Markdown: Allow to escape domain name

This commit is contained in:
Raphael Michel
2023-06-23 15:32:00 +02:00
parent 34e7a0fc31
commit b7f3f7a7a1
2 changed files with 154 additions and 25 deletions

View File

@@ -48,6 +48,8 @@ from django.utils.http import url_has_allowed_host_and_scheme
from django.utils.safestring import mark_safe
from markdown import Extension
from markdown.inlinepatterns import SubstituteTagInlineProcessor
from markdown.postprocessors import Postprocessor
from markdown.treeprocessors import UnescapeTreeprocessor
from tlds import tld_set
register = template.Library()
@@ -185,6 +187,111 @@ class EmailNl2BrExtension(Extension):
md.inlinePatterns.register(br_tag, 'nl', 5)
class LinkifyPostprocessor(Postprocessor):
def __init__(self, linker):
self.linker = linker
super().__init__()
def run(self, text):
return self.linker.linkify(text)
class CleanPostprocessor(Postprocessor):
def __init__(self, tags, attributes, protocols, strip):
self.tags = tags
self.attributes = attributes
self.protocols = protocols
self.strip = strip
super().__init__()
def run(self, text):
return bleach.clean(
text,
tags=self.tags,
attributes=self.attributes,
protocols=self.protocols,
strip=self.strip
)
class CustomUnescapeTreeprocessor(UnescapeTreeprocessor):
"""
This un-escapes everything except \\.
"""
def _unescape(self, m):
if m.group(1) == "46":
return "|escaped-dot-sGnY9LMK|"
return chr(int(m.group(1)))
class CustomUnescapePostprocessor(Postprocessor):
"""
Restore escaped .
"""
RE = re.compile(r'.')
def run(self, text):
return text.replace("|escaped-dot-sGnY9LMK|", ".")
class LinkifyAndCleanExtension(Extension):
r"""
We want to do:
input --> markdown --> bleach clean --> linkify --> output
Internally, the markdown library does:
source --> parse --> (tree|inline)processors --> serializing --> postprocessors
All escaped characters such as \. will be turned to something like <STX>46<ETX> in the processors
step and then will be converted to . back again in the last tree processor, before serialization.
Therefore, linkify does not see the escaped character anymore. This is annoying for the one case
where you want to type "rich_text.py" and *not* have it turned into a link, since you can't type
"rich_text\.py" either.
A simple solution would be to run linkify before markdown, but that may cause other issues when
linkify messes with the markdown syntax and it makes handling our attributes etc. harder.
So we do a weird hack where we modify the unescape processor to unescape everything EXCEPT for the
dot and then unescape that one manually after linkify. However, to make things even harder, the bleach
clean step removes any invisible characters, so we need to cheat a bit more.
"""
def __init__(self, linker, tags, attributes, protocols, strip):
self.linker = linker
self.tags = tags
self.attributes = attributes
self.protocols = protocols
self.strip = strip
super().__init__()
def extendMarkdown(self, md):
md.treeprocessors.deregister('unescape')
md.treeprocessors.register(
CustomUnescapeTreeprocessor(md),
'unescape',
0
)
md.postprocessors.register(
CleanPostprocessor(self.tags, self.attributes, self.protocols, self.strip),
'clean',
2
)
md.postprocessors.register(
LinkifyPostprocessor(self.linker),
'linkify',
1
)
md.postprocessors.register(
CustomUnescapePostprocessor(self.linker),
'unescape_dot',
0
)
def markdown_compile_email(source):
linker = bleach.Linker(
url_re=URL_RE,
@@ -192,18 +299,20 @@ def markdown_compile_email(source):
callbacks=DEFAULT_CALLBACKS + [truelink_callback, abslink_callback],
parse_email=True
)
return linker.linkify(bleach.clean(
markdown.markdown(
source,
extensions=[
'markdown.extensions.sane_lists',
EmailNl2BrExtension(),
]
),
tags=ALLOWED_TAGS,
attributes=ALLOWED_ATTRIBUTES,
protocols=ALLOWED_PROTOCOLS,
))
return markdown.markdown(
source,
extensions=[
'markdown.extensions.sane_lists',
EmailNl2BrExtension(),
LinkifyAndCleanExtension(
linker,
tags=ALLOWED_TAGS,
attributes=ALLOWED_ATTRIBUTES,
protocols=ALLOWED_PROTOCOLS,
strip=False,
)
]
)
class SnippetExtension(markdown.extensions.Extension):
@@ -213,23 +322,24 @@ class SnippetExtension(markdown.extensions.Extension):
md.parser.blockprocessors.deregister('quote')
def markdown_compile(source, snippet=False):
def markdown_compile(source, linker, snippet=False):
tags = ALLOWED_TAGS_SNIPPET if snippet else ALLOWED_TAGS
exts = [
'markdown.extensions.sane_lists',
'markdown.extensions.nl2br'
'markdown.extensions.nl2br',
LinkifyAndCleanExtension(
linker,
tags=tags,
attributes=ALLOWED_ATTRIBUTES,
protocols=ALLOWED_PROTOCOLS,
strip=snippet,
)
]
if snippet:
exts.append(SnippetExtension())
return bleach.clean(
markdown.markdown(
source,
extensions=exts
),
strip=snippet,
tags=tags,
attributes=ALLOWED_ATTRIBUTES,
protocols=ALLOWED_PROTOCOLS,
return markdown.markdown(
source,
extensions=exts
)
@@ -245,7 +355,7 @@ def rich_text(text: str, **kwargs):
callbacks=DEFAULT_CALLBACKS + ([truelink_callback, safelink_callback] if kwargs.get('safelinks', True) else [truelink_callback, abslink_callback]),
parse_email=True
)
body_md = linker.linkify(markdown_compile(text))
body_md = markdown_compile(text, linker)
return mark_safe(body_md)
@@ -261,5 +371,5 @@ def rich_text_snippet(text: str, **kwargs):
callbacks=DEFAULT_CALLBACKS + ([truelink_callback, safelink_callback] if kwargs.get('safelinks', True) else [truelink_callback, abslink_callback]),
parse_email=True
)
body_md = linker.linkify(markdown_compile(text, snippet=True))
body_md = markdown_compile(text, linker, snippet=True)
return mark_safe(body_md)

View File

@@ -30,6 +30,8 @@ from pretix.base.templatetags.rich_text import (
# Test link detection
("google.com",
'<a href="http://google.com" rel="noopener" target="_blank">google.com</a>'),
# Test link escaping
("google\\.com", 'google.com'),
# Test abslink_callback
("[Call](tel:+12345)",
'<a href="tel:+12345" rel="nofollow">Call</a>'),
@@ -79,3 +81,20 @@ def test_newline_handling(content, result):
])
def test_newline_handling_email(content, result):
assert markdown_compile_email(content) == result
@pytest.mark.parametrize("content,result,result_snippet", [
# attributes
('<a onclick="javascript:foo()">foo</a>', '<p><a>foo</a></p>', '<a>foo</a>'),
('<strong color="red">foo</strong>',
'<p><strong>foo</strong></p>',
'<strong>foo</strong>'),
# protocols
('<a href="javascript:foo()">foo</a>', '<p><a>foo</a></p>', '<a>foo</a>'),
# tags
('<script>foo</script>', '&lt;script&gt;foo&lt;/script&gt;', 'foo'),
])
def test_cleanup(content, result, result_snippet):
assert rich_text(content) == result
assert rich_text_snippet(content) == result_snippet
assert markdown_compile_email(content) == result