Markdown: Allow to escape domain name

2023-06-23 15:32:00 +02:00
parent 34e7a0fc31
commit b7f3f7a7a1
2 changed files with 154 additions and 25 deletions
--- a/src/pretix/base/templatetags/rich_text.py
+++ b/src/pretix/base/templatetags/rich_text.py
@@ -48,6 +48,8 @@ from django.utils.http import url_has_allowed_host_and_scheme
 from django.utils.safestring import mark_safe
 from markdown import Extension
 from markdown.inlinepatterns import SubstituteTagInlineProcessor
+from markdown.postprocessors import Postprocessor
+from markdown.treeprocessors import UnescapeTreeprocessor
 from tlds import tld_set

 register = template.Library()
@@ -185,6 +187,111 @@ class EmailNl2BrExtension(Extension):
        md.inlinePatterns.register(br_tag, 'nl', 5)


+class LinkifyPostprocessor(Postprocessor):
+    def __init__(self, linker):
+        self.linker = linker
+        super().__init__()
+
+    def run(self, text):
+        return self.linker.linkify(text)
+
+
+class CleanPostprocessor(Postprocessor):
+    def __init__(self, tags, attributes, protocols, strip):
+        self.tags = tags
+        self.attributes = attributes
+        self.protocols = protocols
+        self.strip = strip
+        super().__init__()
+
+    def run(self, text):
+        return bleach.clean(
+            text,
+            tags=self.tags,
+            attributes=self.attributes,
+            protocols=self.protocols,
+            strip=self.strip
+        )
+
+
+class CustomUnescapeTreeprocessor(UnescapeTreeprocessor):
+    """
+    This un-escapes everything except \\.
+    """
+
+    def _unescape(self, m):
+        if m.group(1) == "46":
+            return "|escaped-dot-sGnY9LMK|"
+        return chr(int(m.group(1)))
+
+
+class CustomUnescapePostprocessor(Postprocessor):
+    """
+    Restore escaped .
+    """
+
+    RE = re.compile(r'&#46;')
+
+    def run(self, text):
+        return text.replace("|escaped-dot-sGnY9LMK|", ".")
+
+
+class LinkifyAndCleanExtension(Extension):
+    r"""
+    We want to do:
+
+    input --> markdown --> bleach clean --> linkify --> output
+
+    Internally, the markdown library does:
+
+    source --> parse --> (tree|inline)processors --> serializing --> postprocessors
+
+    All escaped characters such as \. will be turned to something like <STX>46<ETX> in the processors
+    step and then will be converted to . back again in the last tree processor, before serialization.
+    Therefore, linkify does not see the escaped character anymore. This is annoying for the one case
+    where you want to type "rich_text.py" and *not* have it turned into a link, since you can't type
+    "rich_text\.py" either.
+
+    A simple solution would be to run linkify before markdown, but that may cause other issues when
+    linkify messes with the markdown syntax and it makes handling our attributes etc. harder.
+
+    So we do a weird hack where we modify the unescape processor to unescape everything EXCEPT for the
+    dot and then unescape that one manually after linkify. However, to make things even harder, the bleach
+    clean step removes any invisible characters, so we need to cheat a bit more.
+    """
+
+    def __init__(self, linker, tags, attributes, protocols, strip):
+        self.linker = linker
+        self.tags = tags
+        self.attributes = attributes
+        self.protocols = protocols
+        self.strip = strip
+        super().__init__()
+
+    def extendMarkdown(self, md):
+        md.treeprocessors.deregister('unescape')
+        md.treeprocessors.register(
+            CustomUnescapeTreeprocessor(md),
+            'unescape',
+            0
+        )
+        md.postprocessors.register(
+            CleanPostprocessor(self.tags, self.attributes, self.protocols, self.strip),
+            'clean',
+            2
+        )
+        md.postprocessors.register(
+            LinkifyPostprocessor(self.linker),
+            'linkify',
+            1
+        )
+        md.postprocessors.register(
+            CustomUnescapePostprocessor(self.linker),
+            'unescape_dot',
+            0
+        )
+
+
 def markdown_compile_email(source):
    linker = bleach.Linker(
        url_re=URL_RE,
@@ -192,18 +299,20 @@ def markdown_compile_email(source):
        callbacks=DEFAULT_CALLBACKS + [truelink_callback, abslink_callback],
        parse_email=True
    )
-    return linker.linkify(bleach.clean(
-        markdown.markdown(
-            source,
-            extensions=[
-                'markdown.extensions.sane_lists',
-                EmailNl2BrExtension(),
-            ]
-        ),
-        tags=ALLOWED_TAGS,
-        attributes=ALLOWED_ATTRIBUTES,
-        protocols=ALLOWED_PROTOCOLS,
-    ))
+    return markdown.markdown(
+        source,
+        extensions=[
+            'markdown.extensions.sane_lists',
+            EmailNl2BrExtension(),
+            LinkifyAndCleanExtension(
+                linker,
+                tags=ALLOWED_TAGS,
+                attributes=ALLOWED_ATTRIBUTES,
+                protocols=ALLOWED_PROTOCOLS,
+                strip=False,
+            )
+        ]
+    )


 class SnippetExtension(markdown.extensions.Extension):
@@ -213,23 +322,24 @@ class SnippetExtension(markdown.extensions.Extension):
        md.parser.blockprocessors.deregister('quote')


-def markdown_compile(source, snippet=False):
+def markdown_compile(source, linker, snippet=False):
    tags = ALLOWED_TAGS_SNIPPET if snippet else ALLOWED_TAGS
    exts = [
        'markdown.extensions.sane_lists',
-        'markdown.extensions.nl2br'
+        'markdown.extensions.nl2br',
+        LinkifyAndCleanExtension(
+            linker,
+            tags=tags,
+            attributes=ALLOWED_ATTRIBUTES,
+            protocols=ALLOWED_PROTOCOLS,
+            strip=snippet,
+        )
    ]
    if snippet:
        exts.append(SnippetExtension())
-    return bleach.clean(
-        markdown.markdown(
-            source,
-            extensions=exts
-        ),
-        strip=snippet,
-        tags=tags,
-        attributes=ALLOWED_ATTRIBUTES,
-        protocols=ALLOWED_PROTOCOLS,
+    return markdown.markdown(
+        source,
+        extensions=exts
    )


@@ -245,7 +355,7 @@ def rich_text(text: str, **kwargs):
        callbacks=DEFAULT_CALLBACKS + ([truelink_callback, safelink_callback] if kwargs.get('safelinks', True) else [truelink_callback, abslink_callback]),
        parse_email=True
    )
-    body_md = linker.linkify(markdown_compile(text))
+    body_md = markdown_compile(text, linker)
    return mark_safe(body_md)


@@ -261,5 +371,5 @@ def rich_text_snippet(text: str, **kwargs):
        callbacks=DEFAULT_CALLBACKS + ([truelink_callback, safelink_callback] if kwargs.get('safelinks', True) else [truelink_callback, abslink_callback]),
        parse_email=True
    )
-    body_md = linker.linkify(markdown_compile(text, snippet=True))
+    body_md = markdown_compile(text, linker, snippet=True)
    return mark_safe(body_md)
--- a/src/tests/base/test_rich_text.py
+++ b/src/tests/base/test_rich_text.py
@@ -30,6 +30,8 @@ from pretix.base.templatetags.rich_text import (
    # Test link detection
    ("google.com",
     '<a href="http://google.com" rel="noopener" target="_blank">google.com</a>'),
+    # Test link escaping
+    ("google\\.com", 'google.com'),
    # Test abslink_callback
    ("[Call](tel:+12345)",
     '<a href="tel:+12345" rel="nofollow">Call</a>'),
@@ -79,3 +81,20 @@ def test_newline_handling(content, result):
 ])
 def test_newline_handling_email(content, result):
    assert markdown_compile_email(content) == result
+
+
+@pytest.mark.parametrize("content,result,result_snippet", [
+    # attributes
+    ('<a onclick="javascript:foo()">foo</a>', '<p><a>foo</a></p>', '<a>foo</a>'),
+    ('<strong color="red">foo</strong>',
+     '<p><strong>foo</strong></p>',
+     '<strong>foo</strong>'),
+    # protocols
+    ('<a href="javascript:foo()">foo</a>', '<p><a>foo</a></p>', '<a>foo</a>'),
+    # tags
+    ('<script>foo</script>', '&lt;script&gt;foo&lt;/script&gt;', 'foo'),
+])
+def test_cleanup(content, result, result_snippet):
+    assert rich_text(content) == result
+    assert rich_text_snippet(content) == result_snippet
+    assert markdown_compile_email(content) == result