From b7f3f7a7a1b0132fcbd6fe6e82c8ccd1243f4d73 Mon Sep 17 00:00:00 2001 From: Raphael Michel Date: Fri, 23 Jun 2023 15:32:00 +0200 Subject: [PATCH] Markdown: Allow to escape domain name --- src/pretix/base/templatetags/rich_text.py | 160 ++++++++++++++++++---- src/tests/base/test_rich_text.py | 19 +++ 2 files changed, 154 insertions(+), 25 deletions(-) diff --git a/src/pretix/base/templatetags/rich_text.py b/src/pretix/base/templatetags/rich_text.py index f75188f325..91331f7a40 100644 --- a/src/pretix/base/templatetags/rich_text.py +++ b/src/pretix/base/templatetags/rich_text.py @@ -48,6 +48,8 @@ from django.utils.http import url_has_allowed_host_and_scheme from django.utils.safestring import mark_safe from markdown import Extension from markdown.inlinepatterns import SubstituteTagInlineProcessor +from markdown.postprocessors import Postprocessor +from markdown.treeprocessors import UnescapeTreeprocessor from tlds import tld_set register = template.Library() @@ -185,6 +187,111 @@ class EmailNl2BrExtension(Extension): md.inlinePatterns.register(br_tag, 'nl', 5) +class LinkifyPostprocessor(Postprocessor): + def __init__(self, linker): + self.linker = linker + super().__init__() + + def run(self, text): + return self.linker.linkify(text) + + +class CleanPostprocessor(Postprocessor): + def __init__(self, tags, attributes, protocols, strip): + self.tags = tags + self.attributes = attributes + self.protocols = protocols + self.strip = strip + super().__init__() + + def run(self, text): + return bleach.clean( + text, + tags=self.tags, + attributes=self.attributes, + protocols=self.protocols, + strip=self.strip + ) + + +class CustomUnescapeTreeprocessor(UnescapeTreeprocessor): + """ + This un-escapes everything except \\. + """ + + def _unescape(self, m): + if m.group(1) == "46": + return "|escaped-dot-sGnY9LMK|" + return chr(int(m.group(1))) + + +class CustomUnescapePostprocessor(Postprocessor): + """ + Restore escaped . + """ + + RE = re.compile(r'.') + + def run(self, text): + return text.replace("|escaped-dot-sGnY9LMK|", ".") + + +class LinkifyAndCleanExtension(Extension): + r""" + We want to do: + + input --> markdown --> bleach clean --> linkify --> output + + Internally, the markdown library does: + + source --> parse --> (tree|inline)processors --> serializing --> postprocessors + + All escaped characters such as \. will be turned to something like 46 in the processors + step and then will be converted to . back again in the last tree processor, before serialization. + Therefore, linkify does not see the escaped character anymore. This is annoying for the one case + where you want to type "rich_text.py" and *not* have it turned into a link, since you can't type + "rich_text\.py" either. + + A simple solution would be to run linkify before markdown, but that may cause other issues when + linkify messes with the markdown syntax and it makes handling our attributes etc. harder. + + So we do a weird hack where we modify the unescape processor to unescape everything EXCEPT for the + dot and then unescape that one manually after linkify. However, to make things even harder, the bleach + clean step removes any invisible characters, so we need to cheat a bit more. + """ + + def __init__(self, linker, tags, attributes, protocols, strip): + self.linker = linker + self.tags = tags + self.attributes = attributes + self.protocols = protocols + self.strip = strip + super().__init__() + + def extendMarkdown(self, md): + md.treeprocessors.deregister('unescape') + md.treeprocessors.register( + CustomUnescapeTreeprocessor(md), + 'unescape', + 0 + ) + md.postprocessors.register( + CleanPostprocessor(self.tags, self.attributes, self.protocols, self.strip), + 'clean', + 2 + ) + md.postprocessors.register( + LinkifyPostprocessor(self.linker), + 'linkify', + 1 + ) + md.postprocessors.register( + CustomUnescapePostprocessor(self.linker), + 'unescape_dot', + 0 + ) + + def markdown_compile_email(source): linker = bleach.Linker( url_re=URL_RE, @@ -192,18 +299,20 @@ def markdown_compile_email(source): callbacks=DEFAULT_CALLBACKS + [truelink_callback, abslink_callback], parse_email=True ) - return linker.linkify(bleach.clean( - markdown.markdown( - source, - extensions=[ - 'markdown.extensions.sane_lists', - EmailNl2BrExtension(), - ] - ), - tags=ALLOWED_TAGS, - attributes=ALLOWED_ATTRIBUTES, - protocols=ALLOWED_PROTOCOLS, - )) + return markdown.markdown( + source, + extensions=[ + 'markdown.extensions.sane_lists', + EmailNl2BrExtension(), + LinkifyAndCleanExtension( + linker, + tags=ALLOWED_TAGS, + attributes=ALLOWED_ATTRIBUTES, + protocols=ALLOWED_PROTOCOLS, + strip=False, + ) + ] + ) class SnippetExtension(markdown.extensions.Extension): @@ -213,23 +322,24 @@ class SnippetExtension(markdown.extensions.Extension): md.parser.blockprocessors.deregister('quote') -def markdown_compile(source, snippet=False): +def markdown_compile(source, linker, snippet=False): tags = ALLOWED_TAGS_SNIPPET if snippet else ALLOWED_TAGS exts = [ 'markdown.extensions.sane_lists', - 'markdown.extensions.nl2br' + 'markdown.extensions.nl2br', + LinkifyAndCleanExtension( + linker, + tags=tags, + attributes=ALLOWED_ATTRIBUTES, + protocols=ALLOWED_PROTOCOLS, + strip=snippet, + ) ] if snippet: exts.append(SnippetExtension()) - return bleach.clean( - markdown.markdown( - source, - extensions=exts - ), - strip=snippet, - tags=tags, - attributes=ALLOWED_ATTRIBUTES, - protocols=ALLOWED_PROTOCOLS, + return markdown.markdown( + source, + extensions=exts ) @@ -245,7 +355,7 @@ def rich_text(text: str, **kwargs): callbacks=DEFAULT_CALLBACKS + ([truelink_callback, safelink_callback] if kwargs.get('safelinks', True) else [truelink_callback, abslink_callback]), parse_email=True ) - body_md = linker.linkify(markdown_compile(text)) + body_md = markdown_compile(text, linker) return mark_safe(body_md) @@ -261,5 +371,5 @@ def rich_text_snippet(text: str, **kwargs): callbacks=DEFAULT_CALLBACKS + ([truelink_callback, safelink_callback] if kwargs.get('safelinks', True) else [truelink_callback, abslink_callback]), parse_email=True ) - body_md = linker.linkify(markdown_compile(text, snippet=True)) + body_md = markdown_compile(text, linker, snippet=True) return mark_safe(body_md) diff --git a/src/tests/base/test_rich_text.py b/src/tests/base/test_rich_text.py index 921b1efd20..435b3d5408 100644 --- a/src/tests/base/test_rich_text.py +++ b/src/tests/base/test_rich_text.py @@ -30,6 +30,8 @@ from pretix.base.templatetags.rich_text import ( # Test link detection ("google.com", 'google.com'), + # Test link escaping + ("google\\.com", 'google.com'), # Test abslink_callback ("[Call](tel:+12345)", 'Call'), @@ -79,3 +81,20 @@ def test_newline_handling(content, result): ]) def test_newline_handling_email(content, result): assert markdown_compile_email(content) == result + + +@pytest.mark.parametrize("content,result,result_snippet", [ + # attributes + ('foo', '

foo

', 'foo'), + ('foo', + '

foo

', + 'foo'), + # protocols + ('foo', '

foo

', 'foo'), + # tags + ('', '<script>foo</script>', 'foo'), +]) +def test_cleanup(content, result, result_snippet): + assert rich_text(content) == result + assert rich_text_snippet(content) == result_snippet + assert markdown_compile_email(content) == result