Markdown link parser: Fix fediverse URLs and URLs with user or path (#5563)

This commit is contained in:
Raphael Michel
2025-10-29 10:01:05 +01:00
committed by GitHub
parent 9461ac27f9
commit 4b9f1712f0
2 changed files with 52 additions and 1 deletions

View File

@@ -54,6 +54,19 @@ from tlds import tld_set
register = template.Library()
def build_fediverse_re(tlds):
return re.compile(
r"""\(* # Match any opening parentheses.
@[^@]+@
([\w-]+\.)+(?:{0})(?:\:[0-9]+)?(?!\.\w)\b # xx.yy.tld(:##)?
""".format(
"|".join(sorted(tlds))
),
re.IGNORECASE | re.VERBOSE | re.UNICODE,
)
ALLOWED_TAGS_SNIPPET = {
'a',
'abbr',
@@ -112,6 +125,8 @@ URL_RE = SimpleLazyObject(lambda: build_url_re(tlds=sorted(tld_set, key=len, rev
EMAIL_RE = SimpleLazyObject(lambda: build_email_re(tlds=sorted(tld_set, key=len, reverse=True)))
FEDIVERSE_RE = SimpleLazyObject(lambda: build_fediverse_re(tlds=sorted(tld_set, key=len, reverse=True)))
DOT_ESCAPE = "|escaped-dot-sGnY9LMK|"
@@ -144,9 +159,11 @@ def truelink_callback(attrs, new=False):
<a href="https://maps.google.com/location/foo">https://maps.google.com</a>
"""
text = re.sub(r'[^a-zA-Z0-9.\-/_ ]', '', attrs.get('_text')) # clean up link text
text = re.sub(r'[^a-zA-Z0-9.\-/_@: ]', '', attrs.get('_text')) # clean up link text
url = attrs.get((None, 'href'), '/')
href_url = urllib.parse.urlparse(url)
# Verify server name of URL names
if (None, 'href') in attrs and URL_RE.match(text) and href_url.scheme not in ('tel', 'mailto'):
# link text looks like a url
if text.startswith('//'):
@@ -154,10 +171,20 @@ def truelink_callback(attrs, new=False):
elif not text.startswith('http'):
text = 'https://' + text
text_url = urllib.parse.urlparse(text)
if text_url.netloc.split("@")[-1] != href_url.netloc.split("@")[-1] or not href_url.path.startswith(text_url.path):
# link text contains an URL that has a different base than the actual URL
attrs['_text'] = attrs[None, 'href']
# Verify server name of mastodon display names (@name@server.tld)
if (None, 'href') in attrs and FEDIVERSE_RE.match(text):
parts = text.split('@')
text = f'https://{parts[2]}/@{parts[1]}'
text_url = urllib.parse.urlparse(text)
if text_url.netloc != href_url.netloc or not href_url.path.startswith(href_url.path):
# link text contains an URL that has a different base than the actual URL
attrs['_text'] = attrs[None, 'href']
return attrs

View File

@@ -73,10 +73,34 @@ from pretix.base.templatetags.rich_text import (
'<a href="https://goodsite.com.evilsite.com">goodsite.com</a>',
'<a href="https://goodsite.com.evilsite.com" rel="noopener" target="_blank">https://goodsite.com.evilsite.com</a>',
),
(
'<a href="https://evilsite.com/deep/path">evilsite.com/bad/path/</a>',
'<a href="https://evilsite.com/deep/path" rel="noopener" target="_blank">https://evilsite.com/deep/path</a>',
),
(
'<a href="https://evilsite.com/deep/path">evilsite.com/deep</a>',
'<a href="https://evilsite.com/deep/path" rel="noopener" target="_blank">evilsite.com/deep</a>',
),
(
'<a href="https://evilsite.com/deep/path">evilsite.com</a>',
'<a href="https://evilsite.com/deep/path" rel="noopener" target="_blank">evilsite.com</a>',
),
(
'<a href="https://user:pass@evilsite.com/deep/path">evilsite.com</a>',
'<a href="https://user:pass@evilsite.com/deep/path" rel="noopener" target="_blank">evilsite.com</a>',
),
(
'<a href="https://foo:bar@evilsite.com/deep/path">https://foo:bar@goodsite.com</a>',
'<a href="https://foo:bar@evilsite.com/deep/path" rel="noopener" target="_blank">https://foo:bar@evilsite.com/deep/path</a>',
),
(
'<a href="https://pretix.social/@pretix">@pretix@pretix.social</a>',
'<a href="https://pretix.social/@pretix" rel="noopener" target="_blank">@pretix@pretix.social</a>',
),
(
'<a href="https://evilsite.social/@pretix">@pretix@pretix.social</a>',
'<a href="https://evilsite.social/@pretix" rel="noopener" target="_blank">https://evilsite.social/@pretix</a>',
),
("<a>broken</a>", "<a>broken</a>"),
],
)