From 247a61489f8018d6f3150efa837ec75d9eabf28b Mon Sep 17 00:00:00 2001 From: Raphael Michel Date: Fri, 9 Dec 2022 17:41:33 +0100 Subject: [PATCH] XLSX generation: Remove invalid unicode characters --- src/pretix/helpers/safe_openpyxl.py | 9 +++++++-- src/tests/helpers/test_safe_openpyxl.py | 4 +++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/pretix/helpers/safe_openpyxl.py b/src/pretix/helpers/safe_openpyxl.py index 4210703e7..7d460b5ba 100644 --- a/src/pretix/helpers/safe_openpyxl.py +++ b/src/pretix/helpers/safe_openpyxl.py @@ -24,8 +24,7 @@ from inspect import isgenerator from openpyxl import Workbook from openpyxl.cell.cell import ( - ILLEGAL_CHARACTERS_RE, KNOWN_TYPES, TIME_TYPES, TYPE_FORMULA, TYPE_STRING, - Cell, + KNOWN_TYPES, TIME_TYPES, TYPE_FORMULA, TYPE_STRING, Cell, ) from openpyxl.compat import NUMERIC_TYPES from openpyxl.utils import column_index_from_string @@ -49,6 +48,12 @@ There are mainly two problems this solves: - It removes characters considered invalid by Excel to avoid exporter crashes. """ +ILLEGAL_CHARACTERS_RE = re.compile( + # From the XML specification + # Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] + r'[^\u0020-\uD7FF\u0009\u000A\u000D\uE000-\uFFFD\U00010000-\U0010FFFF]' +) + def remove_invalid_excel_chars(val): if isinstance(val, Cell): diff --git a/src/tests/helpers/test_safe_openpyxl.py b/src/tests/helpers/test_safe_openpyxl.py index e9a314598..64e53e155 100644 --- a/src/tests/helpers/test_safe_openpyxl.py +++ b/src/tests/helpers/test_safe_openpyxl.py @@ -24,11 +24,13 @@ from openpyxl.cell.cell import TYPE_STRING from pretix.helpers.safe_openpyxl import SafeWorkbook -def test_nullbyte_removed(): +def test_invalid_byte_removed(): wb = SafeWorkbook() ws = wb.create_sheet() ws.append(["foo\u0000bar"]) assert ws.cell(1, 1).value == "foobar" + ws.append(["foo\uffffbaz"]) + assert ws.cell(2, 1).value == "foobaz" def test_no_formulas():