XLSX generation: Remove invalid unicode characters

This commit is contained in:
Raphael Michel
2022-12-09 17:41:33 +01:00
parent 979d23e997
commit 247a61489f
2 changed files with 10 additions and 3 deletions

View File

@@ -24,8 +24,7 @@ from inspect import isgenerator
from openpyxl import Workbook
from openpyxl.cell.cell import (
ILLEGAL_CHARACTERS_RE, KNOWN_TYPES, TIME_TYPES, TYPE_FORMULA, TYPE_STRING,
Cell,
KNOWN_TYPES, TIME_TYPES, TYPE_FORMULA, TYPE_STRING, Cell,
)
from openpyxl.compat import NUMERIC_TYPES
from openpyxl.utils import column_index_from_string
@@ -49,6 +48,12 @@ There are mainly two problems this solves:
- It removes characters considered invalid by Excel to avoid exporter crashes.
"""
ILLEGAL_CHARACTERS_RE = re.compile(
# From the XML specification
# Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
r'[^\u0020-\uD7FF\u0009\u000A\u000D\uE000-\uFFFD\U00010000-\U0010FFFF]'
)
def remove_invalid_excel_chars(val):
if isinstance(val, Cell):

View File

@@ -24,11 +24,13 @@ from openpyxl.cell.cell import TYPE_STRING
from pretix.helpers.safe_openpyxl import SafeWorkbook
def test_nullbyte_removed():
def test_invalid_byte_removed():
wb = SafeWorkbook()
ws = wb.create_sheet()
ws.append(["foo\u0000bar"])
assert ws.cell(1, 1).value == "foobar"
ws.append(["foo\uffffbaz"])
assert ws.cell(2, 1).value == "foobaz"
def test_no_formulas():