forked from CGM_Public/pretix_original
Data shredder optimizations (#3429)
Co-authored-by: Martin Gross <gross@rami.io>
This commit is contained in:
@@ -32,11 +32,12 @@
|
||||
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations under the License.
|
||||
|
||||
import copy
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
from typing import List, Tuple
|
||||
|
||||
from django.db import transaction
|
||||
from django.db.models import Max, Q
|
||||
from django.db.models.functions import Greatest
|
||||
from django.dispatch import receiver
|
||||
@@ -99,11 +100,13 @@ class BaseDataShredder:
|
||||
"""
|
||||
raise NotImplementedError() # NOQA
|
||||
|
||||
def shred_data(self):
|
||||
def shred_data(self, progress_callback=None):
|
||||
"""
|
||||
This method is called to actually remove the data from the system. You should remove any database objects
|
||||
here.
|
||||
|
||||
You can call ``progress_callback`` with an integer value between 0 and 100 to communicate back your progress.
|
||||
|
||||
You should never delete ``LogEntry`` objects, but you might modify them to remove personal data. In this
|
||||
case, set the ``LogEntry.shredded`` attribute to ``True`` to show that this is no longer original log data.
|
||||
"""
|
||||
@@ -151,6 +154,7 @@ class BaseDataShredder:
|
||||
|
||||
def shred_log_fields(logentry, banlist=None, whitelist=None):
|
||||
d = logentry.parsed_data
|
||||
initial_data = copy.copy(d)
|
||||
shredded = False
|
||||
if whitelist:
|
||||
for k, v in d.items():
|
||||
@@ -162,9 +166,61 @@ def shred_log_fields(logentry, banlist=None, whitelist=None):
|
||||
if f in d:
|
||||
d[f] = '█'
|
||||
shredded = True
|
||||
logentry.data = json.dumps(d)
|
||||
logentry.shredded = logentry.shredded or shredded
|
||||
logentry.save(update_fields=['data', 'shredded'])
|
||||
if d != initial_data:
|
||||
logentry.data = json.dumps(d)
|
||||
logentry.shredded = logentry.shredded or shredded
|
||||
logentry.save(update_fields=['data', 'shredded'])
|
||||
|
||||
|
||||
def slow_update(qs, batch_size=1000, sleep_time=.5, progress_callback=None, progress_offset=0, progress_total=None, **update):
|
||||
"""
|
||||
Doing UPDATE queries on hundreds of thousands of rows can cause outages due to high write load on the database.
|
||||
This provides a throttled way to update rows. The condition for this to work properly is that the queryset has a
|
||||
filter condition that no longer applies after the update!
|
||||
Otherwise, this will be an endless loop!
|
||||
"""
|
||||
total_updated = 0
|
||||
while True:
|
||||
updated = qs.order_by().filter(
|
||||
pk__in=qs.order_by().values_list('pk', flat=True)[:batch_size]
|
||||
).update(**update)
|
||||
total_updated += updated
|
||||
if not updated:
|
||||
break
|
||||
if total_updated >= 0.8 * batch_size:
|
||||
time.sleep(sleep_time)
|
||||
if progress_callback and progress_total:
|
||||
progress_callback((progress_offset + total_updated) / progress_total)
|
||||
|
||||
return total_updated
|
||||
|
||||
|
||||
def slow_delete(qs, batch_size=1000, sleep_time=.5, progress_callback=None, progress_offset=0, progress_total=None):
|
||||
"""
|
||||
Doing DELETE queries on hundreds of thousands of rows can cause outages due to high write load on the database.
|
||||
This provides a throttled way to update rows.
|
||||
"""
|
||||
total_deleted = 0
|
||||
while True:
|
||||
deleted = qs.order_by().filter(
|
||||
pk__in=qs.order_by().values_list('pk', flat=True)[:batch_size]
|
||||
).delete()[0]
|
||||
total_deleted += deleted
|
||||
if not deleted:
|
||||
break
|
||||
if total_deleted >= 0.8 * batch_size:
|
||||
time.sleep(sleep_time)
|
||||
return total_deleted
|
||||
|
||||
|
||||
def _progress_helper(queryset, progress_callback, offset, total):
|
||||
if not progress_callback:
|
||||
yield from queryset
|
||||
else:
|
||||
for i, o in enumerate(queryset):
|
||||
yield o
|
||||
if i % 10 == 0:
|
||||
progress_callback((i + offset) / total * 100)
|
||||
|
||||
|
||||
class PhoneNumberShredder(BaseDataShredder):
|
||||
@@ -177,18 +233,26 @@ class PhoneNumberShredder(BaseDataShredder):
|
||||
o.code: o.phone for o in self.event.orders.filter(phone__isnull=False)
|
||||
}, cls=CustomJSONEncoder, indent=4)
|
||||
|
||||
@transaction.atomic
|
||||
def shred_data(self):
|
||||
for o in self.event.orders.all():
|
||||
def shred_data(self, progress_callback=None):
|
||||
qs_orders = self.event.orders.all()
|
||||
qs_orders_cnt = qs_orders.count()
|
||||
qs_le = self.event.logentry_set.filter(action_type="pretix.event.order.phone.changed")
|
||||
qs_le_cnt = qs_le.count()
|
||||
total = qs_le_cnt + qs_orders_cnt
|
||||
|
||||
for o in _progress_helper(qs_orders, progress_callback, 0, total):
|
||||
changed = bool(o.phone)
|
||||
o.phone = None
|
||||
d = o.meta_info_data
|
||||
if d:
|
||||
if 'contact_form_data' in d and 'phone' in d['contact_form_data']:
|
||||
changed = True
|
||||
del d['contact_form_data']['phone']
|
||||
o.meta_info = json.dumps(d)
|
||||
o.save(update_fields=['meta_info', 'phone'])
|
||||
o.meta_info = json.dumps(d)
|
||||
if changed:
|
||||
o.save(update_fields=['meta_info', 'phone'])
|
||||
|
||||
for le in self.event.logentry_set.filter(action_type="pretix.event.order.phone.changed"):
|
||||
for le in _progress_helper(qs_le, progress_callback, qs_orders_cnt, total):
|
||||
shred_log_fields(le, banlist=['old_phone', 'new_phone'])
|
||||
|
||||
|
||||
@@ -207,37 +271,66 @@ class EmailAddressShredder(BaseDataShredder):
|
||||
for op in OrderPosition.all.filter(order__event=self.event, attendee_email__isnull=False)
|
||||
}, indent=4)
|
||||
|
||||
@transaction.atomic
|
||||
def shred_data(self):
|
||||
OrderPosition.all.filter(order__event=self.event, attendee_email__isnull=False).update(attendee_email=None)
|
||||
def shred_data(self, progress_callback=None):
|
||||
qs_op = OrderPosition.all.filter(order__event=self.event, attendee_email__isnull=False)
|
||||
qs_op_cnt = qs_op.count()
|
||||
|
||||
for o in self.event.orders.all():
|
||||
qs_orders = self.event.orders.all()
|
||||
qs_orders_cnt = qs_orders.count()
|
||||
|
||||
qs_le = self.event.logentry_set.filter(
|
||||
Q(action_type__contains="order.email") | Q(action_type__contains="position.email") |
|
||||
Q(action_type="pretix.event.order.contact.changed") |
|
||||
Q(action_type="pretix.event.order.modified")
|
||||
).exclude(data="")
|
||||
qs_le_cnt = qs_le.count()
|
||||
|
||||
total = qs_op_cnt + qs_orders_cnt + qs_le_cnt
|
||||
|
||||
slow_update(
|
||||
qs_op,
|
||||
attendee_email=None,
|
||||
progress_callback=progress_callback,
|
||||
progress_offset=0,
|
||||
progress_total=total,
|
||||
# Updates to order position table are slow, since PostgreSQL needs to update many indexes, so let's
|
||||
# take them really slowly to not overwhelm the database.
|
||||
batch_size=100,
|
||||
sleep_time=2,
|
||||
)
|
||||
|
||||
for o in _progress_helper(qs_orders, progress_callback, qs_op_cnt, total):
|
||||
changed = bool(o.email) or bool(o.customer)
|
||||
o.email = None
|
||||
o.customer = None
|
||||
d = o.meta_info_data
|
||||
if d:
|
||||
if 'contact_form_data' in d and 'email' in d['contact_form_data']:
|
||||
del d['contact_form_data']['email']
|
||||
o.meta_info = json.dumps(d)
|
||||
o.save(update_fields=['meta_info', 'email', 'customer'])
|
||||
changed = True
|
||||
o.meta_info = json.dumps(d)
|
||||
if 'contact_form_data' in d and 'email_repeat' in d['contact_form_data']:
|
||||
del d['contact_form_data']['email_repeat']
|
||||
changed = True
|
||||
if changed:
|
||||
if d:
|
||||
o.meta_info = json.dumps(d)
|
||||
o.save(update_fields=['meta_info', 'email', 'customer'])
|
||||
|
||||
for le in self.event.logentry_set.filter(
|
||||
Q(action_type__contains="order.email") | Q(action_type__contains="position.email"),
|
||||
):
|
||||
shred_log_fields(le, banlist=['recipient', 'message', 'subject', 'full_mail'])
|
||||
|
||||
for le in self.event.logentry_set.filter(action_type="pretix.event.order.contact.changed"):
|
||||
shred_log_fields(le, banlist=['old_email', 'new_email'])
|
||||
|
||||
for le in self.event.logentry_set.filter(action_type="pretix.event.order.modified").exclude(data=""):
|
||||
d = le.parsed_data
|
||||
if 'data' in d:
|
||||
for row in d['data']:
|
||||
if 'attendee_email' in row:
|
||||
row['attendee_email'] = '█'
|
||||
le.data = json.dumps(d)
|
||||
le.shredded = True
|
||||
le.save(update_fields=['data', 'shredded'])
|
||||
for le in _progress_helper(qs_le, progress_callback, qs_op_cnt + qs_orders_cnt, total):
|
||||
if le.action_type == "pretix.event.order.modified":
|
||||
d = le.parsed_data
|
||||
if 'data' in d:
|
||||
for row in d['data']:
|
||||
if 'attendee_email' in row:
|
||||
row['attendee_email'] = '█'
|
||||
le.data = json.dumps(d)
|
||||
le.shredded = True
|
||||
le.save(update_fields=['data', 'shredded'])
|
||||
else:
|
||||
shred_log_fields(le, banlist=[
|
||||
'recipient', 'message', 'subject', 'full_mail', 'old_email', 'new_email'
|
||||
])
|
||||
|
||||
|
||||
class WaitingListShredder(BaseDataShredder):
|
||||
@@ -251,16 +344,35 @@ class WaitingListShredder(BaseDataShredder):
|
||||
for wle in self.event.waitinglistentries.all()
|
||||
], indent=4)
|
||||
|
||||
@transaction.atomic
|
||||
def shred_data(self):
|
||||
self.event.waitinglistentries.update(name_cached=None, name_parts={'_shredded': True}, email='█', phone='█')
|
||||
def shred_data(self, progress_callback=None):
|
||||
qs_wle = self.event.waitinglistentries.exclude(email='█')
|
||||
qs_wle_cnt = qs_wle.count()
|
||||
|
||||
for wle in self.event.waitinglistentries.select_related('voucher').filter(voucher__isnull=False):
|
||||
qs_voucher = self.event.waitinglistentries.select_related('voucher').filter(voucher__isnull=False)
|
||||
qs_voucher_cnt = qs_voucher.count()
|
||||
|
||||
qs_le = self.event.logentry_set.filter(action_type="pretix.voucher.added.waitinglist").exclude(data="")
|
||||
qs_le_cnt = qs_le.count()
|
||||
|
||||
total = qs_voucher_cnt + qs_wle_cnt + qs_le_cnt
|
||||
|
||||
slow_update(
|
||||
qs_wle,
|
||||
name_cached=None,
|
||||
name_parts={'_shredded': True},
|
||||
email='█',
|
||||
phone='█',
|
||||
progress_callback=progress_callback,
|
||||
progress_offset=0,
|
||||
progress_total=total,
|
||||
)
|
||||
|
||||
for wle in _progress_helper(qs_voucher, progress_callback, qs_wle_cnt, total):
|
||||
if '@' in wle.voucher.comment:
|
||||
wle.voucher.comment = '█'
|
||||
wle.voucher.save(update_fields=['comment'])
|
||||
|
||||
for le in self.event.logentry_set.filter(action_type="pretix.voucher.added.waitinglist").exclude(data=""):
|
||||
for le in _progress_helper(qs_le, progress_callback, qs_wle_cnt + qs_voucher_cnt, total):
|
||||
d = le.parsed_data
|
||||
if 'name' in d:
|
||||
d['name'] = '█'
|
||||
@@ -298,17 +410,41 @@ class AttendeeInfoShredder(BaseDataShredder):
|
||||
)
|
||||
}, indent=4)
|
||||
|
||||
@transaction.atomic
|
||||
def shred_data(self):
|
||||
OrderPosition.all.filter(
|
||||
def shred_data(self, progress_callback=None):
|
||||
qs_op = OrderPosition.all.filter(
|
||||
order__event=self.event
|
||||
).filter(
|
||||
Q(attendee_name_cached__isnull=False) | Q(attendee_name_parts__isnull=False) |
|
||||
Q(company__isnull=False) | Q(street__isnull=False) | Q(zipcode__isnull=False) | Q(city__isnull=False)
|
||||
).update(attendee_name_cached=None, attendee_name_parts={'_shredded': True}, company=None, street=None,
|
||||
zipcode=None, city=None)
|
||||
Q(attendee_name_cached__isnull=False) |
|
||||
Q(company__isnull=False) |
|
||||
Q(street__isnull=False) |
|
||||
Q(zipcode__isnull=False) |
|
||||
Q(city__isnull=False)
|
||||
)
|
||||
qs_op_cnt = qs_op.count()
|
||||
|
||||
for le in self.event.logentry_set.filter(action_type="pretix.event.order.modified").exclude(data=""):
|
||||
qs_le = self.event.logentry_set.filter(action_type="pretix.event.order.modified").exclude(data="")
|
||||
qs_le_cnt = qs_le.count()
|
||||
|
||||
total = qs_op_cnt + qs_le_cnt
|
||||
|
||||
slow_update(
|
||||
qs_op,
|
||||
attendee_name_cached=None,
|
||||
attendee_name_parts={'_shredded': True},
|
||||
company=None,
|
||||
street=None,
|
||||
zipcode=None,
|
||||
city=None,
|
||||
progress_callback=progress_callback,
|
||||
progress_total=total,
|
||||
progress_offset=0,
|
||||
# Updates to order position table are slow, since PostgreSQL needs to update many indexes, so let's
|
||||
# take them really slowly to not overwhelm the database.
|
||||
batch_size=100,
|
||||
sleep_time=2,
|
||||
)
|
||||
|
||||
for le in _progress_helper(qs_le, progress_callback, qs_op_cnt, total):
|
||||
d = le.parsed_data
|
||||
if 'data' in d:
|
||||
for i, row in enumerate(d['data']):
|
||||
@@ -343,11 +479,18 @@ class InvoiceAddressShredder(BaseDataShredder):
|
||||
for ia in InvoiceAddress.objects.filter(order__event=self.event)
|
||||
}, indent=4)
|
||||
|
||||
@transaction.atomic
|
||||
def shred_data(self):
|
||||
InvoiceAddress.objects.filter(order__event=self.event).delete()
|
||||
def shred_data(self, progress_callback=None):
|
||||
qs_ia = InvoiceAddress.objects.filter(order__event=self.event)
|
||||
qs_ia_cnt = qs_ia.count()
|
||||
|
||||
for le in self.event.logentry_set.filter(action_type="pretix.event.order.modified").exclude(data=""):
|
||||
qs_le = self.event.logentry_set.filter(action_type="pretix.event.order.modified").exclude(data="")
|
||||
qs_le_cnt = qs_le.count()
|
||||
|
||||
total = qs_ia_cnt + qs_le_cnt
|
||||
|
||||
slow_delete(qs_ia, progress_callback=progress_callback, progress_total=total, progress_offset=0)
|
||||
|
||||
for le in _progress_helper(qs_le, progress_callback, qs_ia_cnt, total):
|
||||
d = le.parsed_data
|
||||
if 'invoice_data' in d and not isinstance(d['invoice_data'], bool):
|
||||
for field in d['invoice_data']:
|
||||
@@ -375,11 +518,18 @@ class QuestionAnswerShredder(BaseDataShredder):
|
||||
).data
|
||||
yield 'question-answers.json', 'application/json', json.dumps(d, indent=4)
|
||||
|
||||
@transaction.atomic
|
||||
def shred_data(self):
|
||||
QuestionAnswer.objects.filter(orderposition__order__event=self.event).delete()
|
||||
def shred_data(self, progress_callback=None):
|
||||
qs_qa = QuestionAnswer.objects.filter(orderposition__order__event=self.event)
|
||||
qs_qa_cnt = qs_qa.count()
|
||||
|
||||
for le in self.event.logentry_set.filter(action_type="pretix.event.order.modified").exclude(data=""):
|
||||
qs_le = self.event.logentry_set.filter(action_type="pretix.event.order.modified").exclude(data="")
|
||||
qs_le_cnt = qs_le.count()
|
||||
|
||||
total = qs_qa_cnt + qs_le_cnt
|
||||
|
||||
slow_delete(qs_qa, progress_callback=progress_callback, progress_total=total, progress_offset=0)
|
||||
|
||||
for le in _progress_helper(qs_le, progress_callback, qs_qa_cnt, total):
|
||||
d = le.parsed_data
|
||||
if 'data' in d:
|
||||
for i, row in enumerate(d['data']):
|
||||
@@ -408,9 +558,11 @@ class InvoiceShredder(BaseDataShredder):
|
||||
yield 'invoices/{}.pdf'.format(i.number), 'application/pdf', i.file.read()
|
||||
i.file.close()
|
||||
|
||||
@transaction.atomic
|
||||
def shred_data(self):
|
||||
for i in self.event.invoices.filter(shredded=False):
|
||||
def shred_data(self, progress_callback=None):
|
||||
qs_i = self.event.invoices.filter(shredded=False)
|
||||
total = qs_i.count()
|
||||
|
||||
for i in _progress_helper(qs_i, progress_callback, 0, total):
|
||||
if i.file:
|
||||
i.file.delete()
|
||||
i.shredded = True
|
||||
@@ -430,10 +582,17 @@ class CachedTicketShredder(BaseDataShredder):
|
||||
def generate_files(self) -> List[Tuple[str, str, str]]:
|
||||
pass
|
||||
|
||||
@transaction.atomic
|
||||
def shred_data(self):
|
||||
CachedTicket.objects.filter(order_position__order__event=self.event).delete()
|
||||
CachedCombinedTicket.objects.filter(order__event=self.event).delete()
|
||||
def shred_data(self, progress_callback=None):
|
||||
qs_1 = CachedTicket.objects.filter(order_position__order__event=self.event)
|
||||
qs_1_cnt = qs_1.count()
|
||||
|
||||
qs_2 = CachedCombinedTicket.objects.filter(order__event=self.event)
|
||||
qs_2_cnt = qs_2.count()
|
||||
|
||||
total = qs_1_cnt + qs_2_cnt
|
||||
|
||||
slow_delete(qs_1, progress_callback=progress_callback, progress_total=total, progress_offset=0)
|
||||
slow_delete(qs_2, progress_callback=progress_callback, progress_total=total, progress_offset=qs_1_cnt)
|
||||
|
||||
|
||||
class PaymentInfoShredder(BaseDataShredder):
|
||||
@@ -446,14 +605,21 @@ class PaymentInfoShredder(BaseDataShredder):
|
||||
def generate_files(self) -> List[Tuple[str, str, str]]:
|
||||
pass
|
||||
|
||||
@transaction.atomic
|
||||
def shred_data(self):
|
||||
def shred_data(self, progress_callback=None):
|
||||
qs_p = OrderPayment.objects.filter(order__event=self.event)
|
||||
qs_p_count = qs_p.count()
|
||||
qs_r = OrderRefund.objects.filter(order__event=self.event)
|
||||
qs_r_count = qs_r.count()
|
||||
|
||||
total = qs_p_count + qs_r_count
|
||||
|
||||
provs = self.event.get_payment_providers()
|
||||
for obj in OrderPayment.objects.filter(order__event=self.event):
|
||||
for obj in _progress_helper(qs_p, progress_callback, 0, total):
|
||||
pprov = provs.get(obj.provider)
|
||||
if pprov:
|
||||
pprov.shred_payment_info(obj)
|
||||
for obj in OrderRefund.objects.filter(order__event=self.event):
|
||||
|
||||
for obj in _progress_helper(qs_r, progress_callback, qs_p_count, total):
|
||||
pprov = provs.get(obj.provider)
|
||||
if pprov:
|
||||
pprov.shred_payment_info(obj)
|
||||
|
||||
Reference in New Issue
Block a user