Order search: Fight the database optimizer to actually optimize the query

This commit is contained in:
Raphael Michel
2019-02-15 11:45:06 +01:00
parent 166f8b8a2a
commit 0933fc848d

View File

@@ -46,7 +46,7 @@ class OrderSearch(PaginationMixin, ListView):
return ctx
def get_queryset(self):
qs = Order.objects.select_related('invoice_address').using(settings.DATABASE_REPLICA)
qs = Order.objects.using(settings.DATABASE_REPLICA)
if not self.request.user.has_active_staff_session(self.request.session.session_key):
qs = qs.filter(
@@ -59,9 +59,47 @@ class OrderSearch(PaginationMixin, ListView):
if self.filter_form.is_valid():
qs = self.filter_form.filter_qs(qs)
if self.filter_form.cleaned_data.get('query'):
"""
We need to work around a bug in PostgreSQL's (and likely MySQL's) query plan optimizer here.
The database lacks statistical data to predict how common our search filter is and therefore
assumes that it is cheaper to first ORDER *all* orders in the system (since we got an index on
datetime), then filter out with a full scan until OFFSET/LIMIT condition is fulfilled. If we
look for something rare (such as an email address used once within hundreds of thousands of
orders, this ends up to be pathologically slow.
For some search queries on pretix.eu, we see search times of >30s, just due to the ORDER BY and
LIMIT clause. Without them. the query runs in roughly 0.6s. This heuristical approach tries to
detect these cases and rewrite the query as a nested subquery that strongly suggests sorting
before filtering. However, since even that fails in some cases because PostgreSQL thinks it knows
better, we literally force it by evaluating the subquery explicitly. We only do this for n<=200,
to avoid memory leaks and problems with maximum parameter count on SQLite. In cases where the
search query yields lots of results, this will actually be slower since it requires two queries,
sorry.
Phew.
"""
page = self.kwargs.get(self.page_kwarg) or self.request.GET.get(self.page_kwarg) or 1
limit = self.get_paginate_by(None)
offset = (page - 1) * limit
resultids = list(qs.order_by().values_list('id', flat=True)[:201])
if len(resultids) <= 200 and len(resultids) <= offset + limit:
qs = Order.objects.using(settings.DATABASE_REPLICA).filter(
id__in=resultids
)
"""
We use prefetch_related here instead of select_related for a reason, even though select_related
would be the common choice for a foreign key and doesn't require an additional database query.
The problem is, that if our results contain the same event 25 times, select_related will create
25 Django objects which will all try to pull their ownsettings cache to show the event properly,
leading to lots of unnecessary queries. Due to the way prefetch_related works differently, it
will only create one shared Django object.
"""
return qs.only(
'id', 'invoice_address__name_cached', 'invoice_address__name_parts', 'code', 'event', 'email',
'datetime', 'total', 'status', 'require_approval'
).prefetch_related(
'event', 'event__organizer'
)
).select_related('invoice_address')