Order search: Fight the database optimizer to actually optimize the query

2019-02-15 11:45:06 +01:00
parent 166f8b8a2a
commit 0933fc848d
1 changed files with 40 additions and 2 deletions
--- a/src/pretix/control/views/search.py
+++ b/src/pretix/control/views/search.py
@@ -46,7 +46,7 @@ class OrderSearch(PaginationMixin, ListView):
        return ctx

    def get_queryset(self):
-        qs = Order.objects.select_related('invoice_address').using(settings.DATABASE_REPLICA)
+        qs = Order.objects.using(settings.DATABASE_REPLICA)

        if not self.request.user.has_active_staff_session(self.request.session.session_key):
            qs = qs.filter(
@@ -59,9 +59,47 @@ class OrderSearch(PaginationMixin, ListView):
        if self.filter_form.is_valid():
            qs = self.filter_form.filter_qs(qs)

+            if self.filter_form.cleaned_data.get('query'):
+                """
+                We need to work around a bug in PostgreSQL's (and likely MySQL's) query plan optimizer here.
+                The database lacks statistical data to predict how common our search filter is and therefore
+                assumes that it is cheaper to first ORDER *all* orders in the system (since we got an index on
+                datetime), then filter out with a full scan until OFFSET/LIMIT condition is fulfilled. If we
+                look for something rare (such as an email address used once within hundreds of thousands of
+                orders, this ends up to be pathologically slow.
+
+                For some search queries on pretix.eu, we see search times of >30s, just due to the ORDER BY and
+                LIMIT clause. Without them. the query runs in roughly 0.6s. This heuristical approach tries to
+                detect these cases and rewrite the query as a nested subquery that strongly suggests sorting
+                before filtering. However, since even that fails in some cases because PostgreSQL thinks it knows
+                better, we literally force it by evaluating the subquery explicitly. We only do this for n<=200,
+                to avoid memory leaks – and problems with maximum parameter count on SQLite. In cases where the
+                search query yields lots of results, this will actually be slower since it requires two queries,
+                sorry.
+
+                Phew.
+                """
+
+                page = self.kwargs.get(self.page_kwarg) or self.request.GET.get(self.page_kwarg) or 1
+                limit = self.get_paginate_by(None)
+                offset = (page - 1) * limit
+                resultids = list(qs.order_by().values_list('id', flat=True)[:201])
+                if len(resultids) <= 200 and len(resultids) <= offset + limit:
+                    qs = Order.objects.using(settings.DATABASE_REPLICA).filter(
+                        id__in=resultids
+                    )
+
+        """
+        We use prefetch_related here instead of select_related for a reason, even though select_related
+        would be the common choice for a foreign key and doesn't require an additional database query.
+        The problem is, that if our results contain the same event 25 times, select_related will create
+        25 Django  objects which will all try to pull their ownsettings cache to show the event properly,
+        leading to lots of unnecessary queries. Due to the way prefetch_related works differently, it
+        will only create one shared Django object.
+        """
        return qs.only(
            'id', 'invoice_address__name_cached', 'invoice_address__name_parts', 'code', 'event', 'email',
            'datetime', 'total', 'status', 'require_approval'
        ).prefetch_related(
            'event', 'event__organizer'
-        )
+        ).select_related('invoice_address')