From 0933fc848d713303365a8facba35cb532a6ebb47 Mon Sep 17 00:00:00 2001
From: Raphael Michel <mail@raphaelmichel.de>
Date: Fri, 15 Feb 2019 11:45:06 +0100
Subject: [PATCH] Order search: Fight the database optimizer to actually
 optimize the query

---
 src/pretix/control/views/search.py | 42 ++++++++++++++++++++++++++++--
 1 file changed, 40 insertions(+), 2 deletions(-)

diff --git a/src/pretix/control/views/search.py b/src/pretix/control/views/search.py
index f4e647c37..ad47c8657 100644
--- a/src/pretix/control/views/search.py
+++ b/src/pretix/control/views/search.py
@@ -46,7 +46,7 @@ class OrderSearch(PaginationMixin, ListView):
         return ctx
 
     def get_queryset(self):
-        qs = Order.objects.select_related('invoice_address').using(settings.DATABASE_REPLICA)
+        qs = Order.objects.using(settings.DATABASE_REPLICA)
 
         if not self.request.user.has_active_staff_session(self.request.session.session_key):
             qs = qs.filter(
@@ -59,9 +59,47 @@ class OrderSearch(PaginationMixin, ListView):
         if self.filter_form.is_valid():
             qs = self.filter_form.filter_qs(qs)
 
+            if self.filter_form.cleaned_data.get('query'):
+                """
+                We need to work around a bug in PostgreSQL's (and likely MySQL's) query plan optimizer here.
+                The database lacks statistical data to predict how common our search filter is and therefore
+                assumes that it is cheaper to first ORDER *all* orders in the system (since we got an index on
+                datetime), then filter out with a full scan until OFFSET/LIMIT condition is fulfilled. If we
+                look for something rare (such as an email address used once within hundreds of thousands of
+                orders, this ends up to be pathologically slow.
+
+                For some search queries on pretix.eu, we see search times of >30s, just due to the ORDER BY and
+                LIMIT clause. Without them. the query runs in roughly 0.6s. This heuristical approach tries to
+                detect these cases and rewrite the query as a nested subquery that strongly suggests sorting
+                before filtering. However, since even that fails in some cases because PostgreSQL thinks it knows
+                better, we literally force it by evaluating the subquery explicitly. We only do this for n<=200,
+                to avoid memory leaks – and problems with maximum parameter count on SQLite. In cases where the
+                search query yields lots of results, this will actually be slower since it requires two queries,
+                sorry.
+
+                Phew.
+                """
+
+                page = self.kwargs.get(self.page_kwarg) or self.request.GET.get(self.page_kwarg) or 1
+                limit = self.get_paginate_by(None)
+                offset = (page - 1) * limit
+                resultids = list(qs.order_by().values_list('id', flat=True)[:201])
+                if len(resultids) <= 200 and len(resultids) <= offset + limit:
+                    qs = Order.objects.using(settings.DATABASE_REPLICA).filter(
+                        id__in=resultids
+                    )
+
+        """
+        We use prefetch_related here instead of select_related for a reason, even though select_related
+        would be the common choice for a foreign key and doesn't require an additional database query.
+        The problem is, that if our results contain the same event 25 times, select_related will create
+        25 Django  objects which will all try to pull their ownsettings cache to show the event properly,
+        leading to lots of unnecessary queries. Due to the way prefetch_related works differently, it
+        will only create one shared Django object.
+        """
         return qs.only(
             'id', 'invoice_address__name_cached', 'invoice_address__name_parts', 'code', 'event', 'email',
             'datetime', 'total', 'status', 'require_approval'
         ).prefetch_related(
             'event', 'event__organizer'
-        )
+        ).select_related('invoice_address')