New retry logic for webhooks (#2790)

Co-authored-by: Richard Schreiber <wiffbi@gmail.com>
This commit is contained in:
Raphael Michel
2022-09-15 09:41:39 +02:00
committed by GitHub
parent 1a401ec1e9
commit c1233ed692
7 changed files with 249 additions and 47 deletions

View File

@@ -0,0 +1,29 @@
# Generated by Django 3.2.12 on 2022-09-13 14:48
import django.db.models.deletion
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('pretixbase', '0218_checkinlist_addon_match'),
('pretixapi', '0007_alter_webhookcall_target_url'),
]
operations = [
migrations.CreateModel(
name='WebHookCallRetry',
fields=[
('id', models.BigAutoField(primary_key=True, serialize=False)),
('retry_not_before', models.DateTimeField(auto_now_add=True)),
('retry_count', models.PositiveIntegerField(default=0)),
('action_type', models.CharField(max_length=255)),
('logentry', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='webhook_retries', to='pretixbase.logentry')),
('webhook', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='retries', to='pretixapi.webhook')),
],
options={
'unique_together': {('webhook', 'logentry')},
},
),
]

View File

@@ -133,6 +133,18 @@ class WebHookCall(models.Model):
ordering = ("-datetime",)
class WebHookCallRetry(models.Model):
id = models.BigAutoField(primary_key=True)
webhook = models.ForeignKey('WebHook', on_delete=models.CASCADE, related_name='retries')
logentry = models.ForeignKey('pretixbase.LogEntry', on_delete=models.CASCADE, related_name='webhook_retries')
retry_not_before = models.DateTimeField(auto_now_add=True)
retry_count = models.PositiveIntegerField(default=0)
action_type = models.CharField(max_length=255)
class Meta:
unique_together = (('webhook', 'logentry'),)
class ApiCall(models.Model):
idempotency_key = models.CharField(max_length=190, db_index=True)
auth_hash = models.CharField(max_length=190, db_index=True)

View File

@@ -23,19 +23,24 @@ import json
import logging
import time
from collections import OrderedDict
from datetime import timedelta
import requests
from celery.exceptions import MaxRetriesExceededError
from django.db import DatabaseError, connection, transaction
from django.db.models import Exists, OuterRef, Q
from django.dispatch import receiver
from django.utils.timezone import now
from django.utils.translation import gettext_lazy as _, pgettext_lazy
from django_scopes import scope, scopes_disabled
from requests import RequestException
from pretix.api.models import WebHook, WebHookCall, WebHookEventListener
from pretix.api.models import (
WebHook, WebHookCall, WebHookCallRetry, WebHookEventListener,
)
from pretix.api.signals import register_webhook_events
from pretix.base.models import LogEntry
from pretix.base.services.tasks import ProfiledTask, TransactionAwareTask
from pretix.base.signals import periodic_task
from pretix.celery_app import app
logger = logging.getLogger(__name__)
@@ -356,59 +361,162 @@ def notify_webhooks(logentry_ids: list):
send_webhook.apply_async(args=(logentry.id, notification_type.action_type, wh.pk))
@app.task(base=ProfiledTask, bind=True, max_retries=9, acks_late=True)
def send_webhook(self, logentry_id: int, action_type: str, webhook_id: int):
# 9 retries with 2**(2*x) timing is roughly 72 hours
@app.task(base=ProfiledTask, bind=True, max_retries=5, default_retry_delay=60, acks_late=True, autoretry_for=(DatabaseError,),)
def send_webhook(self, logentry_id: int, action_type: str, webhook_id: int, retry_count: int = 0):
"""
Sends out a specific webhook using adequate retry and error handling logic.
Our retry logic is a little complex since we have different constraints here:
1. We historically documented that we retry for up to three days, so we want to keep that
promise. We want to use (approximately) exponentially increasing times to keep load
manageable.
2. We want to use Celery's ``acks_late=True`` options which prevents lost tasks if a worker
crashes.
3. A limitation of Celery's redis broker implementation is that it can not properly handle
tasks that *run or wait* longer than `visibility_timeout`, which defaults to 1h, when
``acks_late`` is enabled. So any task with a *retry interval* of >1h will be restarted
many times because celery believes the worker has crashed.
4. We do like that the first few retries happen within a few seconds to work around very
intermittent connectivity issues quickly. For the longer retries with multiple hours,
we don't care if they are emitted a few minutes too late.
We therefore have a two-phase retry process:
- For all retry intervals below 5 minutes, which is the first 3 retries currently, we
schedule a new celery task directly with an increased retry_count. We do *not* use
celery's retry() call currently to make the retry process in both phases more similar,
there should not be much of a difference though (except that the initial task will be in
SUCCESS state, but we don't check that status anywhere).
- For all retry intervals of at least 5 minutes, we create a database entry. Then, the
periodic task ``schedule_webhook_retries_on_celery`` will schedule celery tasks for them
once their time has come.
"""
retry_intervals = (
5, # + 5 seconds
30, # + 30 seconds
60, # + 1 minute
300, # + 5 minutes
1200, # + 20 minutes
3600, # + 60 minutes
1440, # + 4 hours
21600, # + 6 hours
43200, # + 12 hours
43200, # + 24 hours
86400, # + 24 hours
) # added up, these are approximately 3 days, as documented
retry_celery_cutoff = 300
with scopes_disabled():
webhook = WebHook.objects.get(id=webhook_id)
with scope(organizer=webhook.organizer):
with scope(organizer=webhook.organizer), transaction.atomic():
logentry = LogEntry.all.get(id=logentry_id)
types = get_all_webhook_events()
event_type = types.get(action_type)
if not event_type or not webhook.enabled:
return # Ignore, e.g. plugin not installed
return 'obsolete-webhook' # Ignore, e.g. plugin not installed
payload = event_type.build_payload(logentry)
if payload is None:
# Content object deleted?
return
return 'obsolete-payload'
t = time.time()
try:
try:
resp = requests.post(
webhook.target_url,
json=payload,
allow_redirects=False
resp = requests.post(
webhook.target_url,
json=payload,
allow_redirects=False,
timeout=30,
)
WebHookCall.objects.create(
webhook=webhook,
action_type=logentry.action_type,
target_url=webhook.target_url,
is_retry=self.request.retries > 0,
execution_time=time.time() - t,
return_code=resp.status_code,
payload=json.dumps(payload),
response_body=resp.text[:1024 * 1024],
success=200 <= resp.status_code <= 299
)
if resp.status_code == 410:
webhook.enabled = False
webhook.save()
return 'gone'
elif resp.status_code > 299:
if retry_count >= len(retry_intervals):
return 'retry-given-up'
elif retry_intervals[retry_count] < retry_celery_cutoff:
send_webhook.apply_async(args=(logentry_id, action_type, webhook_id, retry_count + 1))
return 'retry-via-celery'
else:
webhook.retries.update_or_create(
logentry=logentry,
defaults=dict(
retry_not_before=now() + timedelta(seconds=retry_intervals[retry_count]),
retry_count=retry_count + 1,
action_type=action_type,
),
)
return 'retry-via-db'
return 'ok'
except RequestException as e:
WebHookCall.objects.create(
webhook=webhook,
action_type=logentry.action_type,
target_url=webhook.target_url,
is_retry=self.request.retries > 0,
execution_time=time.time() - t,
return_code=0,
payload=json.dumps(payload),
response_body=str(e)[:1024 * 1024]
)
if retry_count >= len(retry_intervals):
return 'retry-given-up'
elif retry_intervals[retry_count] < retry_celery_cutoff:
send_webhook.apply_async(args=(logentry_id, action_type, webhook_id, retry_count + 1))
return 'retry-via-celery'
else:
webhook.retries.update_or_create(
logentry=logentry,
defaults=dict(
retry_not_before=now() + timedelta(seconds=retry_intervals[retry_count]),
retry_count=retry_count + 1,
action_type=action_type,
),
)
WebHookCall.objects.create(
webhook=webhook,
action_type=logentry.action_type,
target_url=webhook.target_url,
is_retry=self.request.retries > 0,
execution_time=time.time() - t,
return_code=resp.status_code,
payload=json.dumps(payload),
response_body=resp.text[:1024 * 1024],
success=200 <= resp.status_code <= 299
)
if resp.status_code == 410:
webhook.enabled = False
webhook.save()
elif resp.status_code > 299:
raise self.retry(countdown=2 ** (self.request.retries * 2)) # max is 2 ** (8*2) = 65536 seconds = ~18 hours
except RequestException as e:
WebHookCall.objects.create(
webhook=webhook,
action_type=logentry.action_type,
target_url=webhook.target_url,
is_retry=self.request.retries > 0,
execution_time=time.time() - t,
return_code=0,
payload=json.dumps(payload),
response_body=str(e)[:1024 * 1024]
)
raise self.retry(countdown=2 ** (self.request.retries * 2)) # max is 2 ** (8*2) = 65536 seconds = ~18 hours
except MaxRetriesExceededError:
pass
return 'retry-via-db'
@app.task(base=TransactionAwareTask)
def manually_retry_all_calls(webhook_id: int):
with scopes_disabled():
webhook = WebHook.objects.get(id=webhook_id)
with scope(organizer=webhook.organizer), transaction.atomic():
for whcr in webhook.retries.select_for_update(
skip_locked=connection.features.has_select_for_update_skip_locked
):
send_webhook.apply_async(
args=(whcr.logentry_id, whcr.action_type, whcr.webhook_id, whcr.retry_count),
)
whcr.delete()
@receiver(signal=periodic_task, dispatch_uid='pretixapi_schedule_webhook_retries_on_celery')
@scopes_disabled()
def schedule_webhook_retries_on_celery(sender, **kwargs):
with transaction.atomic():
for whcr in WebHookCallRetry.objects.select_for_update(
skip_locked=connection.features.has_select_for_update_skip_locked
).filter(retry_not_before__lt=now()):
send_webhook.apply_async(
args=(whcr.logentry_id, whcr.action_type, whcr.webhook_id, whcr.retry_count),
)
whcr.delete()

View File

@@ -319,6 +319,8 @@ def pretixcontrol_logentry_display(sender: Event, logentry: LogEntry, **kwargs):
'pretix.giftcards.acceptance.removed': _('Gift card acceptance for another organizer has been removed.'),
'pretix.webhook.created': _('The webhook has been created.'),
'pretix.webhook.changed': _('The webhook has been changed.'),
'pretix.webhook.retries.expedited': _('The webhook call retry jobs have been manually expedited.'),
'pretix.webhook.retries.dropped': _('The webhook call retry jobs have been dropped.'),
'pretix.membershiptype.created': _('The membership type has been created.'),
'pretix.membershiptype.changed': _('The membership type has been changed.'),
'pretix.membershiptype.deleted': _('The membership type has been deleted.'),

View File

@@ -6,13 +6,42 @@
<p>
{% trans "This page shows all calls to your webhook in the past 30 days." %}
</p>
{% if retry_count %}
<form action="" method="post">
{% csrf_token %}
<div class="alert alert-info">
<p>
{% blocktranslate trimmed count count=retry_count %}
One webhook is scheduled to be retried.
{% plural %}
{{ count }} webhooks are scheduled to be retried.
{% endblocktranslate %}
</p>
<p>
<button type="submit" name="action" value="expedite" class="btn btn-success">
{% trans "Retry now" %}
</button>
<button type="submit" name="action" value="drop" class="btn btn-danger">
{% trans "Stop retrying" %}
</button>
</p>
<p class="help-block">
{% blocktranslate trimmed with minutes=5 %}
Webhooks scheduled to be retried in less than {{ minutes }} minutes may not be listed here and can
no longer be stopped or expedited.
{% endblocktranslate %}
</p>
</div>
</form>
{% endif %}
{% for c in calls %}
<details class="panel panel-default">
<summary class="panel-heading">
<div class="row">
<div class="col-md-4 col-sm-12 col-xs-12">
{% if c.is_retry %}
<span class="fa fa-repeat fa-fw" data-toggle="tooltip" title="{% trans "This webhook was retried since it previously failed." %}"></span>
<span class="fa fa-repeat fa-fw" data-toggle="tooltip"
title="{% trans "This webhook was retried since it previously failed." %}"></span>
{% else %}
<span class="fa fa-clock-o fa-fw"></span>
{% endif %}

View File

@@ -62,6 +62,7 @@ from django.views.generic import (
)
from pretix.api.models import WebHook
from pretix.api.webhooks import manually_retry_all_calls
from pretix.base.auth import get_auth_backends
from pretix.base.channels import get_all_sales_channels
from pretix.base.i18n import language
@@ -1252,6 +1253,7 @@ class WebHookLogsView(OrganizerDetailViewMixin, OrganizerPermissionRequiredMixin
def get_context_data(self, **kwargs):
ctx = super().get_context_data(**kwargs)
ctx['webhook'] = self.webhook
ctx['retry_count'] = self.webhook.retries.count()
return ctx
@cached_property
@@ -1263,6 +1265,25 @@ class WebHookLogsView(OrganizerDetailViewMixin, OrganizerPermissionRequiredMixin
def get_queryset(self):
return self.webhook.calls.order_by('-datetime')
def post(self, request, *args, **kwargs):
if request.POST.get("action") == "expedite":
self.request.organizer.log_action('pretix.webhook.retries.expedited', user=self.request.user, data={
'webhook': self.webhook.pk,
})
manually_retry_all_calls.apply_async(args=(self.webhook.pk,))
messages.success(request, _('All requests will now be scheduled for an immediate attempt. Please '
'allow for a few minutes before they are processed.'))
elif request.POST.get("action") == "drop":
self.request.organizer.log_action('pretix.webhook.retries.dropped', user=self.request.user, data={
'webhook': self.webhook.pk,
})
self.webhook.retries.all().delete()
messages.success(request, _('All unprocessed webhooks have been stopped from retrying.'))
return redirect(reverse('control:organizer.webhook.logs', kwargs={
'organizer': self.request.organizer.slug,
'webhook': self.webhook.pk,
}))
class GiftCardListView(OrganizerDetailViewMixin, OrganizerPermissionRequiredMixin, ListView):
model = GiftCard