forked from CGM_Public/pretix_original
New retry logic for webhooks (#2790)
Co-authored-by: Richard Schreiber <wiffbi@gmail.com>
This commit is contained in:
@@ -92,9 +92,10 @@ If any other status code is returned, we will assume you did not receive the cal
|
|||||||
or ``304 Not Modified`` response will be treated as a failure. pretix will not follow any ``301`` or ``302`` redirect
|
or ``304 Not Modified`` response will be treated as a failure. pretix will not follow any ``301`` or ``302`` redirect
|
||||||
headers and pretix will ignore all other information in your response headers or body.
|
headers and pretix will ignore all other information in your response headers or body.
|
||||||
|
|
||||||
If we do not receive a status code in the range of ``200`` and ``299``, pretix will retry to deliver for up to three
|
If we do not receive a status code in the range of ``200`` and ``299`` or do not receive any response within a 30 second
|
||||||
days with an exponential back off. Therefore, we recommend that you implement your endpoint in a way where calling it
|
time frame, pretix will retry to deliver for up to three days with an exponential back off. Therefore, we recommend that
|
||||||
multiple times for the same event due to a perceived error does not do any harm.
|
you implement your endpoint in a way where calling it multiple times for the same event due to a perceived error does
|
||||||
|
not do any harm.
|
||||||
|
|
||||||
There is only one exception: If status code ``410 Gone`` is returned, we will assume the
|
There is only one exception: If status code ``410 Gone`` is returned, we will assume the
|
||||||
endpoint does not exist any more and automatically disable the webhook.
|
endpoint does not exist any more and automatically disable the webhook.
|
||||||
|
|||||||
29
src/pretix/api/migrations/0008_webhookcallretry.py
Normal file
29
src/pretix/api/migrations/0008_webhookcallretry.py
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
# Generated by Django 3.2.12 on 2022-09-13 14:48
|
||||||
|
|
||||||
|
import django.db.models.deletion
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('pretixbase', '0218_checkinlist_addon_match'),
|
||||||
|
('pretixapi', '0007_alter_webhookcall_target_url'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.CreateModel(
|
||||||
|
name='WebHookCallRetry',
|
||||||
|
fields=[
|
||||||
|
('id', models.BigAutoField(primary_key=True, serialize=False)),
|
||||||
|
('retry_not_before', models.DateTimeField(auto_now_add=True)),
|
||||||
|
('retry_count', models.PositiveIntegerField(default=0)),
|
||||||
|
('action_type', models.CharField(max_length=255)),
|
||||||
|
('logentry', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='webhook_retries', to='pretixbase.logentry')),
|
||||||
|
('webhook', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='retries', to='pretixapi.webhook')),
|
||||||
|
],
|
||||||
|
options={
|
||||||
|
'unique_together': {('webhook', 'logentry')},
|
||||||
|
},
|
||||||
|
),
|
||||||
|
]
|
||||||
@@ -133,6 +133,18 @@ class WebHookCall(models.Model):
|
|||||||
ordering = ("-datetime",)
|
ordering = ("-datetime",)
|
||||||
|
|
||||||
|
|
||||||
|
class WebHookCallRetry(models.Model):
|
||||||
|
id = models.BigAutoField(primary_key=True)
|
||||||
|
webhook = models.ForeignKey('WebHook', on_delete=models.CASCADE, related_name='retries')
|
||||||
|
logentry = models.ForeignKey('pretixbase.LogEntry', on_delete=models.CASCADE, related_name='webhook_retries')
|
||||||
|
retry_not_before = models.DateTimeField(auto_now_add=True)
|
||||||
|
retry_count = models.PositiveIntegerField(default=0)
|
||||||
|
action_type = models.CharField(max_length=255)
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
unique_together = (('webhook', 'logentry'),)
|
||||||
|
|
||||||
|
|
||||||
class ApiCall(models.Model):
|
class ApiCall(models.Model):
|
||||||
idempotency_key = models.CharField(max_length=190, db_index=True)
|
idempotency_key = models.CharField(max_length=190, db_index=True)
|
||||||
auth_hash = models.CharField(max_length=190, db_index=True)
|
auth_hash = models.CharField(max_length=190, db_index=True)
|
||||||
|
|||||||
@@ -23,19 +23,24 @@ import json
|
|||||||
import logging
|
import logging
|
||||||
import time
|
import time
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
|
from datetime import timedelta
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from celery.exceptions import MaxRetriesExceededError
|
from django.db import DatabaseError, connection, transaction
|
||||||
from django.db.models import Exists, OuterRef, Q
|
from django.db.models import Exists, OuterRef, Q
|
||||||
from django.dispatch import receiver
|
from django.dispatch import receiver
|
||||||
|
from django.utils.timezone import now
|
||||||
from django.utils.translation import gettext_lazy as _, pgettext_lazy
|
from django.utils.translation import gettext_lazy as _, pgettext_lazy
|
||||||
from django_scopes import scope, scopes_disabled
|
from django_scopes import scope, scopes_disabled
|
||||||
from requests import RequestException
|
from requests import RequestException
|
||||||
|
|
||||||
from pretix.api.models import WebHook, WebHookCall, WebHookEventListener
|
from pretix.api.models import (
|
||||||
|
WebHook, WebHookCall, WebHookCallRetry, WebHookEventListener,
|
||||||
|
)
|
||||||
from pretix.api.signals import register_webhook_events
|
from pretix.api.signals import register_webhook_events
|
||||||
from pretix.base.models import LogEntry
|
from pretix.base.models import LogEntry
|
||||||
from pretix.base.services.tasks import ProfiledTask, TransactionAwareTask
|
from pretix.base.services.tasks import ProfiledTask, TransactionAwareTask
|
||||||
|
from pretix.base.signals import periodic_task
|
||||||
from pretix.celery_app import app
|
from pretix.celery_app import app
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@@ -356,59 +361,162 @@ def notify_webhooks(logentry_ids: list):
|
|||||||
send_webhook.apply_async(args=(logentry.id, notification_type.action_type, wh.pk))
|
send_webhook.apply_async(args=(logentry.id, notification_type.action_type, wh.pk))
|
||||||
|
|
||||||
|
|
||||||
@app.task(base=ProfiledTask, bind=True, max_retries=9, acks_late=True)
|
@app.task(base=ProfiledTask, bind=True, max_retries=5, default_retry_delay=60, acks_late=True, autoretry_for=(DatabaseError,),)
|
||||||
def send_webhook(self, logentry_id: int, action_type: str, webhook_id: int):
|
def send_webhook(self, logentry_id: int, action_type: str, webhook_id: int, retry_count: int = 0):
|
||||||
# 9 retries with 2**(2*x) timing is roughly 72 hours
|
"""
|
||||||
|
Sends out a specific webhook using adequate retry and error handling logic.
|
||||||
|
|
||||||
|
Our retry logic is a little complex since we have different constraints here:
|
||||||
|
|
||||||
|
1. We historically documented that we retry for up to three days, so we want to keep that
|
||||||
|
promise. We want to use (approximately) exponentially increasing times to keep load
|
||||||
|
manageable.
|
||||||
|
|
||||||
|
2. We want to use Celery's ``acks_late=True`` options which prevents lost tasks if a worker
|
||||||
|
crashes.
|
||||||
|
|
||||||
|
3. A limitation of Celery's redis broker implementation is that it can not properly handle
|
||||||
|
tasks that *run or wait* longer than `visibility_timeout`, which defaults to 1h, when
|
||||||
|
``acks_late`` is enabled. So any task with a *retry interval* of >1h will be restarted
|
||||||
|
many times because celery believes the worker has crashed.
|
||||||
|
|
||||||
|
4. We do like that the first few retries happen within a few seconds to work around very
|
||||||
|
intermittent connectivity issues quickly. For the longer retries with multiple hours,
|
||||||
|
we don't care if they are emitted a few minutes too late.
|
||||||
|
|
||||||
|
We therefore have a two-phase retry process:
|
||||||
|
|
||||||
|
- For all retry intervals below 5 minutes, which is the first 3 retries currently, we
|
||||||
|
schedule a new celery task directly with an increased retry_count. We do *not* use
|
||||||
|
celery's retry() call currently to make the retry process in both phases more similar,
|
||||||
|
there should not be much of a difference though (except that the initial task will be in
|
||||||
|
SUCCESS state, but we don't check that status anywhere).
|
||||||
|
|
||||||
|
- For all retry intervals of at least 5 minutes, we create a database entry. Then, the
|
||||||
|
periodic task ``schedule_webhook_retries_on_celery`` will schedule celery tasks for them
|
||||||
|
once their time has come.
|
||||||
|
"""
|
||||||
|
retry_intervals = (
|
||||||
|
5, # + 5 seconds
|
||||||
|
30, # + 30 seconds
|
||||||
|
60, # + 1 minute
|
||||||
|
300, # + 5 minutes
|
||||||
|
1200, # + 20 minutes
|
||||||
|
3600, # + 60 minutes
|
||||||
|
1440, # + 4 hours
|
||||||
|
21600, # + 6 hours
|
||||||
|
43200, # + 12 hours
|
||||||
|
43200, # + 24 hours
|
||||||
|
86400, # + 24 hours
|
||||||
|
) # added up, these are approximately 3 days, as documented
|
||||||
|
retry_celery_cutoff = 300
|
||||||
|
|
||||||
with scopes_disabled():
|
with scopes_disabled():
|
||||||
webhook = WebHook.objects.get(id=webhook_id)
|
webhook = WebHook.objects.get(id=webhook_id)
|
||||||
with scope(organizer=webhook.organizer):
|
|
||||||
|
with scope(organizer=webhook.organizer), transaction.atomic():
|
||||||
logentry = LogEntry.all.get(id=logentry_id)
|
logentry = LogEntry.all.get(id=logentry_id)
|
||||||
types = get_all_webhook_events()
|
types = get_all_webhook_events()
|
||||||
event_type = types.get(action_type)
|
event_type = types.get(action_type)
|
||||||
if not event_type or not webhook.enabled:
|
if not event_type or not webhook.enabled:
|
||||||
return # Ignore, e.g. plugin not installed
|
return 'obsolete-webhook' # Ignore, e.g. plugin not installed
|
||||||
|
|
||||||
payload = event_type.build_payload(logentry)
|
payload = event_type.build_payload(logentry)
|
||||||
if payload is None:
|
if payload is None:
|
||||||
# Content object deleted?
|
# Content object deleted?
|
||||||
return
|
return 'obsolete-payload'
|
||||||
|
|
||||||
t = time.time()
|
t = time.time()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
try:
|
resp = requests.post(
|
||||||
resp = requests.post(
|
webhook.target_url,
|
||||||
webhook.target_url,
|
json=payload,
|
||||||
json=payload,
|
allow_redirects=False,
|
||||||
allow_redirects=False
|
timeout=30,
|
||||||
|
)
|
||||||
|
WebHookCall.objects.create(
|
||||||
|
webhook=webhook,
|
||||||
|
action_type=logentry.action_type,
|
||||||
|
target_url=webhook.target_url,
|
||||||
|
is_retry=self.request.retries > 0,
|
||||||
|
execution_time=time.time() - t,
|
||||||
|
return_code=resp.status_code,
|
||||||
|
payload=json.dumps(payload),
|
||||||
|
response_body=resp.text[:1024 * 1024],
|
||||||
|
success=200 <= resp.status_code <= 299
|
||||||
|
)
|
||||||
|
if resp.status_code == 410:
|
||||||
|
webhook.enabled = False
|
||||||
|
webhook.save()
|
||||||
|
return 'gone'
|
||||||
|
elif resp.status_code > 299:
|
||||||
|
if retry_count >= len(retry_intervals):
|
||||||
|
return 'retry-given-up'
|
||||||
|
elif retry_intervals[retry_count] < retry_celery_cutoff:
|
||||||
|
send_webhook.apply_async(args=(logentry_id, action_type, webhook_id, retry_count + 1))
|
||||||
|
return 'retry-via-celery'
|
||||||
|
else:
|
||||||
|
webhook.retries.update_or_create(
|
||||||
|
logentry=logentry,
|
||||||
|
defaults=dict(
|
||||||
|
retry_not_before=now() + timedelta(seconds=retry_intervals[retry_count]),
|
||||||
|
retry_count=retry_count + 1,
|
||||||
|
action_type=action_type,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
return 'retry-via-db'
|
||||||
|
return 'ok'
|
||||||
|
except RequestException as e:
|
||||||
|
WebHookCall.objects.create(
|
||||||
|
webhook=webhook,
|
||||||
|
action_type=logentry.action_type,
|
||||||
|
target_url=webhook.target_url,
|
||||||
|
is_retry=self.request.retries > 0,
|
||||||
|
execution_time=time.time() - t,
|
||||||
|
return_code=0,
|
||||||
|
payload=json.dumps(payload),
|
||||||
|
response_body=str(e)[:1024 * 1024]
|
||||||
|
)
|
||||||
|
if retry_count >= len(retry_intervals):
|
||||||
|
return 'retry-given-up'
|
||||||
|
elif retry_intervals[retry_count] < retry_celery_cutoff:
|
||||||
|
send_webhook.apply_async(args=(logentry_id, action_type, webhook_id, retry_count + 1))
|
||||||
|
return 'retry-via-celery'
|
||||||
|
else:
|
||||||
|
webhook.retries.update_or_create(
|
||||||
|
logentry=logentry,
|
||||||
|
defaults=dict(
|
||||||
|
retry_not_before=now() + timedelta(seconds=retry_intervals[retry_count]),
|
||||||
|
retry_count=retry_count + 1,
|
||||||
|
action_type=action_type,
|
||||||
|
),
|
||||||
)
|
)
|
||||||
WebHookCall.objects.create(
|
return 'retry-via-db'
|
||||||
webhook=webhook,
|
|
||||||
action_type=logentry.action_type,
|
|
||||||
target_url=webhook.target_url,
|
@app.task(base=TransactionAwareTask)
|
||||||
is_retry=self.request.retries > 0,
|
def manually_retry_all_calls(webhook_id: int):
|
||||||
execution_time=time.time() - t,
|
with scopes_disabled():
|
||||||
return_code=resp.status_code,
|
webhook = WebHook.objects.get(id=webhook_id)
|
||||||
payload=json.dumps(payload),
|
with scope(organizer=webhook.organizer), transaction.atomic():
|
||||||
response_body=resp.text[:1024 * 1024],
|
for whcr in webhook.retries.select_for_update(
|
||||||
success=200 <= resp.status_code <= 299
|
skip_locked=connection.features.has_select_for_update_skip_locked
|
||||||
)
|
):
|
||||||
if resp.status_code == 410:
|
send_webhook.apply_async(
|
||||||
webhook.enabled = False
|
args=(whcr.logentry_id, whcr.action_type, whcr.webhook_id, whcr.retry_count),
|
||||||
webhook.save()
|
)
|
||||||
elif resp.status_code > 299:
|
whcr.delete()
|
||||||
raise self.retry(countdown=2 ** (self.request.retries * 2)) # max is 2 ** (8*2) = 65536 seconds = ~18 hours
|
|
||||||
except RequestException as e:
|
|
||||||
WebHookCall.objects.create(
|
@receiver(signal=periodic_task, dispatch_uid='pretixapi_schedule_webhook_retries_on_celery')
|
||||||
webhook=webhook,
|
@scopes_disabled()
|
||||||
action_type=logentry.action_type,
|
def schedule_webhook_retries_on_celery(sender, **kwargs):
|
||||||
target_url=webhook.target_url,
|
with transaction.atomic():
|
||||||
is_retry=self.request.retries > 0,
|
for whcr in WebHookCallRetry.objects.select_for_update(
|
||||||
execution_time=time.time() - t,
|
skip_locked=connection.features.has_select_for_update_skip_locked
|
||||||
return_code=0,
|
).filter(retry_not_before__lt=now()):
|
||||||
payload=json.dumps(payload),
|
send_webhook.apply_async(
|
||||||
response_body=str(e)[:1024 * 1024]
|
args=(whcr.logentry_id, whcr.action_type, whcr.webhook_id, whcr.retry_count),
|
||||||
)
|
)
|
||||||
raise self.retry(countdown=2 ** (self.request.retries * 2)) # max is 2 ** (8*2) = 65536 seconds = ~18 hours
|
whcr.delete()
|
||||||
except MaxRetriesExceededError:
|
|
||||||
pass
|
|
||||||
|
|||||||
@@ -319,6 +319,8 @@ def pretixcontrol_logentry_display(sender: Event, logentry: LogEntry, **kwargs):
|
|||||||
'pretix.giftcards.acceptance.removed': _('Gift card acceptance for another organizer has been removed.'),
|
'pretix.giftcards.acceptance.removed': _('Gift card acceptance for another organizer has been removed.'),
|
||||||
'pretix.webhook.created': _('The webhook has been created.'),
|
'pretix.webhook.created': _('The webhook has been created.'),
|
||||||
'pretix.webhook.changed': _('The webhook has been changed.'),
|
'pretix.webhook.changed': _('The webhook has been changed.'),
|
||||||
|
'pretix.webhook.retries.expedited': _('The webhook call retry jobs have been manually expedited.'),
|
||||||
|
'pretix.webhook.retries.dropped': _('The webhook call retry jobs have been dropped.'),
|
||||||
'pretix.membershiptype.created': _('The membership type has been created.'),
|
'pretix.membershiptype.created': _('The membership type has been created.'),
|
||||||
'pretix.membershiptype.changed': _('The membership type has been changed.'),
|
'pretix.membershiptype.changed': _('The membership type has been changed.'),
|
||||||
'pretix.membershiptype.deleted': _('The membership type has been deleted.'),
|
'pretix.membershiptype.deleted': _('The membership type has been deleted.'),
|
||||||
|
|||||||
@@ -6,13 +6,42 @@
|
|||||||
<p>
|
<p>
|
||||||
{% trans "This page shows all calls to your webhook in the past 30 days." %}
|
{% trans "This page shows all calls to your webhook in the past 30 days." %}
|
||||||
</p>
|
</p>
|
||||||
|
{% if retry_count %}
|
||||||
|
<form action="" method="post">
|
||||||
|
{% csrf_token %}
|
||||||
|
<div class="alert alert-info">
|
||||||
|
<p>
|
||||||
|
{% blocktranslate trimmed count count=retry_count %}
|
||||||
|
One webhook is scheduled to be retried.
|
||||||
|
{% plural %}
|
||||||
|
{{ count }} webhooks are scheduled to be retried.
|
||||||
|
{% endblocktranslate %}
|
||||||
|
</p>
|
||||||
|
<p>
|
||||||
|
<button type="submit" name="action" value="expedite" class="btn btn-success">
|
||||||
|
{% trans "Retry now" %}
|
||||||
|
</button>
|
||||||
|
<button type="submit" name="action" value="drop" class="btn btn-danger">
|
||||||
|
{% trans "Stop retrying" %}
|
||||||
|
</button>
|
||||||
|
</p>
|
||||||
|
<p class="help-block">
|
||||||
|
{% blocktranslate trimmed with minutes=5 %}
|
||||||
|
Webhooks scheduled to be retried in less than {{ minutes }} minutes may not be listed here and can
|
||||||
|
no longer be stopped or expedited.
|
||||||
|
{% endblocktranslate %}
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
</form>
|
||||||
|
{% endif %}
|
||||||
{% for c in calls %}
|
{% for c in calls %}
|
||||||
<details class="panel panel-default">
|
<details class="panel panel-default">
|
||||||
<summary class="panel-heading">
|
<summary class="panel-heading">
|
||||||
<div class="row">
|
<div class="row">
|
||||||
<div class="col-md-4 col-sm-12 col-xs-12">
|
<div class="col-md-4 col-sm-12 col-xs-12">
|
||||||
{% if c.is_retry %}
|
{% if c.is_retry %}
|
||||||
<span class="fa fa-repeat fa-fw" data-toggle="tooltip" title="{% trans "This webhook was retried since it previously failed." %}"></span>
|
<span class="fa fa-repeat fa-fw" data-toggle="tooltip"
|
||||||
|
title="{% trans "This webhook was retried since it previously failed." %}"></span>
|
||||||
{% else %}
|
{% else %}
|
||||||
<span class="fa fa-clock-o fa-fw"></span>
|
<span class="fa fa-clock-o fa-fw"></span>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|||||||
@@ -62,6 +62,7 @@ from django.views.generic import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
from pretix.api.models import WebHook
|
from pretix.api.models import WebHook
|
||||||
|
from pretix.api.webhooks import manually_retry_all_calls
|
||||||
from pretix.base.auth import get_auth_backends
|
from pretix.base.auth import get_auth_backends
|
||||||
from pretix.base.channels import get_all_sales_channels
|
from pretix.base.channels import get_all_sales_channels
|
||||||
from pretix.base.i18n import language
|
from pretix.base.i18n import language
|
||||||
@@ -1252,6 +1253,7 @@ class WebHookLogsView(OrganizerDetailViewMixin, OrganizerPermissionRequiredMixin
|
|||||||
def get_context_data(self, **kwargs):
|
def get_context_data(self, **kwargs):
|
||||||
ctx = super().get_context_data(**kwargs)
|
ctx = super().get_context_data(**kwargs)
|
||||||
ctx['webhook'] = self.webhook
|
ctx['webhook'] = self.webhook
|
||||||
|
ctx['retry_count'] = self.webhook.retries.count()
|
||||||
return ctx
|
return ctx
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
@@ -1263,6 +1265,25 @@ class WebHookLogsView(OrganizerDetailViewMixin, OrganizerPermissionRequiredMixin
|
|||||||
def get_queryset(self):
|
def get_queryset(self):
|
||||||
return self.webhook.calls.order_by('-datetime')
|
return self.webhook.calls.order_by('-datetime')
|
||||||
|
|
||||||
|
def post(self, request, *args, **kwargs):
|
||||||
|
if request.POST.get("action") == "expedite":
|
||||||
|
self.request.organizer.log_action('pretix.webhook.retries.expedited', user=self.request.user, data={
|
||||||
|
'webhook': self.webhook.pk,
|
||||||
|
})
|
||||||
|
manually_retry_all_calls.apply_async(args=(self.webhook.pk,))
|
||||||
|
messages.success(request, _('All requests will now be scheduled for an immediate attempt. Please '
|
||||||
|
'allow for a few minutes before they are processed.'))
|
||||||
|
elif request.POST.get("action") == "drop":
|
||||||
|
self.request.organizer.log_action('pretix.webhook.retries.dropped', user=self.request.user, data={
|
||||||
|
'webhook': self.webhook.pk,
|
||||||
|
})
|
||||||
|
self.webhook.retries.all().delete()
|
||||||
|
messages.success(request, _('All unprocessed webhooks have been stopped from retrying.'))
|
||||||
|
return redirect(reverse('control:organizer.webhook.logs', kwargs={
|
||||||
|
'organizer': self.request.organizer.slug,
|
||||||
|
'webhook': self.webhook.pk,
|
||||||
|
}))
|
||||||
|
|
||||||
|
|
||||||
class GiftCardListView(OrganizerDetailViewMixin, OrganizerPermissionRequiredMixin, ListView):
|
class GiftCardListView(OrganizerDetailViewMixin, OrganizerPermissionRequiredMixin, ListView):
|
||||||
model = GiftCard
|
model = GiftCard
|
||||||
|
|||||||
Reference in New Issue
Block a user