From c0078a732d45f2e18564728f740325b59f49fe0f Mon Sep 17 00:00:00 2001 From: Pasi Sarolahti Date: Mon, 9 May 2022 15:58:58 +0300 Subject: [PATCH] Automatic retries of incomplete grading jobs Sometimes it happens that a grading job never completes because of some technical issue, and it remains in "In Grading" state until user submits it for regrading. This commit implements automatic regrading of grading jobs that have not completed within configured timeout. There is also a simple heuristic to limit the (likely unsuccesful) automatic retries when it seems that the grader is more persistently unavailable. Closes #988. --- aplus/celery.py | 32 +++++++++++++++++++ aplus/settings.py | 13 ++++++++ course/templatetags/base.py | 6 +++- exercise/admin.py | 10 ++++++ .../migrations/0046_auto_20220501_2152.py | 32 +++++++++++++++++++ exercise/submission_models.py | 26 ++++++++++++++- locale/en/LC_MESSAGES/django.po | 4 +++ locale/fi/LC_MESSAGES/django.po | 4 +++ 8 files changed, 125 insertions(+), 2 deletions(-) create mode 100644 exercise/migrations/0046_auto_20220501_2152.py diff --git a/aplus/celery.py b/aplus/celery.py index 4a1030790..6bea02d61 100644 --- a/aplus/celery.py +++ b/aplus/celery.py @@ -2,10 +2,15 @@ import celery import datetime from datetime import timedelta +import logging +from dateutil.relativedelta import relativedelta from time import sleep from django.conf import settings +from django.db.models import Max + +logger = logging.getLogger('aplus.celery') os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'aplus.settings') app = celery.Celery('aplus') @@ -17,6 +22,10 @@ def setup_periodic_tasks(sender, **kwargs): if hasattr(settings, 'SIS_ENROLL_SCHEDULE'): sender.add_periodic_task(settings.SIS_ENROLL_SCHEDULE, enroll.s(), name='enroll') +@app.on_after_configure.connect +def setup_periodic_tasks(sender, **kwargs): + sender.add_periodic_task(settings.SUBMISSION_EXPIRY_TIMEOUT, retry_submissions.s(), name='retry_submissions') + @app.task def enroll(): """ @@ -34,3 +43,26 @@ def enroll(): i.enroll_from_sis() if settings.SIS_ENROLL_DELAY: sleep(settings.SIS_ENROLL_DELAY) + +@app.task +def retry_submissions(): + from exercise.submission_models import PendingSubmission, grader_stable + + # Recovery state: only send one grading request to probe the state of grader + # We pick the one with most attempts, so that total_retries comes down more quickly + # when things are back to normal, and we can return to normal state + if not grader_stable(): + max_retries = PendingSubmission.objects.aggregate(maxretries=Max('num_retries'))['maxretries'] + pending = PendingSubmission.objects.filter(num_retries=max_retries)[0] + logging.info(f"Recovery state: retrying expired submission {pending.submission}") + pending.submission.exercise.grade(pending.submission) + return + + # Stable state: retry all expired submissions + expiry_time = datetime.datetime.now(datetime.timezone.utc) - relativedelta(seconds=settings.SUBMISSION_EXPIRY_TIMEOUT) + expired = PendingSubmission.objects.filter(submission_time__lt=expiry_time) + + for pending in expired: + logger.info(f"Retrying expired submission {pending.submission}") + pending.submission.exercise.grade(pending.submission) + sleep(0.5) # Delay 500 ms to avoid choking grader diff --git a/aplus/settings.py b/aplus/settings.py index 2018d7551..461020321 100644 --- a/aplus/settings.py +++ b/aplus/settings.py @@ -237,6 +237,19 @@ SIS_ENROLL_DELAY = 15 ########################################################################## +# Settings related automatic retries of unfinished grading tasks + +# Number of seconds after which a submission can be resent to grader +SUBMISSION_EXPIRY_TIMEOUT = 120 + +# Number of unresponded retries beyond which we move to recovery state. +# In recovery state there likely is more persistent problem with the grader +# or network that needs fixing. +# In recovery state A+ periodically probes the state of the grader, sending only one +# grading request out every SUBMISSION_EXPIRY_TIMEOUT seconds. +# We don not want to congest the potentially broken system unnecessarily with several +# requests in this case. +GRADER_STABLE_THRESHOLD = 5 ## Celery CELERY_ACCEPT_CONTENT = ['json'] diff --git a/course/templatetags/base.py b/course/templatetags/base.py index 710139e76..2fb3d2296 100644 --- a/course/templatetags/base.py +++ b/course/templatetags/base.py @@ -3,8 +3,9 @@ from django import template from django.conf import settings from django.utils.safestring import mark_safe -from django.utils.translation import get_language +from django.utils.translation import get_language, gettext_lazy as _ from lib.helpers import remove_query_param_from_url, settings_text, update_url_params +from exercise.submission_models import grader_stable register = template.Library() @@ -44,6 +45,9 @@ def site_alert(): if message: return mark_safe('
{}
' .format(pick_localized(message))) + if not grader_stable(): + # Prefer configured alert text, if one is set + return mark_safe('
{}
'.format(_('GRADER_PROBLEMS_ALERT'))) return '' diff --git a/exercise/admin.py b/exercise/admin.py index a62e56365..c4fb2d32e 100644 --- a/exercise/admin.py +++ b/exercise/admin.py @@ -17,6 +17,7 @@ RevealRule, ExerciseTask, LearningObjectDisplay, + PendingSubmission, ) from exercise.exercisecollection_models import ExerciseCollection from lib.admin_helpers import RecentCourseInstanceListFilter @@ -440,6 +441,14 @@ class LearningObjectDisplayAdmin(admin.ModelAdmin): readonly_fields = ('timestamp',) +class PendingSubmissionAdmin(admin.ModelAdmin): + list_display = ( + 'submission', + 'submission_time', + 'num_retries', + ) + + admin.site.register(CourseChapter, CourseChapterAdmin) admin.site.register(BaseExercise, BaseExerciseAdmin) admin.site.register(StaticExercise, StaticExerciseAdmin) @@ -452,3 +461,4 @@ class LearningObjectDisplayAdmin(admin.ModelAdmin): admin.site.register(RevealRule, RevealRuleAdmin) admin.site.register(ExerciseTask, ExerciseTaskAdmin) admin.site.register(LearningObjectDisplay, LearningObjectDisplayAdmin) +admin.site.register(PendingSubmission, PendingSubmissionAdmin) diff --git a/exercise/migrations/0046_auto_20220501_2152.py b/exercise/migrations/0046_auto_20220501_2152.py new file mode 100644 index 000000000..c545d3cf1 --- /dev/null +++ b/exercise/migrations/0046_auto_20220501_2152.py @@ -0,0 +1,32 @@ +# Generated by Django 3.2.12 on 2022-05-01 18:52 + +from django.db import migrations, models +import django.db.models.deletion +import lib.fields + + +class Migration(migrations.Migration): + + dependencies = [ + ('exercise', '0045_auto_20220211_1540'), + ] + + operations = [ + migrations.CreateModel( + name='PendingSubmission', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('submission_time', models.DateTimeField(null=True, verbose_name='LABEL_SUBMISSION_TIME')), + ('num_retries', models.IntegerField(default=0, verbose_name='LABEL_NUMBER_OF_RETRIES')), + ('submission', lib.fields.DefaultForeignKey(on_delete=django.db.models.deletion.CASCADE, to='exercise.submission', verbose_name='LABEL_SUBMISSION')), + ], + options={ + 'verbose_name': 'MODEL_NAME_PENDING_SUBMISSION', + 'verbose_name_plural': 'MODEL_NAME_PENDING_SUBMISSION_PLURAL', + }, + ), + migrations.AddConstraint( + model_name='pendingsubmission', + constraint=models.UniqueConstraint(fields=('submission',), name='unique_submission'), + ), + ] diff --git a/exercise/submission_models.py b/exercise/submission_models.py index 327f0cea3..17b0f26ff 100644 --- a/exercise/submission_models.py +++ b/exercise/submission_models.py @@ -1,6 +1,7 @@ import itertools import json import logging +from django.utils import timezone from mimetypes import guess_type import os from typing import IO, Dict, Iterable, List, Tuple, TYPE_CHECKING @@ -12,8 +13,9 @@ from django.http.request import HttpRequest from django.utils import timezone from django.utils.translation import get_language, gettext_lazy as _ -from exercise.protocol.exercise_page import ExercisePage +from django.db.models import Sum +from exercise.protocol.exercise_page import ExercisePage from authorization.models import JWTAccessible from authorization.object_permissions import register_jwt_accessible_class from lib.fields import DefaultForeignKey, JSONField, PercentField @@ -26,6 +28,7 @@ ) from lib.models import UrlMixin from userprofile.models import UserProfile +from aplus.celery import retry_submissions from . import exercise_models @@ -565,9 +568,11 @@ def scale_grade_to(self, percentage): def set_waiting(self): self.status = self.STATUS.WAITING + self.mark_pending() def set_ready(self, approve_unofficial=False): self.grading_time = timezone.now() + self.clear_pending() if self.status != self.STATUS.UNOFFICIAL or self.force_exercise_points or approve_unofficial: self.status = self.STATUS.READY @@ -581,11 +586,17 @@ def set_ready(self, approve_unofficial=False): "site": settings.BASE_URL, }) + if not grader_stable(): + # We have succesful grading task in recovery state. It may be a sign that problems + # are resolved, so immediately retry next pending submission, to speed up recovery + retry_submissions() + def set_rejected(self): self.status = self.STATUS.REJECTED def set_error(self): self.status = self.STATUS.ERROR + self.clear_pending() @property def is_graded(self): @@ -613,6 +624,19 @@ def get_url_kwargs(self): def get_inspect_url(self): return self.get_url("submission-inspect") + def mark_pending(self): + pending, created = PendingSubmission.objects.get_or_create(submission=self) + if not created: + pending.num_retries += 1 + pending.submission_time = timezone.now() + pending.save() + + def clear_pending(self): + try: + pending = PendingSubmission.objects.get(submission=self) + pending.delete() + except PendingSubmission.DoesNotExist: + logger.warn(f"Submission {self} completed, but was not on PendingSubmissions table.") class SubmissionDraft(models.Model): """ diff --git a/locale/en/LC_MESSAGES/django.po b/locale/en/LC_MESSAGES/django.po index e859ad11b..038a4afd2 100644 --- a/locale/en/LC_MESSAGES/django.po +++ b/locale/en/LC_MESSAGES/django.po @@ -1426,6 +1426,10 @@ msgstr "" "

Are you sure you want to ban this user from the course?

Banned " "users cannot re-enrol in the course.

" +#: course/templatetags/base.py +msgid "GRADER_PROBLEMS_ALERT" +msgstr "Automatic grading has problems at the moment. You may experience delay in receiving feedback." + #: course/views.py msgid "COURSE_INSTANCES_NOT_VISIBLE_TO_STUDENTS" msgstr "The course instances of this course are not visible to students." diff --git a/locale/fi/LC_MESSAGES/django.po b/locale/fi/LC_MESSAGES/django.po index cb082b80e..d97f947f8 100644 --- a/locale/fi/LC_MESSAGES/django.po +++ b/locale/fi/LC_MESSAGES/django.po @@ -1433,6 +1433,10 @@ msgstr "" "

Haluatko varmasti estää tämän käyttäjän pääsyn kurssille?

Estetyt " "käyttäjät eivät pysty ilmoittautumaan kurssille uudelleen.

" +#: course/templatetags/base.py +msgid "GRADER_PROBLEMS_ALERT" +msgstr "Automaattisessa arvioinnissa on tällä hetkellä ongelmia. Palautteen saamisessa saattaa esiintyä viivettä." + #: course/views.py msgid "COURSE_INSTANCES_NOT_VISIBLE_TO_STUDENTS" msgstr "Kurssikerrat eivät ole opiskelijoiden nähtävissä."