From a41a1b98946c596c6a2af4a49af77d2abecae230 Mon Sep 17 00:00:00 2001 From: Pasi Sarolahti Date: Mon, 9 May 2022 15:58:58 +0300 Subject: [PATCH] Automatic retries of incomplete grading jobs Sometimes it happens that a grading job never completes because of some technical issue, and it remains in "In Grading" state until user submits it for regrading. This commit implements automatic regrading of grading jobs that have not completed within configured timeout. There is also a simple heuristic to limit the (likely unsuccesful) automatic retries when it seems that the grader is more persistently unavailable. This commit uses the PendingSubmission model that was added in commit 62150ce. Closes #988. --- aplus/celery.py | 33 +++++++++++++++++++++++++++++++++ aplus/settings.py | 18 ++++++++++++++++++ course/templatetags/base.py | 6 +++++- exercise/admin.py | 10 ++++++++++ exercise/submission_models.py | 30 +++++++++++++++++++++++++++++- locale/en/LC_MESSAGES/django.po | 6 ++++++ locale/fi/LC_MESSAGES/django.po | 6 ++++++ 7 files changed, 107 insertions(+), 2 deletions(-) diff --git a/aplus/celery.py b/aplus/celery.py index 4a1030790..f462282a2 100644 --- a/aplus/celery.py +++ b/aplus/celery.py @@ -2,10 +2,14 @@ import celery import datetime from datetime import timedelta +import logging +from dateutil.relativedelta import relativedelta from time import sleep from django.conf import settings + +logger = logging.getLogger('aplus.celery') os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'aplus.settings') app = celery.Celery('aplus') @@ -17,6 +21,12 @@ def setup_periodic_tasks(sender, **kwargs): if hasattr(settings, 'SIS_ENROLL_SCHEDULE'): sender.add_periodic_task(settings.SIS_ENROLL_SCHEDULE, enroll.s(), name='enroll') +@app.on_after_configure.connect +def setup_periodic_tasks(sender, **kwargs): + if settings.SUBMISSION_EXPIRY_TIMEOUT: + # Run timed check twice in timeout period, for more timely retries + sender.add_periodic_task(settings.SUBMISSION_EXPIRY_TIMEOUT/2, retry_submissions.s(), name='retry_submissions') + @app.task def enroll(): """ @@ -34,3 +44,26 @@ def enroll(): i.enroll_from_sis() if settings.SIS_ENROLL_DELAY: sleep(settings.SIS_ENROLL_DELAY) + +@app.task +def retry_submissions(): + from exercise.submission_models import PendingSubmission, PendingSubmissionManager + + # Recovery state: only send one grading request to probe the state of grader + # We pick the one with most attempts, so that total_retries comes down more quickly + # when things are back to normal, and we can return to normal state + if not PendingSubmissionManager.is_grader_stable: + pending = PendingSubmission.objects.order_by('-num_retries').first() + logging.info(f"Recovery state: retrying expired submission {pending.submission}") + pending.submission.exercise.grade(pending.submission) + return + + # Stable state: retry all expired submissions + expiry_time = datetime.datetime.now(datetime.timezone.utc) - relativedelta(seconds=settings.SUBMISSION_EXPIRY_TIMEOUT) + expired = PendingSubmission.objects.filter(submission_time__lt=expiry_time) + + for pending in expired: + if pending.submission.exercise.can_regrade: + logger.info(f"Retrying expired submission {pending.submission}") + pending.submission.exercise.grade(pending.submission) + sleep(0.5) # Delay 500 ms to avoid choking grader diff --git a/aplus/settings.py b/aplus/settings.py index 2018d7551..26d30a508 100644 --- a/aplus/settings.py +++ b/aplus/settings.py @@ -237,6 +237,24 @@ SIS_ENROLL_DELAY = 15 ########################################################################## +# Settings related automatic retries of unfinished grading tasks + +# Number of seconds after which a submission can be resent to grader, if not completed +# If set to None, retries are disabled. +SUBMISSION_EXPIRY_TIMEOUT = 30 * 60 + +# List of services with automatic grading where retries are allowed +# Network location is sufficient, e.g. "localhost:8080" or "grader.cs.aalto.fi" +SUBMISSION_RETRY_SERVICES = [] + +# Number of unresponded retries beyond which we move to recovery state. +# In recovery state there likely is more persistent problem with the grader +# or network that needs fixing. +# In recovery state A+ periodically probes the state of the grader, sending only one +# grading request out every SUBMISSION_EXPIRY_TIMEOUT seconds. +# We do not want to congest the potentially broken system unnecessarily with several +# requests in this case. +GRADER_STABLE_THRESHOLD = 5 ## Celery CELERY_ACCEPT_CONTENT = ['json'] diff --git a/course/templatetags/base.py b/course/templatetags/base.py index 710139e76..dc4e0e698 100644 --- a/course/templatetags/base.py +++ b/course/templatetags/base.py @@ -3,8 +3,9 @@ from django import template from django.conf import settings from django.utils.safestring import mark_safe -from django.utils.translation import get_language +from django.utils.translation import get_language, gettext_lazy as _ from lib.helpers import remove_query_param_from_url, settings_text, update_url_params +from exercise.submission_models import PendingSubmissionManager register = template.Library() @@ -44,6 +45,9 @@ def site_alert(): if message: return mark_safe('
{}
' .format(pick_localized(message))) + if not PendingSubmissionManager.is_grader_stable(): + # Prefer configured alert text, if one is set + return mark_safe('
{}
'.format(_('GRADER_PROBLEMS_ALERT'))) return '' diff --git a/exercise/admin.py b/exercise/admin.py index a62e56365..c4fb2d32e 100644 --- a/exercise/admin.py +++ b/exercise/admin.py @@ -17,6 +17,7 @@ RevealRule, ExerciseTask, LearningObjectDisplay, + PendingSubmission, ) from exercise.exercisecollection_models import ExerciseCollection from lib.admin_helpers import RecentCourseInstanceListFilter @@ -440,6 +441,14 @@ class LearningObjectDisplayAdmin(admin.ModelAdmin): readonly_fields = ('timestamp',) +class PendingSubmissionAdmin(admin.ModelAdmin): + list_display = ( + 'submission', + 'submission_time', + 'num_retries', + ) + + admin.site.register(CourseChapter, CourseChapterAdmin) admin.site.register(BaseExercise, BaseExerciseAdmin) admin.site.register(StaticExercise, StaticExerciseAdmin) @@ -452,3 +461,4 @@ class LearningObjectDisplayAdmin(admin.ModelAdmin): admin.site.register(RevealRule, RevealRuleAdmin) admin.site.register(ExerciseTask, ExerciseTaskAdmin) admin.site.register(LearningObjectDisplay, LearningObjectDisplayAdmin) +admin.site.register(PendingSubmission, PendingSubmissionAdmin) diff --git a/exercise/submission_models.py b/exercise/submission_models.py index 327f0cea3..149669886 100644 --- a/exercise/submission_models.py +++ b/exercise/submission_models.py @@ -4,16 +4,18 @@ from mimetypes import guess_type import os from typing import IO, Dict, Iterable, List, Tuple, TYPE_CHECKING +from urllib.parse import urlparse from binaryornot.check import is_binary from django.conf import settings from django.db import models, DatabaseError +from django.db.models import F from django.db.models.signals import post_delete from django.http.request import HttpRequest from django.utils import timezone from django.utils.translation import get_language, gettext_lazy as _ -from exercise.protocol.exercise_page import ExercisePage +from exercise.protocol.exercise_page import ExercisePage from authorization.models import JWTAccessible from authorization.object_permissions import register_jwt_accessible_class from lib.fields import DefaultForeignKey, JSONField, PercentField @@ -26,6 +28,7 @@ ) from lib.models import UrlMixin from userprofile.models import UserProfile +from aplus.celery import retry_submissions from . import exercise_models @@ -565,9 +568,11 @@ def scale_grade_to(self, percentage): def set_waiting(self): self.status = self.STATUS.WAITING + self.mark_pending() def set_ready(self, approve_unofficial=False): self.grading_time = timezone.now() + self.clear_pending() if self.status != self.STATUS.UNOFFICIAL or self.force_exercise_points or approve_unofficial: self.status = self.STATUS.READY @@ -581,11 +586,18 @@ def set_ready(self, approve_unofficial=False): "site": settings.BASE_URL, }) + if not PendingSubmissionManager.is_grader_stable: + # We have a successful grading task in the recovery state. It may be a sign that problems + # have been resolved, so immediately retry the next pending submission, to speed up recovery + retry_submissions() + def set_rejected(self): self.status = self.STATUS.REJECTED + self.clear_pending() def set_error(self): self.status = self.STATUS.ERROR + self.clear_pending() @property def is_graded(self): @@ -613,6 +625,22 @@ def get_url_kwargs(self): def get_inspect_url(self): return self.get_url("submission-inspect") + def mark_pending(self): + grading_host = urlparse(self.exercise.service_url).netloc + if grading_host in settings.SUBMISSION_RETRY_SERVICES: + pending, created = PendingSubmission.objects.get_or_create(submission=self) + if not created: + pending.num_retries = F('num_retries') + 1 + pending.submission_time = timezone.now() + pending.save() + + def clear_pending(self): + try: + pending = PendingSubmission.objects.get(submission=self) + pending.delete() + except PendingSubmission.DoesNotExist: + pass + class SubmissionDraft(models.Model): """ diff --git a/locale/en/LC_MESSAGES/django.po b/locale/en/LC_MESSAGES/django.po index b39119b37..cfd5bea50 100644 --- a/locale/en/LC_MESSAGES/django.po +++ b/locale/en/LC_MESSAGES/django.po @@ -1429,6 +1429,12 @@ msgstr "" "

Are you sure you want to ban this user from the course?

Banned " "users cannot re-enrol in the course.

" +#: course/templatetags/base.py +msgid "GRADER_PROBLEMS_ALERT" +msgstr "" +"Automatic grading has currently issues. " +"You may experience delay in receiving feedback." + #: course/views.py msgid "COURSE_INSTANCES_NOT_VISIBLE_TO_STUDENTS" msgstr "The course instances of this course are not visible to students." diff --git a/locale/fi/LC_MESSAGES/django.po b/locale/fi/LC_MESSAGES/django.po index 760634fb6..a30605e29 100644 --- a/locale/fi/LC_MESSAGES/django.po +++ b/locale/fi/LC_MESSAGES/django.po @@ -1437,6 +1437,12 @@ msgstr "" "

Haluatko varmasti estää tämän käyttäjän pääsyn kurssille?

Estetyt " "käyttäjät eivät pysty ilmoittautumaan kurssille uudelleen.

" +#: course/templatetags/base.py +msgid "GRADER_PROBLEMS_ALERT" +msgstr "" +"Automaattisessa arvioinnissa on tällä hetkellä ongelmia. " +"Palautteen saamisessa saattaa esiintyä viivettä." + #: course/views.py msgid "COURSE_INSTANCES_NOT_VISIBLE_TO_STUDENTS" msgstr "Kurssikerrat eivät ole opiskelijoiden nähtävissä."