Skip to content

Commit

Permalink
Automatic retries of incomplete grading jobs
Browse files Browse the repository at this point in the history
Sometimes it happens that a grading job never completes because of
some technical issue, and it remains in "In Grading" state until
user submits it for regrading. This commit implements automatic
regrading of grading jobs that have not completed within configured
timeout. There is also a simple heuristic to limit the
(likely unsuccesful) automatic retries when it seems that the
grader is more persistently unavailable. This commit uses the
PendingSubmission model that was added in commit 62150ce.

Closes apluslms#988.
  • Loading branch information
PasiSa committed Sep 14, 2022
1 parent f0ef3db commit a41a1b9
Show file tree
Hide file tree
Showing 7 changed files with 107 additions and 2 deletions.
33 changes: 33 additions & 0 deletions aplus/celery.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,14 @@
import celery
import datetime
from datetime import timedelta
import logging
from dateutil.relativedelta import relativedelta
from time import sleep

from django.conf import settings


logger = logging.getLogger('aplus.celery')
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'aplus.settings')

app = celery.Celery('aplus')
Expand All @@ -17,6 +21,12 @@ def setup_periodic_tasks(sender, **kwargs):
if hasattr(settings, 'SIS_ENROLL_SCHEDULE'):
sender.add_periodic_task(settings.SIS_ENROLL_SCHEDULE, enroll.s(), name='enroll')

@app.on_after_configure.connect
def setup_periodic_tasks(sender, **kwargs):
if settings.SUBMISSION_EXPIRY_TIMEOUT:
# Run timed check twice in timeout period, for more timely retries
sender.add_periodic_task(settings.SUBMISSION_EXPIRY_TIMEOUT/2, retry_submissions.s(), name='retry_submissions')

@app.task
def enroll():
"""
Expand All @@ -34,3 +44,26 @@ def enroll():
i.enroll_from_sis()
if settings.SIS_ENROLL_DELAY:
sleep(settings.SIS_ENROLL_DELAY)

@app.task
def retry_submissions():
from exercise.submission_models import PendingSubmission, PendingSubmissionManager

# Recovery state: only send one grading request to probe the state of grader
# We pick the one with most attempts, so that total_retries comes down more quickly
# when things are back to normal, and we can return to normal state
if not PendingSubmissionManager.is_grader_stable:
pending = PendingSubmission.objects.order_by('-num_retries').first()
logging.info(f"Recovery state: retrying expired submission {pending.submission}")
pending.submission.exercise.grade(pending.submission)
return

# Stable state: retry all expired submissions
expiry_time = datetime.datetime.now(datetime.timezone.utc) - relativedelta(seconds=settings.SUBMISSION_EXPIRY_TIMEOUT)
expired = PendingSubmission.objects.filter(submission_time__lt=expiry_time)

for pending in expired:
if pending.submission.exercise.can_regrade:
logger.info(f"Retrying expired submission {pending.submission}")
pending.submission.exercise.grade(pending.submission)
sleep(0.5) # Delay 500 ms to avoid choking grader
18 changes: 18 additions & 0 deletions aplus/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,24 @@
SIS_ENROLL_DELAY = 15

##########################################################################
# Settings related automatic retries of unfinished grading tasks

# Number of seconds after which a submission can be resent to grader, if not completed
# If set to None, retries are disabled.
SUBMISSION_EXPIRY_TIMEOUT = 30 * 60

# List of services with automatic grading where retries are allowed
# Network location is sufficient, e.g. "localhost:8080" or "grader.cs.aalto.fi"
SUBMISSION_RETRY_SERVICES = []

# Number of unresponded retries beyond which we move to recovery state.
# In recovery state there likely is more persistent problem with the grader
# or network that needs fixing.
# In recovery state A+ periodically probes the state of the grader, sending only one
# grading request out every SUBMISSION_EXPIRY_TIMEOUT seconds.
# We do not want to congest the potentially broken system unnecessarily with several
# requests in this case.
GRADER_STABLE_THRESHOLD = 5

## Celery
CELERY_ACCEPT_CONTENT = ['json']
Expand Down
6 changes: 5 additions & 1 deletion course/templatetags/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
from django import template
from django.conf import settings
from django.utils.safestring import mark_safe
from django.utils.translation import get_language
from django.utils.translation import get_language, gettext_lazy as _
from lib.helpers import remove_query_param_from_url, settings_text, update_url_params
from exercise.submission_models import PendingSubmissionManager


register = template.Library()
Expand Down Expand Up @@ -44,6 +45,9 @@ def site_alert():
if message:
return mark_safe('<div class="alert alert-danger">{}</div>'
.format(pick_localized(message)))
if not PendingSubmissionManager.is_grader_stable():
# Prefer configured alert text, if one is set
return mark_safe('<div class="alert alert-danger">{}</div>'.format(_('GRADER_PROBLEMS_ALERT')))
return ''


Expand Down
10 changes: 10 additions & 0 deletions exercise/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
RevealRule,
ExerciseTask,
LearningObjectDisplay,
PendingSubmission,
)
from exercise.exercisecollection_models import ExerciseCollection
from lib.admin_helpers import RecentCourseInstanceListFilter
Expand Down Expand Up @@ -440,6 +441,14 @@ class LearningObjectDisplayAdmin(admin.ModelAdmin):
readonly_fields = ('timestamp',)


class PendingSubmissionAdmin(admin.ModelAdmin):
list_display = (
'submission',
'submission_time',
'num_retries',
)


admin.site.register(CourseChapter, CourseChapterAdmin)
admin.site.register(BaseExercise, BaseExerciseAdmin)
admin.site.register(StaticExercise, StaticExerciseAdmin)
Expand All @@ -452,3 +461,4 @@ class LearningObjectDisplayAdmin(admin.ModelAdmin):
admin.site.register(RevealRule, RevealRuleAdmin)
admin.site.register(ExerciseTask, ExerciseTaskAdmin)
admin.site.register(LearningObjectDisplay, LearningObjectDisplayAdmin)
admin.site.register(PendingSubmission, PendingSubmissionAdmin)
30 changes: 29 additions & 1 deletion exercise/submission_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,18 @@
from mimetypes import guess_type
import os
from typing import IO, Dict, Iterable, List, Tuple, TYPE_CHECKING
from urllib.parse import urlparse

from binaryornot.check import is_binary
from django.conf import settings
from django.db import models, DatabaseError
from django.db.models import F
from django.db.models.signals import post_delete
from django.http.request import HttpRequest
from django.utils import timezone
from django.utils.translation import get_language, gettext_lazy as _
from exercise.protocol.exercise_page import ExercisePage

from exercise.protocol.exercise_page import ExercisePage
from authorization.models import JWTAccessible
from authorization.object_permissions import register_jwt_accessible_class
from lib.fields import DefaultForeignKey, JSONField, PercentField
Expand All @@ -26,6 +28,7 @@
)
from lib.models import UrlMixin
from userprofile.models import UserProfile
from aplus.celery import retry_submissions
from . import exercise_models


Expand Down Expand Up @@ -565,9 +568,11 @@ def scale_grade_to(self, percentage):

def set_waiting(self):
self.status = self.STATUS.WAITING
self.mark_pending()

def set_ready(self, approve_unofficial=False):
self.grading_time = timezone.now()
self.clear_pending()
if self.status != self.STATUS.UNOFFICIAL or self.force_exercise_points or approve_unofficial:
self.status = self.STATUS.READY

Expand All @@ -581,11 +586,18 @@ def set_ready(self, approve_unofficial=False):
"site": settings.BASE_URL,
})

if not PendingSubmissionManager.is_grader_stable:
# We have a successful grading task in the recovery state. It may be a sign that problems
# have been resolved, so immediately retry the next pending submission, to speed up recovery
retry_submissions()

def set_rejected(self):
self.status = self.STATUS.REJECTED
self.clear_pending()

def set_error(self):
self.status = self.STATUS.ERROR
self.clear_pending()

@property
def is_graded(self):
Expand Down Expand Up @@ -613,6 +625,22 @@ def get_url_kwargs(self):
def get_inspect_url(self):
return self.get_url("submission-inspect")

def mark_pending(self):
grading_host = urlparse(self.exercise.service_url).netloc
if grading_host in settings.SUBMISSION_RETRY_SERVICES:
pending, created = PendingSubmission.objects.get_or_create(submission=self)
if not created:
pending.num_retries = F('num_retries') + 1
pending.submission_time = timezone.now()
pending.save()

def clear_pending(self):
try:
pending = PendingSubmission.objects.get(submission=self)
pending.delete()
except PendingSubmission.DoesNotExist:
pass


class SubmissionDraft(models.Model):
"""
Expand Down
6 changes: 6 additions & 0 deletions locale/en/LC_MESSAGES/django.po
Original file line number Diff line number Diff line change
Expand Up @@ -1429,6 +1429,12 @@ msgstr ""
"<p>Are you sure you want to ban this user from the course?</p><p>Banned "
"users cannot re-enrol in the course.</p>"

#: course/templatetags/base.py
msgid "GRADER_PROBLEMS_ALERT"
msgstr ""
"Automatic grading has currently issues. "
"You may experience delay in receiving feedback."

#: course/views.py
msgid "COURSE_INSTANCES_NOT_VISIBLE_TO_STUDENTS"
msgstr "The course instances of this course are not visible to students."
Expand Down
6 changes: 6 additions & 0 deletions locale/fi/LC_MESSAGES/django.po
Original file line number Diff line number Diff line change
Expand Up @@ -1437,6 +1437,12 @@ msgstr ""
"<p>Haluatko varmasti estää tämän käyttäjän pääsyn kurssille?</p><p>Estetyt "
"käyttäjät eivät pysty ilmoittautumaan kurssille uudelleen.</p>"

#: course/templatetags/base.py
msgid "GRADER_PROBLEMS_ALERT"
msgstr ""
"Automaattisessa arvioinnissa on tällä hetkellä ongelmia. "
"Palautteen saamisessa saattaa esiintyä viivettä."

#: course/views.py
msgid "COURSE_INSTANCES_NOT_VISIBLE_TO_STUDENTS"
msgstr "Kurssikerrat eivät ole opiskelijoiden nähtävissä."
Expand Down

0 comments on commit a41a1b9

Please sign in to comment.