Skip to content

Commit

Permalink
Automatic retries of incomplete grading jobs
Browse files Browse the repository at this point in the history
Sometimes it happens that a grading job never completes because of
some technical issue, and it remains in "In Grading" state until
user submits it for regrading. This commit implements automatic
regrading of grading jobs that have not completed within configured
timeout. There is also a simple heuristic to limit the
(likely unsuccesful) automatic retries when it seems that the
grader is more persistently unavailable.

Closes apluslms#988.
  • Loading branch information
PasiSa committed Aug 20, 2022
1 parent 67e5c05 commit c0078a7
Show file tree
Hide file tree
Showing 8 changed files with 125 additions and 2 deletions.
32 changes: 32 additions & 0 deletions aplus/celery.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,15 @@
import celery
import datetime
from datetime import timedelta
import logging
from dateutil.relativedelta import relativedelta
from time import sleep

from django.conf import settings
from django.db.models import Max


logger = logging.getLogger('aplus.celery')
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'aplus.settings')

app = celery.Celery('aplus')
Expand All @@ -17,6 +22,10 @@ def setup_periodic_tasks(sender, **kwargs):
if hasattr(settings, 'SIS_ENROLL_SCHEDULE'):
sender.add_periodic_task(settings.SIS_ENROLL_SCHEDULE, enroll.s(), name='enroll')

@app.on_after_configure.connect
def setup_periodic_tasks(sender, **kwargs):
sender.add_periodic_task(settings.SUBMISSION_EXPIRY_TIMEOUT, retry_submissions.s(), name='retry_submissions')

@app.task
def enroll():
"""
Expand All @@ -34,3 +43,26 @@ def enroll():
i.enroll_from_sis()
if settings.SIS_ENROLL_DELAY:
sleep(settings.SIS_ENROLL_DELAY)

@app.task
def retry_submissions():
from exercise.submission_models import PendingSubmission, grader_stable

# Recovery state: only send one grading request to probe the state of grader
# We pick the one with most attempts, so that total_retries comes down more quickly
# when things are back to normal, and we can return to normal state
if not grader_stable():
max_retries = PendingSubmission.objects.aggregate(maxretries=Max('num_retries'))['maxretries']
pending = PendingSubmission.objects.filter(num_retries=max_retries)[0]
logging.info(f"Recovery state: retrying expired submission {pending.submission}")
pending.submission.exercise.grade(pending.submission)
return

# Stable state: retry all expired submissions
expiry_time = datetime.datetime.now(datetime.timezone.utc) - relativedelta(seconds=settings.SUBMISSION_EXPIRY_TIMEOUT)
expired = PendingSubmission.objects.filter(submission_time__lt=expiry_time)

for pending in expired:
logger.info(f"Retrying expired submission {pending.submission}")
pending.submission.exercise.grade(pending.submission)
sleep(0.5) # Delay 500 ms to avoid choking grader
13 changes: 13 additions & 0 deletions aplus/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,19 @@
SIS_ENROLL_DELAY = 15

##########################################################################
# Settings related automatic retries of unfinished grading tasks

# Number of seconds after which a submission can be resent to grader
SUBMISSION_EXPIRY_TIMEOUT = 120

# Number of unresponded retries beyond which we move to recovery state.
# In recovery state there likely is more persistent problem with the grader
# or network that needs fixing.
# In recovery state A+ periodically probes the state of the grader, sending only one
# grading request out every SUBMISSION_EXPIRY_TIMEOUT seconds.
# We don not want to congest the potentially broken system unnecessarily with several
# requests in this case.
GRADER_STABLE_THRESHOLD = 5

## Celery
CELERY_ACCEPT_CONTENT = ['json']
Expand Down
6 changes: 5 additions & 1 deletion course/templatetags/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
from django import template
from django.conf import settings
from django.utils.safestring import mark_safe
from django.utils.translation import get_language
from django.utils.translation import get_language, gettext_lazy as _
from lib.helpers import remove_query_param_from_url, settings_text, update_url_params
from exercise.submission_models import grader_stable


register = template.Library()
Expand Down Expand Up @@ -44,6 +45,9 @@ def site_alert():
if message:
return mark_safe('<div class="alert alert-danger">{}</div>'
.format(pick_localized(message)))
if not grader_stable():
# Prefer configured alert text, if one is set
return mark_safe('<div class="alert alert-danger">{}</div>'.format(_('GRADER_PROBLEMS_ALERT')))
return ''


Expand Down
10 changes: 10 additions & 0 deletions exercise/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
RevealRule,
ExerciseTask,
LearningObjectDisplay,
PendingSubmission,
)
from exercise.exercisecollection_models import ExerciseCollection
from lib.admin_helpers import RecentCourseInstanceListFilter
Expand Down Expand Up @@ -440,6 +441,14 @@ class LearningObjectDisplayAdmin(admin.ModelAdmin):
readonly_fields = ('timestamp',)


class PendingSubmissionAdmin(admin.ModelAdmin):
list_display = (
'submission',
'submission_time',
'num_retries',
)


admin.site.register(CourseChapter, CourseChapterAdmin)
admin.site.register(BaseExercise, BaseExerciseAdmin)
admin.site.register(StaticExercise, StaticExerciseAdmin)
Expand All @@ -452,3 +461,4 @@ class LearningObjectDisplayAdmin(admin.ModelAdmin):
admin.site.register(RevealRule, RevealRuleAdmin)
admin.site.register(ExerciseTask, ExerciseTaskAdmin)
admin.site.register(LearningObjectDisplay, LearningObjectDisplayAdmin)
admin.site.register(PendingSubmission, PendingSubmissionAdmin)
32 changes: 32 additions & 0 deletions exercise/migrations/0046_auto_20220501_2152.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Generated by Django 3.2.12 on 2022-05-01 18:52

from django.db import migrations, models
import django.db.models.deletion
import lib.fields


class Migration(migrations.Migration):

dependencies = [
('exercise', '0045_auto_20220211_1540'),
]

operations = [
migrations.CreateModel(
name='PendingSubmission',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('submission_time', models.DateTimeField(null=True, verbose_name='LABEL_SUBMISSION_TIME')),
('num_retries', models.IntegerField(default=0, verbose_name='LABEL_NUMBER_OF_RETRIES')),
('submission', lib.fields.DefaultForeignKey(on_delete=django.db.models.deletion.CASCADE, to='exercise.submission', verbose_name='LABEL_SUBMISSION')),
],
options={
'verbose_name': 'MODEL_NAME_PENDING_SUBMISSION',
'verbose_name_plural': 'MODEL_NAME_PENDING_SUBMISSION_PLURAL',
},
),
migrations.AddConstraint(
model_name='pendingsubmission',
constraint=models.UniqueConstraint(fields=('submission',), name='unique_submission'),
),
]
26 changes: 25 additions & 1 deletion exercise/submission_models.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import itertools
import json
import logging
from django.utils import timezone
from mimetypes import guess_type
import os
from typing import IO, Dict, Iterable, List, Tuple, TYPE_CHECKING
Expand All @@ -12,8 +13,9 @@
from django.http.request import HttpRequest
from django.utils import timezone
from django.utils.translation import get_language, gettext_lazy as _
from exercise.protocol.exercise_page import ExercisePage
from django.db.models import Sum

from exercise.protocol.exercise_page import ExercisePage
from authorization.models import JWTAccessible
from authorization.object_permissions import register_jwt_accessible_class
from lib.fields import DefaultForeignKey, JSONField, PercentField
Expand All @@ -26,6 +28,7 @@
)
from lib.models import UrlMixin
from userprofile.models import UserProfile
from aplus.celery import retry_submissions
from . import exercise_models


Expand Down Expand Up @@ -565,9 +568,11 @@ def scale_grade_to(self, percentage):

def set_waiting(self):
self.status = self.STATUS.WAITING
self.mark_pending()

def set_ready(self, approve_unofficial=False):
self.grading_time = timezone.now()
self.clear_pending()
if self.status != self.STATUS.UNOFFICIAL or self.force_exercise_points or approve_unofficial:
self.status = self.STATUS.READY

Expand All @@ -581,11 +586,17 @@ def set_ready(self, approve_unofficial=False):
"site": settings.BASE_URL,
})

if not grader_stable():
# We have succesful grading task in recovery state. It may be a sign that problems
# are resolved, so immediately retry next pending submission, to speed up recovery
retry_submissions()

def set_rejected(self):
self.status = self.STATUS.REJECTED

def set_error(self):
self.status = self.STATUS.ERROR
self.clear_pending()

@property
def is_graded(self):
Expand Down Expand Up @@ -613,6 +624,19 @@ def get_url_kwargs(self):
def get_inspect_url(self):
return self.get_url("submission-inspect")

def mark_pending(self):
pending, created = PendingSubmission.objects.get_or_create(submission=self)
if not created:
pending.num_retries += 1
pending.submission_time = timezone.now()
pending.save()

def clear_pending(self):
try:
pending = PendingSubmission.objects.get(submission=self)
pending.delete()
except PendingSubmission.DoesNotExist:
logger.warn(f"Submission {self} completed, but was not on PendingSubmissions table.")

class SubmissionDraft(models.Model):
"""
Expand Down
4 changes: 4 additions & 0 deletions locale/en/LC_MESSAGES/django.po
Original file line number Diff line number Diff line change
Expand Up @@ -1426,6 +1426,10 @@ msgstr ""
"<p>Are you sure you want to ban this user from the course?</p><p>Banned "
"users cannot re-enrol in the course.</p>"

#: course/templatetags/base.py
msgid "GRADER_PROBLEMS_ALERT"
msgstr "Automatic grading has problems at the moment. You may experience delay in receiving feedback."

#: course/views.py
msgid "COURSE_INSTANCES_NOT_VISIBLE_TO_STUDENTS"
msgstr "The course instances of this course are not visible to students."
Expand Down
4 changes: 4 additions & 0 deletions locale/fi/LC_MESSAGES/django.po
Original file line number Diff line number Diff line change
Expand Up @@ -1433,6 +1433,10 @@ msgstr ""
"<p>Haluatko varmasti estää tämän käyttäjän pääsyn kurssille?</p><p>Estetyt "
"käyttäjät eivät pysty ilmoittautumaan kurssille uudelleen.</p>"

#: course/templatetags/base.py
msgid "GRADER_PROBLEMS_ALERT"
msgstr "Automaattisessa arvioinnissa on tällä hetkellä ongelmia. Palautteen saamisessa saattaa esiintyä viivettä."

#: course/views.py
msgid "COURSE_INSTANCES_NOT_VISIBLE_TO_STUDENTS"
msgstr "Kurssikerrat eivät ole opiskelijoiden nähtävissä."
Expand Down

0 comments on commit c0078a7

Please sign in to comment.