From 43345ee71927192c349ddabb63d8596bbdd0eeb8 Mon Sep 17 00:00:00 2001 From: Gerrod Ubben Date: Wed, 27 Oct 2021 14:47:41 -0400 Subject: [PATCH] Add `last_serial` sync optimization fixes: #351 --- CHANGES/351.feature | 2 + docs/tech-preview.rst | 1 + .../0011_pythonrepository_last_serial.py | 18 ++++++++ pulp_python/app/models.py | 33 ++++++++++++- pulp_python/app/serializers.py | 10 +++- pulp_python/app/tasks/sync.py | 24 +++++++--- pulp_python/app/viewsets.py | 22 ++++++++- pulp_python/tests/unit/test_models.py | 46 +++++++++++++++++++ 8 files changed, 147 insertions(+), 9 deletions(-) create mode 100644 CHANGES/351.feature create mode 100644 pulp_python/app/migrations/0011_pythonrepository_last_serial.py diff --git a/CHANGES/351.feature b/CHANGES/351.feature new file mode 100644 index 00000000..779ebb52 --- /dev/null +++ b/CHANGES/351.feature @@ -0,0 +1,2 @@ +Added ``last_serial`` sync optimization to Python repositories. +Subsequent syncs will use ``last_serial`` to get the changed packages since the previous sync. diff --git a/docs/tech-preview.rst b/docs/tech-preview.rst index a2699c1f..c6c362e1 100644 --- a/docs/tech-preview.rst +++ b/docs/tech-preview.rst @@ -8,3 +8,4 @@ The following features are currently being released as part of a tech preview * PyPI’s json API at content endpoint ‘/pypi/{package-name}/json’. Allows for basic Pulp-to-Pulp syncing. * Fully mirror Python repositories like PyPI. * ``Twine`` upload packages to indexes at endpoints '/simple` or '/legacy'. +* Auto-optimize subsequent full mirror syncs using PyPI's last_serial field. diff --git a/pulp_python/app/migrations/0011_pythonrepository_last_serial.py b/pulp_python/app/migrations/0011_pythonrepository_last_serial.py new file mode 100644 index 00000000..c3e812e7 --- /dev/null +++ b/pulp_python/app/migrations/0011_pythonrepository_last_serial.py @@ -0,0 +1,18 @@ +# Generated by Django 3.2.8 on 2021-10-21 20:25 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('python', '0010_update_json_field'), + ] + + operations = [ + migrations.AddField( + model_name='pythonrepository', + name='last_serial', + field=models.IntegerField(default=0), + ), + ] diff --git a/pulp_python/app/models.py b/pulp_python/app/models.py index 7c477d9c..fdef2f38 100644 --- a/pulp_python/app/models.py +++ b/pulp_python/app/models.py @@ -3,8 +3,9 @@ from aiohttp.web import json_response from django.contrib.postgres.fields import ArrayField from django.core.exceptions import ObjectDoesNotExist -from django.db import models +from django.db import models, transaction from django.conf import settings +from django_lifecycle import hook, BEFORE_UPDATE from yarl import URL from pulpcore.plugin.models import ( @@ -216,6 +217,28 @@ class PythonRemote(Remote): class Meta: default_related_name = "%(app_label)s_%(model_name)s" + @hook( + BEFORE_UPDATE, + when_any=[ + "excludes", + "prereleases", + "package_types", + "keep_latest_packages", + "exclude_platforms", + "url", + "policy" + ], + has_changed=True + ) + def clear_last_serial(self): + with transaction.atomic(): + repos = PythonRepository.objects.filter(remote_id=self.pk, last_serial__gt=0) + if repos: + for repo in repos: + repo.last_serial = 0 + PythonRepository.objects.bulk_update(repos, ["last_serial"]) + + class PythonRepository(Repository): """ @@ -227,6 +250,7 @@ class PythonRepository(Repository): REMOTE_TYPES = [PythonRemote] autopublish = models.BooleanField(default=False) + last_serial = models.IntegerField(default=0) class Meta: default_related_name = "%(app_label)s_%(model_name)s" @@ -252,3 +276,10 @@ def finalize_new_version(self, new_version): """ remove_duplicates(new_version) validate_repo_version(new_version) + + @hook(BEFORE_UPDATE, when="remote", has_changed=True) + def clear_last_serial(self): + """ + Reset `last_serial` when remote on repository changes. + """ + self.last_serial = 0 diff --git a/pulp_python/app/serializers.py b/pulp_python/app/serializers.py index b823df5d..a89329a1 100644 --- a/pulp_python/app/serializers.py +++ b/pulp_python/app/serializers.py @@ -28,9 +28,17 @@ class PythonRepositorySerializer(core_serializers.RepositorySerializer): default=False, required=False, ) + last_serial = serializers.IntegerField( + help_text=_( + "The serial number from the last successful sync. Used in the sync process to " + "optimize the sync based on changes from previous sync. Use mirror=True to bypass" + "this optimization." + ), + read_only=True, + ) class Meta: - fields = core_serializers.RepositorySerializer.Meta.fields + ("autopublish",) + fields = core_serializers.RepositorySerializer.Meta.fields + ("autopublish", "last_serial") model = python_models.PythonRepository diff --git a/pulp_python/app/tasks/sync.py b/pulp_python/app/tasks/sync.py index 641aeab5..4edc5d46 100644 --- a/pulp_python/app/tasks/sync.py +++ b/pulp_python/app/tasks/sync.py @@ -3,9 +3,10 @@ from gettext import gettext as _ from os import environ +from django.db import transaction from rest_framework import serializers -from pulpcore.plugin.models import Artifact, ProgressReport, Remote, Repository +from pulpcore.plugin.models import Artifact, ProgressReport, Remote from pulpcore.plugin.stages import ( DeclarativeArtifact, DeclarativeContent, @@ -16,6 +17,7 @@ from pulp_python.app.models import ( PythonPackageContent, PythonRemote, + PythonRepository, ) from pulp_python.app.utils import parse_metadata @@ -43,15 +45,21 @@ def sync(remote_pk, repository_pk, mirror): """ remote = PythonRemote.objects.get(pk=remote_pk) - repository = Repository.objects.get(pk=repository_pk) + repository = PythonRepository.objects.get(pk=repository_pk) if not remote.url: raise serializers.ValidationError( detail=_("A remote must have a url attribute to sync.") ) - first_stage = PythonBanderStage(remote) - DeclarativeVersion(first_stage, repository, mirror).create() + same_remote = getattr(repository.remote, "pk", None) == remote_pk + serial = repository.last_serial if same_remote else 0 + first_stage = PythonBanderStage(remote, mirror, serial) + version = DeclarativeVersion(first_stage, repository, mirror).create() + if version is not None and same_remote: + if first_stage.next_serial and first_stage.next_serial != repository.last_serial: + repository.last_serial = first_stage.next_serial + repository.save() def create_bandersnatch_config(remote): @@ -97,10 +105,13 @@ class PythonBanderStage(Stage): Python Package Syncing Stage using Bandersnatch """ - def __init__(self, remote): + def __init__(self, remote, mirror, last_serial): """Initialize the stage and Bandersnatch config""" super().__init__() self.remote = remote + # If mirror=True, then sync everything, don't use serial + self.serial = last_serial if not mirror else 0 + self.next_serial = None create_bandersnatch_config(remote) async def run(self): @@ -119,7 +130,7 @@ async def run(self): message="Fetching Project Metadata", code="sync.fetching.project" ) as p: pmirror = PulpMirror( - serial=0, # Serial currently isn't supported by Pulp + serial=self.serial, master=master, workers=workers, deferred_download=deferred_download, @@ -132,6 +143,7 @@ async def run(self): Requirement(pkg).name for pkg in self.remote.includes ] await pmirror.synchronize(packages_to_sync) + self.next_serial = pmirror.target_serial class PulpMirror(Mirror): diff --git a/pulp_python/app/viewsets.py b/pulp_python/app/viewsets.py index 88f38121..cb211d79 100644 --- a/pulp_python/app/viewsets.py +++ b/pulp_python/app/viewsets.py @@ -11,7 +11,7 @@ AsyncOperationResponseSerializer, RepositorySyncURLSerializer, ) -from pulpcore.plugin.tasking import dispatch +from pulpcore.plugin.tasking import dispatch, general_update from pulp_python.app import models as python_models from pulp_python.app import serializers as python_serializers @@ -135,6 +135,26 @@ class PythonRemoteViewSet(core_viewsets.RemoteViewSet): queryset = python_models.PythonRemote.objects.all() serializer_class = python_serializers.PythonRemoteSerializer + @extend_schema( + description="Trigger an asynchronous update task", + responses={202: AsyncOperationResponseSerializer}, + ) + def update(self, request, pk, **kwargs): + """Update remote.""" + partial = kwargs.pop("partial", False) + lock = [self.get_object()] + serializer = self.get_serializer(lock[0], data=request.data, partial=partial) + serializer.is_valid(raise_exception=True) + repos = python_models.PythonRepository.objects.filter(remote_id=pk, last_serial__gt=0) + lock.extend(repos) + async_result = dispatch( + general_update, + exclusive_resources=lock, + args=(pk, lock[1:]), + kwargs={"data": request.data, "partial": partial}, + ) + return core_viewsets.OperationPostponedResponse(async_result, request) + @extend_schema( summary="Create from Bandersnatch", responses={201: python_serializers.PythonRemoteSerializer}, diff --git a/pulp_python/tests/unit/test_models.py b/pulp_python/tests/unit/test_models.py index 16e9754d..dc33d4c4 100644 --- a/pulp_python/tests/unit/test_models.py +++ b/pulp_python/tests/unit/test_models.py @@ -1,5 +1,12 @@ from django.test import TestCase +from pulp_python.app.models import PythonRemote, PythonRepository +from pulp_python.app.tasks import update_remote + + +DEFAULT_SERIAL = 10000 +MAX_SERIAL = 20000 + class TestNothing(TestCase): """Test Nothing (placeholder).""" @@ -7,3 +14,42 @@ class TestNothing(TestCase): def test_nothing_at_all(self): """Test that the tests are running and that's it.""" self.assertTrue(True) + + +class TestRepositoryLastSerial(TestCase): + """Tests `last_serial` gets properly set and reset with remote changes.""" + + def setUp(self): + """Set up class with repository with `last_serial` set.""" + self.remote = PythonRemote.objects.create(name="test", url="https://pypi.org") + self.repo = PythonRepository.objects.create( + name="test", remote=self.remote, last_serial=DEFAULT_SERIAL + ) + + def test_remote_change(self): + """Test that `last_serial` gets reset upon remote change.""" + self.assertEqual(self.repo.remote.pk, self.remote.pk) + self.assertEqual(self.repo.last_serial, DEFAULT_SERIAL) + self.repo.remote = None + self.repo.save() + self.repo.refresh_from_db() + self.assertEqual(self.repo.last_serial, 0) + + def test_remote_update(self): + """Test that updating a remote will reset `last_serial`.""" + self.assertEqual(self.repo.remote.pk, self.remote.pk) + self.assertEqual(self.repo.last_serial, DEFAULT_SERIAL) + # Remote is only updated through update task + new_body = {"url": "https://test.pypi.org"} + update_remote(self.remote.pk, (self.repo.pk,), data=new_body, partial=True) + self.repo.refresh_from_db() + self.assertEqual(self.repo.last_serial, 0) + + def test_remote_update_no_change(self): + """Test that changing 'includes' field doesn't reset `last_serial`.""" + self.assertEqual(self.repo.remote.pk, self.remote.pk) + self.assertEqual(self.repo.last_serial, DEFAULT_SERIAL) + new_body = {"includes": ["shelf-reader"]} + update_remote(self.remote.pk, (self.repo.pk,), data=new_body, partial=True) + self.repo.refresh_from_db() + self.assertEqual(self.repo.last_serial, DEFAULT_SERIAL)