Skip to content

Commit

Permalink
Add last_serial sync optimization
Browse files Browse the repository at this point in the history
fixes: pulp#351
  • Loading branch information
gerrod3 committed Nov 12, 2021
1 parent ecdb2df commit 43345ee
Show file tree
Hide file tree
Showing 8 changed files with 147 additions and 9 deletions.
2 changes: 2 additions & 0 deletions CHANGES/351.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Added ``last_serial`` sync optimization to Python repositories.
Subsequent syncs will use ``last_serial`` to get the changed packages since the previous sync.
1 change: 1 addition & 0 deletions docs/tech-preview.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ The following features are currently being released as part of a tech preview
* PyPI’s json API at content endpoint ‘/pypi/{package-name}/json’. Allows for basic Pulp-to-Pulp syncing.
* Fully mirror Python repositories like PyPI.
* ``Twine`` upload packages to indexes at endpoints '/simple` or '/legacy'.
* Auto-optimize subsequent full mirror syncs using PyPI's last_serial field.
18 changes: 18 additions & 0 deletions pulp_python/app/migrations/0011_pythonrepository_last_serial.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Generated by Django 3.2.8 on 2021-10-21 20:25

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('python', '0010_update_json_field'),
]

operations = [
migrations.AddField(
model_name='pythonrepository',
name='last_serial',
field=models.IntegerField(default=0),
),
]
33 changes: 32 additions & 1 deletion pulp_python/app/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
from aiohttp.web import json_response
from django.contrib.postgres.fields import ArrayField
from django.core.exceptions import ObjectDoesNotExist
from django.db import models
from django.db import models, transaction
from django.conf import settings
from django_lifecycle import hook, BEFORE_UPDATE
from yarl import URL

from pulpcore.plugin.models import (
Expand Down Expand Up @@ -216,6 +217,28 @@ class PythonRemote(Remote):
class Meta:
default_related_name = "%(app_label)s_%(model_name)s"

@hook(
BEFORE_UPDATE,
when_any=[
"excludes",
"prereleases",
"package_types",
"keep_latest_packages",
"exclude_platforms",
"url",
"policy"
],
has_changed=True
)
def clear_last_serial(self):
with transaction.atomic():
repos = PythonRepository.objects.filter(remote_id=self.pk, last_serial__gt=0)
if repos:
for repo in repos:
repo.last_serial = 0
PythonRepository.objects.bulk_update(repos, ["last_serial"])



class PythonRepository(Repository):
"""
Expand All @@ -227,6 +250,7 @@ class PythonRepository(Repository):
REMOTE_TYPES = [PythonRemote]

autopublish = models.BooleanField(default=False)
last_serial = models.IntegerField(default=0)

class Meta:
default_related_name = "%(app_label)s_%(model_name)s"
Expand All @@ -252,3 +276,10 @@ def finalize_new_version(self, new_version):
"""
remove_duplicates(new_version)
validate_repo_version(new_version)

@hook(BEFORE_UPDATE, when="remote", has_changed=True)
def clear_last_serial(self):
"""
Reset `last_serial` when remote on repository changes.
"""
self.last_serial = 0
10 changes: 9 additions & 1 deletion pulp_python/app/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,17 @@ class PythonRepositorySerializer(core_serializers.RepositorySerializer):
default=False,
required=False,
)
last_serial = serializers.IntegerField(
help_text=_(
"The serial number from the last successful sync. Used in the sync process to "
"optimize the sync based on changes from previous sync. Use mirror=True to bypass"
"this optimization."
),
read_only=True,
)

class Meta:
fields = core_serializers.RepositorySerializer.Meta.fields + ("autopublish",)
fields = core_serializers.RepositorySerializer.Meta.fields + ("autopublish", "last_serial")
model = python_models.PythonRepository


Expand Down
24 changes: 18 additions & 6 deletions pulp_python/app/tasks/sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@
from gettext import gettext as _
from os import environ

from django.db import transaction
from rest_framework import serializers

from pulpcore.plugin.models import Artifact, ProgressReport, Remote, Repository
from pulpcore.plugin.models import Artifact, ProgressReport, Remote
from pulpcore.plugin.stages import (
DeclarativeArtifact,
DeclarativeContent,
Expand All @@ -16,6 +17,7 @@
from pulp_python.app.models import (
PythonPackageContent,
PythonRemote,
PythonRepository,
)
from pulp_python.app.utils import parse_metadata

Expand Down Expand Up @@ -43,15 +45,21 @@ def sync(remote_pk, repository_pk, mirror):
"""
remote = PythonRemote.objects.get(pk=remote_pk)
repository = Repository.objects.get(pk=repository_pk)
repository = PythonRepository.objects.get(pk=repository_pk)

if not remote.url:
raise serializers.ValidationError(
detail=_("A remote must have a url attribute to sync.")
)

first_stage = PythonBanderStage(remote)
DeclarativeVersion(first_stage, repository, mirror).create()
same_remote = getattr(repository.remote, "pk", None) == remote_pk
serial = repository.last_serial if same_remote else 0
first_stage = PythonBanderStage(remote, mirror, serial)
version = DeclarativeVersion(first_stage, repository, mirror).create()
if version is not None and same_remote:
if first_stage.next_serial and first_stage.next_serial != repository.last_serial:
repository.last_serial = first_stage.next_serial
repository.save()


def create_bandersnatch_config(remote):
Expand Down Expand Up @@ -97,10 +105,13 @@ class PythonBanderStage(Stage):
Python Package Syncing Stage using Bandersnatch
"""

def __init__(self, remote):
def __init__(self, remote, mirror, last_serial):
"""Initialize the stage and Bandersnatch config"""
super().__init__()
self.remote = remote
# If mirror=True, then sync everything, don't use serial
self.serial = last_serial if not mirror else 0
self.next_serial = None
create_bandersnatch_config(remote)

async def run(self):
Expand All @@ -119,7 +130,7 @@ async def run(self):
message="Fetching Project Metadata", code="sync.fetching.project"
) as p:
pmirror = PulpMirror(
serial=0, # Serial currently isn't supported by Pulp
serial=self.serial,
master=master,
workers=workers,
deferred_download=deferred_download,
Expand All @@ -132,6 +143,7 @@ async def run(self):
Requirement(pkg).name for pkg in self.remote.includes
]
await pmirror.synchronize(packages_to_sync)
self.next_serial = pmirror.target_serial


class PulpMirror(Mirror):
Expand Down
22 changes: 21 additions & 1 deletion pulp_python/app/viewsets.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
AsyncOperationResponseSerializer,
RepositorySyncURLSerializer,
)
from pulpcore.plugin.tasking import dispatch
from pulpcore.plugin.tasking import dispatch, general_update

from pulp_python.app import models as python_models
from pulp_python.app import serializers as python_serializers
Expand Down Expand Up @@ -135,6 +135,26 @@ class PythonRemoteViewSet(core_viewsets.RemoteViewSet):
queryset = python_models.PythonRemote.objects.all()
serializer_class = python_serializers.PythonRemoteSerializer

@extend_schema(
description="Trigger an asynchronous update task",
responses={202: AsyncOperationResponseSerializer},
)
def update(self, request, pk, **kwargs):
"""Update remote."""
partial = kwargs.pop("partial", False)
lock = [self.get_object()]
serializer = self.get_serializer(lock[0], data=request.data, partial=partial)
serializer.is_valid(raise_exception=True)
repos = python_models.PythonRepository.objects.filter(remote_id=pk, last_serial__gt=0)
lock.extend(repos)
async_result = dispatch(
general_update,
exclusive_resources=lock,
args=(pk, lock[1:]),
kwargs={"data": request.data, "partial": partial},
)
return core_viewsets.OperationPostponedResponse(async_result, request)

@extend_schema(
summary="Create from Bandersnatch",
responses={201: python_serializers.PythonRemoteSerializer},
Expand Down
46 changes: 46 additions & 0 deletions pulp_python/tests/unit/test_models.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,55 @@
from django.test import TestCase

from pulp_python.app.models import PythonRemote, PythonRepository
from pulp_python.app.tasks import update_remote


DEFAULT_SERIAL = 10000
MAX_SERIAL = 20000


class TestNothing(TestCase):
"""Test Nothing (placeholder)."""

def test_nothing_at_all(self):
"""Test that the tests are running and that's it."""
self.assertTrue(True)


class TestRepositoryLastSerial(TestCase):
"""Tests `last_serial` gets properly set and reset with remote changes."""

def setUp(self):
"""Set up class with repository with `last_serial` set."""
self.remote = PythonRemote.objects.create(name="test", url="https://pypi.org")
self.repo = PythonRepository.objects.create(
name="test", remote=self.remote, last_serial=DEFAULT_SERIAL
)

def test_remote_change(self):
"""Test that `last_serial` gets reset upon remote change."""
self.assertEqual(self.repo.remote.pk, self.remote.pk)
self.assertEqual(self.repo.last_serial, DEFAULT_SERIAL)
self.repo.remote = None
self.repo.save()
self.repo.refresh_from_db()
self.assertEqual(self.repo.last_serial, 0)

def test_remote_update(self):
"""Test that updating a remote will reset `last_serial`."""
self.assertEqual(self.repo.remote.pk, self.remote.pk)
self.assertEqual(self.repo.last_serial, DEFAULT_SERIAL)
# Remote is only updated through update task
new_body = {"url": "https://test.pypi.org"}
update_remote(self.remote.pk, (self.repo.pk,), data=new_body, partial=True)
self.repo.refresh_from_db()
self.assertEqual(self.repo.last_serial, 0)

def test_remote_update_no_change(self):
"""Test that changing 'includes' field doesn't reset `last_serial`."""
self.assertEqual(self.repo.remote.pk, self.remote.pk)
self.assertEqual(self.repo.last_serial, DEFAULT_SERIAL)
new_body = {"includes": ["shelf-reader"]}
update_remote(self.remote.pk, (self.repo.pk,), data=new_body, partial=True)
self.repo.refresh_from_db()
self.assertEqual(self.repo.last_serial, DEFAULT_SERIAL)

0 comments on commit 43345ee

Please sign in to comment.