Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

not prefetching images when not needed #8676

Merged
merged 21 commits into from
Dec 4, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
### Fixed

- Optimized memory consumption and reduced the number of database queries when importing annotations to a task with a lot of jobs and images
(<https://github.com/cvat-ai/cvat/pull/8676>)
32 changes: 22 additions & 10 deletions cvat/apps/dataset_manager/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from datumaro.components.errors import DatasetError, DatasetImportError, DatasetNotFoundError

from django.db import transaction
from django.db.models.query import Prefetch
from django.db.models.query import Prefetch, QuerySet
from django.conf import settings
from rest_framework.exceptions import ValidationError

Expand Down Expand Up @@ -81,9 +81,10 @@ def merge_table_rows(rows, keys_for_merge, field_id):

return list(merged_rows.values())


class JobAnnotation:
@classmethod
def add_prefetch_info(cls, queryset):
def add_prefetch_info(cls, queryset: QuerySet, prefetch_images: bool = True):
assert issubclass(queryset.model, models.Job)

label_qs = add_prefetch_fields(models.Label.objects.all(), [
Expand All @@ -93,6 +94,12 @@ def add_prefetch_info(cls, queryset):
])
label_qs = JobData.add_prefetch_info(label_qs)

task_data_queryset = models.Data.objects.all()
if prefetch_images:
task_data_queryset = task_data_queryset.select_related('video').prefetch_related(
Prefetch('images', queryset=models.Image.objects.order_by('frame'))
)

return queryset.select_related(
'segment',
'segment__task',
Comment on lines 103 to 105
Copy link
Contributor

@zhiltsov-max zhiltsov-max Nov 26, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI, I don't think this impacts memory use heavily at the moment. It seems that using select_related results in different Segment and Task objects in Python, even if they are actually the same DB line. prefetch_related, however, results in the same objects with the same ids. As there are many segments using the same task, it makes sense to use prefetch_related instead in such cases, if memory use is the question. prefetch_related will result in a separate requests though.

Expand All @@ -103,18 +110,15 @@ def add_prefetch_info(cls, queryset):
'segment__task__project__owner',
'segment__task__project__assignee',
zhiltsov-max marked this conversation as resolved.
Show resolved Hide resolved

Prefetch('segment__task__data',
queryset=models.Data.objects.select_related('video').prefetch_related(
Prefetch('images', queryset=models.Image.objects.order_by('frame'))
)),
Prefetch('segment__task__data', queryset=task_data_queryset),

Prefetch('segment__task__label_set', queryset=label_qs),
Prefetch('segment__task__project__label_set', queryset=label_qs),
)

def __init__(self, pk, *, is_prefetched=False, queryset=None):
def __init__(self, pk, *, is_prefetched: bool = False, queryset: QuerySet = None, prefetch_images: bool = False):
zhiltsov-max marked this conversation as resolved.
Show resolved Hide resolved
if queryset is None:
queryset = self.add_prefetch_info(models.Job.objects)
queryset = self.add_prefetch_info(models.Job.objects, prefetch_images=prefetch_images)

if is_prefetched:
self.db_job: models.Job = queryset.select_related(
Expand Down Expand Up @@ -1006,6 +1010,7 @@ def get_job_data(pk):

return annotation.data


@silk_profile(name="POST job data")
@transaction.atomic
def put_job_data(pk, data):
Expand All @@ -1014,6 +1019,7 @@ def put_job_data(pk, data):

return annotation.data


@silk_profile(name="UPDATE job data")
@plugin_decorator
@transaction.atomic
Expand All @@ -1028,26 +1034,29 @@ def patch_job_data(pk, data, action):

return annotation.data


@silk_profile(name="DELETE job data")
@transaction.atomic
def delete_job_data(pk):
annotation = JobAnnotation(pk)
annotation.delete()


def export_job(job_id, dst_file, format_name, server_url=None, save_images=False):
# For big tasks dump function may run for a long time and
# we dont need to acquire lock after the task has been initialized from DB.
# But there is the bug with corrupted dump file in case 2 or
# more dump request received at the same time:
# https://github.com/cvat-ai/cvat/issues/217
with transaction.atomic():
job = JobAnnotation(job_id)
job = JobAnnotation(job_id, prefetch_images=True)
Eldies marked this conversation as resolved.
Show resolved Hide resolved
job.init_from_db()

exporter = make_exporter(format_name)
with open(dst_file, 'wb') as f:
job.export(f, exporter, host=server_url, save_images=save_images)


@silk_profile(name="GET task data")
@transaction.atomic
def get_task_data(pk):
Expand All @@ -1056,6 +1065,7 @@ def get_task_data(pk):

return annotation.data


@silk_profile(name="POST task data")
@transaction.atomic
def put_task_data(pk, data):
Expand All @@ -1064,6 +1074,7 @@ def put_task_data(pk, data):

return annotation.data


@silk_profile(name="UPDATE task data")
@transaction.atomic
def patch_task_data(pk, data, action):
Expand Down Expand Up @@ -1108,9 +1119,10 @@ def import_task_annotations(src_file, task_id, format_name, conv_mask_to_poly):
except (DatasetError, DatasetImportError, DatasetNotFoundError) as ex:
raise CvatImportError(str(ex))


@transaction.atomic
def import_job_annotations(src_file, job_id, format_name, conv_mask_to_poly):
job = JobAnnotation(job_id)
job = JobAnnotation(job_id, prefetch_images=True)

importer = make_importer(format_name)
with open(src_file, 'rb') as f:
Expand Down
2 changes: 1 addition & 1 deletion cvat/apps/quality_control/quality_reports.py
Original file line number Diff line number Diff line change
Expand Up @@ -572,7 +572,7 @@ def add_prefetch_info(cls, queryset):
@transaction.atomic
def __init__(self, job_id: int, *, queryset=None, included_frames=None) -> None:
self.job_id = job_id
self.job_annotation = JobAnnotation(job_id, queryset=queryset)
self.job_annotation = JobAnnotation(job_id, queryset=queryset, prefetch_images=True)
self.job_annotation.init_from_db()
self.job_data = JobData(
annotation_ir=self.job_annotation.ir_data,
Expand Down
Loading