From cf9c12b5aba5e52a3085989a83e3d330460e56c1 Mon Sep 17 00:00:00 2001 From: Andrey Zhavoronkov Date: Mon, 30 Oct 2023 14:54:03 +0200 Subject: [PATCH 1/7] chunk preparation optimization --- cvat/apps/engine/cache.py | 10 ++-- cvat/apps/engine/media_extractors.py | 71 +++++++++++++----------- cvat/apps/engine/task.py | 81 ++++++++++++++++++++-------- cvat/apps/engine/utils.py | 10 +++- cvat/settings/base.py | 3 ++ 5 files changed, 117 insertions(+), 58 deletions(-) diff --git a/cvat/apps/engine/cache.py b/cvat/apps/engine/cache.py index 6f88ed51290c..168bc23eaf02 100644 --- a/cvat/apps/engine/cache.py +++ b/cvat/apps/engine/cache.py @@ -33,7 +33,7 @@ from cvat.apps.engine.mime_types import mimetypes from cvat.apps.engine.models import (DataChoice, DimensionType, Job, Image, StorageChoice, CloudStorage) -from cvat.apps.engine.utils import md5_hash +from cvat.apps.engine.utils import md5_hash, preload_images from utils.dataset_manifest import ImageManifestManager slogger = ServerLogManager(__name__) @@ -117,7 +117,7 @@ def _get_frame_provider_class(): @staticmethod @contextmanager - def _get_images(db_data, chunk_number): + def _get_images(db_data, chunk_number, dimension): images = [] tmp_dir = None upload_dir = { @@ -168,6 +168,8 @@ def _get_images(db_data, chunk_number): images.append((fs_filename, fs_filename, None)) cloud_storage_instance.bulk_download_to_dir(files=files_to_download, upload_dir=tmp_dir) + if dimension == DimensionType.DIM_2D: + images = preload_images(images) for checksum, (_, fs_filename, _) in zip(checksums, images): if checksum and not md5_hash(fs_filename) == checksum: @@ -176,6 +178,8 @@ def _get_images(db_data, chunk_number): for item in reader: source_path = os.path.join(upload_dir, f"{item['name']}{item['extension']}") images.append((source_path, source_path, None)) + if dimension == DimensionType.DIM_2D: + images = preload_images(images) yield images finally: @@ -199,7 +203,7 @@ def _prepare_task_chunk(self, db_data, quality, chunk_number): writer = writer_classes[quality](image_quality, **kwargs) buff = BytesIO() - with self._get_images(db_data, chunk_number) as images: + with self._get_images(db_data, chunk_number, self._dimension) as images: writer.save_as_chunk(images, buff) buff.seek(0) diff --git a/cvat/apps/engine/media_extractors.py b/cvat/apps/engine/media_extractors.py index 5f64a93f3024..3de0c0829d4d 100644 --- a/cvat/apps/engine/media_extractors.py +++ b/cvat/apps/engine/media_extractors.py @@ -12,6 +12,7 @@ from enum import IntEnum from abc import ABC, abstractmethod from contextlib import closing +from typing import Iterable import av import numpy as np @@ -587,12 +588,17 @@ def __init__(self, quality, dimension=DimensionType.DIM_2D): self._dimension = dimension @staticmethod - def _compress_image(image_path, quality): - if isinstance(image_path, av.VideoFrame): - image = image_path.to_image() - else: - with Image.open(image_path) as source_image: - image = ImageOps.exif_transpose(source_image) + def _compress_image(source_image: av.VideoFrame | io.IOBase | Image.Image, quality: int) -> tuple[int, int, io.BytesIO]: + image = None + if isinstance(source_image, av.VideoFrame): + image = source_image.to_image() + elif isinstance(source_image, io.IOBase): + with Image.open(source_image) as _img: + image = ImageOps.exif_transpose(_img) + elif isinstance(source_image, Image.Image): + image = source_image + + assert image is not None # Ensure image data fits into 8bit per pixel before RGB conversion as PIL clips values on conversion if image.mode == "I": @@ -619,7 +625,7 @@ def _compress_image(image_path, quality): image = ImageOps.equalize(image) # The Images need equalization. High resolution with 16-bit but only small range that actually contains information converted_image = image.convert('RGB') - image.close() + try: buf = io.BytesIO() converted_image.save(buf, format='JPEG', quality=quality, optimize=True) @@ -637,7 +643,7 @@ class ZipChunkWriter(IChunkWriter): IMAGE_EXT = 'jpeg' POINT_CLOUD_EXT = 'pcd' - def _write_pcd_file(self, image): + def _write_pcd_file(self, image: str|io.BytesIO) -> tuple[io.BytesIO, str, int, int]: image_buf = open(image, "rb") if isinstance(image, str) else image try: properties = ValidateDimension.get_pcd_properties(image_buf) @@ -648,33 +654,32 @@ def _write_pcd_file(self, image): if isinstance(image, str): image_buf.close() - def save_as_chunk(self, images, chunk_path): + def save_as_chunk(self, images: Iterable[tuple[Image.Image|io.IOBase|str, str, str]], chunk_path: str): with zipfile.ZipFile(chunk_path, 'x') as zip_chunk: for idx, (image, path, _) in enumerate(images): ext = os.path.splitext(path)[1].replace('.', '') output = io.BytesIO() if self._dimension == DimensionType.DIM_2D: - with Image.open(image) as pil_image: - if has_exif_rotation(pil_image): - rot_image = ImageOps.exif_transpose(pil_image) - try: - if rot_image.format == 'TIFF': - # https://pillow.readthedocs.io/en/stable/handbook/image-file-formats.html - # use loseless lzw compression for tiff images - rot_image.save(output, format='TIFF', compression='tiff_lzw') - else: - rot_image.save( - output, - format=rot_image.format if rot_image.format else self.IMAGE_EXT, - quality=100, - subsampling=0 - ) - finally: - rot_image.close() - else: - output = image + if has_exif_rotation(image): + rot_image = ImageOps.exif_transpose(image) + try: + if rot_image.format == 'TIFF': + # https://pillow.readthedocs.io/en/stable/handbook/image-file-formats.html + # use loseless lzw compression for tiff images + rot_image.save(output, format='TIFF', compression='tiff_lzw') + else: + rot_image.save( + output, + format=rot_image.format if rot_image.format else self.IMAGE_EXT, + quality=100, + subsampling=0 + ) + finally: + rot_image.close() + else: + output = path else: - output, ext = self._write_pcd_file(image)[0:2] + output, ext = self._write_pcd_file(path)[0:2] arcname = '{:06d}.{}'.format(idx, ext) if isinstance(output, io.BytesIO): @@ -687,11 +692,13 @@ def save_as_chunk(self, images, chunk_path): class ZipCompressedChunkWriter(ZipChunkWriter): def save_as_chunk( - self, images, chunk_path, *, compress_frames: bool = True, zip_compress_level: int = 0 + self, + images: Iterable[tuple[Image.Image|io.IOBase|str, str, str]], + chunk_path: str, *, compress_frames: bool = True, zip_compress_level: int = 0 ): image_sizes = [] with zipfile.ZipFile(chunk_path, 'x', compresslevel=zip_compress_level) as zip_chunk: - for idx, (image, _, _) in enumerate(images): + for idx, (image, path, _) in enumerate(images): if self._dimension == DimensionType.DIM_2D: if compress_frames: w, h, image_buf = self._compress_image(image, self._image_quality) @@ -702,7 +709,7 @@ def save_as_chunk( w, h = img.size extension = self.IMAGE_EXT else: - image_buf, extension, w, h = self._write_pcd_file(image) + image_buf, extension, w, h = self._write_pcd_file(path) image_sizes.append((w, h)) arcname = '{:06d}.{}'.format(idx, extension) zip_chunk.writestr(arcname, image_buf.getvalue()) diff --git a/cvat/apps/engine/task.py b/cvat/apps/engine/task.py index f9956e7a4e77..199c173bedbb 100644 --- a/cvat/apps/engine/task.py +++ b/cvat/apps/engine/task.py @@ -7,7 +7,7 @@ import fnmatch import os import sys -from typing import Any, Dict, Iterator, List, NamedTuple, Optional, Union +from typing import Any, Dict, Iterator, List, NamedTuple, Optional, Union, Iterable from rest_framework.serializers import ValidationError import rq import re @@ -17,6 +17,10 @@ from urllib import request as urlrequest import django_rq import pytz +import concurrent.futures +import queue + +from PIL import Image from django.conf import settings from django.db import transaction @@ -27,7 +31,7 @@ from cvat.apps.engine.log import ServerLogManager from cvat.apps.engine.media_extractors import (MEDIA_TYPES, ImageListReader, Mpeg4ChunkWriter, Mpeg4CompressedChunkWriter, ValidateDimension, ZipChunkWriter, ZipCompressedChunkWriter, get_mime, sort) -from cvat.apps.engine.utils import av_scan_paths,get_rq_job_meta, define_dependent_job, get_rq_lock_by_user +from cvat.apps.engine.utils import av_scan_paths,get_rq_job_meta, define_dependent_job, get_rq_lock_by_user, preload_images from cvat.utils.http import make_requests_session, PROXIES_FOR_UNTRUSTED_URLS from utils.dataset_manifest import ImageManifestManager, VideoManifestManager, is_manifest from utils.dataset_manifest.core import VideoManifestValidator, is_dataset_manifest @@ -1025,37 +1029,70 @@ def _update_status(msg): frame=frame, width=w, height=h) for (path, frame), (w, h) in zip(chunk_paths, img_sizes) ]) - if db_data.storage_method == models.StorageMethodChoice.FILE_SYSTEM or not settings.USE_CACHE: counter = itertools.count() - generator = itertools.groupby(extractor, lambda x: next(counter) // db_data.chunk_size) - for chunk_idx, chunk_data in generator: - chunk_data = list(chunk_data) - original_chunk_path = db_data.get_original_chunk_path(chunk_idx) - original_chunk_writer.save_as_chunk(chunk_data, original_chunk_path) + generator = itertools.groupby(extractor, lambda _: next(counter) // db_data.chunk_size) + generator = ((idx, list(chunk_data)) for idx, chunk_data in generator) + + def save_chunks( + executor: concurrent.futures.ThreadPoolExecutor, + chunk_idx: int, + chunk_data: Iterable[tuple[str, str, str]]) -> list[tuple[str, int, tuple[int, int]]]: + if (isinstance(extractor, MEDIA_TYPES['image']['extractor']) or + isinstance(extractor, MEDIA_TYPES['zip']['extractor']) or + isinstance(extractor, MEDIA_TYPES['pdf']['extractor']) or + isinstance(extractor, MEDIA_TYPES['archive']['extractor'])): + chunk_data = preload_images(chunk_data) + + fs_original = executor.submit( + original_chunk_writer.save_as_chunk, + images=chunk_data, + chunk_path=db_data.get_original_chunk_path(chunk_idx) + ) + fs_compressed = executor.submit( + compressed_chunk_writer.save_as_chunk, + images=chunk_data, + chunk_path=db_data.get_compressed_chunk_path(chunk_idx), + ) + fs_original.result() + image_sizes = fs_compressed.result() - compressed_chunk_path = db_data.get_compressed_chunk_path(chunk_idx) - img_sizes = compressed_chunk_writer.save_as_chunk(chunk_data, compressed_chunk_path) + # (path, frame, size) + return list((i[0][1], i[0][2], i[1]) for i in zip(chunk_data, image_sizes)) + + def process_results(img_meta: list[tuple[str, int, tuple[int, int]]]): + nonlocal db_images, db_data, video_path, video_size + db_data.size += len(img_meta) if db_task.mode == 'annotation': - db_images.extend([ + db_images.extend( models.Image( data=db_data, - path=os.path.relpath(data[1], upload_dir), - frame=data[2], - width=size[0], - height=size[1]) - - for data, size in zip(chunk_data, img_sizes) - ]) + path=os.path.relpath(frame_path, upload_dir), + frame=frame_number, + width=frame_size[0], + height=frame_size[1]) + for frame_path, frame_number, frame_size in img_meta) else: - video_size = img_sizes[0] - video_path = chunk_data[0][1] + video_size = img_meta[0][2] + video_path = img_meta[0][0] - db_data.size += len(chunk_data) - progress = extractor.get_progress(chunk_data[-1][2]) + progress = extractor.get_progress(img_meta[-1][1]) update_progress(progress) + futures = queue.Queue(maxsize=settings.CVAT_CONCURRENT_CHUNK_PROCESSING) + with concurrent.futures.ThreadPoolExecutor(max_workers=2*settings.CVAT_CONCURRENT_CHUNK_PROCESSING) as executor: + for chunk_idx, chunk_data in generator: + if not futures.full(): + futures.put(executor.submit(save_chunks, executor, chunk_idx, chunk_data)) + continue + + process_results(futures.get().result()) + futures.put(executor.submit(save_chunks, executor, chunk_idx, chunk_data)) + + while not futures.empty(): + process_results(futures.get().result()) + if db_task.mode == 'annotation': models.Image.objects.bulk_create(db_images) created_images = models.Image.objects.filter(data_id=db_data.id) diff --git a/cvat/apps/engine/utils.py b/cvat/apps/engine/utils.py index 0e17b24dd788..0a1b29801907 100644 --- a/cvat/apps/engine/utils.py +++ b/cvat/apps/engine/utils.py @@ -10,7 +10,7 @@ import sys import traceback from contextlib import suppress, nullcontext -from typing import Any, Dict, Optional, Callable, Union +from typing import Any, Dict, Optional, Callable, Union, Iterable import subprocess import os import urllib.parse @@ -375,3 +375,11 @@ def sendfile( attachment_filename = make_attachment_file_name(attachment_filename) return _sendfile(request, filename, attachment, attachment_filename, mimetype, encoding) + +def preload_image(image: tuple[str, str, str])-> tuple[Image.Image, str, str]: + pil_img = Image.open(image[0]) + pil_img.load() + return pil_img, image[1], image[2] + +def preload_images(images: Iterable[tuple[str, str, str]]) -> list[tuple[Image.Image, str, str]]: + return list(map(preload_image, images)) diff --git a/cvat/settings/base.py b/cvat/settings/base.py index 24eefc23fca9..89f5f1e2c1b8 100644 --- a/cvat/settings/base.py +++ b/cvat/settings/base.py @@ -720,3 +720,6 @@ class CVAT_QUEUES(Enum): EMAIL_BACKEND = None ONE_RUNNING_JOB_IN_QUEUE_PER_USER = strtobool(os.getenv('ONE_RUNNING_JOB_IN_QUEUE_PER_USER', 'false')) + +# How many chunks can be prepared simultaneously during task creation in case the cache is not used +CVAT_CONCURRENT_CHUNK_PROCESSING = int(os.getenv('CVAT_CONCURRENT_CHUNK_PROCESSING', 1)) From b6e5d5390502c583c4dccd74e0a13666aa0eab17 Mon Sep 17 00:00:00 2001 From: Andrey Zhavoronkov Date: Mon, 30 Oct 2023 14:58:33 +0200 Subject: [PATCH 2/7] fix pylint --- cvat/apps/engine/task.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/cvat/apps/engine/task.py b/cvat/apps/engine/task.py index 199c173bedbb..ac1bbbb9ae18 100644 --- a/cvat/apps/engine/task.py +++ b/cvat/apps/engine/task.py @@ -20,8 +20,6 @@ import concurrent.futures import queue -from PIL import Image - from django.conf import settings from django.db import transaction from datetime import datetime From e48a2d137145e715a10e9f32e65928b7e665a30f Mon Sep 17 00:00:00 2001 From: Andrey Zhavoronkov Date: Mon, 30 Oct 2023 17:09:10 +0200 Subject: [PATCH 3/7] fix unitests --- cvat/apps/engine/task.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cvat/apps/engine/task.py b/cvat/apps/engine/task.py index ac1bbbb9ae18..9d969931e81c 100644 --- a/cvat/apps/engine/task.py +++ b/cvat/apps/engine/task.py @@ -1036,7 +1036,9 @@ def save_chunks( executor: concurrent.futures.ThreadPoolExecutor, chunk_idx: int, chunk_data: Iterable[tuple[str, str, str]]) -> list[tuple[str, int, tuple[int, int]]]: - if (isinstance(extractor, MEDIA_TYPES['image']['extractor']) or + nonlocal db_data, db_task, extractor, original_chunk_writer, compressed_chunk_writer + if db_task.dimension == models.DimensionType.DIM_2D and ( + isinstance(extractor, MEDIA_TYPES['image']['extractor']) or isinstance(extractor, MEDIA_TYPES['zip']['extractor']) or isinstance(extractor, MEDIA_TYPES['pdf']['extractor']) or isinstance(extractor, MEDIA_TYPES['archive']['extractor'])): @@ -1060,7 +1062,6 @@ def save_chunks( def process_results(img_meta: list[tuple[str, int, tuple[int, int]]]): nonlocal db_images, db_data, video_path, video_size - db_data.size += len(img_meta) if db_task.mode == 'annotation': db_images.extend( @@ -1081,6 +1082,7 @@ def process_results(img_meta: list[tuple[str, int, tuple[int, int]]]): futures = queue.Queue(maxsize=settings.CVAT_CONCURRENT_CHUNK_PROCESSING) with concurrent.futures.ThreadPoolExecutor(max_workers=2*settings.CVAT_CONCURRENT_CHUNK_PROCESSING) as executor: for chunk_idx, chunk_data in generator: + db_data.size += len(chunk_data) if not futures.full(): futures.put(executor.submit(save_chunks, executor, chunk_idx, chunk_data)) continue From 9d604eb270af29d0f0918c9927d4a7bb356ee048 Mon Sep 17 00:00:00 2001 From: Andrey Zhavoronkov Date: Thu, 2 Nov 2023 10:54:41 +0200 Subject: [PATCH 4/7] apply comment --- cvat/apps/engine/task.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/cvat/apps/engine/task.py b/cvat/apps/engine/task.py index 9d969931e81c..b82fcf6aaf2b 100644 --- a/cvat/apps/engine/task.py +++ b/cvat/apps/engine/task.py @@ -1037,11 +1037,13 @@ def save_chunks( chunk_idx: int, chunk_data: Iterable[tuple[str, str, str]]) -> list[tuple[str, int, tuple[int, int]]]: nonlocal db_data, db_task, extractor, original_chunk_writer, compressed_chunk_writer - if db_task.dimension == models.DimensionType.DIM_2D and ( - isinstance(extractor, MEDIA_TYPES['image']['extractor']) or - isinstance(extractor, MEDIA_TYPES['zip']['extractor']) or - isinstance(extractor, MEDIA_TYPES['pdf']['extractor']) or - isinstance(extractor, MEDIA_TYPES['archive']['extractor'])): + if (db_task.dimension == models.DimensionType.DIM_2D and + isinstance(extractor, ( + MEDIA_TYPES['image']['extractor'], + MEDIA_TYPES['zip']['extractor'], + MEDIA_TYPES['pdf']['extractor'], + MEDIA_TYPES['archive']['extractor'], + ))): chunk_data = preload_images(chunk_data) fs_original = executor.submit( From 665f8f61b669035029cd63078b51e800e69058d8 Mon Sep 17 00:00:00 2001 From: Andrey Zhavoronkov Date: Thu, 2 Nov 2023 10:59:39 +0200 Subject: [PATCH 5/7] add changelog entry --- .../20231102_105602_andrey_optimization_creation_of_tasks.md | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 changelog.d/20231102_105602_andrey_optimization_creation_of_tasks.md diff --git a/changelog.d/20231102_105602_andrey_optimization_creation_of_tasks.md b/changelog.d/20231102_105602_andrey_optimization_creation_of_tasks.md new file mode 100644 index 000000000000..44fd5f67cef9 --- /dev/null +++ b/changelog.d/20231102_105602_andrey_optimization_creation_of_tasks.md @@ -0,0 +1,4 @@ +### Changed + +- Improved performance of chunk preparation when creating tasks + () From 755ff83949fd2a299fe551cd7b078ece635e764a Mon Sep 17 00:00:00 2001 From: Andrey Zhavoronkov Date: Thu, 2 Nov 2023 13:52:02 +0200 Subject: [PATCH 6/7] Update cvat/apps/engine/task.py Co-authored-by: Maria Khrustaleva --- cvat/apps/engine/task.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/cvat/apps/engine/task.py b/cvat/apps/engine/task.py index b82fcf6aaf2b..a1194c62c2ac 100644 --- a/cvat/apps/engine/task.py +++ b/cvat/apps/engine/task.py @@ -1085,11 +1085,8 @@ def process_results(img_meta: list[tuple[str, int, tuple[int, int]]]): with concurrent.futures.ThreadPoolExecutor(max_workers=2*settings.CVAT_CONCURRENT_CHUNK_PROCESSING) as executor: for chunk_idx, chunk_data in generator: db_data.size += len(chunk_data) - if not futures.full(): - futures.put(executor.submit(save_chunks, executor, chunk_idx, chunk_data)) - continue - - process_results(futures.get().result()) + if futures.full(): + process_results(futures.get().result()) futures.put(executor.submit(save_chunks, executor, chunk_idx, chunk_data)) while not futures.empty(): From 1c0cd66f76ae082d70353cb12450bebcc7528029 Mon Sep 17 00:00:00 2001 From: Andrey Zhavoronkov Date: Thu, 2 Nov 2023 15:05:40 +0200 Subject: [PATCH 7/7] apply comment --- cvat/apps/engine/cache.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cvat/apps/engine/cache.py b/cvat/apps/engine/cache.py index 168bc23eaf02..a1139b4bf16e 100644 --- a/cvat/apps/engine/cache.py +++ b/cvat/apps/engine/cache.py @@ -168,8 +168,7 @@ def _get_images(db_data, chunk_number, dimension): images.append((fs_filename, fs_filename, None)) cloud_storage_instance.bulk_download_to_dir(files=files_to_download, upload_dir=tmp_dir) - if dimension == DimensionType.DIM_2D: - images = preload_images(images) + images = preload_images(images) for checksum, (_, fs_filename, _) in zip(checksums, images): if checksum and not md5_hash(fs_filename) == checksum: