From e7cca0ec6443a186a7c22f74e32e5dd8585351b7 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Tue, 4 May 2021 19:53:02 +0300 Subject: [PATCH] Manifest fixes (#3146) * Add logger, reverse func * Fix image filtering * Fix upload video manifest Co-authored-by: Nikita Manovich --- CHANGELOG.md | 1 + cvat/apps/engine/migrations/0038_manifest.py | 92 ++++++++++++++++++-- utils/dataset_manifest/core.py | 14 ++- 3 files changed, 97 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 19c3951da13e..806d551c342e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Changing a label on canvas does not work when 'Show object details' enabled () - Make sure frame unzip web worker correctly terminates after unzipping all images in a requested chunk () - Reset password link was unavailable before login () +- Manifest: migration () ### Security diff --git a/cvat/apps/engine/migrations/0038_manifest.py b/cvat/apps/engine/migrations/0038_manifest.py index 7447aa6f5740..05a5b11b0b8e 100644 --- a/cvat/apps/engine/migrations/0038_manifest.py +++ b/cvat/apps/engine/migrations/0038_manifest.py @@ -1,7 +1,10 @@ # Generated by Django 3.1.1 on 2021-02-20 08:36 import glob +import itertools +import logging import os +import sys from re import search from django.conf import settings @@ -9,40 +12,109 @@ from cvat.apps.engine.models import (DimensionType, StorageChoice, StorageMethodChoice) +from cvat.apps.engine.media_extractors import get_mime from utils.dataset_manifest import ImageManifestManager, VideoManifestManager -def migrate_data(apps, shema_editor): +def get_logger(): + migration = os.path.basename(__file__).split(".")[0] + logger = logging.getLogger(name=migration) + logger.setLevel(logging.INFO) + file_handler = logging.FileHandler(os.path.join(settings.MIGRATIONS_LOGS_ROOT, f"{migration}.log")) + formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + file_handler.setFormatter(formatter) + logger.addHandler(file_handler) + logger.addHandler(logging.StreamHandler(sys.stdout)) + logger.addHandler(logging.StreamHandler(sys.stderr)) + return logger + +def _get_query_set(apps): Data = apps.get_model("engine", "Data") query_set = Data.objects.filter(storage_method=StorageMethodChoice.CACHE) + return query_set + +def migrate2meta(apps, shema_editor): + logger = get_logger() + query_set = _get_query_set(apps) + for db_data in query_set: + try: + upload_dir = '{}/{}/raw'.format(settings.MEDIA_DATA_ROOT, db_data.id) + logger.info('Migrate data({}), folder - {}'.format(db_data.id, upload_dir)) + meta_path = os.path.join(upload_dir, "meta_info.txt") + if os.path.exists(os.path.join(upload_dir, 'manifest.jsonl')): + os.remove(os.path.join(upload_dir, 'manifest.jsonl')) + logger.info('A manifest file has been deleted') + if os.path.exists(os.path.join(upload_dir, 'index.json')): + os.remove(os.path.join(upload_dir, 'index.json')) + logger.info('A manifest index file has been deleted') + data_dir = upload_dir if db_data.storage == StorageChoice.LOCAL else settings.SHARE_ROOT + if hasattr(db_data, 'video'): + if os.path.exists(meta_path): + logger.info('A meta_info.txt already exists') + continue + media_file = os.path.join(data_dir, db_data.video.path) + logger.info('Preparing of the video meta has begun') + meta = VideoManifestManager(manifest_path=upload_dir) \ + .prepare_meta(media_file=media_file, force=True) + with open(meta_path, "w") as meta_file: + for idx, pts, _ in meta: + meta_file.write(f"{idx} {pts}\n") + else: + name_format = "dummy_{}.txt" + sources = [db_image.path for db_image in db_data.images.all().order_by('frame')] + counter = itertools.count() + logger.info('Preparing of the dummy chunks has begun') + for idx, img_paths in itertools.groupby(sources, lambda x: next(counter) // db_data.chunk_size): + if os.path.exists(os.path.join(upload_dir, name_format.format(idx))): + logger.info(name_format.format(idx) + " already exists") + continue + with open(os.path.join(upload_dir, name_format.format(idx)), "w") as dummy_chunk: + dummy_chunk.writelines([f"{img_path}\n" for img_path in img_paths]) + logger.info('Succesfull migration for the data({})'.format(db_data.id)) + except Exception as ex: + logger.error(str(ex)) + +def migrate2manifest(apps, shema_editor): + logger = get_logger() + logger.info('The data migration has been started for creating manifest`s files') + query_set = _get_query_set(apps) + logger.info('Need to update {} data objects'.format(len(query_set))) for db_data in query_set: try: upload_dir = '{}/{}/raw'.format(settings.MEDIA_DATA_ROOT, db_data.id) + logger.info('Migrate data({}), folder - {}'.format(db_data.id, upload_dir)) if os.path.exists(os.path.join(upload_dir, 'meta_info.txt')): - os.remove(os.path.join(upload_dir, 'meta_info.txt')) + os.remove(os.path.join(upload_dir, 'meta_info.txt')) + logger.info('{}/meta_info.txt has been deleted'.format(upload_dir)) else: for path in glob.glob(f'{upload_dir}/dummy_*.txt'): os.remove(path) + logger.info(f"{path} has been deleted") # it's necessary for case with long data migration if os.path.exists(os.path.join(upload_dir, 'manifest.jsonl')): + logger.info('Manifest file already exists') continue data_dir = upload_dir if db_data.storage == StorageChoice.LOCAL else settings.SHARE_ROOT if hasattr(db_data, 'video'): media_file = os.path.join(data_dir, db_data.video.path) manifest = VideoManifestManager(manifest_path=upload_dir) - meta_info = manifest.prepare_meta(media_file=media_file) + logger.info('Preparing of the video meta information has begun') + meta_info = manifest.prepare_meta(media_file=media_file, force=True) + logger.info('Manifest creating has begun') manifest.create(meta_info) + logger.info('Index creating has begun') manifest.init_index() else: manifest = ImageManifestManager(manifest_path=upload_dir) sources = [] if db_data.storage == StorageChoice.LOCAL: for (root, _, files) in os.walk(data_dir): - sources.extend([os.path.join(root, f) for f in files]) + sources.extend([os.path.join(root, f) for f in files if get_mime(f) == 'image']) sources.sort() # using share, this means that we can not explicitly restore the entire data structure else: sources = [os.path.join(data_dir, db_image.path) for db_image in db_data.images.all().order_by('frame')] if any(list(filter(lambda x: x.dimension==DimensionType.DIM_3D, db_data.tasks.all()))): + logger.info('Preparing of images 3d meta information has begun') content = [] for source in sources: name, ext = os.path.splitext(os.path.relpath(source, upload_dir)) @@ -51,6 +123,7 @@ def migrate_data(apps, shema_editor): 'extension': ext }) else: + logger.info('Preparing of 2d images meta information has begun') meta_info = manifest.prepare_meta(sources=sources, data_dir=data_dir) content = meta_info.content @@ -58,6 +131,7 @@ def migrate_data(apps, shema_editor): def _get_frame_step(str_): match = search("step\s*=\s*([1-9]\d*)", str_) return int(match.group(1)) if match else 1 + logger.info('Data is located on the share, metadata update has been started') step = _get_frame_step(db_data.frame_filter) start = db_data.start_frame stop = db_data.stop_frame + 1 @@ -67,10 +141,13 @@ def _get_frame_step(str_): item = content.pop(0) if i in images_range else dict() result_content.append(item) content = result_content + logger.info('Manifest creating has begun') manifest.create(content) + logger.info('Index creating has begun') manifest.init_index() + logger.info('Succesfull migration for the data({})'.format(db_data.id)) except Exception as ex: - print(str(ex)) + logger.error(str(ex)) class Migration(migrations.Migration): @@ -79,5 +156,8 @@ class Migration(migrations.Migration): ] operations = [ - migrations.RunPython(migrate_data) + migrations.RunPython( + code=migrate2manifest, + reverse_code=migrate2meta + ) ] diff --git a/utils/dataset_manifest/core.py b/utils/dataset_manifest/core.py index 78a00b0b98bf..edb68fb21af6 100644 --- a/utils/dataset_manifest/core.py +++ b/utils/dataset_manifest/core.py @@ -325,7 +325,7 @@ def index(self): return self._index class VideoManifestManager(_ManifestManager): - def __init__(self, manifest_path, *args, **kwargs): + def __init__(self, manifest_path): super().__init__(manifest_path) setattr(self._manifest, 'TYPE', 'video') self.BASE_INFORMATION['properties'] = 3 @@ -381,9 +381,15 @@ def validate_base_info(self): assert self._manifest.TYPE != json.loads(manifest_file.readline())['type'] class VideoManifestValidator(VideoManifestManager): - def __init__(self, **kwargs): - self.source_path = kwargs.pop('source_path') - super().__init__(self, **kwargs) + def __init__(self, source_path, manifest_path): + self.source_path = source_path + super().__init__(manifest_path) + + @staticmethod + def _get_video_stream(container): + video_stream = next(stream for stream in container.streams if stream.type == 'video') + video_stream.thread_type = 'AUTO' + return video_stream def validate_key_frame(self, container, video_stream, key_frame): for packet in container.demux(video_stream):