From 5b4e199eccf6900d30b52a1feceb189d1a0e7a39 Mon Sep 17 00:00:00 2001 From: Arseny Boykov <36469655+Bobronium@users.noreply.github.com> Date: Tue, 6 Aug 2024 14:48:12 +0200 Subject: [PATCH] Lazy list for array field (#8229) ### Motivation and context Decided to split changes in this PR: https://github.com/cvat-ai/cvat/pull/8223 1. Annotations import (https://github.com/cvat-ai/cvat/pull/8226) 2. Array fields optimization (this PR) 3. Logging function optimization (https://github.com/cvat-ai/cvat/pull/8228) ### How has this been tested? ### Checklist - [x] I submit my changes into the `develop` branch - [ ] I have created a changelog fragment - [ ] I have updated the documentation accordingly - [ ] I have added tests to cover my changes - [ ] I have linked related issues (see [GitHub docs]( https://help.github.com/en/github/managing-your-work-on-github/linking-a-pull-request-to-an-issue#linking-a-pull-request-to-an-issue-using-a-keyword)) - [ ] I have increased versions of npm packages if it is necessary ([cvat-canvas](https://github.com/cvat-ai/cvat/tree/develop/cvat-canvas#versioning), [cvat-core](https://github.com/cvat-ai/cvat/tree/develop/cvat-core#versioning), [cvat-data](https://github.com/cvat-ai/cvat/tree/develop/cvat-data#versioning) and [cvat-ui](https://github.com/cvat-ai/cvat/tree/develop/cvat-ui#versioning)) ### License - [x] I submit _my code changes_ under the same [MIT License]( https://github.com/cvat-ai/cvat/blob/develop/LICENSE) that covers the project. Feel free to contact the maintainers if that's a concern. ## Summary by CodeRabbit - **New Features** - Introduced the `LazyList` class for efficient, on-demand parsing of list elements from strings. - Added support for custom transformations through a converter function. - Enhanced lazy evaluation with new decorators for improved performance on list operations. - **Tests** - Implemented a comprehensive test suite for the `LazyList` class, validating core functionalities and ensuring robustness. --- cvat-cli/pyproject.toml | 6 - cvat-sdk/pyproject.toml | 6 - cvat/apps/analytics_report/pyproject.toml | 6 - cvat/apps/engine/lazy_list.py | 254 ++++++++++++++++++++++ cvat/apps/engine/models.py | 13 +- cvat/apps/engine/tests/test_lazy_list.py | 183 ++++++++++++++++ cvat/apps/quality_control/pyproject.toml | 6 - dev/format_python_code.sh | 1 + pyproject.toml | 9 + tests/python/pyproject.toml | 6 - 10 files changed, 455 insertions(+), 35 deletions(-) create mode 100644 cvat/apps/engine/lazy_list.py create mode 100644 cvat/apps/engine/tests/test_lazy_list.py create mode 100644 pyproject.toml diff --git a/cvat-cli/pyproject.toml b/cvat-cli/pyproject.toml index 67280a49cac1..ce8cba3ffba6 100644 --- a/cvat-cli/pyproject.toml +++ b/cvat-cli/pyproject.toml @@ -7,9 +7,3 @@ profile = "black" forced_separate = ["tests"] line_length = 100 skip_gitignore = true # align tool behavior with Black - -# Can't just use a pyproject in the root dir, so duplicate -# https://github.com/psf/black/issues/2863 -[tool.black] -line-length = 100 -target-version = ['py38'] diff --git a/cvat-sdk/pyproject.toml b/cvat-sdk/pyproject.toml index 67280a49cac1..ce8cba3ffba6 100644 --- a/cvat-sdk/pyproject.toml +++ b/cvat-sdk/pyproject.toml @@ -7,9 +7,3 @@ profile = "black" forced_separate = ["tests"] line_length = 100 skip_gitignore = true # align tool behavior with Black - -# Can't just use a pyproject in the root dir, so duplicate -# https://github.com/psf/black/issues/2863 -[tool.black] -line-length = 100 -target-version = ['py38'] diff --git a/cvat/apps/analytics_report/pyproject.toml b/cvat/apps/analytics_report/pyproject.toml index 567b78362580..b74ee17d74aa 100644 --- a/cvat/apps/analytics_report/pyproject.toml +++ b/cvat/apps/analytics_report/pyproject.toml @@ -4,9 +4,3 @@ forced_separate = ["tests"] line_length = 100 skip_gitignore = true # align tool behavior with Black known_first_party = ["cvat"] - -# Can't just use a pyproject in the root dir, so duplicate -# https://github.com/psf/black/issues/2863 -[tool.black] -line-length = 100 -target-version = ['py38'] diff --git a/cvat/apps/engine/lazy_list.py b/cvat/apps/engine/lazy_list.py new file mode 100644 index 000000000000..21b5959c22d5 --- /dev/null +++ b/cvat/apps/engine/lazy_list.py @@ -0,0 +1,254 @@ +# Copyright (C) 2024 CVAT.ai Corporation +# +# SPDX-License-Identifier: MIT + +from functools import wraps +from itertools import islice +from typing import Any, Callable, Iterator, TypeVar, overload + +import attrs +from attr import field + +T = TypeVar("T", bound=int | float | str) + + +def _parse_self_and_other_before_accessing(list_method: Callable[..., Any]) -> Callable[..., Any]: + @wraps(list_method) + def wrapper(self: "LazyList", other: Any) -> "LazyList": + self._parse_up_to(-1) + if isinstance(other, LazyList): + other._parse_up_to(-1) + if not isinstance(other, list): + # explicitly calling list.__add__ with + # np.ndarray raises TypeError instead of it returning NotImplemented + # this prevents python from executing np.ndarray.__radd__ + return NotImplemented + + return list_method(self, other) + + return wrapper + + +def _parse_self_before_accessing(list_method: Callable[..., Any]) -> Callable[..., Any]: + """Wrapper for original list methods. Forces LazyList to parse itself before accessing them.""" + + @wraps(list_method) + def wrapper(self: "LazyList", *args, **kwargs) -> "LazyList": + self._parse_up_to(-1) + + return list_method(self, *args, **kwargs) + + return wrapper + + +class LazyListMeta(type): + def __new__( + mcs, + name: str, + bases: tuple[type, ...], + namespace: dict[str, Any], + ): + # add pre-parse for list methods + for method_name in [ + "append", + "copy", + "insert", + "pop", + "remove", + "reverse", + "sort", + "clear", + "index", + "count", + "__setitem__", + "__delitem__", + "__contains__", + "__len__", + "__reversed__", + "__mul__", + "__rmul__", + "__imul__", + ]: + namespace[method_name] = _parse_self_before_accessing(getattr(list, method_name)) + + for method_name in [ + "extend", + "__add__", + "__iadd__", + "__eq__", + "__gt__", + "__ge__", + "__lt__", + "__le__", + ]: + namespace[method_name] = _parse_self_and_other_before_accessing( + getattr(list, method_name) + ) + + return super().__new__(mcs, name, bases, namespace) + + +@attrs.define(slots=True, repr=False) +class LazyList(list[T], metaclass=LazyListMeta): + """ + Evaluates elements from the string representation as needed. + Lazy evaluation is supported for __getitem__ and __iter__ methods. + Using any other method will result in parsing the whole string. + Once instance of LazyList is fully parsed (either by accessing list methods + or by iterating over all elements), it will behave just as a regular python list. + """ + + _string: str = "" + _separator: str = "," + _converter: Callable[[str], T] = lambda s: s + _probable_length: int | None = field(init=False, default=None) + _parsed: bool = field(init=False, default=False) + + def __repr__(self) -> str: + if self._parsed: + return f"LazyList({list.__repr__(self)})" + current_index = list.__len__(self) + current_position = 1 if self._string.startswith("[") else 0 + separator_offset = len(self._separator) + + for _ in range(current_index): + current_position = ( + self._string.find(self._separator, current_position) + separator_offset + ) + + parsed_elements = list.__repr__(self).removesuffix("]") + unparsed_elements = self._string[current_position:] + return ( + f"LazyList({parsed_elements}... + {unparsed_elements}', " + f"({list.__len__(self) / self._compute_max_length(self._string) * 100:.02f}% parsed))" + ) + + def __deepcopy__(self, memodict: Any = None) -> list[T]: + """ + Since our elements are scalar, this should be sufficient + Without this, deepcopy would copy the state of the object, + then would try to append its elements. + + However, since copy will contain initial string, + it will compute its elements on the first on the first append, + resulting in value duplication. + """ + return list(self) + + @overload + def __getitem__(self, index: int) -> T: ... + + @overload + def __getitem__(self, index: slice) -> list[T]: ... + + def __getitem__(self, index: int | slice) -> T | list[T]: + if self._parsed: + return list.__getitem__(self, index) + + if isinstance(index, slice): + self._parse_up_to(index.indices(self._compute_max_length(self._string))[1] - 1) + return list.__getitem__(self, index) + + self._parse_up_to(index) + return list.__getitem__(self, index) + + def __iter__(self) -> Iterator[T]: + yield from list.__iter__(self) + yield from self._iter_unparsed() + + def __str__(self) -> str: + if not self._parsed: + return self._string.strip("[]") + return self._separator.join(map(str, self)) + + def _parse_up_to(self, index: int) -> None: + if self._parsed: + return + + if index < 0: + index += self._compute_max_length(self._string) + + start = list.__len__(self) + if start > index: + return + end = index - start + 1 + for _ in islice(self._iter_unparsed(), end + 1): + pass + + if index == self._compute_max_length(self._string) - 1: + self._mark_parsed() + + def _mark_parsed(self): + self._parsed = True + self._string = "" # freeing the memory + + def _iter_unparsed(self): + if self._parsed: + return + string = self._string + current_index = list.__len__(self) + current_position = 1 if string.startswith("[") else 0 + string_length = len(string) - 1 if string.endswith("]") else len(string) + separator_offset = len(self._separator) + + for _ in range(current_index): + current_position = string.find(self._separator, current_position) + separator_offset + + while current_index < self._compute_max_length(string): + end = string.find(self._separator, current_position, string_length) + if end == -1: + end = string_length + self._mark_parsed() + + element_str = string[current_position:end] + current_position = end + separator_offset + if not element_str: + self._probable_length -= 1 + continue + element = self._converter(element_str) + if list.__len__(self) <= current_index: + # We need to handle special case when instance of lazy list becomes parsed after + # this function is called: + # ll = LazyList("1,2,3", _converter=int) + # iterator = iter(ll) + # next(iterator) # > 1 (will generate next element and append to self) + # list(ll) # > [1, 2, 3] + # next(iterator) # > 2 (will generate next element, however will not append it) + # assert list(ll) == [1, 2, 3] + list.append(self, element) + yield element + current_index += 1 + + def _compute_max_length(self, string) -> int: + if self._probable_length is None: + if not self._string: + return 0 + self._probable_length = string.count(self._separator) + 1 + return self._probable_length + + # support pickling + + def __reduce__(self): + return self.__class__, (self._string, self._separator, self._converter), self.__getstate__() + + def __reduce_ex__(self, protocol: int): + return self.__reduce__() + + def __getstate__(self): + return { + "string": self._string, + "_separator": self._separator, + "_converter": self._converter, + "_probable_length": self._probable_length, + "parsed": self._parsed, + "parsed_elements": list(self) if self._parsed else None, + } + + def __setstate__(self, state): + self._string = state["string"] + self._separator = state["_separator"] + self._converter = state["_converter"] + self._probable_length = state["_probable_length"] + self._parsed = state["parsed"] + if self._parsed: + self.extend(state["parsed_elements"]) diff --git a/cvat/apps/engine/models.py b/cvat/apps/engine/models.py index ff03e16814af..fe7fcbf79ad0 100644 --- a/cvat/apps/engine/models.py +++ b/cvat/apps/engine/models.py @@ -22,6 +22,7 @@ from django.db.models import Q from drf_spectacular.types import OpenApiTypes from drf_spectacular.utils import extend_schema_field +from cvat.apps.engine.lazy_list import LazyList from cvat.apps.engine.utils import parse_specific_attributes, chunked_list from cvat.apps.events.utils import cache_deleted @@ -181,6 +182,7 @@ def choices(cls): def __str__(self): return self.value + class AbstractArrayField(models.TextField): separator = "," converter = staticmethod(lambda x: x) @@ -193,19 +195,20 @@ def __init__(self, *args, store_sorted:Optional[bool]=False, unique_values:Optio def from_db_value(self, value, expression, connection): if not value: return [] - if value.startswith('[') and value.endswith(']'): - value = value[1:-1] - return [self.converter(v) for v in value.split(self.separator) if v] + return LazyList(string=value, separator=self.separator, converter=self.converter) def to_python(self, value): - if isinstance(value, list): + if isinstance(value, list | LazyList): return value return self.from_db_value(value, None, None) def get_prep_value(self, value): + if isinstance(value, LazyList) and not (self._unique_values or self._store_sorted): + return str(value) + if self._unique_values: - value = list(dict.fromkeys(value)) + value = dict.fromkeys(value) if self._store_sorted: value = sorted(value) return self.separator.join(map(str, value)) diff --git a/cvat/apps/engine/tests/test_lazy_list.py b/cvat/apps/engine/tests/test_lazy_list.py new file mode 100644 index 000000000000..52c2f979bf4f --- /dev/null +++ b/cvat/apps/engine/tests/test_lazy_list.py @@ -0,0 +1,183 @@ +import unittest +import copy +import pickle +from typing import TypeVar +from cvat.apps.engine.lazy_list import LazyList + + +T = TypeVar('T') + + +class TestLazyList(unittest.TestCase): + + def setUp(self): + self.lazy_list = LazyList(string="1,2,3", converter=int) + + def test_skipped_values(self): + ll = LazyList("1,2,,4", converter=int) + self.assertEqual(len(ll), 3) + self.assertEqual(ll, [1, 2, 4]) + + def test_len(self): + self.assertEqual(len(self.lazy_list), 3) + list(self.lazy_list) + self.assertEqual(len(self.lazy_list), 3) + + def test_repr(self): + self.assertEqual(repr(self.lazy_list), "LazyList([... + 1,2,3', (0.00% parsed))") + next(iter(self.lazy_list)) # Trigger parsing of the first element + self.assertIn("1... + 2,3", repr(self.lazy_list)) + list(self.lazy_list) + self.assertEqual(repr(self.lazy_list), "LazyList([1, 2, 3])") + + def test_deepcopy(self): + copied_list = copy.deepcopy(self.lazy_list) + self.assertEqual(copied_list, [1, 2, 3]) + self.assertNotEqual(id(copied_list), id(self.lazy_list)) + self.assertEqual(len(copied_list), 3) + + def test_getitem(self): + self.assertEqual(self.lazy_list[1], 2) + self.assertEqual(self.lazy_list[:2], [1, 2]) + + def test_iter(self): + iterator = iter(self.lazy_list) + self.assertEqual(next(iterator), 1) + self.assertEqual(next(iterator), 2) + self.assertEqual(list(self.lazy_list), [1, 2, 3]) + self.assertEqual(next(iterator), 3) + self.assertEqual(list(self.lazy_list), [1, 2, 3]) + + def test_append(self): + self.lazy_list.append(4) + self.assertEqual(self.lazy_list, [1, 2, 3, 4]) + + def test_copy(self): + copied_list = self.lazy_list.copy() + self.assertEqual(copied_list, [1, 2, 3]) + + def test_insert(self): + self.lazy_list.insert(0, 0) + self.assertEqual(self.lazy_list, [0, 1, 2, 3]) + + def test_pop(self): + value = self.lazy_list.pop() + self.assertEqual(value, 3) + self.assertEqual(self.lazy_list, [1, 2]) + + def test_remove(self): + self.lazy_list.remove(2) + self.assertEqual(self.lazy_list, [1, 3]) + + def test_reverse(self): + self.lazy_list.reverse() + self.assertEqual(self.lazy_list, [3, 2, 1]) + + def test_sort(self): + unsorted_list = LazyList(string="3,1,2", converter=int) + unsorted_list.sort() + self.assertEqual(unsorted_list, [1, 2, 3]) + + def test_clear(self): + self.lazy_list.clear() + self.assertEqual(len(self.lazy_list), 0) + + def test_index(self): + self.assertEqual(self.lazy_list.index(2), 1) + + def test_count(self): + self.assertEqual(self.lazy_list.count(2), 1) + + def test_setitem(self): + self.lazy_list[0] = 4 + self.assertEqual(self.lazy_list[0], 4) + + def test_delitem(self): + del self.lazy_list[0] + self.assertEqual(self.lazy_list, [2, 3]) + + def test_contains(self): + self.assertIn(2, self.lazy_list) + + def test_reversed(self): + self.assertEqual(list(reversed(self.lazy_list)), [3, 2, 1]) + + def test_mul(self): + self.assertEqual(self.lazy_list * 2, [1, 2, 3, 1, 2, 3]) + + def test_rmul(self): + self.assertEqual(2 * self.lazy_list, [1, 2, 3, 1, 2, 3]) + + def test_imul(self): + self.lazy_list *= 2 + self.assertEqual(self.lazy_list, [1, 2, 3, 1, 2, 3]) + + def test_extend(self): + self.lazy_list.extend([4, 5]) + self.assertEqual(self.lazy_list, [1, 2, 3, 4, 5]) + + def test_add(self): + new_list = self.lazy_list + [4, 5] + self.assertEqual(new_list, [1, 2, 3, 4, 5]) + + def test_eq(self): + self.assertTrue(self.lazy_list == [1, 2, 3]) + + def test_iadd(self): + self.lazy_list += [4, 5] + self.assertEqual(self.lazy_list, [1, 2, 3, 4, 5]) + + def test_gt(self): + self.assertTrue(self.lazy_list > [1, 2]) + + def test_ge(self): + self.assertTrue(self.lazy_list >= [1, 2, 3]) + + def test_lt(self): + self.assertTrue(self.lazy_list < [1, 2, 3, 4]) + + def test_le(self): + self.assertTrue(self.lazy_list <= [1, 2, 3]) + + def test_lazy_list_with_lazy_list(self): + other_lazy_list = LazyList(string="4,5,6", converter=int) + combined_list = self.lazy_list + other_lazy_list + self.assertEqual(combined_list, [1, 2, 3, 4, 5, 6]) + + def test_pickle_support(self): + pickled = pickle.dumps(self.lazy_list) + unpickled = pickle.loads(pickled) + self.assertEqual(unpickled, [1, 2, 3]) + self.assertEqual(unpickled._string, "") + self.assertTrue(unpickled._parsed) + + def test_parse_before_accessing_decorator(self): + lazy_list_copy = LazyList(string="1,2,3", converter=int) + lazy_list_copy.append(4) + self.assertEqual(lazy_list_copy, [1, 2, 3, 4]) + + def test_parse_both_before_accessing_decorator(self): + other_list = LazyList(string="4,5", converter=int) + result = self.lazy_list + other_list + self.assertEqual(result, [1, 2, 3, 4, 5]) + + def test_length_on_iteration(self): + elements = [] + for element in self.lazy_list: + self.assertEqual(len(self.lazy_list), 3) + elements.append(element) + + self.assertEqual(elements, [1, 2, 3]) + + def test_str(self): + self.assertEqual(str(self.lazy_list), "1,2,3") + self.assertEqual(self.lazy_list, LazyList(str(self.lazy_list), converter=int)) + + def test_str_parsed(self): + list(self.lazy_list) + self.assertEqual(str(self.lazy_list), "1,2,3") + self.assertEqual(self.lazy_list, LazyList(str(self.lazy_list), converter=int)) + + +if __name__ == "__main__": + unittest.main() diff --git a/cvat/apps/quality_control/pyproject.toml b/cvat/apps/quality_control/pyproject.toml index 567b78362580..b74ee17d74aa 100644 --- a/cvat/apps/quality_control/pyproject.toml +++ b/cvat/apps/quality_control/pyproject.toml @@ -4,9 +4,3 @@ forced_separate = ["tests"] line_length = 100 skip_gitignore = true # align tool behavior with Black known_first_party = ["cvat"] - -# Can't just use a pyproject in the root dir, so duplicate -# https://github.com/psf/black/issues/2863 -[tool.black] -line-length = 100 -target-version = ['py38'] diff --git a/dev/format_python_code.sh b/dev/format_python_code.sh index a67bf08572e7..9483c5fbcb2e 100755 --- a/dev/format_python_code.sh +++ b/dev/format_python_code.sh @@ -23,6 +23,7 @@ for paths in \ "tests/python/" \ "cvat/apps/quality_control" \ "cvat/apps/analytics_report" \ + "cvat/apps/engine/lazy_list.py" \ ; do ${BLACK} -- ${paths} ${ISORT} -- ${paths} diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 000000000000..581552a67ebc --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,9 @@ +[tool.isort] +profile = "black" +forced_separate = ["tests"] +line_length = 100 +skip_gitignore = true # align tool behavior with Black + +[tool.black] +line-length = 100 +target-version = ['py38'] diff --git a/tests/python/pyproject.toml b/tests/python/pyproject.toml index e91d405b8e20..ab4db6695977 100644 --- a/tests/python/pyproject.toml +++ b/tests/python/pyproject.toml @@ -3,9 +3,3 @@ profile = "black" forced_separate = ["tests"] line_length = 100 skip_gitignore = true # align tool behavior with Black - -# Can't just use a pyproject in the root dir, so duplicate -# https://github.com/psf/black/issues/2863 -[tool.black] -line-length = 100 -target-version = ['py38']