Lazy list for array field (#8229)

### Motivation and context Decided to split changes in this PR: #8223 1. Annotations import (#8226) 2. Array fields optimization (this PR) 3. Logging function optimization (#8228) ### How has this been tested?  ### Checklist  - [x] I submit my changes into the `develop` branch - [ ] I have created a changelog fragment  - [ ] I have updated the documentation accordingly - [ ] I have added tests to cover my changes - [ ] I have linked related issues (see [GitHub docs]( https://help.github.com/en/github/managing-your-work-on-github/linking-a-pull-request-to-an-issue#linking-a-pull-request-to-an-issue-using-a-keyword)) - [ ] I have increased versions of npm packages if it is necessary ([cvat-canvas](https://github.com/cvat-ai/cvat/tree/develop/cvat-canvas#versioning), [cvat-core](https://github.com/cvat-ai/cvat/tree/develop/cvat-core#versioning), [cvat-data](https://github.com/cvat-ai/cvat/tree/develop/cvat-data#versioning) and [cvat-ui](https://github.com/cvat-ai/cvat/tree/develop/cvat-ui#versioning)) ### License - [x] I submit _my code changes_ under the same [MIT License]( https://github.com/cvat-ai/cvat/blob/develop/LICENSE) that covers the project. Feel free to contact the maintainers if that's a concern.  ## Summary by CodeRabbit - **New Features** - Introduced the `LazyList` class for efficient, on-demand parsing of list elements from strings. - Added support for custom transformations through a converter function. - Enhanced lazy evaluation with new decorators for improved performance on list operations. - **Tests** - Implemented a comprehensive test suite for the `LazyList` class, validating core functionalities and ensuring robustness.
cvat-ai · Aug 6, 2024 · 5b4e199 · 5b4e199
1 parent f3dcc3e
commit 5b4e199
Show file tree

Hide file tree

Showing 10 changed files with 455 additions and 35 deletions.
diff --git a/cvat-cli/pyproject.toml b/cvat-cli/pyproject.toml
@@ -7,9 +7,3 @@ profile = "black"
 forced_separate = ["tests"]
 line_length = 100
 skip_gitignore = true # align tool behavior with Black
-
-# Can't just use a pyproject in the root dir, so duplicate
-# https://github.com/psf/black/issues/2863
-[tool.black]
-line-length = 100
-target-version = ['py38']
diff --git a/cvat-sdk/pyproject.toml b/cvat-sdk/pyproject.toml
@@ -7,9 +7,3 @@ profile = "black"
 forced_separate = ["tests"]
 line_length = 100
 skip_gitignore = true # align tool behavior with Black
-
-# Can't just use a pyproject in the root dir, so duplicate
-# https://github.com/psf/black/issues/2863
-[tool.black]
-line-length = 100
-target-version = ['py38']
diff --git a/cvat/apps/analytics_report/pyproject.toml b/cvat/apps/analytics_report/pyproject.toml
@@ -4,9 +4,3 @@ forced_separate = ["tests"]
 line_length = 100
 skip_gitignore = true # align tool behavior with Black
 known_first_party = ["cvat"]
-
-# Can't just use a pyproject in the root dir, so duplicate
-# https://github.com/psf/black/issues/2863
-[tool.black]
-line-length = 100
-target-version = ['py38']
diff --git a/cvat/apps/engine/lazy_list.py b/cvat/apps/engine/lazy_list.py
@@ -0,0 +1,254 @@
+# Copyright (C) 2024 CVAT.ai Corporation
+#
+# SPDX-License-Identifier: MIT
+
+from functools import wraps
+from itertools import islice
+from typing import Any, Callable, Iterator, TypeVar, overload
+
+import attrs
+from attr import field
+
+T = TypeVar("T", bound=int | float | str)
+
+
+def _parse_self_and_other_before_accessing(list_method: Callable[..., Any]) -> Callable[..., Any]:
+    @wraps(list_method)
+    def wrapper(self: "LazyList", other: Any) -> "LazyList":
+        self._parse_up_to(-1)
+        if isinstance(other, LazyList):
+            other._parse_up_to(-1)
+        if not isinstance(other, list):
+            # explicitly calling list.__add__ with
+            # np.ndarray raises TypeError instead of it returning NotImplemented
+            # this prevents python from executing np.ndarray.__radd__
+            return NotImplemented
+
+        return list_method(self, other)
+
+    return wrapper
+
+
+def _parse_self_before_accessing(list_method: Callable[..., Any]) -> Callable[..., Any]:
+    """Wrapper for original list methods. Forces LazyList to parse itself before accessing them."""
+
+    @wraps(list_method)
+    def wrapper(self: "LazyList", *args, **kwargs) -> "LazyList":
+        self._parse_up_to(-1)
+
+        return list_method(self, *args, **kwargs)
+
+    return wrapper
+
+
+class LazyListMeta(type):
+    def __new__(
+        mcs,
+        name: str,
+        bases: tuple[type, ...],
+        namespace: dict[str, Any],
+    ):
+        # add pre-parse for list methods
+        for method_name in [
+            "append",
+            "copy",
+            "insert",
+            "pop",
+            "remove",
+            "reverse",
+            "sort",
+            "clear",
+            "index",
+            "count",
+            "__setitem__",
+            "__delitem__",
+            "__contains__",
+            "__len__",
+            "__reversed__",
+            "__mul__",
+            "__rmul__",
+            "__imul__",
+        ]:
+            namespace[method_name] = _parse_self_before_accessing(getattr(list, method_name))
+
+        for method_name in [
+            "extend",
+            "__add__",
+            "__iadd__",
+            "__eq__",
+            "__gt__",
+            "__ge__",
+            "__lt__",
+            "__le__",
+        ]:
+            namespace[method_name] = _parse_self_and_other_before_accessing(
+                getattr(list, method_name)
+            )
+
+        return super().__new__(mcs, name, bases, namespace)
+
+
+@attrs.define(slots=True, repr=False)
+class LazyList(list[T], metaclass=LazyListMeta):
+    """
+    Evaluates elements from the string representation as needed.
+    Lazy evaluation is supported for __getitem__ and __iter__ methods.
+    Using any other method will result in parsing the whole string.
+    Once instance of LazyList is fully parsed (either by accessing list methods
+    or by iterating over all elements), it will behave just as a regular python list.
+    """
+
+    _string: str = ""
+    _separator: str = ","
+    _converter: Callable[[str], T] = lambda s: s
+    _probable_length: int | None = field(init=False, default=None)
+    _parsed: bool = field(init=False, default=False)
+
+    def __repr__(self) -> str:
+        if self._parsed:
+            return f"LazyList({list.__repr__(self)})"
+        current_index = list.__len__(self)
+        current_position = 1 if self._string.startswith("[") else 0
+        separator_offset = len(self._separator)
+
+        for _ in range(current_index):
+            current_position = (
+                self._string.find(self._separator, current_position) + separator_offset
+            )
+
+        parsed_elements = list.__repr__(self).removesuffix("]")
+        unparsed_elements = self._string[current_position:]
+        return (
+            f"LazyList({parsed_elements}... + {unparsed_elements}', "
+            f"({list.__len__(self) / self._compute_max_length(self._string) * 100:.02f}% parsed))"
+        )
+
+    def __deepcopy__(self, memodict: Any = None) -> list[T]:
+        """
+        Since our elements are scalar, this should be sufficient
+        Without this, deepcopy would copy the state of the object,
+        then would try to append its elements.
+
+        However, since copy will contain initial string,
+        it will compute its elements on the first on the first append,
+        resulting in value duplication.
+        """
+        return list(self)
+
+    @overload
+    def __getitem__(self, index: int) -> T: ...
+
+    @overload
+    def __getitem__(self, index: slice) -> list[T]: ...
+
+    def __getitem__(self, index: int | slice) -> T | list[T]:
+        if self._parsed:
+            return list.__getitem__(self, index)
+
+        if isinstance(index, slice):
+            self._parse_up_to(index.indices(self._compute_max_length(self._string))[1] - 1)
+            return list.__getitem__(self, index)
+
+        self._parse_up_to(index)
+        return list.__getitem__(self, index)
+
+    def __iter__(self) -> Iterator[T]:
+        yield from list.__iter__(self)
+        yield from self._iter_unparsed()
+
+    def __str__(self) -> str:
+        if not self._parsed:
+            return self._string.strip("[]")
+        return self._separator.join(map(str, self))
+
+    def _parse_up_to(self, index: int) -> None:
+        if self._parsed:
+            return
+
+        if index < 0:
+            index += self._compute_max_length(self._string)
+
+        start = list.__len__(self)
+        if start > index:
+            return
+        end = index - start + 1
+        for _ in islice(self._iter_unparsed(), end + 1):
+            pass
+
+        if index == self._compute_max_length(self._string) - 1:
+            self._mark_parsed()
+
+    def _mark_parsed(self):
+        self._parsed = True
+        self._string = ""  # freeing the memory
+
+    def _iter_unparsed(self):
+        if self._parsed:
+            return
+        string = self._string
+        current_index = list.__len__(self)
+        current_position = 1 if string.startswith("[") else 0
+        string_length = len(string) - 1 if string.endswith("]") else len(string)
+        separator_offset = len(self._separator)
+
+        for _ in range(current_index):
+            current_position = string.find(self._separator, current_position) + separator_offset
+
+        while current_index < self._compute_max_length(string):
+            end = string.find(self._separator, current_position, string_length)
+            if end == -1:
+                end = string_length
+                self._mark_parsed()
+
+            element_str = string[current_position:end]
+            current_position = end + separator_offset
+            if not element_str:
+                self._probable_length -= 1
+                continue
+            element = self._converter(element_str)
+            if list.__len__(self) <= current_index:
+                # We need to handle special case when instance of lazy list becomes parsed after
+                # this function is called:
+                # ll = LazyList("1,2,3", _converter=int)
+                # iterator = iter(ll)
+                # next(iterator)  # > 1 (will generate next element and append to self)
+                # list(ll)  # > [1, 2, 3]
+                # next(iterator)  # > 2 (will generate next element, however will not append it)
+                # assert list(ll) == [1, 2, 3]
+                list.append(self, element)
+            yield element
+            current_index += 1
+
+    def _compute_max_length(self, string) -> int:
+        if self._probable_length is None:
+            if not self._string:
+                return 0
+            self._probable_length = string.count(self._separator) + 1
+        return self._probable_length
+
+    # support pickling
+
+    def __reduce__(self):
+        return self.__class__, (self._string, self._separator, self._converter), self.__getstate__()
+
+    def __reduce_ex__(self, protocol: int):
+        return self.__reduce__()
+
+    def __getstate__(self):
+        return {
+            "string": self._string,
+            "_separator": self._separator,
+            "_converter": self._converter,
+            "_probable_length": self._probable_length,
+            "parsed": self._parsed,
+            "parsed_elements": list(self) if self._parsed else None,
+        }
+
+    def __setstate__(self, state):
+        self._string = state["string"]
+        self._separator = state["_separator"]
+        self._converter = state["_converter"]
+        self._probable_length = state["_probable_length"]
+        self._parsed = state["parsed"]
+        if self._parsed:
+            self.extend(state["parsed_elements"])
diff --git a/cvat/apps/engine/models.py b/cvat/apps/engine/models.py
@@ -22,6 +22,7 @@
 from django.db.models import Q
 from drf_spectacular.types import OpenApiTypes
 from drf_spectacular.utils import extend_schema_field
+from cvat.apps.engine.lazy_list import LazyList
 
 from cvat.apps.engine.utils import parse_specific_attributes, chunked_list
 from cvat.apps.events.utils import cache_deleted
@@ -181,6 +182,7 @@ def choices(cls):
     def __str__(self):
         return self.value
 
+
 class AbstractArrayField(models.TextField):
     separator = ","
     converter = staticmethod(lambda x: x)
@@ -193,19 +195,20 @@ def __init__(self, *args, store_sorted:Optional[bool]=False, unique_values:Optio
     def from_db_value(self, value, expression, connection):
         if not value:
             return []
-        if value.startswith('[') and value.endswith(']'):
-            value = value[1:-1]
-        return [self.converter(v) for v in value.split(self.separator) if v]
+        return LazyList(string=value, separator=self.separator, converter=self.converter)
 
     def to_python(self, value):
-        if isinstance(value, list):
+        if isinstance(value, list | LazyList):
             return value
 
         return self.from_db_value(value, None, None)
 
     def get_prep_value(self, value):
+        if isinstance(value, LazyList) and not (self._unique_values or self._store_sorted):
+            return str(value)
+
         if self._unique_values:
-            value = list(dict.fromkeys(value))
+            value = dict.fromkeys(value)
         if self._store_sorted:
             value = sorted(value)
         return self.separator.join(map(str, value))