From 5b4e199eccf6900d30b52a1feceb189d1a0e7a39 Mon Sep 17 00:00:00 2001
From: Arseny Boykov <36469655+Bobronium@users.noreply.github.com>
Date: Tue, 6 Aug 2024 14:48:12 +0200
Subject: [PATCH] Lazy list for array field (#8229)

<!-- Raise an issue to propose your change
(https://github.com/cvat-ai/cvat/issues).
It helps to avoid duplication of efforts from multiple independent
contributors.
Discuss your ideas with maintainers to be sure that changes will be
approved and merged.
Read the [Contribution guide](https://docs.cvat.ai/docs/contributing/).
-->

<!-- Provide a general summary of your changes in the Title above -->

### Motivation and context
Decided to split changes in this PR:
https://github.com/cvat-ai/cvat/pull/8223

1. Annotations import (https://github.com/cvat-ai/cvat/pull/8226)
2. Array fields optimization (this PR)
3. Logging function optimization
(https://github.com/cvat-ai/cvat/pull/8228)

### How has this been tested?
<!-- Please describe in detail how you tested your changes.
Include details of your testing environment, and the tests you ran to
see how your change affects other areas of the code, etc. -->

### Checklist
<!-- Go over all the following points, and put an `x` in all the boxes
that apply.
If an item isn't applicable for some reason, then ~~explicitly
strikethrough~~ the whole
line. If you don't do that, GitHub will show incorrect progress for the
pull request.
If you're unsure about any of these, don't hesitate to ask. We're here
to help! -->
- [x] I submit my changes into the `develop` branch
- [ ] I have created a changelog fragment <!-- see top comment in
CHANGELOG.md -->
- [ ] I have updated the documentation accordingly
- [ ] I have added tests to cover my changes
- [ ] I have linked related issues (see [GitHub docs](

https://help.github.com/en/github/managing-your-work-on-github/linking-a-pull-request-to-an-issue#linking-a-pull-request-to-an-issue-using-a-keyword))
- [ ] I have increased versions of npm packages if it is necessary

([cvat-canvas](https://github.com/cvat-ai/cvat/tree/develop/cvat-canvas#versioning),

[cvat-core](https://github.com/cvat-ai/cvat/tree/develop/cvat-core#versioning),

[cvat-data](https://github.com/cvat-ai/cvat/tree/develop/cvat-data#versioning)
and

[cvat-ui](https://github.com/cvat-ai/cvat/tree/develop/cvat-ui#versioning))

### License

- [x] I submit _my code changes_ under the same [MIT License](
https://github.com/cvat-ai/cvat/blob/develop/LICENSE) that covers the
project.
  Feel free to contact the maintainers if that's a concern.


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->

## Summary by CodeRabbit

- **New Features**
- Introduced the `LazyList` class for efficient, on-demand parsing of
list elements from strings.
- Added support for custom transformations through a converter function.
- Enhanced lazy evaluation with new decorators for improved performance
on list operations.

- **Tests**
- Implemented a comprehensive test suite for the `LazyList` class,
validating core functionalities and ensuring robustness.

<!-- end of auto-generated comment: release notes by coderabbit.ai -->
---
 cvat-cli/pyproject.toml                   |   6 -
 cvat-sdk/pyproject.toml                   |   6 -
 cvat/apps/analytics_report/pyproject.toml |   6 -
 cvat/apps/engine/lazy_list.py             | 254 ++++++++++++++++++++++
 cvat/apps/engine/models.py                |  13 +-
 cvat/apps/engine/tests/test_lazy_list.py  | 183 ++++++++++++++++
 cvat/apps/quality_control/pyproject.toml  |   6 -
 dev/format_python_code.sh                 |   1 +
 pyproject.toml                            |   9 +
 tests/python/pyproject.toml               |   6 -
 10 files changed, 455 insertions(+), 35 deletions(-)
 create mode 100644 cvat/apps/engine/lazy_list.py
 create mode 100644 cvat/apps/engine/tests/test_lazy_list.py
 create mode 100644 pyproject.toml

diff --git a/cvat-cli/pyproject.toml b/cvat-cli/pyproject.toml
index 67280a49cac1..ce8cba3ffba6 100644
--- a/cvat-cli/pyproject.toml
+++ b/cvat-cli/pyproject.toml
@@ -7,9 +7,3 @@ profile = "black"
 forced_separate = ["tests"]
 line_length = 100
 skip_gitignore = true # align tool behavior with Black
-
-# Can't just use a pyproject in the root dir, so duplicate
-# https://github.com/psf/black/issues/2863
-[tool.black]
-line-length = 100
-target-version = ['py38']
diff --git a/cvat-sdk/pyproject.toml b/cvat-sdk/pyproject.toml
index 67280a49cac1..ce8cba3ffba6 100644
--- a/cvat-sdk/pyproject.toml
+++ b/cvat-sdk/pyproject.toml
@@ -7,9 +7,3 @@ profile = "black"
 forced_separate = ["tests"]
 line_length = 100
 skip_gitignore = true # align tool behavior with Black
-
-# Can't just use a pyproject in the root dir, so duplicate
-# https://github.com/psf/black/issues/2863
-[tool.black]
-line-length = 100
-target-version = ['py38']
diff --git a/cvat/apps/analytics_report/pyproject.toml b/cvat/apps/analytics_report/pyproject.toml
index 567b78362580..b74ee17d74aa 100644
--- a/cvat/apps/analytics_report/pyproject.toml
+++ b/cvat/apps/analytics_report/pyproject.toml
@@ -4,9 +4,3 @@ forced_separate = ["tests"]
 line_length = 100
 skip_gitignore = true # align tool behavior with Black
 known_first_party = ["cvat"]
-
-# Can't just use a pyproject in the root dir, so duplicate
-# https://github.com/psf/black/issues/2863
-[tool.black]
-line-length = 100
-target-version = ['py38']
diff --git a/cvat/apps/engine/lazy_list.py b/cvat/apps/engine/lazy_list.py
new file mode 100644
index 000000000000..21b5959c22d5
--- /dev/null
+++ b/cvat/apps/engine/lazy_list.py
@@ -0,0 +1,254 @@
+# Copyright (C) 2024 CVAT.ai Corporation
+#
+# SPDX-License-Identifier: MIT
+
+from functools import wraps
+from itertools import islice
+from typing import Any, Callable, Iterator, TypeVar, overload
+
+import attrs
+from attr import field
+
+T = TypeVar("T", bound=int | float | str)
+
+
+def _parse_self_and_other_before_accessing(list_method: Callable[..., Any]) -> Callable[..., Any]:
+    @wraps(list_method)
+    def wrapper(self: "LazyList", other: Any) -> "LazyList":
+        self._parse_up_to(-1)
+        if isinstance(other, LazyList):
+            other._parse_up_to(-1)
+        if not isinstance(other, list):
+            # explicitly calling list.__add__ with
+            # np.ndarray raises TypeError instead of it returning NotImplemented
+            # this prevents python from executing np.ndarray.__radd__
+            return NotImplemented
+
+        return list_method(self, other)
+
+    return wrapper
+
+
+def _parse_self_before_accessing(list_method: Callable[..., Any]) -> Callable[..., Any]:
+    """Wrapper for original list methods. Forces LazyList to parse itself before accessing them."""
+
+    @wraps(list_method)
+    def wrapper(self: "LazyList", *args, **kwargs) -> "LazyList":
+        self._parse_up_to(-1)
+
+        return list_method(self, *args, **kwargs)
+
+    return wrapper
+
+
+class LazyListMeta(type):
+    def __new__(
+        mcs,
+        name: str,
+        bases: tuple[type, ...],
+        namespace: dict[str, Any],
+    ):
+        # add pre-parse for list methods
+        for method_name in [
+            "append",
+            "copy",
+            "insert",
+            "pop",
+            "remove",
+            "reverse",
+            "sort",
+            "clear",
+            "index",
+            "count",
+            "__setitem__",
+            "__delitem__",
+            "__contains__",
+            "__len__",
+            "__reversed__",
+            "__mul__",
+            "__rmul__",
+            "__imul__",
+        ]:
+            namespace[method_name] = _parse_self_before_accessing(getattr(list, method_name))
+
+        for method_name in [
+            "extend",
+            "__add__",
+            "__iadd__",
+            "__eq__",
+            "__gt__",
+            "__ge__",
+            "__lt__",
+            "__le__",
+        ]:
+            namespace[method_name] = _parse_self_and_other_before_accessing(
+                getattr(list, method_name)
+            )
+
+        return super().__new__(mcs, name, bases, namespace)
+
+
+@attrs.define(slots=True, repr=False)
+class LazyList(list[T], metaclass=LazyListMeta):
+    """
+    Evaluates elements from the string representation as needed.
+    Lazy evaluation is supported for __getitem__ and __iter__ methods.
+    Using any other method will result in parsing the whole string.
+    Once instance of LazyList is fully parsed (either by accessing list methods
+    or by iterating over all elements), it will behave just as a regular python list.
+    """
+
+    _string: str = ""
+    _separator: str = ","
+    _converter: Callable[[str], T] = lambda s: s
+    _probable_length: int | None = field(init=False, default=None)
+    _parsed: bool = field(init=False, default=False)
+
+    def __repr__(self) -> str:
+        if self._parsed:
+            return f"LazyList({list.__repr__(self)})"
+        current_index = list.__len__(self)
+        current_position = 1 if self._string.startswith("[") else 0
+        separator_offset = len(self._separator)
+
+        for _ in range(current_index):
+            current_position = (
+                self._string.find(self._separator, current_position) + separator_offset
+            )
+
+        parsed_elements = list.__repr__(self).removesuffix("]")
+        unparsed_elements = self._string[current_position:]
+        return (
+            f"LazyList({parsed_elements}... + {unparsed_elements}', "
+            f"({list.__len__(self) / self._compute_max_length(self._string) * 100:.02f}% parsed))"
+        )
+
+    def __deepcopy__(self, memodict: Any = None) -> list[T]:
+        """
+        Since our elements are scalar, this should be sufficient
+        Without this, deepcopy would copy the state of the object,
+        then would try to append its elements.
+
+        However, since copy will contain initial string,
+        it will compute its elements on the first on the first append,
+        resulting in value duplication.
+        """
+        return list(self)
+
+    @overload
+    def __getitem__(self, index: int) -> T: ...
+
+    @overload
+    def __getitem__(self, index: slice) -> list[T]: ...
+
+    def __getitem__(self, index: int | slice) -> T | list[T]:
+        if self._parsed:
+            return list.__getitem__(self, index)
+
+        if isinstance(index, slice):
+            self._parse_up_to(index.indices(self._compute_max_length(self._string))[1] - 1)
+            return list.__getitem__(self, index)
+
+        self._parse_up_to(index)
+        return list.__getitem__(self, index)
+
+    def __iter__(self) -> Iterator[T]:
+        yield from list.__iter__(self)
+        yield from self._iter_unparsed()
+
+    def __str__(self) -> str:
+        if not self._parsed:
+            return self._string.strip("[]")
+        return self._separator.join(map(str, self))
+
+    def _parse_up_to(self, index: int) -> None:
+        if self._parsed:
+            return
+
+        if index < 0:
+            index += self._compute_max_length(self._string)
+
+        start = list.__len__(self)
+        if start > index:
+            return
+        end = index - start + 1
+        for _ in islice(self._iter_unparsed(), end + 1):
+            pass
+
+        if index == self._compute_max_length(self._string) - 1:
+            self._mark_parsed()
+
+    def _mark_parsed(self):
+        self._parsed = True
+        self._string = ""  # freeing the memory
+
+    def _iter_unparsed(self):
+        if self._parsed:
+            return
+        string = self._string
+        current_index = list.__len__(self)
+        current_position = 1 if string.startswith("[") else 0
+        string_length = len(string) - 1 if string.endswith("]") else len(string)
+        separator_offset = len(self._separator)
+
+        for _ in range(current_index):
+            current_position = string.find(self._separator, current_position) + separator_offset
+
+        while current_index < self._compute_max_length(string):
+            end = string.find(self._separator, current_position, string_length)
+            if end == -1:
+                end = string_length
+                self._mark_parsed()
+
+            element_str = string[current_position:end]
+            current_position = end + separator_offset
+            if not element_str:
+                self._probable_length -= 1
+                continue
+            element = self._converter(element_str)
+            if list.__len__(self) <= current_index:
+                # We need to handle special case when instance of lazy list becomes parsed after
+                # this function is called:
+                # ll = LazyList("1,2,3", _converter=int)
+                # iterator = iter(ll)
+                # next(iterator)  # > 1 (will generate next element and append to self)
+                # list(ll)  # > [1, 2, 3]
+                # next(iterator)  # > 2 (will generate next element, however will not append it)
+                # assert list(ll) == [1, 2, 3]
+                list.append(self, element)
+            yield element
+            current_index += 1
+
+    def _compute_max_length(self, string) -> int:
+        if self._probable_length is None:
+            if not self._string:
+                return 0
+            self._probable_length = string.count(self._separator) + 1
+        return self._probable_length
+
+    # support pickling
+
+    def __reduce__(self):
+        return self.__class__, (self._string, self._separator, self._converter), self.__getstate__()
+
+    def __reduce_ex__(self, protocol: int):
+        return self.__reduce__()
+
+    def __getstate__(self):
+        return {
+            "string": self._string,
+            "_separator": self._separator,
+            "_converter": self._converter,
+            "_probable_length": self._probable_length,
+            "parsed": self._parsed,
+            "parsed_elements": list(self) if self._parsed else None,
+        }
+
+    def __setstate__(self, state):
+        self._string = state["string"]
+        self._separator = state["_separator"]
+        self._converter = state["_converter"]
+        self._probable_length = state["_probable_length"]
+        self._parsed = state["parsed"]
+        if self._parsed:
+            self.extend(state["parsed_elements"])
diff --git a/cvat/apps/engine/models.py b/cvat/apps/engine/models.py
index ff03e16814af..fe7fcbf79ad0 100644
--- a/cvat/apps/engine/models.py
+++ b/cvat/apps/engine/models.py
@@ -22,6 +22,7 @@
 from django.db.models import Q
 from drf_spectacular.types import OpenApiTypes
 from drf_spectacular.utils import extend_schema_field
+from cvat.apps.engine.lazy_list import LazyList
 
 from cvat.apps.engine.utils import parse_specific_attributes, chunked_list
 from cvat.apps.events.utils import cache_deleted
@@ -181,6 +182,7 @@ def choices(cls):
     def __str__(self):
         return self.value
 
+
 class AbstractArrayField(models.TextField):
     separator = ","
     converter = staticmethod(lambda x: x)
@@ -193,19 +195,20 @@ def __init__(self, *args, store_sorted:Optional[bool]=False, unique_values:Optio
     def from_db_value(self, value, expression, connection):
         if not value:
             return []
-        if value.startswith('[') and value.endswith(']'):
-            value = value[1:-1]
-        return [self.converter(v) for v in value.split(self.separator) if v]
+        return LazyList(string=value, separator=self.separator, converter=self.converter)
 
     def to_python(self, value):
-        if isinstance(value, list):
+        if isinstance(value, list | LazyList):
             return value
 
         return self.from_db_value(value, None, None)
 
     def get_prep_value(self, value):
+        if isinstance(value, LazyList) and not (self._unique_values or self._store_sorted):
+            return str(value)
+
         if self._unique_values:
-            value = list(dict.fromkeys(value))
+            value = dict.fromkeys(value)
         if self._store_sorted:
             value = sorted(value)
         return self.separator.join(map(str, value))
diff --git a/cvat/apps/engine/tests/test_lazy_list.py b/cvat/apps/engine/tests/test_lazy_list.py
new file mode 100644
index 000000000000..52c2f979bf4f
--- /dev/null
+++ b/cvat/apps/engine/tests/test_lazy_list.py
@@ -0,0 +1,183 @@
+import unittest
+import copy
+import pickle
+from typing import TypeVar
+from cvat.apps.engine.lazy_list import LazyList
+
+
+T = TypeVar('T')
+
+
+class TestLazyList(unittest.TestCase):
+
+    def setUp(self):
+        self.lazy_list = LazyList(string="1,2,3", converter=int)
+
+    def test_skipped_values(self):
+        ll = LazyList("1,2,,4", converter=int)
+        self.assertEqual(len(ll), 3)
+        self.assertEqual(ll, [1, 2, 4])
+
+    def test_len(self):
+        self.assertEqual(len(self.lazy_list), 3)
+        list(self.lazy_list)
+        self.assertEqual(len(self.lazy_list), 3)
+
+    def test_repr(self):
+        self.assertEqual(repr(self.lazy_list), "LazyList([... + 1,2,3', (0.00% parsed))")
+        next(iter(self.lazy_list))  # Trigger parsing of the first element
+        self.assertIn("1... + 2,3", repr(self.lazy_list))
+        list(self.lazy_list)
+        self.assertEqual(repr(self.lazy_list), "LazyList([1, 2, 3])")
+
+    def test_deepcopy(self):
+        copied_list = copy.deepcopy(self.lazy_list)
+        self.assertEqual(copied_list, [1, 2, 3])
+        self.assertNotEqual(id(copied_list), id(self.lazy_list))
+        self.assertEqual(len(copied_list), 3)
+
+    def test_getitem(self):
+        self.assertEqual(self.lazy_list[1], 2)
+        self.assertEqual(self.lazy_list[:2], [1, 2])
+
+    def test_iter(self):
+        iterator = iter(self.lazy_list)
+        self.assertEqual(next(iterator), 1)
+        self.assertEqual(next(iterator), 2)
+        self.assertEqual(list(self.lazy_list), [1, 2, 3])
+        self.assertEqual(next(iterator), 3)
+        self.assertEqual(list(self.lazy_list), [1, 2, 3])
+
+    def test_append(self):
+        self.lazy_list.append(4)
+        self.assertEqual(self.lazy_list, [1, 2, 3, 4])
+
+    def test_copy(self):
+        copied_list = self.lazy_list.copy()
+        self.assertEqual(copied_list, [1, 2, 3])
+
+    def test_insert(self):
+        self.lazy_list.insert(0, 0)
+        self.assertEqual(self.lazy_list, [0, 1, 2, 3])
+
+    def test_pop(self):
+        value = self.lazy_list.pop()
+        self.assertEqual(value, 3)
+        self.assertEqual(self.lazy_list, [1, 2])
+
+    def test_remove(self):
+        self.lazy_list.remove(2)
+        self.assertEqual(self.lazy_list, [1, 3])
+
+    def test_reverse(self):
+        self.lazy_list.reverse()
+        self.assertEqual(self.lazy_list, [3, 2, 1])
+
+    def test_sort(self):
+        unsorted_list = LazyList(string="3,1,2", converter=int)
+        unsorted_list.sort()
+        self.assertEqual(unsorted_list, [1, 2, 3])
+
+    def test_clear(self):
+        self.lazy_list.clear()
+        self.assertEqual(len(self.lazy_list), 0)
+
+    def test_index(self):
+        self.assertEqual(self.lazy_list.index(2), 1)
+
+    def test_count(self):
+        self.assertEqual(self.lazy_list.count(2), 1)
+
+    def test_setitem(self):
+        self.lazy_list[0] = 4
+        self.assertEqual(self.lazy_list[0], 4)
+
+    def test_delitem(self):
+        del self.lazy_list[0]
+        self.assertEqual(self.lazy_list, [2, 3])
+
+    def test_contains(self):
+        self.assertIn(2, self.lazy_list)
+
+    def test_reversed(self):
+        self.assertEqual(list(reversed(self.lazy_list)), [3, 2, 1])
+
+    def test_mul(self):
+        self.assertEqual(self.lazy_list * 2, [1, 2, 3, 1, 2, 3])
+
+    def test_rmul(self):
+        self.assertEqual(2 * self.lazy_list, [1, 2, 3, 1, 2, 3])
+
+    def test_imul(self):
+        self.lazy_list *= 2
+        self.assertEqual(self.lazy_list, [1, 2, 3, 1, 2, 3])
+
+    def test_extend(self):
+        self.lazy_list.extend([4, 5])
+        self.assertEqual(self.lazy_list, [1, 2, 3, 4, 5])
+
+    def test_add(self):
+        new_list = self.lazy_list + [4, 5]
+        self.assertEqual(new_list, [1, 2, 3, 4, 5])
+
+    def test_eq(self):
+        self.assertTrue(self.lazy_list == [1, 2, 3])
+
+    def test_iadd(self):
+        self.lazy_list += [4, 5]
+        self.assertEqual(self.lazy_list, [1, 2, 3, 4, 5])
+
+    def test_gt(self):
+        self.assertTrue(self.lazy_list > [1, 2])
+
+    def test_ge(self):
+        self.assertTrue(self.lazy_list >= [1, 2, 3])
+
+    def test_lt(self):
+        self.assertTrue(self.lazy_list < [1, 2, 3, 4])
+
+    def test_le(self):
+        self.assertTrue(self.lazy_list <= [1, 2, 3])
+
+    def test_lazy_list_with_lazy_list(self):
+        other_lazy_list = LazyList(string="4,5,6", converter=int)
+        combined_list = self.lazy_list + other_lazy_list
+        self.assertEqual(combined_list, [1, 2, 3, 4, 5, 6])
+
+    def test_pickle_support(self):
+        pickled = pickle.dumps(self.lazy_list)
+        unpickled = pickle.loads(pickled)
+        self.assertEqual(unpickled, [1, 2, 3])
+        self.assertEqual(unpickled._string, "")
+        self.assertTrue(unpickled._parsed)
+
+    def test_parse_before_accessing_decorator(self):
+        lazy_list_copy = LazyList(string="1,2,3", converter=int)
+        lazy_list_copy.append(4)
+        self.assertEqual(lazy_list_copy, [1, 2, 3, 4])
+
+    def test_parse_both_before_accessing_decorator(self):
+        other_list = LazyList(string="4,5", converter=int)
+        result = self.lazy_list + other_list
+        self.assertEqual(result, [1, 2, 3, 4, 5])
+
+    def test_length_on_iteration(self):
+        elements = []
+        for element in self.lazy_list:
+            self.assertEqual(len(self.lazy_list), 3)
+            elements.append(element)
+
+        self.assertEqual(elements, [1, 2, 3])
+
+    def test_str(self):
+        self.assertEqual(str(self.lazy_list), "1,2,3")
+        self.assertEqual(self.lazy_list, LazyList(str(self.lazy_list), converter=int))
+
+    def test_str_parsed(self):
+        list(self.lazy_list)
+        self.assertEqual(str(self.lazy_list), "1,2,3")
+        self.assertEqual(self.lazy_list, LazyList(str(self.lazy_list), converter=int))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/cvat/apps/quality_control/pyproject.toml b/cvat/apps/quality_control/pyproject.toml
index 567b78362580..b74ee17d74aa 100644
--- a/cvat/apps/quality_control/pyproject.toml
+++ b/cvat/apps/quality_control/pyproject.toml
@@ -4,9 +4,3 @@ forced_separate = ["tests"]
 line_length = 100
 skip_gitignore = true # align tool behavior with Black
 known_first_party = ["cvat"]
-
-# Can't just use a pyproject in the root dir, so duplicate
-# https://github.com/psf/black/issues/2863
-[tool.black]
-line-length = 100
-target-version = ['py38']
diff --git a/dev/format_python_code.sh b/dev/format_python_code.sh
index a67bf08572e7..9483c5fbcb2e 100755
--- a/dev/format_python_code.sh
+++ b/dev/format_python_code.sh
@@ -23,6 +23,7 @@ for paths in \
     "tests/python/" \
     "cvat/apps/quality_control" \
     "cvat/apps/analytics_report" \
+    "cvat/apps/engine/lazy_list.py" \
     ; do
     ${BLACK} -- ${paths}
     ${ISORT} -- ${paths}
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 000000000000..581552a67ebc
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,9 @@
+[tool.isort]
+profile = "black"
+forced_separate = ["tests"]
+line_length = 100
+skip_gitignore = true # align tool behavior with Black
+
+[tool.black]
+line-length = 100
+target-version = ['py38']
diff --git a/tests/python/pyproject.toml b/tests/python/pyproject.toml
index e91d405b8e20..ab4db6695977 100644
--- a/tests/python/pyproject.toml
+++ b/tests/python/pyproject.toml
@@ -3,9 +3,3 @@ profile = "black"
 forced_separate = ["tests"]
 line_length = 100
 skip_gitignore = true # align tool behavior with Black
-
-# Can't just use a pyproject in the root dir, so duplicate
-# https://github.com/psf/black/issues/2863
-[tool.black]
-line-length = 100
-target-version = ['py38']