Fix several bugs with LazyList

1. Support pickling 2. Handle cases where LazyList is extended with/added to another LazyList 3. Fix cases when unparsed LazyList was being compared result 4. Fix cases when deep copying resulted in duplicate values
cvat-ai · Jul 29, 2024 · 2a68e50 · 2a68e50
1 parent 078669e
commit 2a68e50
Show file tree

Hide file tree

Showing 2 changed files with 325 additions and 71 deletions.
diff --git a/cvat/apps/engine/models.py b/cvat/apps/engine/models.py
@@ -4,7 +4,7 @@
 # SPDX-License-Identifier: MIT
 
 from __future__ import annotations
-from dataclasses import dataclass, field
+
 from itertools import islice
 
 import os
@@ -187,29 +187,34 @@ def __str__(self):
 T = TypeVar("T")
 
 
+def _parse_both_before_accessing(fn):
+    @wraps(fn)
+    def wrapper(self: 'LazyList', other: Any) -> 'LazyList':
+        self._parse_up_to(-1)
+        if not isinstance(other, list):
+            # explicitly calling list.__add__ with
+            # np.ndarray raises TypeError instead of it returning NotImplemented
+            # this prevents python from executing np.ndarray.__radd__
+            return NotImplemented
+        if isinstance(other, LazyList):
+            other._parse_up_to(-1)
+
+        return fn(self, other)
+
+    return wrapper
+
+
 def _parse_before_accessing(fn: Callable[..., Any]) -> Callable[..., Any]:
     """Wrapper for original list methods. Forces LazyList to parse itself before accessing them."""
-    if fn.__name__ in {"__add__", "__mul__"}:
-        @wraps(fn)
-        def wrapper(self: 'LazyList', other):
-            self._parse_up_to(-1)
-            if not isinstance(other, list):
-                # explicitly calling list.__add__ with
-                # np.ndarray raises TypeError instead of it returning NotImplemented
-                # this prevents python from executing np.ndarray.__radd__
-                return NotImplemented
-            return fn(self, other)
-    else:
-        @wraps(fn)
-        def wrapper(self: 'LazyList', *args, **kwargs) -> 'LazyList':
-            self._parse_up_to(-1)
-
-            return fn(self, *args, **kwargs)
+    @wraps(fn)
+    def wrapper(self: 'LazyList', *args, **kwargs) -> 'LazyList':
+        self._parse_up_to(-1)
+
+        return fn(self, *args, **kwargs)
 
     return wrapper
 
 
-@dataclass(slots=True)
 class LazyList(list[T]):
     """
     Evaluates elements from the string representation as needed.
@@ -218,43 +223,48 @@ class LazyList(list[T]):
     Once instance of LazyList is fully parsed (either by accessing list methods
     or by iterating over all elements), it will behave just as a regular python list.
     """
+    __slots__ = ("string", "_separator", "_converter", "_probable_length", "parsed")
     string: str
     _separator: str
     _converter: Callable[[str], T]
-    _probable_length: int | None = field(init=False, default=None)
-    parsed: bool = field(init=False, default=False)
+    _probable_length: int | None
 
-    for method in [
-        "append",
-        "copy",
-        "extend",
-        "insert",
-        "pop",
-        "remove",
-        "reverse",
-        "sort",
-        "clear",
-        "__setitem__",
-        "__delitem__",
-        "__eq__",
-        "__contains__",
-        "__len__",
-        "__add__",
-        "__iadd__",
-        "__mul__",
-        "__rmul__",
-        "__imul__",
-        "__contains__",
-        "__reversed__",
-        "__gt__",
-        "__ge__",
-        "__lt__",
-        "__le__",
-        "__eq__",
-        "__repr__",
-        "__len__",
-    ]:
-        locals()[method] = _parse_before_accessing(getattr(list, method))
+    def __init__(self, string: str = "", separator: str = ",", converter: Callable[[str], T] = lambda s: s) -> None:
+        super().__init__()
+        self.string = string
+        self._separator = separator
+        self._converter = converter
+        self._probable_length = None
+        self.parsed = False
+
+    def __repr__(self) -> str:
+        if self.parsed:
+            return f"LazyList({list.__repr__(self)})"
+        current_index = list.__len__(self)
+        current_position = 1 if self.string.startswith('[') else 0
+        separator_offset = len(self._separator)
+
+        for _ in range(current_index):
+            current_position = self.string.find(self._separator, current_position) + separator_offset
+
+        parsed_elements = list.__repr__(self).removesuffix("]")
+        unparsed_elements = self.string[current_position:]
+        return (
+            f"LazyList({parsed_elements}... + {unparsed_elements}', "
+            f"({list.__len__(self) / self._compute_max_length(self.string) * 100:.02f}% parsed))"
+        )
+
+    def __deepcopy__(self, memodict: Any = None) -> list[T]:
+        """
+        Since our elements are scalar, this should be sufficient
+        Without this, deepcopy would copy the state of the object,
+        then would try to append its elements.
+
+        However, since copy will contain initial string,
+        it will compute its elements on the first on the first append,
+        resulting in value duplication.
+        """
+        return list(self)
 
     @overload
     def __getitem__(self, index: int) -> T: ...
@@ -267,7 +277,7 @@ def __getitem__(self, index: int | slice) -> T | list[T]:
             return list.__getitem__(self, index)
 
         if isinstance(index, slice):
-            self._parse_up_to(index.indices(self._compute_max_length())[1] - 1)
+            self._parse_up_to(index.indices(self._compute_max_length(self.string))[1] - 1)
             return list.__getitem__(self, index)
 
         self._parse_up_to(index)
@@ -282,54 +292,132 @@ def _parse_up_to(self, index: int) -> None:
             return
 
         if index < 0:
-            index += self._compute_max_length()
-        if index < 0 or index >= self._compute_max_length():
-            raise IndexError('Index out of range')
+            index += self._compute_max_length(self.string)
 
         start = list.__len__(self)
         if start > index:
             return
-
         end = index - start + 1
-        for _ in islice(self._iter_unparsed(), end):
+        for _ in islice(self._iter_unparsed(), end + 1):
             pass
 
-        if index == self._compute_max_length() - 1:
-            self.parsed = True
-            self.string = ""  # freeing the memory
+        if index == self._compute_max_length(self.string) - 1:
+            self._mark_parsed()
+
+    def _mark_parsed(self):
+        self.parsed = True
+        self.string = ""  # freeing the memory
 
     def _iter_unparsed(self):
         if self.parsed:
             return
+        string = self.string
         current_index = list.__len__(self)
-        current_position = 1 if self.string.startswith('[') else 0
-        string_length = len(self.string) - 1 if self.string.endswith(']') else len(self.string)
+        current_position = 1 if string.startswith('[') else 0
+        string_length = len(string) - 1 if string.endswith(']') else len(string)
         separator_offset = len(self._separator)
 
         for _ in range(current_index):
-            current_position = self.string.find(self._separator, current_position) + separator_offset
+            current_position = string.find(self._separator, current_position) + separator_offset
 
-        while current_index < self._compute_max_length():
-            end = self.string.find(self._separator, current_position, string_length)
+        while current_index < self._compute_max_length(string):
+            end = string.find(self._separator, current_position, string_length)
             if end == -1:
                 end = string_length
-                self.parsed = True
+                self._mark_parsed()
 
-            element_str = self.string[current_position:end]
+            element_str = string[current_position:end]
             current_position = end + separator_offset
             if not element_str:
                 self._probable_length -= 1
                 continue
             element = self._converter(element_str)
-            list.append(self, element)
+            if list.__len__(self) <= current_index:
+                # We need to handle special case when instance of lazy list becomes parsed after
+                # this function is called:
+                # ll = LazyList("1,2,3", _converter=int)
+                # iterator = iter(ll)
+                # next(iterator)  # > 1 (will generate next element and append to self)
+                # list(ll)  # > [1, 2, 3]
+                # next(iterator)  # > 2 (will generate next element, however will not append it)
+                # assert list(ll) == [1, 2, 3]
+                list.append(self, element)
             yield element
             current_index += 1
 
-    def _compute_max_length(self) -> int:
+    def _compute_max_length(self, string) -> int:
         if self._probable_length is None:
-            self._probable_length = self.string.count(self._separator) + 1
+            if not self.string:
+                return 0
+            self._probable_length = string.count(self._separator) + 1
         return self._probable_length
 
+    # support pickling
+
+    def __reduce__(self):
+        return self.__class__, (self.string, self._separator, self._converter), self.__getstate__()
+
+    def __reduce_ex__(self, protocol: int):
+        return self.__reduce__()
+
+    def __getstate__(self):
+        return {
+            'string': self.string,
+            '_separator': self._separator,
+            '_converter': self._converter,
+            '_probable_length': self._probable_length,
+            'parsed': self.parsed,
+            'parsed_elements': list(self) if self.parsed else None
+        }
+
+    def __setstate__(self, state):
+        self.string = state['string']
+        self._separator = state['_separator']
+        self._converter = state['_converter']
+        self._probable_length = state['_probable_length']
+        self.parsed = state['parsed']
+        if self.parsed:
+            self.extend(state['parsed_elements'])
+
+    # add pre-parse for list methods
+    for method in [
+        "append",
+        "copy",
+        "insert",
+        "pop",
+        "remove",
+        "reverse",
+        "sort",
+        "clear",
+        "index",
+        "count",
+        "__setitem__",
+        "__delitem__",
+        "__contains__",
+        "__len__",
+        "__contains__",
+        "__reversed__",
+        "__mul__",
+        "__rmul__",
+        "__imul__",
+    ]:
+        locals()[method] = _parse_before_accessing(getattr(list, method))
+
+    for method in [
+        "extend",
+        "__add__",
+        "__eq__",
+        "__iadd__",
+        "__gt__",
+        "__ge__",
+        "__lt__",
+        "__le__",
+        "__eq__",
+    ]:
+        locals()[method] = _parse_both_before_accessing(getattr(list, method))
+
+    del method
+
 
 class AbstractArrayField(models.TextField):
     separator = ","
@@ -343,7 +431,7 @@ def __init__(self, *args, store_sorted:Optional[bool]=False, unique_values:Optio
     def from_db_value(self, value, expression, connection):
         if not value:
             return []
-        return LazyList(value, self.separator, self.converter)
+        return LazyList(string=value, separator=self.separator, converter=self.converter)
 
     def to_python(self, value):
         if isinstance(value, list | LazyList):