Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Lazy list for array field #8229

Merged
merged 17 commits into from
Aug 6, 2024
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
264 changes: 257 additions & 7 deletions cvat/apps/engine/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,15 @@

from __future__ import annotations

from itertools import islice

import os
import re
import shutil
import uuid
from enum import Enum
from functools import cached_property
from typing import Any, Dict, Optional, Sequence
from functools import cached_property, wraps
from typing import Any, Callable, Dict, Iterator, Optional, Sequence, TypeVar, overload

from django.conf import settings
from django.contrib.auth.models import User
Expand Down Expand Up @@ -181,6 +183,253 @@ def choices(cls):
def __str__(self):
return self.value


T = TypeVar("T", bound=int | float | str)


def _parse_self_and_other_before_accessing(list_method: Callable[..., Any]) -> Callable[..., Any]:
@wraps(list_method)
def wrapper(self: 'LazyList', other: Any) -> 'LazyList':
self._parse_up_to(-1)
if isinstance(other, LazyList):
other._parse_up_to(-1)
if not isinstance(other, list):
# explicitly calling list.__add__ with
# np.ndarray raises TypeError instead of it returning NotImplemented
# this prevents python from executing np.ndarray.__radd__
return NotImplemented

return list_method(self, other)

return wrapper


def _parse_self_before_accessing(list_method: Callable[..., Any]) -> Callable[..., Any]:
"""Wrapper for original list methods. Forces LazyList to parse itself before accessing them."""
@wraps(list_method)
def wrapper(self: 'LazyList', *args, **kwargs) -> 'LazyList':
self._parse_up_to(-1)

return list_method(self, *args, **kwargs)

return wrapper


class LazyListMeta(type):
def __new__(
mcs,
name: str,
bases: tuple[type, ...],
namespace: dict[str, Any],
):
# add pre-parse for list methods
for method_name in [
"append",
"copy",
"insert",
"pop",
"remove",
"reverse",
"sort",
"clear",
"index",
"count",
"__setitem__",
"__delitem__",
"__contains__",
"__len__",
"__reversed__",
"__mul__",
"__rmul__",
"__imul__",
]:
namespace[method_name] = _parse_self_before_accessing(
getattr(list, method_name)
)

for method_name in [
"extend",
"__add__",
"__iadd__",
"__eq__",
"__gt__",
"__ge__",
"__lt__",
"__le__",
]:
namespace[method_name] = _parse_self_and_other_before_accessing(
getattr(list, method_name)
)

return super().__new__(mcs, name, bases, namespace)


class LazyList(list[T], metaclass=LazyListMeta):
zhiltsov-max marked this conversation as resolved.
Show resolved Hide resolved
"""
Evaluates elements from the string representation as needed.
Lazy evaluation is supported for __getitem__ and __iter__ methods.
Using any other method will result in parsing the whole string.
Once instance of LazyList is fully parsed (either by accessing list methods
or by iterating over all elements), it will behave just as a regular python list.
"""
__slots__ = ("_string", "_separator", "_converter", "_probable_length", "_parsed")

def __init__(self, string: str = "", separator: str = ",", converter: Callable[[str], T] = lambda s: s) -> None:
super().__init__()
self._string = string
self._separator = separator
self._converter = converter
self._probable_length: int | None = None
self._parsed: bool = False

def __repr__(self) -> str:
if self._parsed:
return f"LazyList({list.__repr__(self)})"
current_index = list.__len__(self)
current_position = 1 if self._string.startswith('[') else 0
separator_offset = len(self._separator)

for _ in range(current_index):
current_position = self._string.find(self._separator, current_position) + separator_offset

parsed_elements = list.__repr__(self).removesuffix("]")
unparsed_elements = self._string[current_position:]
return (
f"LazyList({parsed_elements}... + {unparsed_elements}', "
f"({list.__len__(self) / self._compute_max_length(self._string) * 100:.02f}% parsed))"
)

def __deepcopy__(self, memodict: Any = None) -> list[T]:
"""
Since our elements are scalar, this should be sufficient
zhiltsov-max marked this conversation as resolved.
Show resolved Hide resolved
Without this, deepcopy would copy the state of the object,
then would try to append its elements.

However, since copy will contain initial string,
it will compute its elements on the first on the first append,
resulting in value duplication.
"""
return list(self)

@overload
def __getitem__(self, index: int) -> T: ...

@overload
def __getitem__(self, index: slice) -> list[T]: ...

def __getitem__(self, index: int | slice) -> T | list[T]:
if self._parsed:
return list.__getitem__(self, index)

if isinstance(index, slice):
self._parse_up_to(index.indices(self._compute_max_length(self._string))[1] - 1)
return list.__getitem__(self, index)

self._parse_up_to(index)
return list.__getitem__(self, index)

def __iter__(self) -> Iterator[T]:
yield from list.__iter__(self)
yield from self._iter_unparsed()

def __str__(self) -> str:
if not self._parsed:
return self._string.strip("[]")
return self._separator.join(map(str, self))

def _parse_up_to(self, index: int) -> None:
if self._parsed:
return

if index < 0:
index += self._compute_max_length(self._string)

start = list.__len__(self)
if start > index:
return
end = index - start + 1
for _ in islice(self._iter_unparsed(), end + 1):
pass

if index == self._compute_max_length(self._string) - 1:
self._mark_parsed()

def _mark_parsed(self):
self._parsed = True
self._string = "" # freeing the memory

def _iter_unparsed(self):
if self._parsed:
return
string = self._string
current_index = list.__len__(self)
current_position = 1 if string.startswith('[') else 0
string_length = len(string) - 1 if string.endswith(']') else len(string)
separator_offset = len(self._separator)

for _ in range(current_index):
current_position = string.find(self._separator, current_position) + separator_offset

while current_index < self._compute_max_length(string):
end = string.find(self._separator, current_position, string_length)
if end == -1:
end = string_length
self._mark_parsed()

element_str = string[current_position:end]
current_position = end + separator_offset
if not element_str:
self._probable_length -= 1
continue
element = self._converter(element_str)
if list.__len__(self) <= current_index:
# We need to handle special case when instance of lazy list becomes parsed after
# this function is called:
# ll = LazyList("1,2,3", _converter=int)
# iterator = iter(ll)
# next(iterator) # > 1 (will generate next element and append to self)
# list(ll) # > [1, 2, 3]
# next(iterator) # > 2 (will generate next element, however will not append it)
# assert list(ll) == [1, 2, 3]
list.append(self, element)
yield element
current_index += 1

def _compute_max_length(self, string) -> int:
if self._probable_length is None:
if not self._string:
return 0
self._probable_length = string.count(self._separator) + 1
return self._probable_length

# support pickling

def __reduce__(self):
return self.__class__, (self._string, self._separator, self._converter), self.__getstate__()

def __reduce_ex__(self, protocol: int):
return self.__reduce__()

def __getstate__(self):
return {
'string': self._string,
'_separator': self._separator,
'_converter': self._converter,
'_probable_length': self._probable_length,
'parsed': self._parsed,
'parsed_elements': list(self) if self._parsed else None
}

def __setstate__(self, state):
self._string = state['string']
self._separator = state['_separator']
self._converter = state['_converter']
self._probable_length = state['_probable_length']
self._parsed = state['parsed']
if self._parsed:
self.extend(state['parsed_elements'])


class AbstractArrayField(models.TextField):
separator = ","
converter = staticmethod(lambda x: x)
Expand All @@ -193,19 +442,20 @@ def __init__(self, *args, store_sorted:Optional[bool]=False, unique_values:Optio
def from_db_value(self, value, expression, connection):
if not value:
return []
if value.startswith('[') and value.endswith(']'):
value = value[1:-1]
return [self.converter(v) for v in value.split(self.separator) if v]
return LazyList(string=value, separator=self.separator, converter=self.converter)

def to_python(self, value):
if isinstance(value, list):
if isinstance(value, list | LazyList):
return value

return self.from_db_value(value, None, None)

def get_prep_value(self, value):
if isinstance(value, LazyList) and not (self._unique_values or self._store_sorted):
return str(value)

if self._unique_values:
value = list(dict.fromkeys(value))
value = dict.fromkeys(value)
if self._store_sorted:
value = sorted(value)
return self.separator.join(map(str, value))
Expand Down
Loading
Loading