Skip to content

Commit

Permalink
core: add helper for more_iterable to check that all types involved a…
Browse files Browse the repository at this point in the history
…re hashable

Otherwise unique_everseen performance may degrade to quadratic rather than linear

For now hidden behind HPI_CHECK_UNIQUE_EVERSEEN flag

also switch some modules to use it
  • Loading branch information
karlicoss committed Oct 31, 2023
1 parent d678608 commit 71cb66d
Show file tree
Hide file tree
Showing 8 changed files with 90 additions and 23 deletions.
73 changes: 72 additions & 1 deletion my/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,25 @@
import os
import sys
import types
from typing import Union, Callable, Dict, Iterable, TypeVar, Sequence, List, Optional, Any, cast, Tuple, TYPE_CHECKING, NoReturn
from typing import (
Any,
Callable,
Dict,
Iterable,
Iterator,
List,
NoReturn,
Optional,
Sequence,
TYPE_CHECKING,
Tuple,
TypeVar,
Union,
cast,
get_args,
get_type_hints,
get_origin,
)
import warnings
from . import warnings as core_warnings

Expand Down Expand Up @@ -628,6 +646,59 @@ def assert_never(value: NoReturn) -> NoReturn:
assert False, f'Unhandled value: {value} ({type(value).__name__})'


def _check_all_hashable(fun):
# TODO ok, take callable?
hints = get_type_hints(fun)
# TODO needs to be defensive like in cachew?
return_type = hints.get('return')
# TODO check if None
origin = get_origin(return_type) # Iterator etc?
(arg,) = get_args(return_type)
# options we wanna handle are simple type on the top level or union
arg_origin = get_origin(arg)

if sys.version_info[:2] >= (3, 10):
is_uniontype = arg_origin is types.UnionType
else:
is_uniontype = False

is_union = arg_origin is Union or is_uniontype
if is_union:
to_check = get_args(arg)
else:
to_check = (arg,)

no_hash = [
t
for t in to_check
# seems that objects that have not overridden hash have the attribute but it's set to None
if getattr(t, '__hash__', None) is None
]
assert len(no_hash) == 0, f'Types {no_hash} are not hashable, this will result in significant performance downgrade for unique_everseen'


_UET = TypeVar('_UET')
_UEU = TypeVar('_UEU')


def unique_everseen(
fun: Callable[[], Iterable[_UET]],
key: Optional[Callable[[_UET], _UEU]] = None,
) -> Iterator[_UET]:
# TODO support normal iterable as well?
import more_itertools

# NOTE: it has to take original callable, because otherwise we don't have access to generator type annotations
iterable = fun()

if key is None:
# todo check key return type as well? but it's more likely to be hashable
if os.environ.get('HPI_CHECK_UNIQUE_EVERSEEN') is not None:
_check_all_hashable(fun)

return more_itertools.unique_everseen(iterable=iterable, key=key)


## legacy imports, keeping them here for backwards compatibility
from functools import cached_property as cproperty
from typing import Literal
Expand Down
5 changes: 2 additions & 3 deletions my/fbmessenger/android.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,8 @@
import sqlite3
from typing import Iterator, Sequence, Optional, Dict, Union, List

from more_itertools import unique_everseen

from my.core import get_files, Paths, datetime_aware, Res, assert_never, LazyLogger, make_config
from my.core.common import unique_everseen
from my.core.error import echain
from my.core.sqlite import sqlite_connection

Expand Down Expand Up @@ -242,7 +241,7 @@ def messages() -> Iterator[Res[Message]]:
senders: Dict[str, Sender] = {}
msgs: Dict[str, Message] = {}
threads: Dict[str, Thread] = {}
for x in unique_everseen(_entities()):
for x in unique_everseen(_entities):
if isinstance(x, Exception):
yield x
continue
Expand Down
5 changes: 2 additions & 3 deletions my/instagram/android.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@
import sqlite3
from typing import Iterator, Sequence, Optional, Dict, Union

from more_itertools import unique_everseen

from my.core import (
get_files,
Paths,
Expand All @@ -22,6 +20,7 @@
Res,
assert_never,
)
from my.core.common import unique_everseen
from my.core.cachew import mcachew
from my.core.error import echain
from my.core.sqlite import sqlite_connect_immutable, select
Expand Down Expand Up @@ -196,7 +195,7 @@ def _entities() -> Iterator[Res[Union[User, _Message]]]:
@mcachew(depends_on=inputs)
def messages() -> Iterator[Res[Message]]:
id2user: Dict[str, User] = {}
for x in unique_everseen(_entities()):
for x in unique_everseen(_entities):
if isinstance(x, Exception):
yield x
continue
Expand Down
5 changes: 3 additions & 2 deletions my/instagram/gdpr.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from pathlib import Path
from typing import Iterator, Sequence, Dict, Union

from more_itertools import bucket, unique_everseen
from more_itertools import bucket

from my.core import (
get_files,
Expand All @@ -17,6 +17,7 @@
assert_never,
make_logger,
)
from my.core.common import unique_everseen

from my.config import instagram as user_config

Expand Down Expand Up @@ -196,7 +197,7 @@ def _entitites_from_path(path: Path) -> Iterator[Res[Union[User, _Message]]]:
# TODO basically copy pasted from android.py... hmm
def messages() -> Iterator[Res[Message]]:
id2user: Dict[str, User] = {}
for x in unique_everseen(_entities()):
for x in unique_everseen(_entities):
if isinstance(x, Exception):
yield x
continue
Expand Down
5 changes: 2 additions & 3 deletions my/tinder/android.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,8 @@
import sqlite3
from typing import Sequence, Iterator, Union, Dict, List, Mapping

from more_itertools import unique_everseen

from my.core import Paths, get_files, Res, assert_never, stat, Stats, datetime_aware, make_logger
from my.core.common import unique_everseen
from my.core.error import echain
from my.core.sqlite import sqlite_connection
import my.config
Expand Down Expand Up @@ -162,7 +161,7 @@ def _parse_msg(row: sqlite3.Row) -> _Message:
def entities() -> Iterator[Res[Entity]]:
id2person: Dict[str, Person] = {}
id2match: Dict[str, Match] = {}
for x in unique_everseen(_entities()):
for x in unique_everseen(_entities):
if isinstance(x, Exception):
yield x
continue
Expand Down
7 changes: 3 additions & 4 deletions my/twitter/talon.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,8 @@
import sqlite3
from typing import Iterator, Sequence, Union

from more_itertools import unique_everseen

from my.core import Paths, Res, datetime_aware, get_files
from my.core.common import unique_everseen
from my.core.sqlite import sqlite_connection

from .common import TweetId, permalink
Expand Down Expand Up @@ -133,15 +132,15 @@ def _parse_tweet(row: sqlite3.Row) -> Tweet:


def tweets() -> Iterator[Res[Tweet]]:
for x in unique_everseen(_entities()):
for x in unique_everseen(_entities):
if isinstance(x, Exception):
yield x
elif isinstance(x, _IsTweet):
yield x.tweet


def likes() -> Iterator[Res[Tweet]]:
for x in unique_everseen(_entities()):
for x in unique_everseen(_entities):
if isinstance(x, Exception):
yield x
elif isinstance(x, _IsFavorire):
Expand Down
8 changes: 4 additions & 4 deletions my/vk/vk_messages_backup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@
from datetime import datetime
from dataclasses import dataclass
import json
from typing import Dict, Iterator, NamedTuple
from typing import Dict, Iterator

from more_itertools import unique_everseen
import pytz

from my.core import stat, Stats, Json, Res, datetime_aware
from my.core import stat, Stats, Json, Res, datetime_aware, get_files
from my.core.common import unique_everseen

from my.config import vk_messages_backup as config

Expand Down Expand Up @@ -147,7 +147,7 @@ def _messages() -> Iterator[Res[Message]]:

def messages() -> Iterator[Res[Message]]:
# seems that during backup messages were sometimes duplicated..
yield from unique_everseen(_messages())
yield from unique_everseen(_messages)


def stats() -> Stats:
Expand Down
5 changes: 2 additions & 3 deletions my/whatsapp/android.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,8 @@
import sqlite3
from typing import Sequence, Iterator, Optional

from more_itertools import unique_everseen

from my.core import get_files, Paths, datetime_aware, Res, make_logger, make_config
from my.core.common import unique_everseen
from my.core.error import echain, notnone
from my.core.sqlite import sqlite_connection
import my.config
Expand Down Expand Up @@ -202,4 +201,4 @@ def _messages() -> Iterator[Res[Message]]:


def messages() -> Iterator[Res[Message]]:
yield from unique_everseen(_messages())
yield from unique_everseen(_messages)

0 comments on commit 71cb66d

Please sign in to comment.