Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

core/stats: less duplication in output, report stats of first item, report input stats #324

Merged
merged 3 commits into from
Oct 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 32 additions & 13 deletions my/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,45 +401,55 @@ def quick_stats():
Stats = Dict[str, Any]
StatsFun = Callable[[], Stats]
# todo not sure about return type...
def stat(func: Union[Callable[[], Iterable[C]], Iterable[C]], quick: bool=False) -> Stats:
def stat(
func: Union[Callable[[], Iterable[C]], Iterable[C]],
*,
quick: bool = False,
name: Optional[str] = None,
) -> Stats:
if callable(func):
fr = func()
fname = func.__name__
else:
# meh. means it's just a list.. not sure how to generate a name then
fr = func
fname = f'unnamed_{id(fr)}'
tname = type(fr).__name__
if tname == 'DataFrame':
type_name = type(fr).__name__
if type_name == 'DataFrame':
# dynamic, because pandas is an optional dependency..
df = cast(Any, fr) # todo ugh, not sure how to annotate properly
df = cast(Any, fr) # todo ugh, not sure how to annotate properly
res = dict(
dtypes=df.dtypes.to_dict(),
rows=len(df),
)
else:
res = _stat_iterable(fr, quick=quick)

stat_name = name if name is not None else fname
return {
fname: res,
stat_name: res,
}


def _stat_iterable(it: Iterable[C], quick: bool=False) -> Any:
def _stat_iterable(it: Iterable[C], quick: bool = False) -> Any:
from more_itertools import ilen, take, first

# todo not sure if there is something in more_itertools to compute this?
total = 0
errors = 0
last = None
first_item = None
last_item = None

def funcit():
nonlocal errors, last, total
nonlocal errors, first_item, last_item, total
for x in it:
total += 1
if isinstance(x, Exception):
errors += 1
else:
last = x
last_item = x
if first_item is None:
first_item = x
yield x

eit = funcit()
Expand All @@ -464,10 +474,19 @@ def funcit():
if errors > 0:
res['errors'] = errors

if last is not None:
dt = guess_datetime(last)
if dt is not None:
res['last'] = dt
def stat_item(item):
if item is None:
return None
if isinstance(item, Path):
return str(item)
return guess_datetime(item)

if (stat_first := stat_item(first_item)) is not None:
res['first'] = stat_first

if (stat_last := stat_item(last_item)) is not None:
res['last'] = stat_last

return res


Expand Down
38 changes: 30 additions & 8 deletions my/core/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,42 @@

# TODO maybe could be enough to annotate OUTPUTS or something like that?
# then stats could just use them as hints?
def guess_stats(module_name: str, quick: bool=False) -> Optional[StatsFun]:
def guess_stats(module_name: str, quick: bool = False) -> Optional[StatsFun]:
providers = guess_data_providers(module_name)
if len(providers) == 0:
return None

def auto_stats() -> Stats:
return {k: stat(v, quick=quick) for k, v in providers.items()}
res = {}
for k, v in providers.items():
res.update(stat(v, quick=quick, name=k))
return res

return auto_stats


def test_guess_stats() -> None:
from datetime import datetime
import my.core.tests.auto_stats as M

auto_stats = guess_stats(M.__name__)
assert auto_stats is not None
res = auto_stats()

assert res == {
'inputs': {
karlicoss marked this conversation as resolved.
Show resolved Hide resolved
'count': 3,
'first': 'file1.json',
'last': 'file3.json',
},
'iter_data': {
'count': 9,
'first': datetime(2020, 1, 1, 1, 1, 1),
'last': datetime(2020, 1, 3, 1, 1, 1),
},
}


def guess_data_providers(module_name: str) -> Dict[str, Callable]:
module = importlib.import_module(module_name)
mfunctions = inspect.getmembers(module, inspect.isfunction)
Expand All @@ -34,7 +60,6 @@ def is_data_provider(fun: Any) -> bool:
1. returns iterable or something like that
2. takes no arguments? (otherwise not callable by stats anyway?)
3. doesn't start with an underscore (those are probably helper functions?)
4. functions isn't the 'inputs' function (or ends with '_inputs')
"""
# todo maybe for 2 allow default arguments? not sure
# one example which could benefit is my.pdfs
Expand All @@ -54,9 +79,6 @@ def is_data_provider(fun: Any) -> bool:
# probably a helper function?
if fun.__name__.startswith('_'):
return False
# ignore def inputs; something like comment_inputs or backup_inputs should also be ignored
if fun.__name__ == 'inputs' or fun.__name__.endswith('_inputs'):
return False

# inspect.signature might return str instead of a proper type object
# if from __future__ import annotations is used
Expand Down Expand Up @@ -96,11 +118,11 @@ def _helper_func() -> Iterator[Any]:

def inputs() -> Iterator[Any]:
yield 1
assert not idp(inputs)
assert idp(inputs)

def producer_inputs() -> Iterator[Any]:
yield 1
assert not idp(producer_inputs)
assert idp(producer_inputs)


# return any parameters the user is required to provide - those which don't have default values
Expand Down
30 changes: 30 additions & 0 deletions my/core/tests/auto_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
"""
Helper 'module' for test_guess_stats
"""

from dataclasses import dataclass
from datetime import datetime, timedelta
from pathlib import Path
from typing import Iterable, Sequence


@dataclass
class Item:
id: str
dt: datetime
source: Path


def inputs() -> Sequence[Path]:
return [
Path('file1.json'),
Path('file2.json'),
Path('file3.json'),
]


def iter_data() -> Iterable[Item]:
dt = datetime.fromisoformat('2020-01-01 01:01:01')
for path in inputs():
for i in range(3):
yield Item(id=str(i), dt=dt + timedelta(days=i), source=path)
Loading