Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update docs #118

Merged
merged 8 commits into from
Oct 6, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 64 additions & 48 deletions cascade/base/meta_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,8 @@

import os
import json
from typing import Union
import datetime
from typing import List, Dict
from typing import Union, List, Dict
from json import JSONEncoder

import yaml
Expand Down Expand Up @@ -61,15 +60,15 @@ def default(self, obj):

return super(CustomEncoder, self).default(obj)

def obj_to_dict(self, obj):
def obj_to_dict(self, obj) -> Dict:
return json.loads(self.encode(obj))


class BaseHandler:
def read(self, path) -> List[Dict]:
def read(self, path: str) -> Union[Dict, List[Dict]]:
raise NotImplementedError()

def write(self, path, obj, overwrite=True) -> None:
def write(self, path: str, obj, overwrite=True) -> None:
raise NotImplementedError()

def _raise_io_error(self, path, exc):
Expand All @@ -80,24 +79,7 @@ def _raise_io_error(self, path, exc):


class JSONHandler(BaseHandler):
"""
Handles the logic of dumping and loading json files
"""
def read(self, path) -> Union[Dict, List[Dict]]:
"""
Reads json from path

Parameters
----------
path:
Path to the file. If no extension provided,
then .json will be added

Raises
------
IOError
when decoding errors occur
"""
def read(self, path: str) -> Union[Dict, List[Dict]]:
_, ext = os.path.splitext(path)
if ext == '':
path += '.json'
Expand All @@ -111,32 +93,16 @@ def read(self, path) -> Union[Dict, List[Dict]]:
self._raise_io_error(path, e)
return meta

def write(self, name, obj: List[Dict], overwrite=True) -> None:
"""
Writes json to path using custom encoder
"""
if not overwrite and os.path.exists(name):
def write(self, path:str, obj: List[Dict], overwrite=True) -> None:
if not overwrite and os.path.exists(path):
return

with open(name, 'w') as f:
with open(path, 'w') as f:
json.dump(obj, f, cls=CustomEncoder, indent=4)


class YAMLHandler(BaseHandler):
def read(self, path) -> Union[Dict, List[Dict]]:
"""
Reads yaml from path

Parameters
----------
path:
Path to the file. If no extension provided, then .yml will be added

Raises
------
IOError
when decoding errors occur
"""
def read(self, path: str) -> Union[Dict, List[Dict]]:
_, ext = os.path.splitext(path)
if ext == '':
path += '.yml'
Expand All @@ -148,7 +114,7 @@ def read(self, path) -> Union[Dict, List[Dict]]:
self._raise_io_error(path, e)
return meta

def write(self, path, obj, overwrite=True) -> None:
def write(self, path: str, obj, overwrite=True) -> None:
if not overwrite and os.path.exists(path):
return

Expand All @@ -158,14 +124,14 @@ def write(self, path, obj, overwrite=True) -> None:


class TextHandler(BaseHandler):
def read(self, path) -> Dict:
def read(self, path: str) -> Dict:
"""
Reads text file from path and returns dict
in the form {path: 'text from file'}

Parameters
----------
path:
path: str
Path to the file
"""

Expand All @@ -179,11 +145,61 @@ def write(self, path, obj, overwrite=True) -> None:


class MetaHandler:
def read(self, path) -> List[Dict]:
"""
Encapsulates the logic of reading and writing metadata to disk.

Supported read-write formats are `json` and `yml`. Other formats
are supported as read-only. For example one can read meta from txt or md file.

Examples
--------
>>> from cascade.base import MetaHandler
>>> mh = MetaHandler()
>>> mh.write('meta.json', {'hello': 'world'})
>>> obj = mh.read('meta.json')
>>> mh.write('meta.yml', {'hello': 'world'})
>>> obj = mh.read('meta.yml')
"""
def read(self, path: str) -> Union[Dict, List[Dict]]:
"""
Reads object from path.

Parameters
----------
path: str
Path to the object.

Returns
-------
obj: Union[Dict, List[Dict]]

Raises
------
IOError
when decoding errors occur
"""
handler = self._get_handler(path)
return handler.read(path)

def write(self, path, obj, overwrite=True) -> None:
def write(self, path: str, obj, overwrite:bool = True) -> None:
"""
Writes object to path.

Parameters
----------
path: str
Path where to write object with name and extension
obj
An object to be serialized and saved
overwrite: bool, optional
Whether to overwrite the file if it already exists. If False
and file already exists will silently return without saving.

Raises
------
IOError
when encoding errors occur
"""
handler = self._get_handler(path)
return handler.write(path, obj, overwrite=overwrite)

Expand Down
26 changes: 21 additions & 5 deletions cascade/base/traceable.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,23 @@


class Traceable:
def __init__(self, *args, meta_prefix=None, **kwargs) -> None:
"""
Base class for everything that has metadata in cascade.
Handles the logic of getting and updating internal meta prefix.
"""
def __init__(self, *args, meta_prefix:Union[Dict, str] = None, **kwargs) -> None:
"""
Parameters
----------
meta_prefix: Union[Dict, str], optional
The dictionary that is used to update object's meta in `get_meta` call.
Due to the call of update can overwrite default values.
If str - prefix assumed to be path and loaded using MetaHandler.

See also
--------
cascade.base.MetaHandler
"""
if meta_prefix is None:
meta_prefix = {}
elif isinstance(meta_prefix, str):
Expand All @@ -22,8 +38,8 @@ def get_meta(self) -> List[Dict]:
meta: List[Dict]
A list where last element is this object's metadata.
Meta can be anything that is worth to document about
the object and its properties. This is done in form
of list to enable cascade-like calls in Modifiers and Samplers.
the object and its properties.
Meta is list to allow the formation of pipelines.
"""
meta = {
'name': repr(self)
Expand All @@ -36,8 +52,8 @@ def get_meta(self) -> List[Dict]:

def update_meta(self, obj: Union[Dict, str]) -> None:
"""
Updates _meta_prefix, which is then updates
dataset's meta when get_meta() is called
Updates `_meta_prefix`, which then updates
dataset's meta when `get_meta()` is called
"""
if isinstance(obj, str):
obj = self._read_meta_from_file(obj)
Expand Down
16 changes: 13 additions & 3 deletions cascade/data/apply_modifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,17 +20,27 @@

class ApplyModifier(Modifier):
"""
Modifier that maps a function to previous dataset's elements in a lazy way.
Modifier that maps a function to given dataset's items in a lazy way.
"""
def __init__(self, dataset: Dataset, func: Callable, *args, **kwargs) -> None:
"""
Parameters
----------
dataset: Dataset
a dataset to modify
A dataset to modify
func: Callable
a function to be applied to every item of a dataset -
A function to be applied to every item of a dataset -
each `__getitem__` would call `func` on an item obtained from a previous dataset

Examples
--------
>>> from cascade import data as cdd
>>> ds = cdd.Wrapper([0, 1, 2, 3, 4])
>>> ds = cdd.ApplyModifier(ds, lambda x: x ** 2)

Now function will only be applied when items are retrieved

>>> assert [item for item in ds] == [0, 1, 4, 9, 16]
"""
super().__init__(dataset, *args, **kwargs)
self._func = func
Expand Down
33 changes: 30 additions & 3 deletions cascade/data/bruteforce_cacher.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,41 @@

class BruteforceCacher(Modifier):
"""
Unusual modifier which loads everything in memory in initialization phase
and then returns values from cache
Identity modifier that calls all previous pipeline in __init__ loading everything
in memory. This is useful in combination with `Pickler` when pipeline
has heavy operations upstream. You can load everything and pickle it to turn off
heavy part of the pipeline.

Examples
--------
>>> from cascade import data as cdd
>>> ds = cdd.Wrapper([0 for _ in range(1000000)])
>>> ds = cdd.ApplyModifier(ds, lambda x: x + 1)
>>> ds = cdd.ApplyModifier(ds, lambda x: x + 1)
>>> ds = cdd.ApplyModifier(ds, lambda x: x + 1)

Cache heavy upstream part once

>>> ds = cdd.BruteforceCacher(ds)

Then pickle it

>>> ds = cdd.Pickler('ds', ds)

Unpickle and use further

>>> ds = cdd.Pickler('ds')
>>> ds = cdd.RandomSampler(ds, 1000)

See also
--------
Cascade.data.SequentialCacher
cascade.data.SequentialCacher
cascade.data.Pickler
"""
def __init__(self, dataset: Dataset, *args, **kwargs) -> None:
"""
Loads every item in dataset in internal list.
"""
super().__init__(dataset, *args, **kwargs)
# forcibly calling all previous datasets in the init
if hasattr(self._dataset, '__len__') and hasattr(self._dataset, '__getitem__'):
Expand Down
12 changes: 10 additions & 2 deletions cascade/data/concatenator.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,23 @@
class Concatenator(Dataset):
"""
Unifies several Datasets under one, calling them sequentially in the provided order.

Examples
--------
>>> from cascade.data import Wrapper, Concatenator
>>> ds_1 = Wrapper([0, 1, 2])
>>> ds_2 = Wrapper([2, 1, 0])
>>> ds = Concatenator((ds_1, ds_2))
>>> assert [item for item in ds] == [0, 1, 2, 2, 1, 0]
"""
def __init__(self, datasets: Iterable[Dataset], *args, **kwargs) -> None:
"""
Creates concatenated dataset from the list of datasets provided

Parameters
----------
datasets: Iterable[Dataset]
a list or tuple of datasets to concatenate
datasets: Union[Iterable[Dataset], Mapping[Dataset]]
A list or tuple of datasets to concatenate
"""
self._datasets = datasets
lengths = [len(ds) for ds in self._datasets]
Expand Down
11 changes: 2 additions & 9 deletions cascade/data/cyclic_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,8 @@ class CyclicSampler(Sampler):
-------
>>> from cascade.data import CyclicSampler, Wrapper
>>> ds = Wrapper([1,2,3])
>>> ds = CyclicSampler(ds, 5)
>>> for item in ds:
... print(item)
...
1
2
3
1
2
>>> ds = CyclicSampler(ds, 7)
>>> assert [item for item in ds] == [1, 2, 3, 1, 2, 3, 1]
"""
def __getitem__(self, index) -> T:
internal_index = index % len(self._dataset)
Expand Down
Loading