Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Establish MagicDataInterface #22

Merged
merged 1 commit into from
Mar 4, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 7 additions & 8 deletions datatc/data_directory.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from typing import Any, Dict, List, Union
import warnings

from datatc.data_interface import DataInterfaceManager
from datatc.data_interface import MagicDataInterface
from datatc.self_aware_data import SelfAwareData, SelfAwareDataInterface


Expand Down Expand Up @@ -186,15 +186,15 @@ class DataDirectory:
data_dir_manager = DataDirectoryManager

def __init__(self, path: str, contents: Dict[str, 'DataDirectory'] = None,
data_interface_manager=DataInterfaceManager):
magic_data_interface=MagicDataInterface):
"""
Initialize a DataDirectory at a path. The contents of that DataDirectory are recursively characterized and the
DataDirectory's data_type set. For testing purposes, the contents can also be set directly.

Args:
path: A file path at which to instantiate the DataDirectory.
contents: The files and subdirectories contained in the directory.
data_interface_manager: DataInterfaceManager object to use to interface with files.
magic_data_interface: MagicDataInterface object to use to interface with files.
"""
self.path = Path(path).expanduser().resolve()
if not self.path.exists():
Expand All @@ -206,7 +206,7 @@ def __init__(self, path: str, contents: Dict[str, 'DataDirectory'] = None,
self.contents = contents
# determine_data_type has to be done _after_ characterize dir because it inspects the children
self.data_type = self._determine_data_type()
self.data_interface_manager = data_interface_manager
self.magic_data_interface = magic_data_interface

@classmethod
def register_project(cls, project_hint: str, project_path: str) -> None:
Expand Down Expand Up @@ -305,8 +305,7 @@ def save(self, data: Any, file_name: str, **kwargs) -> None:
self.contents[new_data_dir.name] = new_data_dir

def _save_file(self, data: Any, file_name: str, **kwargs) -> 'DataFile':
data_interface = self.data_interface_manager.select(file_name)
saved_file_path = data_interface.save(data, file_name, self.path, **kwargs)
saved_file_path = self.magic_data_interface.save(data, str(Path(self.path, file_name)), **kwargs)
return DataFile(saved_file_path)

def _save_self_aware_data(self, data: Any, file_name: str, **kwargs) -> 'SelfAwareDataDirectory':
Expand Down Expand Up @@ -477,8 +476,8 @@ def load(self, data_interface_hint=None, **kwargs) -> Any:

"""
if data_interface_hint is None:
data_interface = self.data_interface_manager.select(self.data_type)
data_interface = self.magic_data_interface.select_data_interface(self.data_type)
else:
data_interface = self.data_interface_manager.select(data_interface_hint)
data_interface = self.magic_data_interface.select_data_interface(data_interface_hint)
print('Loading {}'.format(self.path))
return data_interface.load(self.path, **kwargs)
65 changes: 40 additions & 25 deletions datatc/data_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,32 +185,30 @@ def _interface_specific_load(cls, file_path, **kwargs):
return {'data': 42}


class DataInterfaceManagerBase:
class MagicDataInterfaceBase:

def __init__(self):
self.registered_interfaces = {}

def register_data_interface(self, data_interface: Type[DataInterfaceBase]) -> None:
self.registered_interfaces[data_interface.file_extension] = data_interface
def save(self, data: Any, file_path: str, mode: str = None, **kwargs) -> str:
file_dir_path = os.path.dirname(file_path)
file_name = os.path.basename(file_path)
data_interface = self.select_data_interface(file_name)
saved_file_path = data_interface.save(data, file_name, file_dir_path, mode=mode, **kwargs)
return saved_file_path

def instantiate_data_interface(self, file_type: str) -> DataInterfaceBase:
if file_type in self.registered_interfaces:
return self.registered_interfaces[file_type]()
def load(self, file_path: str, data_interface_hint: str = None, **kwargs) -> Any:
if data_interface_hint is None:
data_interface = self.select_data_interface(file_path)
else:
raise ValueError("File type {} not recognized. Supported file types include {}".format(
file_type, list(self.registered_interfaces.keys())))
data_interface = self.select_data_interface(data_interface_hint)
print('Loading {}'.format(file_path))
return data_interface.load(file_path, **kwargs)

@staticmethod
def parse_file_hint(file_hint: str) -> str:
if type(file_hint) is PosixPath:
file_hint = file_hint.__str__()
if '.' in file_hint:
file_name, file_extension = file_hint.split('.')
return file_extension
else:
return file_hint
def register_data_interface(self, data_interface: Type[DataInterfaceBase]) -> None:
self.registered_interfaces[data_interface.file_extension] = data_interface

def select(self, file_hint: str, default_file_type=None) -> DataInterfaceBase:
def select_data_interface(self, file_hint: str, default_file_type=None) -> DataInterfaceBase:
"""
Select the appropriate data interface based on the file_hint.

Expand All @@ -219,15 +217,32 @@ def select(self, file_hint: str, default_file_type=None) -> DataInterfaceBase:
default_file_type: default file type to use, if the file_hint doesn't specify.
Returns: A DataInterface.
"""
file_hint = self.parse_file_hint(file_hint)
file_hint = self._parse_file_hint(file_hint)
if file_hint in self.registered_interfaces:
return self.instantiate_data_interface(file_hint)
return self._instantiate_data_interface(file_hint)
elif default_file_type is not None:
return self.instantiate_data_interface(default_file_type)
return self._instantiate_data_interface(default_file_type)
else:
raise ValueError("File hint {} not recognized. Supported file types include {}".format(
file_hint, list(self.registered_interfaces.keys())))

def _instantiate_data_interface(self, file_type: str) -> DataInterfaceBase:
if file_type in self.registered_interfaces:
return self.registered_interfaces[file_type]()
else:
raise ValueError("File type {} not recognized. Supported file types include {}".format(
file_type, list(self.registered_interfaces.keys())))

@staticmethod
def _parse_file_hint(file_hint: str) -> str:
if type(file_hint) is PosixPath:
file_hint = file_hint.__str__()
if '.' in file_hint:
file_name, file_extension = file_hint.split('.')
return file_extension
else:
return file_hint


all_live_interfaces = [
PickleDataInterface,
Expand All @@ -241,9 +256,9 @@ def select(self, file_hint: str, default_file_type=None) -> DataInterfaceBase:
YAMLDataInterface,
]

DataInterfaceManager = DataInterfaceManagerBase()
MagicDataInterface = MagicDataInterfaceBase()
for interface in all_live_interfaces:
DataInterfaceManager.register_data_interface(interface)
MagicDataInterface.register_data_interface(interface)

TestDataInterfaceManager = DataInterfaceManagerBase()
TestDataInterfaceManager.register_data_interface(TestingDataInterface)
TestMagicDataInterface = MagicDataInterfaceBase()
TestMagicDataInterface.register_data_interface(TestingDataInterface)
4 changes: 2 additions & 2 deletions datatc/data_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def save(self, data: Any, processing_func: Callable, file_name: str, data_file_t
if self.check_name_already_exists(file_name, file_dir_path):
raise ValueError("That data processor name is already in use")

data_interface = di.DataInterfaceManager.select(data_file_type)
data_interface = di.MagicDataInterface.select(data_file_type)
data = processing_func(data)
data_interface.save(data, file_name, file_dir_path)
self.processor_data_interface.save(processing_func, file_name + self.processor_designation, file_dir_path)
Expand Down Expand Up @@ -71,7 +71,7 @@ def load(self, file_name: str, file_dir_path: str) -> DataProcessor:
# find and load the data
if data_file_extension is None:
data_file_extension = self.get_data_processor_data_type(file_name, file_dir_path)
data_interface = di.DataInterfaceManager.select(data_file_extension)
data_interface = di.MagicDataInterface.select(data_file_extension)
data = data_interface.load(file_name, file_dir_path)
return DataProcessor(data, processing_func, code)

Expand Down
14 changes: 6 additions & 8 deletions datatc/self_aware_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from pathlib import Path
from typing import Any, Callable, Dict, List, Tuple, Type, Union

from datatc.data_interface import DataInterfaceManager, DillDataInterface, TextDataInterface, YAMLDataInterface
from datatc.data_interface import MagicDataInterface, DillDataInterface, TextDataInterface, YAMLDataInterface
from datatc.git_utilities import get_git_repo_of_func, check_for_uncommitted_git_changes_at_path, get_git_hash_from_path


Expand Down Expand Up @@ -347,8 +347,7 @@ def load_from_file(cls, file_path: str) -> 'SelfAwareData':
Returns: SelfAwareData with a TransformSequence containing a FileSourceTransformStep pointing to file_path

"""
data_interface = DataInterfaceManager.select(file_path)
data = data_interface.load(file_path)
data = MagicDataInterface.load(file_path)
metadata = [{'file_path': file_path}]
return cls(data, metadata)

Expand Down Expand Up @@ -521,7 +520,7 @@ def save(cls, sad: SelfAwareData, parent_path: str, file_name: str, **kwargs) -
new_transform_dir_path = Path(parent_path, transform_dir_name)
os.makedirs(new_transform_dir_path)

data_interface = DataInterfaceManager.select(data_file_type)
data_interface = MagicDataInterface.select_data_interface(data_file_type)
data_interface.save(sad.data, 'data', new_transform_dir_path, **kwargs)
cls.file_component_interfaces['func'].save(transformer_func, 'func', new_transform_dir_path)
cls.file_component_interfaces['code'].save(code, 'code', new_transform_dir_path)
Expand All @@ -548,8 +547,7 @@ def load(cls, path: str, data_interface_hint=None, load_function: bool = True, *
func_file = file_map['func']
code_file = file_map['code']

data_interface = DataInterfaceManager.select(data_file, default_file_type=data_interface_hint)
data = data_interface.load(data_file, **kwargs)
data = MagicDataInterface.load(data_file, data_interface_hint=data_interface_hint, **kwargs)
if load_function:
transformer_func = cls.file_component_interfaces['func'].load(func_file)
else:
Expand Down Expand Up @@ -640,7 +638,7 @@ def save(cls, sad: SelfAwareData, parent_path: str, file_name: str, **kwargs) -
new_transform_dir_path = Path(parent_path, transform_dir_name)
os.makedirs(new_transform_dir_path)

data_interface = DataInterfaceManager.select(data_file_type)
data_interface = MagicDataInterface.select_data_interface(data_file_type)
data_interface.save(sad.data, 'data', new_transform_dir_path, **kwargs)

cls.file_component_interfaces['sad'].save(sad, 'sad', new_transform_dir_path)
Expand Down Expand Up @@ -677,7 +675,7 @@ def load(cls, path: str, data_interface_hint=None, load_function: bool = True, *
sad = cls.file_component_interfaces['sad'].load(sad_file)
return sad
else:
data_interface = DataInterfaceManager.select(data_file, default_file_type=data_interface_hint)
data_interface = MagicDataInterface.select_data_interface(data_file, default_file_type=data_interface_hint)
data = data_interface.load(data_file, **kwargs)
metadata = cls.file_component_interfaces['provenance'].load(metadata_file)
# TODO: make function to add and extract version from sequence file
Expand Down
6 changes: 6 additions & 0 deletions docs/api_reference.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,9 @@ SelfAwareData
.. autoclass:: datatc.self_aware_data.SelfAwareData
:members:


MagicDataInterface
---------------

.. autoclass:: datatc.data_interface.MagicDataInterface
:members:
28 changes: 22 additions & 6 deletions docs/getting_started.rst
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ To load a file, navigate the file system using ``[]`` operators, and then call `
>>> raw_df = dd['data_extracts']['2020-02-04_Extract_3months']['2020-02-04_Extract_3months.xlsx'].load()


Don't worry about what format the file is in- `datatc` will intuit how to load the file. See :ref:`supported_formats`.
Don't worry about what format the file is in- `datatc` will intuit how to load the file. See `Supported Formats`.


Shortcuts for loading data files *faster*
Expand Down Expand Up @@ -307,15 +307,31 @@ Here's a toy example of working with ``SelfAwareData``:
----


Working with File Types via `DataInterface`
Working with File Types via `MagicDataInterface`
------------------------------------------------

`DataInterface` provides a standard interface for interacting with all file types: ``save()`` and ``load()``. This abstracts away the exact saving and loading operations for specific file types.
``MagicDataInterface`` provides a one-stop shop for interacting with common file types.
Just point ``MagicDataInterface`` to a file path and call ``save()`` and ``load()``.

If you want to work with a file type that `datatc` doesn't know about yet, you can create a `DataInterface` for it:
.. code-block:: python

from datatc import MagicDataInterface

iris_df = MagicDataInterface.load('iris.csv')
config = MagicDataInterface.load('config/model_params.yaml')

MagicDataInterface.save(results, 'results/iris_results.pkl')

See `Supported Formats` for a list of data types that ``MagicDataInterface`` knows how to work with.


Add a Data Type to MagicDataInterface
.....................................

If you want to work with a file type that ``MagicDataInterface`` doesn't know about yet, you can create a `DataInterface` for it:

1. Create a ``DataInterface`` that subclasses from ``DataInterfaceBase``, and implement the ``_interface_specific_save`` and ``_interface_specific_load`` functions.

2. Register your new `DataInterface` with `DataInterfaceManager`:
2. Register your new `DataInterface` with ``MagicDataInterface``:

>>> DataInterfaceManager.register_data_interface(MyNewDataInterface)
>>> MagicDataInterface.register_data_interface(MyNewDataInterface)
4 changes: 2 additions & 2 deletions tests/test_data_directory.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import warnings
import pytest
from datatc.data_directory import DataDirectory, DataFile
from datatc.data_interface import TestDataInterfaceManager
from datatc.data_interface import TestMagicDataInterface


# suppress 'DataDirectory path does not exist' warnings
Expand Down Expand Up @@ -136,7 +136,7 @@ def test_save_adds_to_dir_contents(self):
expected_directory_contents_after_save = {file_name: DataFile(expected_file_path)}

data_directory = DataDirectory(path='$HOME', contents=initial_directory_contents,
data_interface_manager=TestDataInterfaceManager)
magic_data_interface=TestMagicDataInterface)
data_directory.save(42, file_name)

# check that the contents keys (the file names) are the same
Expand Down
28 changes: 27 additions & 1 deletion tests/test_data_interface.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,23 @@
import unittest
from datatc.data_interface import TestingDataInterface
import tempfile
import pandas as pd
import shutil
from datatc.data_interface import MagicDataInterface, TestingDataInterface


class TestDataInterface(unittest.TestCase):

def setUp(self):
# Create a temporary directory
self.test_dir = tempfile.mkdtemp()

# establish a dataframe
self.raw_df = pd.DataFrame({'col_1': range(50), 'col_2': range(0, 100, 2)})

def tearDown(self):
# Remove the directory after the test
shutil.rmtree(self.test_dir)

def test_construct_file_path_name_without_extension(self):
file_name = 'file'
file_dir_path = '/home'
Expand All @@ -15,3 +29,15 @@ def test_construct_file_path_name_with_extension(self):
file_dir_path = '/home'
expected_result = '/home/file.yaml'
self.assertEqual(TestingDataInterface.construct_file_path(file_name, file_dir_path), expected_result)

def test_data_interface_save(self):
p = self.test_dir + 'test_save.csv'
MagicDataInterface.save(self.raw_df, p, index=False)
reloaded_data = pd.read_csv(p)
pd.testing.assert_frame_equal(self.raw_df, reloaded_data)

def test_data_interface_load(self):
p = self.test_dir + 'test_load.csv'
self.raw_df.to_csv(p, index=False)
reloaded_data = MagicDataInterface.load(p)
pd.testing.assert_frame_equal(self.raw_df, reloaded_data)
1 change: 0 additions & 1 deletion tests/test_self_aware_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import shutil
import tempfile
from datatc.self_aware_data import SelfAwareData, SelfAwareDataInterface, FileSourceTransformStep
from datatc.data_interface import DataInterfaceManager


class TestSelfAwareData(unittest.TestCase):
Expand Down