Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MNT: Refactoring changes to CSV adapter + CSVArrayAdapter #803

Merged
merged 31 commits into from
Jan 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
931e1a8
MNT: small refactoring changes to CSV adapter
genematx Oct 29, 2024
ac4a265
MNT: refactor and remove dataframe_adapter property
genematx Oct 29, 2024
1e6d78a
MNT: add changelog entry
genematx Oct 29, 2024
e02f849
ENH: draft CSV Array adapter
genematx Oct 30, 2024
0be72ab
ENH: constructor methods for CSVArrayAdapter
genematx Nov 1, 2024
de240a8
ENH: Subclass CSVArrayAdapter from ArrayAdapter
genematx Nov 6, 2024
9c6ade4
Merge branch 'refactor-csv' into csv-array
genematx Nov 7, 2024
bfce5ff
ENH: add from_assets constructor for Adapters
genematx Nov 7, 2024
c00a2e9
ENH: pass dtype parameter to CSV adapter from structure
genematx Nov 8, 2024
feca99c
FIX: instance checking of data_type
genematx Nov 8, 2024
d7f5c4a
ENH: allow to pass nrows parameter to CSVAdapter
genematx Nov 8, 2024
8bbd481
FIX: reshape and rechunk when extending a csv array
genematx Nov 13, 2024
bc7dd51
Merge branch 'main' into refactor-csv
genematx Dec 10, 2024
8d10672
FIX: typo
genematx Dec 10, 2024
0e876ef
Merge branch 'main' into refactor-csv
genematx Jan 13, 2025
3510bdd
MNT: remove unused read_csv function
genematx Jan 13, 2025
5acdfa1
TST: Tests for CSVArrayAdapter
genematx Jan 13, 2025
a4149d5
ENH: from_uris for CSVArrayAdapter
genematx Jan 14, 2025
4f785de
TST: fix relative links in test
genematx Jan 14, 2025
2b59ef8
MNT: remove access_policy attribute from adapters
genematx Jan 17, 2025
0159b7c
TST: ensure old tests pass
genematx Jan 17, 2025
1b7b113
ENH: rename from_assets to from_catalog
genematx Jan 21, 2025
d07a3f9
MNT: avoid heavy imports
genematx Jan 21, 2025
1859da6
MNT: lint
genematx Jan 21, 2025
e77eb12
MNT: fix dask imports
genematx Jan 21, 2025
b1b835e
Merge branch 'main' into refactor-csv
genematx Jan 22, 2025
a26e4d5
MNT: Update Changelog
genematx Jan 22, 2025
5d6d812
ENH: use variadic args for adapter.from_uris
genematx Jan 22, 2025
b28bac4
ENH: declare data_source and node as positional-only arguments
genematx Jan 22, 2025
a0f90f4
TST: try with extra slash in uris
genematx Jan 22, 2025
7a7ae78
MNT: use standardized init_storage method
genematx Jan 22, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ Write the date in place of the "Unreleased" in the case a new version is release
`public_keys`, which can be fetched at initialization (server startup) time.
See examples `example_configs/orcid_auth.yml`,
`example_configs/google_auth.yml`, and `example_configs/simple_oidc`.
- Refactor and standardize Adapter API: implement from_uris and from_catalog
classmethods for instantiation from files and registered Tiled nodes, respectively.
- Refactor CSVAdapter to allow pd.read_csv kwargs
genematx marked this conversation as resolved.
Show resolved Hide resolved

### Maintenance

Expand Down
2 changes: 1 addition & 1 deletion docs/source/reference/service.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ or its dask counterpart.
.. autosummary::
:toctree: generated

tiled.adapters.csv.read_csv
tiled.adapters.csv.CSVAdapter
tiled.adapters.excel.ExcelAdapter
tiled.adapters.hdf5.HDF5Adapter
tiled.adapters.netcdf.read_netcdf
Expand Down
4 changes: 2 additions & 2 deletions tiled/_tests/test_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import tifffile
import xarray

from ..adapters.csv import read_csv
from ..adapters.csv import CSVAdapter
from ..adapters.dataframe import ArrayAdapter
from ..adapters.tiff import TiffAdapter
from ..catalog import in_memory
Expand Down Expand Up @@ -236,7 +236,7 @@ async def test_write_dataframe_external_direct(a, tmpdir):
filepath = str(tmpdir / "file.csv")
data_uri = ensure_uri(filepath)
df.to_csv(filepath, index=False)
dfa = read_csv(data_uri)
dfa = CSVAdapter.from_uris(data_uri)
structure = asdict(dfa.structure())
await a.create_node(
key="x",
Expand Down
141 changes: 141 additions & 0 deletions tiled/_tests/test_csv_adapter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
from pathlib import Path

import numpy
import pandas
import pytest

from ..adapters.csv import CSVArrayAdapter
from ..catalog import in_memory
from ..client import Context, from_context
from ..server.app import build_app
from ..structures.array import ArrayStructure
from ..structures.core import StructureFamily
from ..structures.data_source import Asset, DataSource, Management
from ..structures.table import TableStructure

rng = numpy.random.default_rng(12345)

df1 = pandas.DataFrame(
{
"C": ["red", "green", "blue", "white"],
"D": [10.0, 20.0, 30.0, 40.0],
"E": [0, 1, 2, 3],
}
)

arr1 = rng.integers(0, 255, size=(13, 17), dtype="uint8")
arr2 = rng.random(size=(15, 19), dtype="float64")
df_arr1 = pandas.DataFrame(arr1)
df_arr2 = pandas.DataFrame(arr2)


@pytest.fixture(scope="module")
def tree(tmp_path_factory):
return in_memory(writable_storage=tmp_path_factory.getbasetemp())


@pytest.fixture(scope="module")
def context(tree):
with Context.from_app(build_app(tree)) as context:
client = from_context(context)
client.create_container(key="x")
yield context


@pytest.fixture
def csv_table_file(tmpdir):
fpath = Path(tmpdir, "table.csv")
df1.to_csv(fpath, index=False)

yield str(fpath)


@pytest.fixture
def csv_array1_file(tmpdir):
fpath = Path(tmpdir, "array_1.csv")
df_arr1.to_csv(fpath, index=False, header=False)

yield str(fpath)


@pytest.fixture
def csv_array2_file(tmpdir):
fpath = Path(tmpdir, "array_2.csv")
df_arr2.to_csv(fpath, index=False, header=False)

yield str(fpath)


def test_csv_table(context, csv_table_file):
client = from_context(context)

csv_assets = [
Asset(
data_uri=f"file://localhost/{csv_table_file}",
is_directory=False,
parameter="data_uris",
num=0,
)
]
csv_data_source = DataSource(
mimetype="text/csv;header=present",
assets=csv_assets,
structure_family=StructureFamily.table,
structure=TableStructure.from_pandas(df1),
management=Management.external,
)

client["x"].new(
structure_family=StructureFamily.table,
data_sources=[csv_data_source],
key="table",
)

read_df = client["x"]["table"].read()
assert set(read_df.columns) == set(df1.columns)
assert (read_df == df1).all().all()


def test_csv_arrays(context, csv_array1_file, csv_array2_file):
client = from_context(context)

for key, csv_fpath, arr in zip(
("array1", "array2"), (csv_array1_file, csv_array2_file), (arr1, arr2)
):
csv_assets = [
Asset(
data_uri=f"file://localhost/{csv_fpath}",
is_directory=False,
parameter="data_uris",
num=0,
)
]
csv_data_source = DataSource(
mimetype="text/csv;header=absent",
assets=csv_assets,
structure_family=StructureFamily.array,
structure=ArrayStructure.from_array(arr),
management=Management.external,
)

client["x"].new(
structure_family=StructureFamily.array,
data_sources=[csv_data_source],
key=key,
)

read_arr1 = client["x"]["array1"].read()
assert numpy.array_equal(read_arr1, arr1)

read_arr2 = client["x"]["array2"].read()
assert numpy.isclose(read_arr2, arr2).all()


def test_csv_arrays_from_uris(csv_array1_file, csv_array2_file):
array_adapter = CSVArrayAdapter.from_uris(f"file://localhost/{csv_array1_file}")
read_arr = array_adapter.read()
assert numpy.isclose(read_arr, arr1).all()

array_adapter = CSVArrayAdapter.from_uris(f"file://localhost/{csv_array2_file}")
read_arr = array_adapter.read()
assert numpy.isclose(read_arr, arr2).all()
4 changes: 2 additions & 2 deletions tiled/_tests/test_custom_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ async def test_xdi_round_trip(tmpdir):
"readable_storage": [tmpdir / "files"],
"init_if_not_exists": True,
"adapters_by_mimetype": {
"application/x-xdi": "tiled.examples.xdi:read_xdi"
"application/x-xdi": "tiled.examples.xdi:XDIAdapter"
},
},
}
Expand All @@ -46,7 +46,7 @@ async def test_xdi_round_trip(tmpdir):
await register(
client,
tmpdir / "files",
adapters_by_mimetype={"application/x-xdi": "tiled.examples.xdi:read_xdi"},
adapters_by_mimetype={"application/x-xdi": "tiled.examples.xdi:XDIAdapter"},
mimetypes_by_file_ext={".xdi": "application/x-xdi"},
)
client["example"].export(str(tmpdir / "exported.xdi"))
Expand Down
27 changes: 20 additions & 7 deletions tiled/_tests/test_directory_walker.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from ..adapters.hdf5 import HDF5Adapter
from ..adapters.tiff import TiffAdapter
from ..adapters.utils import init_adapter_from_catalog
from ..catalog import in_memory
from ..client import Context, from_context
from ..client.register import (
Expand Down Expand Up @@ -203,17 +204,29 @@ async def test_image_file_with_sidecar_metadata_file(tmpdir):
with open(metadata_filepath, "w") as file:
yaml.dump(metadata, file)

def read_tiff_with_yaml_metadata(image_uri, metadata_uri, metadata=None, **kwargs):
with open(path_from_uri(metadata_uri)) as file:
metadata = yaml.safe_load(file)
return TiffAdapter(image_uri, metadata=metadata, **kwargs)
class TiffAdapterWithSidecar(TiffAdapter):
def __init__(self, image_uri, metadata_uri, metadata=None, **kwargs):
with open(path_from_uri(metadata_uri)) as file:
metadata = yaml.safe_load(file)

super().__init__(image_uri, metadata=metadata, **kwargs)

@classmethod
def from_catalog(
cls,
data_source,
node,
/,
**kwargs,
):
return init_adapter_from_catalog(cls, data_source, node, **kwargs)

catalog = in_memory(
writable_storage=tmpdir,
adapters_by_mimetype={MIMETYPE: read_tiff_with_yaml_metadata},
adapters_by_mimetype={MIMETYPE: TiffAdapterWithSidecar},
)
with Context.from_app(build_app(catalog)) as context:
adapter = read_tiff_with_yaml_metadata(
adapter = TiffAdapterWithSidecar(
ensure_uri(image_filepath), ensure_uri(metadata_filepath)
)
client = from_context(context)
Expand Down Expand Up @@ -295,7 +308,7 @@ async def test_hdf5_virtual_datasets(tmpdir):
)
catalog = in_memory(writable_storage=tmpdir)
with Context.from_app(build_app(catalog)) as context:
adapter = HDF5Adapter.from_uri(ensure_uri(filepath))
adapter = HDF5Adapter.from_uris(ensure_uri(filepath))
client = from_context(context)
client.new(
key="VDS",
Expand Down
4 changes: 2 additions & 2 deletions tiled/_tests/test_jpeg.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,9 @@ def client(tmpdir_module):

tree = MapAdapter(
{
"color": JPEGAdapter(ensure_uri(path)),
"color": JPEGAdapter.from_uris(ensure_uri(path)),
"sequence": JPEGSequenceAdapter.from_uris(
[ensure_uri(filepath) for filepath in filepaths]
*[ensure_uri(filepath) for filepath in filepaths]
),
}
)
Expand Down
9 changes: 0 additions & 9 deletions tiled/_tests/test_protocols.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ def __init__(
structure: ArrayStructure,
metadata: Optional[JSON] = None,
specs: Optional[List[Spec]] = None,
access_policy: Optional[AccessPolicy] = None,
) -> None:
self._array = array
self._structure = structure
Expand Down Expand Up @@ -109,13 +108,11 @@ def __init__(
structure: AwkwardStructure,
metadata: Optional[JSON] = None,
specs: Optional[List[Spec]] = None,
access_policy: Optional[AccessPolicy] = None,
) -> None:
self.container = container
self._metadata = metadata or {}
self._structure = structure
self._specs = list(specs or [])
self.access_policy = access_policy

def structure(self) -> AwkwardStructure:
return self._structure
Expand Down Expand Up @@ -188,7 +185,6 @@ def __init__(
*,
metadata: Optional[JSON] = None,
specs: Optional[List[Spec]] = None,
access_policy: Optional[AccessPolicy] = None,
) -> None:
"""
Construct from blocks with coords given in block-local reference frame.
Expand All @@ -198,13 +194,11 @@ def __init__(
structure :
metadata :
specs :
access_policy :
"""
self.blocks = blocks
self._metadata = metadata or {}
self._structure = structure
self._specs = specs or []
self.access_policy = access_policy

all_coords = [[1], [1]]
all_data = [[1], [1]]
Expand Down Expand Up @@ -277,7 +271,6 @@ def __init__(
*,
metadata: Optional[JSON] = None,
specs: Optional[List[Spec]] = None,
access_policy: Optional[AccessPolicy] = None,
) -> None:
"""

Expand All @@ -287,13 +280,11 @@ def __init__(
structure :
metadata :
specs :
access_policy :
"""
self._metadata = metadata or {}
self._partitions = list(partitions)
self._structure = structure
self._specs = specs or []
self.access_policy = access_policy

def structure(self) -> TableStructure:
return self._structure
Expand Down
4 changes: 2 additions & 2 deletions tiled/_tests/test_tiff.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,9 @@ def client(tmpdir_module):
{
"color": TiffAdapter(ensure_uri(path)),
"sequence": TiffSequenceAdapter.from_uris(
[ensure_uri(filepath) for filepath in filepaths]
*[ensure_uri(filepath) for filepath in filepaths]
),
"5d_sequence": TiffSequenceAdapter.from_uris(
"5d_sequence": TiffSequenceAdapter(
[ensure_uri(filepath) for filepath in filepaths],
structure=ArrayStructure(
shape=(3, 1, 5, 7, 4),
Expand Down
6 changes: 5 additions & 1 deletion tiled/_tests/test_writing.py
Original file line number Diff line number Diff line change
Expand Up @@ -600,7 +600,11 @@ def test_write_with_specified_mimetype(tree):
df = pandas.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
structure = TableStructure.from_pandas(df)

for mimetype in [PARQUET_MIMETYPE, "text/csv", APACHE_ARROW_FILE_MIME_TYPE]:
for mimetype in [
PARQUET_MIMETYPE,
"text/csv",
APACHE_ARROW_FILE_MIME_TYPE,
]:
x = client.new(
"table",
[
Expand Down
Loading
Loading