Skip to content
This repository has been archived by the owner on Oct 31, 2024. It is now read-only.

Commit

Permalink
Lazy load datasets
Browse files Browse the repository at this point in the history
  • Loading branch information
astrojuanlu committed Sep 3, 2024
1 parent dbb22df commit f56d621
Showing 1 changed file with 26 additions and 7 deletions.
33 changes: 26 additions & 7 deletions src/kedro_catalog/catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,22 +26,41 @@ def save(self, name: str, data: t.Any) -> None: ...

@dataclass
class DataCatalog:
_datasets: dict[str, DatasetProtocol]
_dataset_configs: dict[str, DatasetConfig]

@classmethod
def from_config(cls, config: dict[str, dict[str, t.Any]]) -> DataCatalog:
dataset_configs = {
name: DatasetConfig.model_validate(dataset_config)
for name, dataset_config in config.items()
}
datasets = {
name: find_dataset_class(dataset_config.type).from_spec(dataset_config.spec)
for name, dataset_config in dataset_configs.items()
}
return cls(datasets)

# This performs eager instantiation, which we want to defer!
# https://github.com/kedro-org/kedro/issues/2829
# datasets = {
# name: find_dataset_class(dataset_config.type).from_spec(
# dataset_config.specxy
# )
# for name, dataset_config in dataset_configs.items()
# }

# Alternatively, we could find all the dataset classes already,
# but that might trigger expensive imports
# dataset_proxys = {
# name: (find_dataset_class(dataset_config.type), dataset_config.spec)
# for name, dataset_config in dataset_configs.items()
# }

# We just store the validated configs instead
return cls(dataset_configs)

def get_dataset(self, name: str) -> DatasetProtocol:
return self._datasets[name]
# NOTE: This could be a cache, maybe with @lru_cache
# but will that mean that datasets stay in memory for longer?
# Might need to use weakrefs or give up
dataset_config = self._dataset_configs[name]
ds_class, spec = find_dataset_class(dataset_config.type), dataset_config.spec
return ds_class.from_spec(spec)

def load(self, name: str) -> t.Any:
return self.get_dataset(name).load()
Expand Down

0 comments on commit f56d621

Please sign in to comment.