diff --git a/src/kedro_catalog/catalog.py b/src/kedro_catalog/catalog.py index fc11b4e..375a4af 100644 --- a/src/kedro_catalog/catalog.py +++ b/src/kedro_catalog/catalog.py @@ -26,7 +26,7 @@ def save(self, name: str, data: t.Any) -> None: ... @dataclass class DataCatalog: - _datasets: dict[str, DatasetProtocol] + _dataset_configs: dict[str, DatasetConfig] @classmethod def from_config(cls, config: dict[str, dict[str, t.Any]]) -> DataCatalog: @@ -34,14 +34,33 @@ def from_config(cls, config: dict[str, dict[str, t.Any]]) -> DataCatalog: name: DatasetConfig.model_validate(dataset_config) for name, dataset_config in config.items() } - datasets = { - name: find_dataset_class(dataset_config.type).from_spec(dataset_config.spec) - for name, dataset_config in dataset_configs.items() - } - return cls(datasets) + + # This performs eager instantiation, which we want to defer! + # https://github.com/kedro-org/kedro/issues/2829 + # datasets = { + # name: find_dataset_class(dataset_config.type).from_spec( + # dataset_config.specxy + # ) + # for name, dataset_config in dataset_configs.items() + # } + + # Alternatively, we could find all the dataset classes already, + # but that might trigger expensive imports + # dataset_proxys = { + # name: (find_dataset_class(dataset_config.type), dataset_config.spec) + # for name, dataset_config in dataset_configs.items() + # } + + # We just store the validated configs instead + return cls(dataset_configs) def get_dataset(self, name: str) -> DatasetProtocol: - return self._datasets[name] + # NOTE: This could be a cache, maybe with @lru_cache + # but will that mean that datasets stay in memory for longer? + # Might need to use weakrefs or give up + dataset_config = self._dataset_configs[name] + ds_class, spec = find_dataset_class(dataset_config.type), dataset_config.spec + return ds_class.from_spec(spec) def load(self, name: str) -> t.Any: return self.get_dataset(name).load()