wip

mivanit · mivanit · commit cc202ceb5bc0 · 2025-09-29T18:33:59.000+01:00
diff --git a/spd/clustering/storage.py b/spd/clustering/storage.py
@@ -40,21 +40,21 @@ class ClusteringStorage:
     """
 
     # Directory structure constants
-    DATASET_DIR = "dataset"
-    BATCHES_DIR = "batches"
-    HISTORIES_DIR = "merge_histories"
-    ENSEMBLE_DIR = "ensemble"
-    DISTANCES_DIR = "distances"
+    DATASET_DIR: str = "dataset"
+    BATCHES_DIR: str = "batches"
+    HISTORIES_DIR: str = "merge_histories"
+    ENSEMBLE_DIR: str = "ensemble"
+    DISTANCES_DIR: str = "distances"
 
     # File naming constants
-    CONFIG_FILE = "dataset_config.json"
-    BATCH_FILE_FMT = "batch_{batch_idx:02d}.npz"
-    HISTORY_FILE_FMT = "data_{batch_id}"
-    MERGE_HISTORY_FILE = "merge_history.zip"
-    ENSEMBLE_META_FILE = "ensemble_meta.json"
-    ENSEMBLE_ARRAY_FILE = "ensemble_merge_array.npz"
-    DISTANCES_FILE_FMT = "distances_{method}.npz"
-    RUN_CONFIG_FILE = "run_config.json"
+    RUN_CONFIG_FILE: str = "run_config.json"
+    DATASET_CONFIG_FILE: str = "dataset_config.json"
+    ENSEMBLE_META_FILE: str = "ensemble_meta.json"
+    ENSEMBLE_ARRAY_FILE: str = "ensemble_merge_array.npz"
+    BATCH_FILE_FMT: str = "batch_{batch_idx:02d}.npz"
+    HISTORY_FILE_FMT: str = "data_{batch_id}"
+    MERGE_HISTORY_FILE: str = "merge_history.zip"
+    DISTANCES_FILE_FMT: str = "distances.{method}.npz"
 
     def __init__(self, base_path: Path, run_identifier: str | None = None):
         """Initialize storage with base path and optional run identifier.
@@ -63,7 +63,7 @@ def __init__(self, base_path: Path, run_identifier: str | None = None):
             base_path: Root directory for all storage operations
             run_identifier: Optional identifier to create a subdirectory for this run
         """
-        self.base_path = base_path
+        self.base_path: Path = base_path
         if run_identifier:
             self.run_path = base_path / run_identifier
         else:
@@ -72,122 +72,94 @@ def __init__(self, base_path: Path, run_identifier: str | None = None):
         # Ensure base directory exists
         self.run_path.mkdir(parents=True, exist_ok=True)
 
+    # directories
     @property
     def dataset_dir(self) -> Path:
-        """Get dataset directory path."""
         return self.run_path / self.DATASET_DIR
 
     @property
     def batches_dir(self) -> Path:
-        """Get batches directory path."""
         return self.dataset_dir / self.BATCHES_DIR
 
     @property
     def histories_dir(self) -> Path:
-        """Get histories directory path."""
         return self.run_path / self.HISTORIES_DIR
 
     @property
     def ensemble_dir(self) -> Path:
-        """Get ensemble directory path."""
         return self.run_path / self.ENSEMBLE_DIR
 
     @property
     def distances_dir(self) -> Path:
-        """Get distances directory path."""
         return self.run_path / self.DISTANCES_DIR
 
-    # Batch storage methods
+    # files
+    @property
+    def run_config_file(self) -> Path:
+        return self.run_path / self.RUN_CONFIG_FILE
+    
+    @property
+    def dataset_config_file(self) -> Path:
+        return self.dataset_dir / self.DATASET_CONFIG_FILE
+    
+    @property
+    def ensemble_meta_file(self) -> Path:
+        return self.ensemble_dir / self.ENSEMBLE_META_FILE
 
-    def save_dataset_config(self, config: dict[str, Any]) -> Path:
-        """Save dataset configuration to JSON file.
+    @property
+    def ensemble_array_file(self) -> Path:
+        return self.ensemble_dir / self.ENSEMBLE_ARRAY_FILE
 
-        Args:
-            config: Dataset configuration dictionary
+    # dynamic
+    def batch_path(self, batch_idx: int) -> Path:
+        return self.batches_dir / self.BATCH_FILE_FMT.format(batch_idx=batch_idx)
 
-        Returns:
-            Path to saved configuration file
-        """
+    def history_path(self, batch_id: str) -> Path:
+        return self.histories_dir / self.HISTORY_FILE_FMT.format(batch_id=batch_id) / self.MERGE_HISTORY_FILE
+
+    # Batch storage methods
+
+    def save_dataset_config(self, config: dict[str, Any]) -> Path:
         self.dataset_dir.mkdir(parents=True, exist_ok=True)
-        config_path = self.dataset_dir / self.CONFIG_FILE
-        config_path.write_text(json.dumps(config, indent=2))
-        return config_path
+        self.dataset_config_file.write_text(json.dumps(config, indent=2))
+        return self.dataset_config_file
 
     def save_batch(self, batch: Tensor, batch_idx: int) -> Path:
         self.batches_dir.mkdir(parents=True, exist_ok=True)
-        batch_path = self.batches_dir / self.BATCH_FILE_FMT.format(batch_idx=batch_idx)
+        batch_path: Path = self.batch_path(batch_idx)
 
         np.savez_compressed(batch_path, input_ids=batch.cpu().numpy())
         return batch_path
 
     def save_batches(self, batches: Iterator[Tensor], config: dict[str, Any]) -> list[Path]:
-        paths = []
+        paths: list[Path] = []
 
         self.save_dataset_config(config)
 
         for idx, batch in enumerate(batches):
-            path = self.save_batch(batch, idx)
+            path: Path = self.save_batch(batch, idx)
             paths.append(path)
 
         return paths
 
     def load_batch(self, batch_path: Path) -> Int[Tensor, "batch_size n_ctx"]:
-        """Load a batch from disk.
-
-        Args:
-            batch_path: Path to batch file
-
-        Returns:
-            Loaded batch tensor
-        """
-        data = np.load(batch_path)
+        data: dict[str, np.ndarray] = np.load(batch_path)
         return torch.tensor(data["input_ids"])
 
-    def load_batches(self) -> list[Tensor]:
-        """Load all batches from the batches directory.
-
-        Returns:
-            List of loaded batch tensors
-        """
-        batch_files = sorted(self.batches_dir.glob("batch_*.npz"))
-        return [self.load_batch(path) for path in batch_files]
-
     def get_batch_paths(self) -> list[Path]:
-        """Get sorted list of all batch file paths.
-
-        Returns:
-            List of paths to batch files
-        """
         return sorted(self.batches_dir.glob("batch_*.npz"))
 
     # History storage methods
 
     def save_history(self, history: MergeHistory, batch_id: str) -> Path:
-        """Save merge history for a batch.
-
-        Args:
-            history: MergeHistory object to save
-            batch_id: Identifier for the batch
-
-        Returns:
-            Path to saved history file
-        """
-        history_dir = self.histories_dir / self.HISTORY_FILE_FMT.format(batch_id=batch_id)
+        history_dir: Path = self.histories_dir / self.HISTORY_FILE_FMT.format(batch_id=batch_id)
         history_dir.mkdir(parents=True, exist_ok=True)
 
-        history_path = history_dir / self.MERGE_HISTORY_FILE
+        history_path: Path = history_dir / self.MERGE_HISTORY_FILE
         history.save(history_path)
         return history_path
 
     def load_history(self, batch_id: str) -> MergeHistory:
-        """Load merge history for a batch.
-
-        Args:
-            batch_id: Identifier for the batch
-
-        Returns:
-            Loaded MergeHistory object
-        """
         history_path = (
             self.histories_dir
             / self.HISTORY_FILE_FMT.format(batch_id=batch_id)
@@ -196,57 +168,40 @@ def load_history(self, batch_id: str) -> MergeHistory:
         return MergeHistory.read(history_path)
 
     def get_history_paths(self) -> list[Path]:
-        """Get all history file paths.
-
-        Returns:
-            List of paths to history files
-        """
         return sorted(self.histories_dir.glob(f"*/{self.MERGE_HISTORY_FILE}"))
 
     def load_histories(self) -> list[MergeHistory]:
-        """Load all merge histories.
-
-        Returns:
-            List of loaded MergeHistory objects
-        """
         return [MergeHistory.read(path) for path in self.get_history_paths()]
 
     # Ensemble storage methods
 
     def save_ensemble(self, ensemble: NormalizedEnsemble) -> tuple[Path, Path]:
-        """Save normalized ensemble data.
-
-        Args:
-            ensemble: NormalizedEnsemble containing merge array and metadata
-
-        Returns:
-            Tuple of (metadata_path, array_path)
-        """
+        """Save normalized ensemble data"""
         self.ensemble_dir.mkdir(parents=True, exist_ok=True)
 
         # Save metadata
-        metadata_path = self.ensemble_dir / self.ENSEMBLE_META_FILE
+        metadata_path: Path = self.ensemble_meta_file
         metadata_path.write_text(json.dumps(ensemble.metadata, indent=2))
 
         # Save merge array
-        array_path = self.ensemble_dir / self.ENSEMBLE_ARRAY_FILE
+        array_path: Path = self.ensemble_array_file
         np.savez_compressed(array_path, merges=ensemble.merge_array)
 
         return metadata_path, array_path
 
     def save_distances(self, distances: DistancesArray, method: DistancesMethod) -> Path:
         self.distances_dir.mkdir(parents=True, exist_ok=True)
 
-        distances_path = self.distances_dir / self.DISTANCES_FILE_FMT.format(method=method)
+        distances_path: Path = self.distances_dir / self.DISTANCES_FILE_FMT.format(method=method)
         np.savez_compressed(distances_path, distances=distances)
         return distances_path
 
     def load_distances(self, method: DistancesMethod) -> DistancesArray:
-        distances_path = self.distances_dir / self.DISTANCES_FILE_FMT.format(method=method)
-        data = np.load(distances_path)
+        distances_path: Path = self.distances_dir / self.DISTANCES_FILE_FMT.format(method=method)
+        data: dict[str, np.ndarray] = np.load(distances_path)
         return data["distances"]
 
     def save_run_config(self, config: RunConfig) -> Path:
-        config_path = self.run
+        config_path = self.run_path / self.RUN_CONFIG_FILE
         config_path.write_text(config.model_dump_json(indent=2))
         return config_path