wip

oclivegriffin · oclivegriffin · commit bc37ae394793 · 2025-09-29T14:11:06.000+01:00
diff --git a/spd/clustering/clustering_pipeline.py b/spd/clustering/clustering_pipeline.py
@@ -7,14 +7,24 @@
 
 from pathlib import Path
 
-from spd.clustering.merge_run_config import MergeRunConfig
+from pydantic import BaseModel
+
+from spd.clustering.merge_run_config import RunFilePaths, MergeRunConfig
+
+
+class RunRecord(BaseModel):
+    merge_run_config: MergeRunConfig
+    output_dir: Path
+    devices: list[str]
+    max_concurrency: int
+    plot: bool
 
 
 def main(
     config: MergeRunConfig,
     base_path: Path,
-    n_workers: int,
     devices: list[str],
+    workers_per_device: int,
 ):
     """
     The following is (hopefully) correct (thought see there's some repetition I'd like to change)
@@ -42,38 +52,29 @@ def main(
         create_clustering_report,
     )
 
-    output_dir = base_path / config.config_identifier
-
-    histories_path = output_dir / "merge_histories"
-    histories_path.mkdir(parents=True, exist_ok=True)
-
-    distances_dir = output_dir / "distances"
-    distances_dir.mkdir(parents=True, exist_ok=True)
+    run_path = base_path / config.config_identifier
+    histories_path = run_path / "merge_histories"
+    dataset_dir = run_path / "dataset"
+    distances_dir = run_path / "distances"
+    run_config_path = run_path / "run_config.json"
 
-    # TODO see if we actually need this
-    # run_config_path = output_dir / "run_config.json"
-    # run_config_path.write_text(
-    #     json.dumps(
-    #         dict(merge_run_config=config.model_dump(mode="json"), base_path=str(base_path), devices=devices, max_concurrency=n_workers, plot=True,  # can we remove this?  repo_root=str(REPO_ROOT), run_id=config.config_identifier, run_path=str(output_dir),),
-    #         indent="\t",
-    #     )
-    # )
-    # print(f"Run config saved to {run_config_path}")
+    print(f"Run config saved to {run_config_path}")
+    run_config_path.write_text(config.model_dump_json(indent=2))
 
     print(f"Splitting dataset into {config.n_batches} batches...")
     data_files = split_and_save_dataset(
         config=config,
-        output_path=output_dir,
+        output_dir=dataset_dir,
         save_file_fmt="batch_{batch_idx}.npz",
         cfg_file_fmt="config.json",  # just a place we save a raw dict of metadata
     )
 
-    print(f"Processing {len(data_files)} batches with {n_workers} workers...")
+    print(f"Processing {len(data_files)} batches with {workers_per_device} workers per device...")
     results = process_batches_parallel(
         data_files=data_files,
         config=config,
         output_base_dir=histories_path,
-        n_workers=n_workers,
+        workers_per_device=workers_per_device,
         devices=devices,
     )
 
diff --git a/spd/clustering/merge_run_config.py b/spd/clustering/merge_run_config.py
@@ -35,13 +35,47 @@
 }
 
 
+class RunFilePaths:
+    def __init__(self, run_path: Path):
+        self.run_path = run_path
+
+    @property
+    def histories_path(self) -> Path:
+        return self.run_path / "merge_histories"
+
+    @property
+    def distances_dir(self) -> Path:
+        return self.run_path / "distances"
+
+    @property
+    def run_config_path(self) -> Path:
+        return self.run_path / "run_config.json"
+
+    def scaffold(self) -> None:
+        self.histories_path.mkdir(exist_ok=True)
+        self.distances_dir.mkdir(exist_ok=True)
+
+
 class MergeRunConfig(MergeConfig):
     """Configuration for a complete merge clustering run.
 
     Extends MergeConfig with parameters for model, dataset, and batch configuration.
     CLI-only parameters (base_path, devices, max_concurrency) are intentionally excluded.
     """
 
+    base_path: Path = Field(
+        ...,
+        description="Base path for saving clustering outputs",
+    )
+    workers_per_device: int = Field(
+        ...,
+        description="Maximum number of concurrent clustering processes per device",
+    )
+    devices: list[str] = Field(
+        ...,
+        description="Devices to use for clustering",
+    )
+
     model_path: str = Field(
         description="WandB path to the model (format: wandb:entity/project/run_id)",
     )
diff --git a/spd/clustering/s1_split_dataset.py b/spd/clustering/s1_split_dataset.py
@@ -21,7 +21,7 @@ def split_dataset_lm(
     model_path: str,
     n_batches: int,
     batch_size: int,
-    output_path: Path,
+    output_dir: Path,
     save_file_fmt: str,
     cfg_file_fmt: str,
 ) -> list[Path]:
@@ -72,9 +72,9 @@ def split_dataset_lm(
         )
 
     # make dirs
-    output_path.mkdir(parents=True, exist_ok=True)
+    output_dir.mkdir(parents=True, exist_ok=True)
     (
-        output_path
+        output_dir
         / save_file_fmt.format(batch_size=batch_size, batch_idx="XX", n_batches=f"{n_batches:02d}")
     ).parent.mkdir(parents=True, exist_ok=True)
     # iterate over the requested number of batches and save them
@@ -86,7 +86,7 @@ def split_dataset_lm(
     ):
         if batch_idx >= n_batches:
             break
-        batch_path: Path = output_path / save_file_fmt.format(
+        batch_path: Path = output_dir / save_file_fmt.format(
             batch_size=batch_size,
             batch_idx=f"{batch_idx:02d}",
             n_batches=f"{n_batches:02d}",
@@ -98,7 +98,7 @@ def split_dataset_lm(
         output_paths.append(batch_path)
 
     # save a config file
-    cfg_path: Path = output_path / cfg_file_fmt.format(batch_size=batch_size)
+    cfg_path: Path = output_dir / cfg_file_fmt.format(batch_size=batch_size)
     cfg_data: dict[str, Any] = dict(
         # args to this function
         model_path=model_path,
@@ -110,7 +110,7 @@ def split_dataset_lm(
         tokenizer_type=str(getattr(_tokenizer, "__class__", None)),
         # files we saved
         output_files=[str(p) for p in output_paths],
-        output_dir=str(output_path),
+        output_dir=str(output_dir),
         output_file_fmt=save_file_fmt,
         cfg_file_fmt=cfg_file_fmt,
         cfg_file=str(cfg_path),
@@ -127,7 +127,7 @@ def split_dataset_resid_mlp(
     model_path: str,
     n_batches: int,
     batch_size: int,
-    output_path: Path,
+    output_dir: Path,
     save_file_fmt: str,
     cfg_file_fmt: str,
 ) -> list[Path]:
@@ -168,9 +168,9 @@ def split_dataset_resid_mlp(
         dataloader = DatasetGeneratedDataLoader(dataset, batch_size=batch_size, shuffle=False)
 
     # make dirs
-    output_path.mkdir(parents=True, exist_ok=True)
+    output_dir.mkdir(parents=True, exist_ok=True)
     (
-        output_path
+        output_dir
         / save_file_fmt.format(batch_size=batch_size, batch_idx="XX", n_batches=f"{n_batches:02d}")
     ).parent.mkdir(parents=True, exist_ok=True)
 
@@ -186,7 +186,7 @@ def split_dataset_resid_mlp(
         if batch_idx >= n_batches:
             break
 
-        batch_path: Path = output_path / save_file_fmt.format(
+        batch_path: Path = output_dir / save_file_fmt.format(
             batch_size=batch_size,
             batch_idx=f"{batch_idx:02d}",
             n_batches=f"{n_batches:02d}",
@@ -198,7 +198,7 @@ def split_dataset_resid_mlp(
         output_paths.append(batch_path)
 
         # save the config file
-    cfg_path: Path = output_path / cfg_file_fmt.format(batch_size=batch_size)
+    cfg_path: Path = output_dir / cfg_file_fmt.format(batch_size=batch_size)
     cfg_data: dict[str, Any] = dict(
         # args to this function
         model_path=model_path,
@@ -208,7 +208,7 @@ def split_dataset_resid_mlp(
         resid_mlp_dataset_kwargs=resid_mlp_dataset_kwargs,
         # files we saved
         output_files=[str(p) for p in output_paths],
-        output_dir=str(output_path),
+        output_dir=str(output_dir),
         output_file_fmt=save_file_fmt,
         cfg_file_fmt=cfg_file_fmt,
         cfg_file=str(cfg_path),
@@ -223,7 +223,7 @@ def split_dataset_resid_mlp(
 
 def split_and_save_dataset(
     config: MergeRunConfig,
-    output_path: Path,
+    output_dir: Path,
     save_file_fmt: str,
     cfg_file_fmt: str,
 ) -> list[Path]:
@@ -234,7 +234,7 @@ def split_and_save_dataset(
                 model_path=config.model_path,
                 n_batches=config.n_batches,
                 batch_size=config.batch_size,
-                output_path=output_path,
+                output_dir=output_dir,
                 save_file_fmt=save_file_fmt,
                 cfg_file_fmt=cfg_file_fmt,
             )
@@ -243,7 +243,7 @@ def split_and_save_dataset(
                 model_path=config.model_path,
                 n_batches=config.n_batches,
                 batch_size=config.batch_size,
-                output_path=output_path,
+                output_dir=output_dir,
                 save_file_fmt=save_file_fmt,
                 cfg_file_fmt=cfg_file_fmt,
             )
diff --git a/spd/clustering/s2_clustering.py b/spd/clustering/s2_clustering.py
@@ -27,23 +27,19 @@ class ClusteringResult:
     wandb_url: str | None
 
 
-# TODO consider making this a generator
 def process_batches_parallel(
     config: MergeRunConfig,
     data_files: list[Path],
     output_base_dir: Path,
-    n_workers: int,
+    workers_per_device: int,
     devices: list[str],
 ) -> list[ClusteringResult]:
-    devices = devices or ["cuda:0"]
-
-    # Create worker arguments with device assignment
     worker_args = [
         (config, data_path, output_base_dir, devices[i % len(devices)])
         for i, data_path in enumerate(data_files)
     ]
 
-    with Pool(n_workers) as pool:
+    with Pool(workers_per_device * len(devices)) as pool:
         results = pool.map(_worker_fn, worker_args)
 
     return results
diff --git a/spd/clustering/scripts/main.py b/spd/clustering/scripts/main.py
@@ -33,11 +33,11 @@ def cli():
         help="comma-separated list of devices to use for clustering (e.g., 'cuda:0,cuda:1')",
     )
     parser.add_argument(
-        "--max-concurrency",
+        "--workers-per-device",
         "-x",
         type=int,
-        default=None,
-        help="Maximum number of concurrent clustering processes (default: all devices)",
+        default=1,
+        help="Maximum number of concurrent clustering processes per device (default: 1)",
     )
     args = parser.parse_args()
 
@@ -53,7 +53,7 @@ def cli():
         config=MergeRunConfig.from_file(args.config),
         base_path=args.base_path,
         devices=devices,
-        n_workers=args.max_concurrency if args.max_concurrency is not None else len(devices),
+        workers_per_device=args.workers_per_device,
     )