wip

oclivegriffin · oclivegriffin · commit 3ea89ae8c95c · 2025-09-29T13:41:30.000+01:00
diff --git a/oli.patch b/oli.patch
diff --git a/spd/clustering/clustering_pipeline.py b/spd/clustering/clustering_pipeline.py
@@ -0,0 +1,95 @@
+"""
+Orchestration layer - clean clustering pipeline coordination.
+
+Replaces the original 370+ line subprocess/FD system with simple multiprocessing.Pool.
+Each batch loads its own model and WandB run to match original design.
+"""
+
+from pathlib import Path
+
+from spd.clustering.merge_run_config import MergeRunConfig
+
+
+def main(
+    config: MergeRunConfig,
+    base_path: Path,
+    n_workers: int,
+    devices: list[str],
+):
+    """
+    The following is (hopefully) correct (thought see there's some repetition I'd like to change)
+
+    base_dir/
+        {config.config_identifier}/
+            merge_histories/
+                {config.config_identifier}-data_{batch_id}/
+                    merge_history.zip
+                    plots/
+                        activations_raw.pdf
+                        activations_concat.pdf
+                        activations_coact.pdf
+                        activations_coact_log.pdf
+                        merge_iteration.pdf
+            distances/
+            figures/
+            run_config.json
+    """
+    from spd.clustering.s1_split_dataset import split_and_save_dataset
+    from spd.clustering.s2_clustering import process_batches_parallel
+    from spd.clustering.s3_normalize_histories import normalize_and_ensemble_and_save
+    from spd.clustering.s4_compute_distances import (
+        compute_and_save_distances_new,
+        create_clustering_report,
+    )
+
+    output_dir = base_path / config.config_identifier
+
+    histories_path = output_dir / "merge_histories"
+    histories_path.mkdir(parents=True, exist_ok=True)
+
+    distances_dir = output_dir / "distances"
+    distances_dir.mkdir(parents=True, exist_ok=True)
+
+    # TODO see if we actually need this
+    # run_config_path = output_dir / "run_config.json"
+    # run_config_path.write_text(
+    #     json.dumps(
+    #         dict(merge_run_config=config.model_dump(mode="json"), base_path=str(base_path), devices=devices, max_concurrency=n_workers, plot=True,  # can we remove this?  repo_root=str(REPO_ROOT), run_id=config.config_identifier, run_path=str(output_dir),),
+    #         indent="\t",
+    #     )
+    # )
+    # print(f"Run config saved to {run_config_path}")
+
+    print(f"Splitting dataset into {config.n_batches} batches...")
+    data_files = split_and_save_dataset(
+        config=config,
+        output_path=output_dir,
+        save_file_fmt="batch_{batch_idx}.npz",
+        cfg_file_fmt="config.json",  # just a place we save a raw dict of metadata
+    )
+
+    print(f"Processing {len(data_files)} batches with {n_workers} workers...")
+    results = process_batches_parallel(
+        data_files=data_files,
+        config=config,
+        output_base_dir=histories_path,
+        n_workers=n_workers,
+        devices=devices,
+    )
+
+    enseble_merge_arr_path = normalize_and_ensemble_and_save(
+        history_paths=[r.history_save_path for r in results],
+        distances_dir=distances_dir,
+    )
+
+    distances = compute_and_save_distances_new(
+        merges_path=enseble_merge_arr_path,
+        method="perm_invariant_hamming",
+    )
+
+    create_clustering_report(
+        distances=distances,
+        method="perm_invariant_hamming",
+        wandb_urls=[r.wandb_url for r in results if r.wandb_url],  # Gross - clean up,
+        config_identifier=config.config_identifier,
+    )
diff --git a/spd/clustering/math/perm_invariant_hamming.py b/spd/clustering/math/perm_invariant_hamming.py
@@ -1,5 +1,6 @@
 import numpy as np
 from jaxtyping import Float, Int
+from scipy.optimize import linear_sum_assignment
 
 
 def perm_invariant_hamming_matrix(
diff --git a/spd/clustering/merge.py b/spd/clustering/merge.py
@@ -259,9 +259,8 @@ def _wandb_iter_log(
         )
 
     if iter_idx > 0 and iter_idx % config.intervals["artifact"] == 0:
-        with tempfile.TemporaryFile() as tmp_file:
-            file: Path = Path(tmp_file.name)
-            file.parent.mkdir(parents=True, exist_ok=True)
+        with tempfile.NamedTemporaryFile() as tmp_file:
+            file = Path(tmp_file.name)
             merge_history.save(file)
             artifact = wandb.Artifact(
                 name=f"merge_hist_iter.{batch_id}.iter_{iter_idx}",
diff --git a/spd/clustering/s25.py b/spd/clustering/s25.py
diff --git a/spd/clustering/s2_clustering.py b/spd/clustering/s2_clustering.py
@@ -8,7 +8,6 @@
 import wandb
 from jaxtyping import Int
 from torch import Tensor
-from tqdm import tqdm
 from wandb.sdk.wandb_run import Run
 
 from spd.clustering.activations import component_activations, process_activations
@@ -28,10 +27,6 @@ class ClusteringResult:
     wandb_url: str | None
 
 
-def _worker_fn(args: tuple[MergeRunConfig, Path, Path, str]) -> ClusteringResult:
-    return run_clustering(*args)
-
-
 # TODO consider making this a generator
 def process_batches_parallel(
     config: MergeRunConfig,
@@ -48,22 +43,17 @@ def process_batches_parallel(
         for i, data_path in enumerate(data_files)
     ]
 
-    # Simple pool without initializer
-    # with Pool(n_workers) as pool:
-    #     # Process batches with progress bar
-    #     results = list(
-    #         tqdm(
-    #             pool.imap(_worker_fn, worker_args),
-    #             total=len(data_files),
-    #             desc="Processing batches",
-    #         )
-    #     )
-    results = [_worker_fn(args) for args in worker_args]
+    with Pool(n_workers) as pool:
+        results = pool.map(_worker_fn, worker_args)
 
     return results
 
 
-def run_clustering(
+def _worker_fn(args: tuple[MergeRunConfig, Path, Path, str]) -> ClusteringResult:
+    return _run_clustering(*args)
+
+
+def _run_clustering(
     config: MergeRunConfig,
     data_path: Path,
     output_base_dir: Path,
@@ -167,29 +157,6 @@ def _setup_wandb(
     return run
 
 
-def _save_merge_history_to_wandb(
-    run: Run,
-    history_path: Path,
-    batch_id: str,
-    config_identifier: str,
-    history: MergeHistory,
-):
-    artifact = wandb.Artifact(
-        name=f"merge_history_{batch_id}",
-        type="merge_history",
-        description=f"Merge history for batch {batch_id}",
-        metadata={
-            "batch_name": batch_id,
-            "config_identifier": config_identifier,
-            "n_iters_current": history.n_iters_current,
-            "filename": history_path,
-        },
-    )
-    # Add both files before logging the artifact
-    artifact.add_file(str(history_path))
-    run.log_artifact(artifact)
-
-
 def _log_merge_history_plots_to_wandb(run: Run, history: MergeHistory):
     fig_cs = plot_merge_history_cluster_sizes(history=history)
 
diff --git a/spd/clustering/s3_normalize_histories.py b/spd/clustering/s3_normalize_histories.py
@@ -46,4 +46,4 @@ def normalize_and_ensemble_and_save(
     ZANJ().save(ensemble, path_hist_ensemble)
     logger.info(f"Ensemble saved to {path_hist_ensemble}")
 
-    return enseble_merge_arr_path
+    return enseble_merge_arr_path
diff --git a/spd/clustering/scripts/main.py b/spd/clustering/scripts/main.py
@@ -1,105 +1,11 @@
-"""
-Orchestration layer - clean clustering pipeline coordination.
-
-Replaces the original 370+ line subprocess/FD system with simple multiprocessing.Pool.
-Each batch loads its own model and WandB run to match original design.
-"""
-
 import argparse
 from pathlib import Path
 
+from spd.clustering.clustering_pipeline import main
 from spd.clustering.merge_run_config import MergeRunConfig
-from spd.clustering.s1_split_dataset import split_and_save_dataset
-from spd.clustering.s2_clustering import process_batches_parallel
-from spd.clustering.s3_normalize_histories import normalize_and_ensemble_and_save
-from spd.clustering.s4_compute_distances import (
-    compute_and_save_distances_new,
-    create_clustering_report,
-)
 from spd.settings import REPO_ROOT
 
 
-def main(
-    config: MergeRunConfig,
-    base_path: Path,
-    n_workers: int,
-    devices: list[str],
-):
-    """
-    The following is (hopefully) correct (thought see there's some repetition I'd like to change)
-
-    base_dir/
-        {config.config_identifier}/
-            merge_histories/
-                {config.config_identifier}-data_{batch_id}/
-                    merge_history.zip
-                    plots/
-                        activations_raw.pdf
-                        activations_concat.pdf
-                        activations_coact.pdf
-                        activations_coact_log.pdf
-                        merge_iteration.pdf
-            distances/
-            figures/
-            run_config.json
-    """
-
-    output_dir = base_path / config.config_identifier
-
-    histories_path = output_dir / "merge_histories"
-    histories_path.mkdir(parents=True, exist_ok=True)
-
-    # figures_path = output_dir / "figures"
-    # figures_path.mkdir(parents=True, exist_ok=True)
-
-    distances_dir = output_dir / "distances"
-    distances_dir.mkdir(parents=True, exist_ok=True)
-
-    # TODO see if we actually need this
-    # run_config_path = output_dir / "run_config.json"
-    # run_config_path.write_text(
-    #     json.dumps(
-    #         dict(merge_run_config=config.model_dump(mode="json"), base_path=str(base_path), devices=devices, max_concurrency=n_workers, plot=True,  # can we remove this?  repo_root=str(REPO_ROOT), run_id=config.config_identifier, run_path=str(output_dir),),
-    #         indent="\t",
-    #     )
-    # )
-    # print(f"Run config saved to {run_config_path}")
-
-    print(f"Splitting dataset into {config.n_batches} batches...")
-    data_files = split_and_save_dataset(
-        config=config,
-        output_path=output_dir,
-        save_file_fmt="batch_{batch_idx}.npz",
-        cfg_file_fmt="config.json",  # just a place we save a raw dict of metadata
-    )
-
-    print(f"Processing {len(data_files)} batches with {n_workers} workers...")
-    results = process_batches_parallel(
-        data_files=data_files,
-        config=config,
-        output_base_dir=histories_path,
-        n_workers=n_workers,
-        devices=devices,
-    )
-
-    enseble_merge_arr_path = normalize_and_ensemble_and_save(
-        history_paths=[r.history_save_path for r in results],
-        distances_dir=distances_dir,
-    )
-
-    distances = compute_and_save_distances_new(
-        merges_path=enseble_merge_arr_path,
-        method="perm_invariant_hamming",
-    )
-
-    create_clustering_report(
-        distances=distances,
-        method="perm_invariant_hamming",
-        wandb_urls=[r.wandb_url for r in results if r.wandb_url],  # Gross - clean up,
-        config_identifier=config.config_identifier,
-    )
-
-
 def cli():
     """Command-line interface for clustering."""
     parser = argparse.ArgumentParser(
@@ -138,6 +44,7 @@ def cli():
     # Parse devices
     if args.devices is None:
         import torch
+
         devices = ["cuda" if torch.cuda.is_available() else "cpu"]
     else:
         devices = args.devices.split(",")
diff --git a/spd/clustering/sweep.py b/spd/clustering/sweep.py
@@ -5,10 +5,8 @@
 import matplotlib.cm as cm
 import matplotlib.pyplot as plt
 import numpy as np
-import torch
 from matplotlib.colors import LogNorm
 from matplotlib.lines import Line2D
-from tqdm import tqdm
 
 from spd.clustering.merge_config import MergeConfig
 from spd.clustering.merge_history import MergeHistory
diff --git a/spd/models/component_model.py b/spd/models/component_model.py
@@ -68,6 +68,7 @@ def from_path(cls, path: ModelPath) -> "SPDRunInfo":
 
         return cls(checkpoint_path=comp_model_path, config=config)
 
+
 # TODO encapsulate Gates in a separate class (containing sigmoid type and sampling mode)
 class ComponentModel(LoadableModule):
     """Wrapper around an arbitrary model for running SPD.
diff --git a/tests/clustering/test_wandb_integration.py b/tests/clustering/test_wandb_integration.py
@@ -6,9 +6,9 @@
 
 import torch
 
+from spd.clustering.cli.s2_clustering import save_group_idxs_artifact
+from spd.clustering.cli.s3_normalize_histories import load_merge_histories_from_wandb
 from spd.clustering.merge_history import MergeHistory
-from spd.clustering.scripts.s2_clustering import save_group_idxs_artifact
-from spd.clustering.scripts.s3_normalize_histories import load_merge_histories_from_wandb
 
 
 def test_wandb_url_parsing_short_format():