Clean up indexing slightly in interp_max_utils

Also remove unused imports and comment out old code in proof_max2_01_exhaustive
JasonGross · Sep 14, 2023 · d02b030 · d02b030
1 parent 092d2ac
commit d02b030
Show file tree

Hide file tree

Showing 2 changed files with 175 additions and 199 deletions.
diff --git a/training/interp_max_utils.py b/training/interp_max_utils.py
@@ -30,16 +30,16 @@ def logit_delta(model: HookedTransformer, renderer=None, histogram_all_incorrect
     """
 
     all_tokens = compute_all_tokens(model=model)
-    predicted_logits = model(all_tokens)[:,-1].detach().cpu()
+    predicted_logits = model(all_tokens)[:,-1,:].detach().cpu()
 
     # Extract statistics for each row
     # Use values in all_tokens as indices to gather correct logits
-    indices_of_max = all_tokens.max(dim=1, keepdim=True).values
-    correct_logits = torch.gather(predicted_logits, 1, indices_of_max)
+    indices_of_max = all_tokens.max(dim=-1, keepdim=True).values
+    correct_logits = torch.gather(predicted_logits, -1, indices_of_max)
     logits_above_correct = correct_logits - predicted_logits
     # replace correct logit indices with large number so that they don't get picked up by the min
     logits_above_correct[torch.arange(logits_above_correct.shape[0]), indices_of_max.squeeze()] = float('inf')
-    min_incorrect_logit = logits_above_correct.min(dim=1).values
+    min_incorrect_logit = logits_above_correct.min(dim=-1).values
 
     if histogram_all_incorrect_logit_differences:
         all_incorrect_logits = logits_above_correct[logits_above_correct != float('inf')]