Verify our code matches CAA (#76)

* adding a llama chat formater and prompter based on CAA * testing that our reading vectors match CAA reading vectors * fixing linting * fixing test
dtch1997 · Jan 22, 2024 · aa1dd24 · aa1dd24
1 parent 773db50
commit aa1dd24
Show file tree

Hide file tree

Showing 8 changed files with 588 additions and 8 deletions.
diff --git a/repepo/core/format.py b/repepo/core/format.py
@@ -37,6 +37,28 @@ def apply(self, example: Example, **kwargs):
         )
 
 
+class LlamaChatFormatter(Formatter):
+    """
+    Add [INST] and [/INST] tags to the instruction and input.
+
+    Based on: https://github.com/nrimsky/SycophancySteering/blob/main/utils/tokenize_llama.py#L30
+    """
+
+    B_INST = "[INST]"
+    E_INST = "[/INST]"
+
+    @override
+    def apply(self, example: Example):
+        dialog_content_parts = []
+        if example.instruction:
+            dialog_content_parts.append(example.instruction.strip())
+        dialog_content_parts.append(example.input.strip())
+        dialog_content = "\n".join(dialog_content_parts)
+        prompt = f"{self.B_INST} {dialog_content} {self.E_INST} "
+        response = example.output.strip()
+        return Completion(prompt=prompt, response=response)
+
+
 class InstructionFormatter(Formatter):
     """Instruction formatter used for fine-tuning Alpaca."""
 

diff --git a/repepo/core/prompt.py b/repepo/core/prompt.py
@@ -1,4 +1,5 @@
 import abc
+from dataclasses import dataclass
 from typing import List
 from typing_extensions import override
 
@@ -28,6 +29,31 @@ def apply(self, completion):
         return completion
 
 
+@dataclass
+class LlamaChatPrompter(Prompter):
+    """
+    Prepend system message before the first prompt. This is based on the SycophancySteering
+    prompt format, except the system message is not inside of the [INST] tags. Hopefully
+    this is OK?
+
+    TODO: Put the sys message inside of the [INST] tags, or at least make it possible to do so.
+          This will require some rearchitecting
+
+    Based on: https://github.com/nrimsky/SycophancySteering/blob/main/utils/tokenize_llama.py#L30
+    """
+
+    system_prompt: str = "You are a helpful, honest and concise assistant."
+
+    B_SYS = "<<SYS>>\n"
+    E_SYS = "\n<</SYS>>\n\n"
+
+    @override
+    def apply(self, completion: Completion) -> Completion:
+        prompt = f"{self.B_SYS}{self.system_prompt}{self.E_SYS}{completion.prompt}"
+        response = completion.response
+        return Completion(prompt=prompt, response=response)
+
+
 class FewShotPrompter(Prompter):
     """Compose examples few-shot"""
 

diff --git a/tests/_original_caa/helpers.py b/tests/_original_caa/helpers.py
@@ -0,0 +1,161 @@
+import torch as t
+from typing import Optional
+import os
+from dataclasses import dataclass
+
+
+@dataclass
+class SteeringSettings:
+    """
+    max_new_tokens: Maximum number of tokens to generate.
+    type: Type of test to run. One of "in_distribution", "out_of_distribution", "truthful_qa".
+    few_shot: Whether to test with few-shot examples in the prompt. One of "positive", "negative", "none".
+    do_projection: Whether to project activations onto orthogonal complement of steering vector.
+    override_vector: If not None, the steering vector generated from this layer's activations will be used at all layers. Use to test the effect of steering with a different layer's vector.
+    override_vector_model: If not None, steering vectors generated from this model will be used instead of the model being used for generation - use to test vector transference between models.
+    use_base_model: Whether to use the base model instead of the chat model.
+    model_size: Size of the model to use. One of "7b", "13b".
+    n_test_datapoints: Number of datapoints to test on. If None, all datapoints will be used.
+    add_every_token_position: Whether to add steering vector to every token position (including question), not only to token positions corresponding to the model's response to the user
+    override_model_weights_path: If not None, the model weights at this path will be used instead of the model being used for generation - use to test using activation steering on top of fine-tuned model.
+    """
+
+    max_new_tokens: int = 100
+    type: str = "in_distribution"
+    few_shot: str = "none"
+    do_projection: bool = False
+    override_vector: Optional[int] = None
+    override_vector_model: Optional[str] = None
+    use_base_model: bool = False
+    model_size: str = "7b"
+    n_test_datapoints: Optional[int] = None
+    add_every_token_position: bool = False
+    override_model_weights_path: Optional[str] = None
+
+    def make_result_save_suffix(
+        self,
+        layer: Optional[int] = None,
+        multiplier: Optional[int] = None,
+    ):
+        elements = {
+            "layer": layer,
+            "multiplier": multiplier,
+            "max_new_tokens": self.max_new_tokens,
+            "type": self.type,
+            "few_shot": self.few_shot,
+            "do_projection": self.do_projection,
+            "override_vector": self.override_vector,
+            "override_vector_model": self.override_vector_model,
+            "use_base_model": self.use_base_model,
+            "model_size": self.model_size,
+            "n_test_datapoints": self.n_test_datapoints,
+            "add_every_token_position": self.add_every_token_position,
+            "override_model_weights_path": self.override_model_weights_path,
+        }
+        return "_".join(
+            [
+                f"{k}={str(v).replace('/', '-')}"
+                for k, v in elements.items()
+                if v is not None
+            ]
+        )
+
+    def filter_result_files_by_suffix(
+        self,
+        directory: str,
+        layer: Optional[int] = None,
+        multiplier: Optional[int] = None,
+    ):
+        elements = {
+            "layer": layer,
+            "multiplier": multiplier,
+            "max_new_tokens": self.max_new_tokens,
+            "type": self.type,
+            "few_shot": self.few_shot,
+            "do_projection": self.do_projection,
+            "override_vector": self.override_vector,
+            "override_vector_model": self.override_vector_model,
+            "use_base_model": self.use_base_model,
+            "model_size": self.model_size,
+            "n_test_datapoints": self.n_test_datapoints,
+            "add_every_token_position": self.add_every_token_position,
+            "override_model_weights_path": self.override_model_weights_path,
+        }
+
+        filtered_elements = {k: v for k, v in elements.items() if v is not None}
+        remove_elements = {k for k, v in elements.items() if v is None}
+
+        matching_files = []
+
+        print(self.override_model_weights_path)
+
+        for filename in os.listdir(directory):
+            if all(
+                f"{k}={str(v).replace('/', '-')}" in filename
+                for k, v in filtered_elements.items()
+            ):
+                # ensure remove_elements are *not* present
+                if all(f"{k}=" not in filename for k in remove_elements):
+                    matching_files.append(filename)
+
+        return [os.path.join(directory, f) for f in matching_files]
+
+
+def project_onto_orthogonal_complement(tensor, onto):
+    """
+    Projects tensor onto the orthogonal complement of the span of onto.
+    """
+    # Get the projection of tensor onto onto
+    proj = (
+        t.sum(tensor * onto, dim=-1, keepdim=True)
+        * onto
+        / (t.norm(onto, dim=-1, keepdim=True) ** 2 + 1e-10)
+    )
+    # Subtract to get the orthogonal component
+    return tensor - proj
+
+
+def add_vector_after_position(
+    matrix, vector, position_ids, after=None, do_projection=True
+):
+    after_id = after
+    if after_id is None:
+        after_id = position_ids.min().item() - 1
+
+    mask = position_ids > after_id
+    mask = mask.unsqueeze(-1)
+
+    if do_projection:
+        matrix = project_onto_orthogonal_complement(matrix, vector)
+
+    matrix += mask.float() * vector
+    return matrix
+
+
+def find_last_subtensor_position(tensor, sub_tensor):
+    n, m = tensor.size(0), sub_tensor.size(0)
+    if m > n:
+        return -1
+    for i in range(n - m, -1, -1):
+        if t.equal(tensor[i : i + m], sub_tensor):
+            return i
+    return -1
+
+
+def find_instruction_end_postion(tokens, end_str):
+    start_pos = find_last_subtensor_position(tokens, end_str)
+    if start_pos == -1:
+        return -1
+    return start_pos + len(end_str) - 1
+
+
+def get_a_b_probs(logits, a_token_id, b_token_id):
+    last_token_logits = logits[0, -1, :]
+    last_token_probs = t.softmax(last_token_logits, dim=-1)
+    a_prob = last_token_probs[a_token_id].item()
+    b_prob = last_token_probs[b_token_id].item()
+    return a_prob, b_prob
+
+
+def make_tensor_save_suffix(layer, model_name_path):
+    return f'{layer}_{model_name_path.split("/")[-1]}'