-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
8 changed files
with
1,467 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,204 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"What happens when we use a random steering vector for steering?" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# NOTE: We need to extract a random steering vector, so we re-define the run_experiment function here\n", | ||
"\n", | ||
"import logging\n", | ||
"import sys\n", | ||
"import torch\n", | ||
"import functools\n", | ||
"\n", | ||
"from typing import cast\n", | ||
"from pprint import pformat\n", | ||
"from repepo.core.pipeline import Pipeline\n", | ||
"from repepo.steering.utils.helpers import (\n", | ||
" SteeringConfig,\n", | ||
" EmptyTorchCUDACache,\n", | ||
" get_model_and_tokenizer,\n", | ||
" get_formatter,\n", | ||
" make_dataset,\n", | ||
" get_experiment_path,\n", | ||
" get_eval_result_path,\n", | ||
" save_eval_result,\n", | ||
" load_eval_result,\n", | ||
" get_activation_path,\n", | ||
" save_activation,\n", | ||
" load_activation,\n", | ||
" save_metric,\n", | ||
")\n", | ||
"\n", | ||
"from repepo.steering.build_steering_training_data import (\n", | ||
" build_steering_vector_training_data,\n", | ||
")\n", | ||
"from repepo.steering.concept_metrics import (\n", | ||
" VarianceOfNormSimilarityMetric,\n", | ||
" EuclideanSimilarityMetric,\n", | ||
" CosineSimilarityMetric,\n", | ||
" compute_difference_vectors,\n", | ||
")\n", | ||
"\n", | ||
"from repepo.steering.utils.database import SteeringConfigDatabase\n", | ||
"\n", | ||
"from steering_vectors.train_steering_vector import (\n", | ||
" extract_activations,\n", | ||
" SteeringVector,\n", | ||
" LayerType,\n", | ||
")\n", | ||
"\n", | ||
"from repepo.core.evaluate import EvalResult\n", | ||
"from repepo.steering.get_aggregator import get_aggregator\n", | ||
"from repepo.steering.evaluate_steering_vector import (\n", | ||
" evaluate_steering_vector,\n", | ||
")\n", | ||
"\n", | ||
"from repepo.steering.run_experiment import setup_logger\n", | ||
"\n", | ||
"\n", | ||
"\n", | ||
"def run_experiment_with_random_steering_vector(\n", | ||
" config: SteeringConfig,\n", | ||
" force_rerun: bool = False,\n", | ||
" logging_level: str = \"INFO\",\n", | ||
") -> EvalResult:\n", | ||
" # Set up logger\n", | ||
" logger = setup_logger(logging_level)\n", | ||
" logger.info(f\"Running experiment with config: \\n{pformat(config)}\")\n", | ||
" \n", | ||
" # Set up pipeline\n", | ||
" model, tokenizer = get_model_and_tokenizer(config.model_name)\n", | ||
" formatter = get_formatter(config.formatter)\n", | ||
" pipeline = Pipeline(model, tokenizer, formatter=formatter)\n", | ||
"\n", | ||
" # Initialize a random steering vector\n", | ||
" layer_activations = {config.layer: torch.randn(4096)}\n", | ||
" steering_vector = SteeringVector(\n", | ||
" layer_activations = layer_activations,\n", | ||
" layer_type = cast(LayerType, config.layer_type),\n", | ||
" )\n", | ||
"\n", | ||
" # Evaluate steering vector\n", | ||
" test_dataset = make_dataset(config.test_dataset, config.test_split)\n", | ||
" with EmptyTorchCUDACache():\n", | ||
" eval_results = evaluate_steering_vector(\n", | ||
" pipeline=pipeline,\n", | ||
" steering_vector=steering_vector,\n", | ||
" dataset=test_dataset,\n", | ||
" layers=[config.layer],\n", | ||
" multipliers=[config.multiplier],\n", | ||
" completion_template=config.test_completion_template,\n", | ||
" patch_generation_tokens_only=config.patch_generation_tokens_only,\n", | ||
" skip_first_n_generation_tokens=config.skip_first_n_generation_tokens,\n", | ||
" logger=logger,\n", | ||
" )\n", | ||
" assert len(eval_results) == 1, \"Expected one result\"\n", | ||
" eval_result = eval_results[0]\n", | ||
"\n", | ||
" return eval_result\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import itertools\n", | ||
"from repepo.steering.sweeps.constants import ALL_ABSTRACT_CONCEPT_DATASETS\n", | ||
"\n", | ||
"datasets = ALL_ABSTRACT_CONCEPT_DATASETS\n", | ||
"layer = 13\n", | ||
"multipliers = [-2, -1, 0, 1, 2]\n", | ||
"\n", | ||
"def iter_config():\n", | ||
" for dataset, multiplier in itertools.product(datasets, multipliers):\n", | ||
" yield SteeringConfig(\n", | ||
" train_dataset=dataset,\n", | ||
" train_split=\"0%:+10\",\n", | ||
" formatter=\"llama-chat-formatter\",\n", | ||
" layer=layer,\n", | ||
" multiplier=multiplier,\n", | ||
" test_dataset=dataset,\n", | ||
" test_split=\"40%:+10\",\n", | ||
" test_completion_template=\"{prompt} My answer is: {response}\",\n", | ||
" patch_generation_tokens_only=True,\n", | ||
" skip_first_n_generation_tokens=1,\n", | ||
" )\n", | ||
"\n", | ||
"results = []\n", | ||
"for config in iter_config():\n", | ||
" with EmptyTorchCUDACache():\n", | ||
" result = run_experiment_with_random_steering_vector(config, force_rerun=True, logging_level=\"INFO\")\n", | ||
" results.append((config, result))\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import pandas as pd \n", | ||
"\n", | ||
"# Aggregate results\n", | ||
"rows = []\n", | ||
"\n", | ||
"for config, result in results:\n", | ||
" rows.append({\n", | ||
" 'dataset': config.test_dataset,\n", | ||
" 'multiplier': config.multiplier,\n", | ||
" 'mean_logit_diff': result.metrics['mean_logit_diff']\n", | ||
" })\n", | ||
"\n", | ||
"df = pd.DataFrame(rows)\n", | ||
"df.head()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Plot logit diff for each dataset.\n", | ||
"import seaborn as sns\n", | ||
"import matplotlib.pyplot as plt\n", | ||
"fig, ax = plt.subplots(figsize=(10, 6))\n", | ||
"for dataset, group in df.groupby('dataset'):\n", | ||
" sns.lineplot(x='multiplier', y='mean_logit_diff', data=group, label=dataset, ax=ax)" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": ".venv", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.10.13" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
Oops, something went wrong.