Skip to content

Commit

Permalink
Add notebooks to run experiments
Browse files Browse the repository at this point in the history
  • Loading branch information
dtch1997 committed Mar 13, 2024
1 parent 66cb2d7 commit 9c662c7
Show file tree
Hide file tree
Showing 8 changed files with 1,467 additions and 0 deletions.
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ repos:
- id: end-of-file-fixer
- id: check-yaml
- id: check-added-large-files
args: ['--maxkb=1000']
- repo: https://github.com/psf/black
rev: 23.10.1
hooks:
Expand Down
204 changes: 204 additions & 0 deletions repepo/experiments/ablate_random_steering_vector.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"What happens when we use a random steering vector for steering?"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# NOTE: We need to extract a random steering vector, so we re-define the run_experiment function here\n",
"\n",
"import logging\n",
"import sys\n",
"import torch\n",
"import functools\n",
"\n",
"from typing import cast\n",
"from pprint import pformat\n",
"from repepo.core.pipeline import Pipeline\n",
"from repepo.steering.utils.helpers import (\n",
" SteeringConfig,\n",
" EmptyTorchCUDACache,\n",
" get_model_and_tokenizer,\n",
" get_formatter,\n",
" make_dataset,\n",
" get_experiment_path,\n",
" get_eval_result_path,\n",
" save_eval_result,\n",
" load_eval_result,\n",
" get_activation_path,\n",
" save_activation,\n",
" load_activation,\n",
" save_metric,\n",
")\n",
"\n",
"from repepo.steering.build_steering_training_data import (\n",
" build_steering_vector_training_data,\n",
")\n",
"from repepo.steering.concept_metrics import (\n",
" VarianceOfNormSimilarityMetric,\n",
" EuclideanSimilarityMetric,\n",
" CosineSimilarityMetric,\n",
" compute_difference_vectors,\n",
")\n",
"\n",
"from repepo.steering.utils.database import SteeringConfigDatabase\n",
"\n",
"from steering_vectors.train_steering_vector import (\n",
" extract_activations,\n",
" SteeringVector,\n",
" LayerType,\n",
")\n",
"\n",
"from repepo.core.evaluate import EvalResult\n",
"from repepo.steering.get_aggregator import get_aggregator\n",
"from repepo.steering.evaluate_steering_vector import (\n",
" evaluate_steering_vector,\n",
")\n",
"\n",
"from repepo.steering.run_experiment import setup_logger\n",
"\n",
"\n",
"\n",
"def run_experiment_with_random_steering_vector(\n",
" config: SteeringConfig,\n",
" force_rerun: bool = False,\n",
" logging_level: str = \"INFO\",\n",
") -> EvalResult:\n",
" # Set up logger\n",
" logger = setup_logger(logging_level)\n",
" logger.info(f\"Running experiment with config: \\n{pformat(config)}\")\n",
" \n",
" # Set up pipeline\n",
" model, tokenizer = get_model_and_tokenizer(config.model_name)\n",
" formatter = get_formatter(config.formatter)\n",
" pipeline = Pipeline(model, tokenizer, formatter=formatter)\n",
"\n",
" # Initialize a random steering vector\n",
" layer_activations = {config.layer: torch.randn(4096)}\n",
" steering_vector = SteeringVector(\n",
" layer_activations = layer_activations,\n",
" layer_type = cast(LayerType, config.layer_type),\n",
" )\n",
"\n",
" # Evaluate steering vector\n",
" test_dataset = make_dataset(config.test_dataset, config.test_split)\n",
" with EmptyTorchCUDACache():\n",
" eval_results = evaluate_steering_vector(\n",
" pipeline=pipeline,\n",
" steering_vector=steering_vector,\n",
" dataset=test_dataset,\n",
" layers=[config.layer],\n",
" multipliers=[config.multiplier],\n",
" completion_template=config.test_completion_template,\n",
" patch_generation_tokens_only=config.patch_generation_tokens_only,\n",
" skip_first_n_generation_tokens=config.skip_first_n_generation_tokens,\n",
" logger=logger,\n",
" )\n",
" assert len(eval_results) == 1, \"Expected one result\"\n",
" eval_result = eval_results[0]\n",
"\n",
" return eval_result\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import itertools\n",
"from repepo.steering.sweeps.constants import ALL_ABSTRACT_CONCEPT_DATASETS\n",
"\n",
"datasets = ALL_ABSTRACT_CONCEPT_DATASETS\n",
"layer = 13\n",
"multipliers = [-2, -1, 0, 1, 2]\n",
"\n",
"def iter_config():\n",
" for dataset, multiplier in itertools.product(datasets, multipliers):\n",
" yield SteeringConfig(\n",
" train_dataset=dataset,\n",
" train_split=\"0%:+10\",\n",
" formatter=\"llama-chat-formatter\",\n",
" layer=layer,\n",
" multiplier=multiplier,\n",
" test_dataset=dataset,\n",
" test_split=\"40%:+10\",\n",
" test_completion_template=\"{prompt} My answer is: {response}\",\n",
" patch_generation_tokens_only=True,\n",
" skip_first_n_generation_tokens=1,\n",
" )\n",
"\n",
"results = []\n",
"for config in iter_config():\n",
" with EmptyTorchCUDACache():\n",
" result = run_experiment_with_random_steering_vector(config, force_rerun=True, logging_level=\"INFO\")\n",
" results.append((config, result))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd \n",
"\n",
"# Aggregate results\n",
"rows = []\n",
"\n",
"for config, result in results:\n",
" rows.append({\n",
" 'dataset': config.test_dataset,\n",
" 'multiplier': config.multiplier,\n",
" 'mean_logit_diff': result.metrics['mean_logit_diff']\n",
" })\n",
"\n",
"df = pd.DataFrame(rows)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Plot logit diff for each dataset.\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"fig, ax = plt.subplots(figsize=(10, 6))\n",
"for dataset, group in df.groupby('dataset'):\n",
" sns.lineplot(x='multiplier', y='mean_logit_diff', data=group, label=dataset, ax=ax)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit 9c662c7

Please sign in to comment.