-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy patheval_neural_exec.py
136 lines (103 loc) · 5.4 KB
/
eval_neural_exec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import os, sys, importlib, argparse, glob
import torch
from NeuralExec.llm import load_llm
from NeuralExec.discrete_opt import WhiteBoxTokensOpt
from NeuralExec.utility import read_pickle, write_pickle, _hash
from NeuralExec.evaluation.tester import run_injection, FuzzyCheckerPromptInjcetion
from NeuralExec.ex_triggers import NeuralExec
from NeuralExec.utility import mkdir
from confs.evaluation_setup import vhparams
def make_logfile_path(hparams, llm_name, trigger_str, test_path, logtype):
modifiers = [
'runs',
'verifier',
]
name = f'{str(_hash(llm_name))[:8]}_{str(_hash(trigger_str[0]+trigger_str[1]))[:8]}_{str(_hash(test_path))[:8]}_{modifiers[logtype]}'
return os.path.join(hparams['result_dir_log'], name), name
if __name__ == '__main__':
# Create the parser
parser = argparse.ArgumentParser(description="Evaluate an execution trigger against a target LLM.")
# Add arguments
parser.add_argument("log_path", type=str,
help="Path to the log file for execution trigger or pattern (e.g., './logs/baselines/baseline_*')")
parser.add_argument("gpus", type=str,
help='Comma-separated list of GPUs to use (e.g., "0,1,2,3").')
parser.add_argument("--target_llm", type=str, default=None,
help="String defining the LLM to attack. Default is the target LLM for the Neural Exec.")
parser.add_argument("--path_test_prompts", type=str, default=None,
help="Path to the test prompts file. Default is None.")
# Parse the arguments
args = parser.parse_args()
# set gpus
os.environ['CUDA_VISIBLE_DEVICES'] = args.gpus
paths = glob.glob(args.log_path)
print(f'Triggers to test: {len(paths)}')
# load first for default parameters
logger = read_pickle(paths[0])
trigger, _ = logger.get_last_adv_tok(best=True)
hparams = logger.confs
hparams.update(vhparams)
mkdir(hparams['result_dir_log'])
if args.target_llm is None:
llm_name = hparams['llm']
print(f"Getting target LLM ({llm_name}) from trigger's hparameters")
# load LLM
print(f"Loading {llm_name}...")
llm = load_llm(llm_name)
source_tokenizer = None
else:
llm_name = args.target_llm
# load LLM
print(f"Loading {llm_name}...")
llm = load_llm(llm_name)
source_tokenizer = load_llm(hparams['llm'], tokenizer_only=True).tokenizer
# load data
if args.path_test_prompts is None:
test_path = hparams['testset_path']
print(f"Getting test set ({test_path}) from trigger's hparameters")
else:
test_path = args.path_test_prompts
test_prompts = read_pickle(test_path)
# setup opt class
wbo = WhiteBoxTokensOpt(llm, hparams)
for path in paths:
logger = read_pickle(path)
trigger, _ = logger.get_last_adv_tok(best=True)
if not source_tokenizer is None:
trigger = NeuralExec.convert_tokens_to_other_tokenizer(trigger, source_tokenizer, llm.tokenizer)
trigger_str = trigger.decode(llm.tokenizer)
print(trigger_str)
# phase-1] run prompt injection and collects outputs
run_log_path, _ = make_logfile_path(hparams, llm_name, trigger_str, test_path, 0)
if os.path.isfile(run_log_path):
print(f"{run_log_path} already computed. Skipping...")
info_runs, injection_runs = read_pickle(run_log_path)
else:
print(f"Running step-1: Run injection attacks on target {llm_name}. Saving logs in {run_log_path}...")
injection_runs = run_injection(wbo, trigger, test_prompts, batch_size=hparams['batch_size'], max_new_tokens=hparams['max_new_tokens'])
info_runs = (llm_name, trigger, path, test_path, hparams)
write_pickle(run_log_path, (info_runs, injection_runs))
torch.cuda.empty_cache()
print("\tInit verifier...")
llm_ver = load_llm(hparams['llm_for_verification'])
for path in paths:
logger = read_pickle(path)
trigger, _ = logger.get_last_adv_tok(best=True)
if not source_tokenizer is None:
trigger = NeuralExec.convert_tokens_to_other_tokenizer(trigger, source_tokenizer, llm.tokenizer)
trigger_str = trigger.decode(llm.tokenizer)
print(trigger_str)
run_log_path, _ = make_logfile_path(hparams, llm_name, trigger_str, test_path, 0)
_, injection_runs = read_pickle(run_log_path)
# phase-2] run verification LLM on collected outputs
ver_log_path, _ = make_logfile_path(hparams, llm_name, trigger_str, test_path, 1)
if os.path.isfile(ver_log_path):
print(f"{ver_log_path} already computed. Skipping...")
else:
print(f"Running step-2: Run verification LLM ({hparams['llm_for_verification']}) on collected outputs. Saving logs in {ver_log_path}...")
verifier = FuzzyCheckerPromptInjcetion(llm_ver, hparams['max_new_tokens_ver'])
ver_results = verifier(injection_runs, hparams['batch_size_ver'])
info_runs = (llm_name, trigger, path, test_path, hparams)
write_pickle(ver_log_path, (info_runs, ver_results))
print(f'Results: {ver_results}')
print(f"Evaluation completed. Logs saved in {hparams['result_dir_log']}")