-
Notifications
You must be signed in to change notification settings - Fork 24
/
utils.py
100 lines (80 loc) · 3.38 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import numpy as np
from language_models import HuggingFace
def insert_adv_string(msg, adv):
return msg + adv
def schedule_n_to_change_fixed(max_n_to_change, it):
""" Piece-wise constant schedule for `n_to_change` (both characters and tokens) """
# it = int(it / n_iters * 10000)
if 0 < it <= 10:
n_to_change = max_n_to_change
elif 10 < it <= 25:
n_to_change = max_n_to_change // 2
elif 25 < it <= 50:
n_to_change = max_n_to_change // 4
elif 50 < it <= 100:
n_to_change = max_n_to_change // 8
elif 100 < it <= 500:
n_to_change = max_n_to_change // 16
else:
n_to_change = max_n_to_change // 32
n_to_change = max(n_to_change, 1)
return n_to_change
def schedule_n_to_change_prob(max_n_to_change, prob, target_model):
""" Piece-wise constant schedule for `n_to_change` based on the best prob """
# it = int(it / n_iters * 10000)
# need to adjust since llama and r2d2 are harder to attack
if isinstance(target_model.model, HuggingFace):
if 0 <= prob <= 0.01:
n_to_change = max_n_to_change
elif 0.01 < prob <= 0.1:
n_to_change = max_n_to_change // 2
elif 0.1 < prob <= 1.0:
n_to_change = max_n_to_change // 4
else:
raise ValueError(f'Wrong prob {prob}')
else:
if 0 <= prob <= 0.1:
n_to_change = max_n_to_change
elif 0.1 < prob <= 0.5:
n_to_change = max_n_to_change // 2
elif 0.5 < prob <= 1.0:
n_to_change = max_n_to_change // 4
else:
raise ValueError(f'Wrong prob {prob}')
n_to_change = max(n_to_change, 1)
return n_to_change
def extract_logprob(logprob_dict, target_token):
logprobs = []
if ' ' + target_token in logprob_dict:
logprobs.append(logprob_dict[' ' + target_token])
if target_token in logprob_dict:
logprobs.append(logprob_dict[target_token])
if logprobs == []:
return -np.inf
else:
return max(logprobs)
def early_stopping_condition(best_logprobs, target_model, logprob_dict, target_token, determinstic_jailbreak, no_improvement_history=750,
prob_start=0.02, no_improvement_threshold_prob=0.01):
if determinstic_jailbreak and logprob_dict != {}:
argmax_token = max(logprob_dict, key=logprob_dict.get)
if argmax_token in [target_token, ' '+target_token]:
return True
else:
return False
if len(best_logprobs) == 0:
return False
best_logprob = best_logprobs[-1]
if no_improvement_history < len(best_logprobs):
prob_best, prob_history = np.exp(best_logprobs[-1]), np.exp(best_logprobs[-no_improvement_history])
no_sufficient_improvement_condition = prob_best - prob_history < no_improvement_threshold_prob
else:
no_sufficient_improvement_condition = False
if isinstance(target_model.model, HuggingFace) and np.exp(best_logprob) > prob_start and no_sufficient_improvement_condition:
return True
if isinstance(target_model.model, HuggingFace) and np.exp(best_logprob) > 0.1:
return True
# for all other models
# note: for GPT models, `best_logprob` is the maximum logprob over the randomness of the model (so it's not the "real" best logprob)
if np.exp(best_logprob) > 0.4:
return True
return False