-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_backdoored_dataset.py
139 lines (118 loc) · 6.49 KB
/
create_backdoored_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import os
import ast
import shutil
import socket
from datetime import datetime
import pandas as pd
import multiprocessing as mp
from tools.logistics import get_project_root_path
from tools.data import create_backdoored_dataset
def create_dataset_multiprocessing(dict_params):
images_count_per_class = dict_params['images_per_class']
num_classes = dict_params['num_classes']
params_method = dict_params['params_method']
path_data_backd = params_method['dir_backdoored_data']
items = path_data_backd.split(os.path.sep)
short_path = os.path.join(items[-3], items[-2], items[-1])
if os.path.isdir(path_data_backd):
files_count_in_backd_dataset = len(os.listdir(path_data_backd))
if files_count_in_backd_dataset == (num_classes * images_count_per_class + 1): # +1 because of csv data
print(f'already complete ({files_count_in_backd_dataset}) {short_path}')
return
shutil.rmtree(path_data_backd)
print(f'deleted ({files_count_in_backd_dataset}) {path_data_backd}')
create_backdoored_dataset(**params_method)
print(f'done {short_path}')
def main():
path_root_project = get_project_root_path()
# path_root = os.path.join(path_root_project, 'TrojAI-data', 'round1-dataset-train')
# path_root = os.path.join(path_root_project, 'TrojAI-data', 'round1-holdout-dataset')
# path_root = os.path.join(path_root_project, 'TrojAI-data', 'round2-train-dataset')
# path_root = os.path.join(path_root_project, 'TrojAI-data', 'round2-holdout-dataset')
# path_root = os.path.join(path_root_project, 'TrojAI-data', 'round3-train-dataset')
path_root = os.path.join(path_root_project, 'TrojAI-data', 'round3-holdout-dataset')
path_metadata = os.path.join(path_root, 'METADATA.csv')
metadata = pd.read_csv(path_metadata)
mp_mapping_params = []
list_trigger_sizes = [30, 40, 50]
list_filters = ['gotham', 'kelvin', 'lomo', 'nashville', 'toaster']
# list_limits = {
# # 'openlab30.umiacs.umd.edu': (0, 275),
# # 'openlab31.umiacs.umd.edu': (276, 551),
# # 'openlab30.umiacs.umd.edu': (0, 275),
# 'openlab08.umiacs.umd.edu': (0, 1103)
# }
# IMAGES_PER_CLASS = 100 # 50 for round 1
# IMAGES_PER_CLASS = 10 # 50 for round 2
for _, row in metadata.iterrows():
model_name = row['model_name']
# model_id = int(model_name[3:])
# left, right = list_limits[socket.gethostname()]
# if left <= model_id <= right:
# if os.path.isdir(os.path.join(path_root, model_name)) and str(row['trigger_color']) != 'None':
if True:
num_classes = row['number_classes']
number_example_images = int(row['number_example_images'])
trigger_color = row['trigger_color']
triggered_classes = row['triggered_classes']
if triggered_classes.lower() == 'none':
triggered_classes = '[]'
triggered_classes = ast.literal_eval(triggered_classes.replace(' ', ', '))
# default class for clean models
# I set trigger_target_class to 0 to pass a valid parameter to method create_backdoored_dataset
trigger_target_class = row['trigger_target_class']
trigger_target_class = int(trigger_target_class) if trigger_target_class.lower() != 'none' else 0
###############################################################################################################
path_model = os.path.join(path_root, model_name)
if os.path.isdir(path_model):
path_data_clean = os.path.join(path_model, 'clean_example_data')
# for a path_data_clean, generate a path_data_backd with a for a specific size for square trigger
# generate backdoored datasets with square trigger with specific size
for trigger_color in ['random', (127, 127, 127)]:
for p_trigger_size in list_trigger_sizes:
path_data_backd = os.path.join(path_model, f'backdoored_data_square-{p_trigger_size}')
if type(trigger_color) is str:
path_data_backd += f'_{trigger_color}'
mapping_param_dict = dict(
num_classes=num_classes,
images_per_class=number_example_images,
params_method=dict(
dir_clean_data=path_data_clean,
dir_backdoored_data=path_data_backd,
trigger_type='polygon',
trigger_name='square',
trigger_color=trigger_color,
trigger_size=p_trigger_size,
# triggered_classes=triggered_classes,
triggered_classes='all',
trigger_target_class=trigger_target_class)
)
mp_mapping_params.append(mapping_param_dict)
### create_dataset_multiprocessing(mapping_param_dict)
# # generate backdoored datasets with specific filter
for p_filter_name in list_filters:
path_data_backd = os.path.join(path_model, f'backdoored_data_filter_{p_filter_name}')
mapping_param_dict = dict(
num_classes=num_classes,
images_per_class=number_example_images,
params_method=dict(
dir_clean_data=path_data_clean,
dir_backdoored_data=path_data_backd,
trigger_type='filter',
trigger_name=p_filter_name,
trigger_color=None,
trigger_size=None,
# triggered_classes=triggered_classes,
triggered_classes='all',
trigger_target_class=trigger_target_class)
)
mp_mapping_params.append(mapping_param_dict)
### create_dataset_multiprocessing(mapping_param_dict)
cpus = mp.cpu_count() - 4
print(f'Creating {len(mp_mapping_params)} datasets using {cpus} CPU cores')
with mp.Pool(processes=cpus) as pool:
pool.map(create_dataset_multiprocessing, mp_mapping_params)
if __name__ == '__main__':
t = datetime.now()
main()
print(f'script ended, took {datetime.now() - t}')