-
Notifications
You must be signed in to change notification settings - Fork 14
/
generate_multiply.py
172 lines (143 loc) · 5.91 KB
/
generate_multiply.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import argparse
import os
import csv
import logging
import fnmatch
import numpy as np
import time
from datetime import datetime
from tqdm import tqdm
from pathlib import Path
def get_files_recursively(folder_path):
allowed_patterns = [
'*.[Pp][Nn][Gg]', '*.[Jj][Pp][Gg]', '*.[Jj][Pp][Ee][Gg]',
'*.[Gg][Ii][Ff]', '*.[Ww][Ee][Bb][Pp]',
]
image_path_list = [
str(path) for pattern in allowed_patterns
for path in Path(folder_path).rglob(pattern)
]
return image_path_list
def read_weight_mapping(weight_mapping_csv):
weight_mapping = {}
with open(weight_mapping_csv, 'r') as f:
reader = csv.reader(f)
for row in reader:
pattern, weight = row
weight_mapping[pattern] = weight
return weight_mapping
class WeightTree(object):
def __init__(self, dirname, weight_mapping=None, progress_bar=None):
self.dirname = dirname
self.n_images = 0
self.contain_images = False
self.children = []
for path in os.listdir(dirname):
path = os.path.join(self.dirname, path)
if os.path.isfile(path):
extension = os.path.splitext(path)[1]
if extension.lower() in ['.jpg', '.jpeg', '.png', '.webp']:
if progress_bar is not None:
progress_bar.update(1)
self.n_images += 1
self.contain_images = True
elif os.path.isdir(path):
sub_weight_tree = WeightTree(
path, weight_mapping, progress_bar)
if (sub_weight_tree.contain_images
or len(sub_weight_tree.children) > 0):
self.children.append(sub_weight_tree)
self.weight = self.modify_weight(weight_mapping)
def modify_weight(self, training_weights):
if training_weights is None:
return 1
basename = os.path.basename(self.dirname)
if basename in training_weights:
# print(self.dirname)
# print(training_weights[basename])
return float(training_weights[basename])
for pattern in training_weights:
if fnmatch.fnmatch(self.dirname, pattern):
# print(self.dirname)
# print(training_weights[pattern])
return float(training_weights[pattern])
return 1
def compute_sampling_prob(
self, baseprob, dir_list, prob_list, n_images_list):
weights_list = []
for weight_tree in self.children:
weights_list.append(weight_tree.weight)
if self.contain_images:
weights_list.append(self.weight)
probs = np.array(weights_list)/np.sum(weights_list)
# Modify dir_list and prob_list in place
if self.contain_images:
dir_list.append(self.dirname)
prob_list.append(baseprob*probs[-1])
n_images_list.append(self.n_images)
for i, weight_tree in enumerate(self.children):
weight_tree.compute_sampling_prob(
baseprob*probs[i], dir_list, prob_list, n_images_list)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--src_dir', type=str,
help='Directory to generate multiply.txt')
parser.add_argument('--logdir', default='logs',
help='Directory to save log file')
parser.add_argument(
'--max_multiply', type=int, default=100,
help='maximum multiply of each image')
parser.add_argument(
'--min_multiply', type=float, default=1,
help='minimum multiply of each image')
parser.add_argument(
'--weight_csv', default=None,
help='If provided use the provided csv to modify weights')
args = parser.parse_args()
if args.weight_csv is not None:
weight_mapping = read_weight_mapping(args.weight_csv)
else:
weight_mapping = None
n_images_totol = len(get_files_recursively(args.src_dir))
bar = tqdm(total=n_images_totol)
weight_tree = WeightTree(args.src_dir, weight_mapping, bar)
dir_list = []
prob_list = []
n_images_list = []
weight_tree.compute_sampling_prob(1, dir_list, prob_list, n_images_list)
probs = np.array(prob_list)
n_images_array = np.array(n_images_list)
per_image_weights = probs/n_images_array
# This makes the weights larger than 1
per_image_multiply = per_image_weights / np.min(per_image_weights)
per_image_multiply = per_image_multiply * args.min_multiply
per_image_multiply_final = np.minimum(
np.around(per_image_multiply, 2), args.max_multiply)
if args.logdir is not None:
os.makedirs(args.logdir, exist_ok=True)
current_datetime = datetime.now()
str_current_datetime = str(current_datetime)
logfile = os.path.join(args.logdir, f'log_{str_current_datetime}.txt')
logging.basicConfig(
filename=logfile, level=logging.INFO, filemode='w')
n_images_total = 0
n_images_virtual_total = 0
for k in np.argsort(per_image_multiply):
dirname = dir_list[k]
n_images = n_images_list[k]
multiply = per_image_multiply_final[k]
n_images_total += n_images
n_images_virtual_total += n_images * multiply
with open(os.path.join(dirname, 'multiply.txt'), 'w') as f:
f.write(str(multiply))
if args.logdir is not None:
logging.info(dirname)
logging.info(f'sampling probability: {prob_list[k]}')
logging.info(f'number of images: {n_images}')
logging.info(f'original multipy: {per_image_multiply[k]}')
logging.info(f'final multipy: {multiply}\n')
logging.info(f'Number of images: {n_images_totol}')
logging.info(f'Virtual dataset size: {n_images_virtual_total}')
time.sleep(1)
print(f'Number of images: {n_images_totol}')
print(f'Virtual dataset size: {n_images_virtual_total}')