-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcreate_data_file.py
43 lines (32 loc) · 1.44 KB
/
create_data_file.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# from tqdm import tqdm
import random
import argparse
import concurrent.futures
AMINO_ACIDS = [
'A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I',
'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'
]
def generate_peptide(length):
return ''.join(random.choices(AMINO_ACIDS, k = length))
def generate_peptides(amount, min_length = 5, max_length = 50):
return [generate_peptide(random.randint(min_length, max_length)) for _ in range(amount)]
def generate_column_peptides(amount, min_length = 5, max_length = 50):
return [
f"{generate_peptide(random.randint(min_length, max_length))}\t{generate_peptide(random.randint(min_length, max_length))}"
for _ in range(amount)
]
parser = argparse.ArgumentParser()
parser.add_argument('--min-length', type = int, default = 5)
parser.add_argument('--max-length', type = int, default = 50)
parser.add_argument('--amount', type = int, default = 1_000_000)
parser.add_argument('--batch-size', type = int, default = 1_000)
args = parser.parse_args()
if __name__ == '__main__':
amount_of_batches = args.amount // args.batch_size
with concurrent.futures.ProcessPoolExecutor() as executor:
futures = [
executor.submit(generate_column_peptides, args.batch_size, args.min_length, args.max_length)
for _ in range(amount_of_batches)
]
for future in concurrent.futures.as_completed(futures):
print('\n'.join(future.result()))