-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathviral_families.py
executable file
·43 lines (34 loc) · 1015 Bytes
/
viral_families.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
"""
Script that counts specific viruses and gives them a family
Anneliek ter Horst 11/8/17
Takes in csv with all EVE information, puts out csv with only viral names
"""
# imports
import csv
import sys
import pandas as pd
import time
from collections import Counter
# Open the infile as pandas object
df = pd.DataFrame.from_csv(open(sys.argv[1])).reset_index(drop=True)
# print length of csv
print len(df)
# define start and end of viral name
start = '['
end = ']'
# make a list for viral names
viral_name_list =[]
# Loop though rows in df, append each viral name from [] to name_list
for index, row in df.iterrows():
sequence = row['sequence']
result = sequence[sequence.find(start):sequence.find(end)+len(end)]
# remove [] and all numbers in the viral name
result = result.translate(None, '[]')
viral_name_list.append(result)
# write to csv
with open(sys.argv[2],'w') as f:
writer = csv.writer(f)
for i in viral_name_list:
writer.writerow([i])
# close file
f.close()