-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgenetoprotein.py
70 lines (42 loc) · 1.89 KB
/
genetoprotein.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
from Bio import SeqIO
import pandas as pd
def to_file(data, filename):
with open(filename, 'w', encoding='utf-8') as f:
f.write('\n'.join(f'{key}\t{value}' for key, value in data.items()))
def get_mapping():
mapping = {}
with open("uniprot_sprot.fasta") as handle:
for record in SeqIO.parse(handle, "fasta"):
acc = record.id.split("|")#[1]
specie = acc[2].split("_")[1]
acc = acc[1]
if specie == "MOUSE":
des = record.description.split(" ")
# print(des)
for i in des:
if i.startswith("GN="):
gene = i.split("=")[1]
mapping[gene] = acc
# print(gene)
return mapping
dfs = pd.read_excel("Cheng_Collaboration_data.xlsx", sheet_name="Protein from luminal fluid")
# Retain only rows with at least one value
selected_rows = dfs[(~dfs['luminal protein estrus'].isnull()) & (~dfs['luminal protein 0.5'].isnull()) \
& (~dfs['luminal protein 1.5'].isnull()) & (~dfs['luminal protein 2.5'].isnull()) \
& (~dfs['luminal protein so estrus'].isnull()) & (~dfs['luminal protein so 0.5'].isnull()) \
& (~dfs['luminal protein so 1.5'].isnull()) & (~dfs['luminal protein so 2.5'].isnull())]
joy_proteins = set(selected_rows['Accession'].tolist())
print(joy_proteins)
mapping = get_mapping()
mapping = {i: j for i, j in mapping.items() if j in joy_proteins}
rem = set([i for i in joy_proteins if not i in set(mapping.values())])
print(rem)
print(len(rem))
exit()
to_file(mapping, 'output.csv')
print("Found {} proteins, remaining {}".format(len(mapping), len(joy_proteins) - len(mapping)))
exit()
dfs = pd.read_excel("Cheng_Collaboration_data.xlsx", sheet_name="bulkRNA-seq literature")
joy_genes = set(dfs['Accession'].tolist())
yo = set(mapping.keys())
print(len(joy_genes.intersection(yo)))