-
Notifications
You must be signed in to change notification settings - Fork 0
/
Gene_Attribute Matrix.py
206 lines (191 loc) · 11.9 KB
/
Gene_Attribute Matrix.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
# -*- coding: utf-8 -*-
"""
Created on Tue May 22 16:47:16 2018
@author: bjwil
"""
import requests
import os
import zlib
def _download_file(response, filename):
with open(filename, 'wb') as f:
for chunk in response.iter_content(chunk_size=1024):
f.write(chunk)
def _download_and_decompress_file(response, filename):
decompressor = zlib.decompressobj(16 + zlib.MAX_WBITS)
filename = filename[:-3]
with open(filename, 'w+') as f:
while True:
chunk = response.raw.read(1024)
if not chunk:
break
string = decompressor.decompress(chunk)
f.write(string)
def download_datasets(selected_datasets, selected_downloads, decompress=False):
for dataset, path in selected_datasets:
if not os.path.exists(dataset):
os.mkdir(dataset)
for downloadable in selected_downloads:
url = 'http://amp.pharm.mssm.edu/static/hdfs/harmonizome/data/%s/%s' %\
(path, downloadable)
response = requests.get(url, stream=True)
filename = '%s/%s' % (dataset, downloadable)
# Not every dataset has all downloadables.
if response.status_code != 200:
continue
if decompress and 'txt.gz' in filename:
_download_and_decompress_file(response, filename)
else:
_download_file(response, filename)
print('%s downloaded.' % dataset)
if __name__ == '__main__':
# Uncomment a dataset to download it.
download_datasets([
# ('Achilles Cell Line Gene Essentiality Profiles', 'achilles'),
# ('Allen Brain Atlas Adult Human Brain Tissue Gene Expression Profiles', 'brainatlasadulthuman'),
# ('Allen Brain Atlas Adult Mouse Brain Tissue Gene Expression Profiles', 'brainatlasadultmouse'),
# ('Allen Brain Atlas Developing Human Brain Tissue Gene Expression Profiles by Microarray', 'brainatlasdevelopmentalhumanmicroarray'),
# ('Allen Brain Atlas Developing Human Brain Tissue Gene Expression Profiles by RNA-seq', 'brainatlasdevelopmentalhumanrnaseq'),
# ('Allen Brain Atlas Prenatal Human Brain Tissue Gene Expression Profiles', 'brainatlasprenatalhuman'),
# ('BIND Biomolecular Interactions', 'bind'),
# ('BioGPS Cell Line Gene Expression Profiles', 'biogpsnci60'),
# ('BioGPS Human Cell Type and Tissue Gene Expression Profiles', 'biogpshuman'),
# ('BioGPS Mouse Cell Type and Tissue Gene Expression Profiles', 'biogpsmouse'),
# ('BioGRID Protein-Protein Interactions', 'biogrid'),
# ('Biocarta Pathways', 'biocarta'),
# ('CCLE Cell Line Gene CNV Profiles', 'cclecnv'),
# ('CCLE Cell Line Gene Expression Profiles', 'cclemrna'),
# ('CCLE Cell Line Gene Mutation Profiles', 'cclemut'),
# ('CHEA Transcription Factor Binding Site Profiles', 'chea'),
# ('CHEA Transcription Factor Targets', 'cheappi'),
# ('CMAP Signatures of Differentially Expressed Genes for Small Molecules', 'cmap'),
# ('COMPARTMENTS Curated Protein Localization Evidence Scores', 'jensencompartmentcurated'),
# ('COMPARTMENTS Experimental Protein Localization Evidence Scores', 'jensencompartmentexpts'),
# ('COMPARTMENTS Text-mining Protein Localization Evidence Scores', 'jensencompartmenttextmining'),
# ('CORUM Protein Complexes', 'corum'),
# ('COSMIC Cell Line Gene CNV Profiles', 'cosmiccnv'),
# ('COSMIC Cell Line Gene Mutation Profiles', 'cosmicmut'),
# ('CTD Gene-Chemical Interactions', 'ctdchemical'),
# ('CTD Gene-Disease Associations', 'ctddisease'),
# ('ClinVar SNP-Phenotype Associations', 'clinvar'),
# ('Combined Pathways Pathways', 'combinedpathways'),
# ('dbGAP Gene-Trait Associations', 'dbgap'),
# ('DEPOD Substrates of Phosphatases', 'depod'),
# ('DIP Protein-Protein Interactions', 'dip'),
# ('DISEASES Curated Gene-Disease Assocation Evidence Scores', 'jensendiseasecurated'),
# ('DISEASES Experimental Gene-Disease Assocation Evidence Scores', 'jensendiseaseexpts'),
# ('DISEASES Text-mining Gene-Disease Assocation Evidence Scores', 'jensendiseasetextmining'),
# ('DrugBank Drug Targets', 'drugbank'),
# ('ENCODE Histone Modification Site Profiles', 'encodehm'),
# ('ENCODE Transcription Factor Binding Site Profiles', 'encodetf'),
# ('ENCODE Transcription Factor Targets', 'encodetfppi'),
# ('ESCAPE Omics Signatures of Genes and Proteins for Stem Cells', 'escape'),
# ('GAD Gene-Disease Associations', 'gad'),
# ('GAD High Level Gene-Disease Associations', 'gadhighlevel'),
# ('GDSC Cell Line Gene Expression Profiles', 'gdsc'),
# ('GEO Signatures of Differentially Expressed Genes for Diseases', 'geodisease'),
# ('GEO Signatures of Differentially Expressed Genes for Gene Perturbations', 'geogene'),
# ('GEO Signatures of Differentially Expressed Genes for Kinase Perturbations', 'geokinase'),
# ('GEO Signatures of Differentially Expressed Genes for Small Molecules', 'geochemical'),
# ('GEO Signatures of Differentially Expressed Genes for Transcription Factor Perturbations', 'geotf'),
# ('GEO Signatures of Differentially Expressed Genes for Viral Infections', 'geovirus'),
# ('GO Biological Process Annotations', 'gobp'),
# ('GO Cellular Component Annotations', 'gocc'),
# ('GO Molecular Function Annotations', 'gomf'),
# ('GTEx Tissue Gene Expression Profiles', 'gtextissue'),
# ('GTEx Tissue Sample Gene Expression Profiles', 'gtexsample'),
# ('GTEx eQTL', 'gtexeqtl'),
# ('GWAS Catalog SNP-Phenotype Associations', 'gwascatalog'),
# ('GWASdb SNP-Disease Associations', 'gwasdbdisease'),
# ('GWASdb SNP-Phenotype Associations', 'gwasdbphenotype'),
# ('GeneRIF Biological Term Annotations', 'generif'),
# ('GeneSigDB Published Gene Signatures', 'genesigdb'),
# ('Graph of Medicine EHR Text-mining Clinical Term Annotations', 'graphofmedicine'),
# ('Guide to Pharmacology Chemical Ligands of Receptors', 'guidetopharmchemical'),
# ('Guide to Pharmacology Protein Ligands of Receptors', 'guidetopharmprotein'),
# ('HMDB Metabolites of Enzymes', 'hmdb'),
# ('HPA Cell Line Gene Expression Profiles', 'hpacelllines'),
# ('HPA Tissue Gene Expression Profiles', 'hpatissuesmrna'),
# ('HPA Tissue Protein Expression Profiles', 'hpatissuesprotein'),
# ('HPA Tissue Sample Gene Expression Profiles', 'hpasamples'),
# ('HPM Cell Type and Tissue Protein Expression Profiles', 'hpm'),
# ('HPO Gene-Disease Associations', 'hpo'),
# ('HPRD Protein-Protein Interactions', 'hprd'),
# ('Heiser et al., PNAS, 2011 Cell Line Gene Expression Profiles', 'heiser'),
# ('HuGE Navigator Gene-Phenotype Associations', 'hugenavigator'),
# ('Hub Proteins Protein-Protein Interactions', 'hubs'),
# ('HumanCyc Biomolecular Interactions', 'humancycppi'),
# ('HumanCyc Pathways', 'humancyc'),
# ('IntAct Biomolecular Interactions', 'intact'),
# ('InterPro Predicted Protein Domain Annotations', 'interpro'),
# ('JASPAR Predicted Transcription Factor Targets', 'jasparpwm'),
# ('KEA Substrates of Kinases', 'kea'),
# ('KEGG Biomolecular Interactions', 'keggppi'),
# ('KEGG Pathways', 'kegg'),
# ('Kinativ Kinase Inhibitor Bioactivity Profiles', 'kinativ'),
# ('KinomeScan Kinase Inhibitor Targets', 'kinomescan'),
# ('Klijn et al., Nat. Biotechnol., 2015 Cell Line Gene CNV Profiles', 'klijncnv'),
# ('Klijn et al., Nat. Biotechnol., 2015 Cell Line Gene Expression Profiles', 'klijnmrna'),
# ('Klijn et al., Nat. Biotechnol., 2015 Cell Line Gene Mutation Profiles', 'klijnmut'),
# ('LINCS L1000 CMAP Signatures of Differentially Expressed Genes for Gene Knockdowns', 'lincscmapgene'),
# ('LINCS L1000 CMAP Signatures of Differentially Expressed Genes for Small Molecules', 'lincscmapchemical'),
# ('LOCATE Curated Protein Localization Annotations', 'locate'),
# ('LOCATE Predicted Protein Localization Annotations', 'locatepredicted'),
# ('MPO Gene-Phenotype Associations', 'mgimpo'),
# ('MSigDB Cancer Gene Co-expression Modules', 'msigdbcomp'),
# ('MSigDB Signatures of Differentially Expressed Genes for Cancer Gene Perturbations', 'msigdbonc'),
# ('MiRTarBase microRNA Targets', 'mirtarbase'),
# ('MotifMap Predicted Transcription Factor Targets', 'motifmap'),
# ('NURSA Protein Complexes', 'nursa'),
# ('NURSA Protein-Protein Interactions', 'nursappi'),
# ('OMIM Gene-Disease Associations', 'omim'),
# ('PANTHER Biomolecular Interactions', 'pantherppi'),
# ('PANTHER Pathways', 'panther'),
# ('PID Biomolecular Interactions', 'pidppi'),
# ('PID Pathways', 'pid'),
# ('Pathway Commons Protein-Protein Interactions', 'pc'),
# ('PhosphoSitePlus Phosphosite-Disease Associations', 'phosphositeplusdisease'),
# ('PhosphoSitePlus Substrates of Kinases', 'phosphositeplus'),
# ('Phosphosite Textmining Biological Term Annotations', 'phosphositetextmining'),
# ('ProteomicsDB Cell Type and Tissue Protein Expression Profiles', 'proteomicsdb'),
# ('Reactome Biomolecular Interactions', 'reactomeppi'),
# ('Reactome Pathways', 'reactome'),
# ('Recon X Predicted Biomolecular Interactions', 'reconx'),
# ('Roadmap Epigenomics Cell and Tissue DNA Accessibility Profiles', 'epigenomicsdnaaccessibility'),
# ('Roadmap Epigenomics Cell and Tissue DNA Methylation Profiles', 'epigenomicsdnamethylation'),
# ('Roadmap Epigenomics Cell and Tissue Gene Expression Profiles', 'epigenomicsmrna'),
# ('Roadmap Epigenomics Histone Modification Site Profiles', 'epigenomicshm'),
# ('SILAC Phosphoproteomics Signatures of Differentially Phosphorylated Proteins for Drugs', 'silacdrug'),
# ('SILAC Phosphoproteomics Signatures of Differentially Phosphorylated Proteins for Gene Perturbations', 'silacgene'),
# ('SILAC Phosphoproteomics Signatures of Differentially Phosphorylated Proteins for Protein Ligands', 'silacligand'),
# ('SNPedia SNP-Phenotype Associations', 'snpedia'),
# ('TCGA Signatures of Differentially Expressed Genes for Tumors', 'tcga'),
# ('TISSUES Curated Tissue Protein Expression Evidence Scores', 'jensentissuecurated'),
# ('TISSUES Experimental Tissue Protein Expression Evidence Scores', 'jensentissueexpts'),
# ('TISSUES Text-mining Tissue Protein Expression Evidence Scores', 'jensentissuetextmining'),
# ('TRANSFAC Curated Transcription Factor Targets', 'transfac'),
# ('TRANSFAC Predicted Transcription Factor Targets', 'transfacpwm'),
# ('TargetScan Predicted Conserved microRNA Targets', 'targetscan'),
# ('TargetScan Predicted Nonconserved microRNA Targets', 'targetscannonconserved'),
# ('Virus MINT Protein-Viral Protein Interactions', 'virusmintppi'),
# ('Virus MINT Protein-Virus Interactions', 'virusmint'),
# ('Wikipathways Pathways', 'wikipathways'),
], [
'gene_attribute_matrix.txt.gz',
# 'gene_attribute_edges.txt.gz',
# 'gene_set_library_crisp.txt.gz',
# 'gene_set_library_up_crisp.txt.gz',
# 'gene_set_library_dn_crisp.txt.gz',
# 'attribute_set_library_crisp.txt.gz',
# 'attribute_set_library_up_crisp.txt.gz',
# 'attribute_set_library_dn_crisp.txt.gz',
# 'gene_similarity_matrix_cosine.txt.gz',
# 'attribute_similarity_matrix_cosine.txt.gz',
# 'gene_list_terms.txt.gz',
# 'attribute_list_entries.txt.gz',
'processing_script.m'
])
import numpy as np
import pandas as pd
matrix = np.loadtxt('gene_attribute_matrix_cleaned.txt')
data = pd.read_csv("gene_attribute_matrix.txt")
data[2]