From cfa238d936f16da4139cc6a1506c464798e70489 Mon Sep 17 00:00:00 2001 From: Pedro Assis Date: Wed, 21 Aug 2024 15:54:32 -1000 Subject: [PATCH 01/20] rebase after reverts --- data/active_adapters.py | 230 +++++++++++++++--- data/adapters/AFGR_caqtl_adapter.py | 36 +-- data/adapters/AFGR_eqtl_adapter.py | 40 +-- data/adapters/AFGR_sqtl_adapter.py | 38 +-- data/adapters/__init__.py | 13 +- data/adapters/adastra_asb_adapter.py | 39 +-- data/adapters/biogrid_gene_gene_adapter.py | 40 +-- data/adapters/ccre_adapter.py | 35 +-- data/adapters/cellosaurus_ontology_adapter.py | 38 +-- .../clingen_variant_disease_adapter.py | 44 +--- data/adapters/coxpresdb_adapter.py | 38 +-- data/adapters/dbSNFP_adapter.py | 60 +---- data/adapters/depmap_adapter.py | 36 +-- data/adapters/ebi_complex_adapter.py | 44 +--- data/adapters/encode_E2G_CRISPR_adapter.py | 43 +--- data/adapters/encode_caqtl_adapter.py | 42 +--- data/adapters/encode_element_gene_adapter.py | 72 +++--- data/adapters/encode_mpra_adapter.py | 43 +--- data/adapters/favor_adapter.py | 57 +---- data/adapters/gaf_adapter.py | 41 +--- data/adapters/gencode_adapter.py | 48 ++-- data/adapters/gencode_gene_adapter.py | 40 +-- .../gencode_gene_structure_adapter.py | 45 +--- data/adapters/gtex_eqtl_adapter.py | 52 ++-- data/adapters/gtex_sqtl_adapter.py | 42 ++-- data/adapters/gvatdb_asb_adapter.py | 39 +-- data/adapters/gwas_adapter.py | 44 +--- data/adapters/human_mouse_element_adapter.py | 46 ++-- .../mgi_human_mouse_ortholog_adapter.py | 36 +-- data/adapters/motif_adapter.py | 38 +-- .../adapters/mouse_genomes_project_adapter.py | 64 ++--- data/adapters/oncotree_adapter.py | 45 ++-- data/adapters/ontologies_adapter.py | 57 ++--- data/adapters/orphanet_disease_adapter.py | 36 +-- data/adapters/pQTL_adapter.py | 38 +-- data/adapters/pharmgkb_drug_adapter.py | 42 +--- data/adapters/proteins_interaction_adapter.py | 39 +-- data/adapters/reactome_adapter.py | 42 +--- data/adapters/reactome_pathway_adapter.py | 42 +--- data/adapters/topld_adapter.py | 64 +---- data/adapters/uniprot_adapter.py | 41 +--- data/adapters/uniprot_protein_adapter.py | 44 +--- data/adapters/writer.py | 49 +++- data/data_loader.py | 22 -- data/data_parser.py | 77 ++++++ data/pytest.ini | 2 + data/tests/test_AFGR_caqtl_adapter.py | 27 ++ data/tests/test_AFGR_eqtl_adapter.py | 26 ++ data/tests/test_AFGR_sqtl.py | 26 ++ data/tests/test_adapter.py | 166 ++++++------- data/tests/test_biogrid_gene_gene.py | 32 +++ data/tests/test_ccre_adapter.py | 28 +++ .../test_cellosaurus_ontology_adapter.py | 70 ++++++ .../test_clingen_variant_disease_adapter.py | 63 +++++ data/tests/test_coexpresdb_adapter.py | 44 ++++ data/tests/test_dbSNFP_adapter.py | 87 +++++++ data/tests/test_depmap_adapter.py | 81 ++++++ data/tests/test_ebi_complex_adapter.py | 88 +++++++ data/tests/test_encode_E2G_CRISPR_adapter.py | 83 +++++++ data/tests/test_encode_caqtl_adapter.py | 58 +++++ data/tests/test_encode_mpra_adapter.py | 80 ++++++ data/tests/test_gaf_adapter.py | 76 ++++++ data/tests/test_gencode_adapter.py | 105 ++++++++ data/tests/test_gencode_gene_adapter.py | 53 ++++ .../test_gencode_gene_structure_adapter.py | 112 +++++++++ data/tests/test_gtex_eqtl_adapter.py | 75 ++++++ data/tests/test_gtex_sqtl_adapter.py | 78 ++++++ data/tests/test_gvatdb_asb_adapter.py | 45 ++++ data/tests/test_gwas_adapter.py | 77 ++++++ .../tests/test_human_mouse_element_adapter.py | 89 +++++++ .../test_mgi_human_mouse_ortholog_adapter.py | 44 ++++ data/tests/test_motif_adapter.py | 65 +++++ data/tests/test_orphanet_disease_adapter.py | 38 +++ data/tests/test_pQTL_adapter.py | 17 ++ data/tests/test_pharmagkb_drug_adapter.py | 76 ++++++ .../test_proteins_interaction_adapter.py | 65 +++++ data/tests/test_reactome_adapter.py | 45 ++++ data/tests/test_topld_adapter.py | 79 ++++++ data/tests/test_uniprot_adapter.py | 101 ++++++++ data/tests/test_uniprot_protein_adapter.py | 134 ++++++++++ data/tests/test_writer.py | 9 +- 81 files changed, 3078 insertions(+), 1427 deletions(-) delete mode 100644 data/data_loader.py create mode 100644 data/data_parser.py create mode 100644 data/pytest.ini create mode 100644 data/tests/test_AFGR_caqtl_adapter.py create mode 100644 data/tests/test_AFGR_eqtl_adapter.py create mode 100644 data/tests/test_AFGR_sqtl.py create mode 100644 data/tests/test_biogrid_gene_gene.py create mode 100644 data/tests/test_ccre_adapter.py create mode 100644 data/tests/test_cellosaurus_ontology_adapter.py create mode 100644 data/tests/test_clingen_variant_disease_adapter.py create mode 100644 data/tests/test_coexpresdb_adapter.py create mode 100644 data/tests/test_dbSNFP_adapter.py create mode 100644 data/tests/test_depmap_adapter.py create mode 100644 data/tests/test_ebi_complex_adapter.py create mode 100644 data/tests/test_encode_E2G_CRISPR_adapter.py create mode 100644 data/tests/test_encode_caqtl_adapter.py create mode 100644 data/tests/test_encode_mpra_adapter.py create mode 100644 data/tests/test_gaf_adapter.py create mode 100644 data/tests/test_gencode_adapter.py create mode 100644 data/tests/test_gencode_gene_adapter.py create mode 100644 data/tests/test_gencode_gene_structure_adapter.py create mode 100644 data/tests/test_gtex_eqtl_adapter.py create mode 100644 data/tests/test_gtex_sqtl_adapter.py create mode 100644 data/tests/test_gvatdb_asb_adapter.py create mode 100644 data/tests/test_gwas_adapter.py create mode 100644 data/tests/test_human_mouse_element_adapter.py create mode 100644 data/tests/test_mgi_human_mouse_ortholog_adapter.py create mode 100644 data/tests/test_motif_adapter.py create mode 100644 data/tests/test_orphanet_disease_adapter.py create mode 100644 data/tests/test_pQTL_adapter.py create mode 100644 data/tests/test_pharmagkb_drug_adapter.py create mode 100644 data/tests/test_proteins_interaction_adapter.py create mode 100644 data/tests/test_reactome_adapter.py create mode 100644 data/tests/test_topld_adapter.py create mode 100644 data/tests/test_uniprot_adapter.py create mode 100644 data/tests/test_uniprot_protein_adapter.py diff --git a/data/active_adapters.py b/data/active_adapters.py index 4c0da0ff..eec4199a 100644 --- a/data/active_adapters.py +++ b/data/active_adapters.py @@ -33,7 +33,7 @@ from adapters.AFGR_eqtl_adapter import AFGREQtl from adapters.AFGR_sqtl_adapter import AFGRSQtl from adapters.AFGR_caqtl_adapter import AFGRCAQtl -from adapters.dbSNFP_adapter import DbSNFPAdapter +from adapters.dbSNFP_adapter import DbSNFP from adapters.pQTL_adapter import pQTL from adapters.biogrid_gene_gene_adapter import GeneGeneBiogrid from adapters.encode_E2G_CRISPR_adapter import ENCODE2GCRISPR @@ -55,7 +55,7 @@ 'eqtl_term': GtexEQtl(filepath='./samples/GTEx_eQTL', label='GTEx_eqtl_term'), 'AFGR_eqtl': AFGREQtl(filepath='./samples/AFGR/sorted.dist.hwe.af.AFR_META.eQTL.example.txt.gz', label='AFGR_eqtl'), 'AFGR_eqtl_term': AFGREQtl(filepath='./samples/AFGR/sorted.dist.hwe.af.AFR_META.eQTL.example.txt.gz', label='AFGR_eqtl_term'), - 'topld': TopLD('chr22', './samples/topld_sample.csv', './samples/topld_info_annotation.csv', ancestry='SAS'), + 'topld': TopLD(filepath='./samples/topld_sample.csv', annotation_filepath='./samples/topld_info_annotation.csv', chr='chr22', ancestry='SAS'), 'caqtl_ocr': CAQtl(filepath='./samples/caqtl-sample.bed', source='PMID:34017130', label='regulatory_region'), 'caqtl': CAQtl(filepath='./samples/caqtl-sample.bed', source='PMID:34017130', label='encode_caqtl'), 'AFGR_caqtl_ocr': AFGRCAQtl(filepath='./samples/AFGR/sorted.dist.hwe.af.AFR.caQTL.example.txt.gz', label='regulatory_region'), @@ -73,18 +73,18 @@ 'gtex_splice_qtl_term': GtexSQtl('./samples/GTEx_sQTL', label='GTEx_splice_QTL_term'), 'AFGR_sqtl': AFGRSQtl(filepath='./samples/AFGR/sorted.all.AFR.Meta.sQTL.example.txt.gz', label='AFGR_sqtl'), 'AFGR_sqtl_term': AFGRSQtl(filepath='./samples/AFGR/sorted.all.AFR.Meta.sQTL.example.txt.gz', label='AFGR_sqtl_term'), - 'encode_regulatory_region': EncodeElementGeneLink('./samples/epiraction_ENCFF712SUP.bed.gz', 'regulatory_region', 'ENCODE_EpiRaction', 'https://www.encodeproject.org/files/ENCFF712SUP/', 'CL_0000765'), - 'encode_regulatory_region_gene': EncodeElementGeneLink('./samples/epiraction_ENCFF712SUP.bed.gz', 'regulatory_region_gene', 'ENCODE_EpiRaction', 'https://www.encodeproject.org/files/ENCFF712SUP/', 'CL_0000765'), - 'encode_regulatory_region_gene_biosample': EncodeElementGeneLink('./samples/E2G_ENCFF617FJH.bed.gz', 'regulatory_region_gene_biosample', 'ENCODE-E2G-DNaseOnly', 'https://www.encodeproject.org/files/ENCFF617FJH/', 'EFO_0001203'), - 'encode_regulatory_region_gene_treatment_CHEBI': EncodeElementGeneLink('./samples/E2G_ENCFF617FJH.bed.gz', 'regulatory_region_gene_biosample_treatment_CHEBI', 'ENCODE-E2G-DNaseOnly', 'https://www.encodeproject.org/files/ENCFF617FJH/', 'EFO_0001203'), - 'encode_regulatory_region_gene_treatment_protein': EncodeElementGeneLink('./samples/E2G_ENCFF728HSS.bed.gz', 'regulatory_region_gene_biosample_treatment_protein', 'ENCODE-E2G-DNaseOnly', 'https://www.encodeproject.org/files/ENCFF728HSS/', 'NTR_0000502'), - 'encode_donor': EncodeElementGeneLink('./samples/E2G_ENCFF617FJH.bed.gz', 'donor', 'ENCODE-E2G-DNaseOnly', 'https://www.encodeproject.org/files/ENCFF617FJH/', 'EFO_0001203'), - 'encode_biosample': EncodeElementGeneLink('./samples/E2G_ENCFF728HSS.bed.gz', 'ontology_term', 'ENCODE-E2G-DNaseOnly', 'https://www.encodeproject.org/files/ENCFF728HSS/', 'NTR_0000502'), - 'encode_regulatory_region_gene_donor': EncodeElementGeneLink('./samples/E2G_ENCFF617FJH.bed.gz', 'regulatory_region_gene_biosample_donor', 'ENCODE-E2G-DNaseOnly', 'https://www.encodeproject.org/files/ENCFF617FJH/', 'EFO_0001203'), - 'encode_mpra_regulatory_region': EncodeMPRA('./samples/MPRA_ENCFF802FUV_example.bed.gz', 'regulatory_region', 'https://www.encodeproject.org/files/ENCFF802FUV/', 'EFO_0002067'), - 'encode_mpra_regulatory_region_biosample': EncodeMPRA('./samples/MPRA_ENCFF802FUV_example.bed.gz', 'regulatory_region_biosample', 'https://www.encodeproject.org/files/ENCFF802FUV/', 'EFO_0002067'), - 'encode_regulatory_region_crispr': ENCODE2GCRISPR('./samples/ENCODE_E2G_CRISPR_example.tsv', 'regulatory_region'), - 'encode_regulatory_region_gene_crispr': ENCODE2GCRISPR('./samples/ENCODE_E2G_CRISPR_example.tsv', 'regulatory_region_gene'), + 'encode_regulatory_region': EncodeElementGeneLink(filepath='./samples/epiraction_ENCFF712SUP.bed.gz', label='regulatory_region', source='ENCODE_EpiRaction', source_url='https://www.encodeproject.org/files/ENCFF712SUP/', biological_context='CL_0000765'), + 'encode_regulatory_region_gene': EncodeElementGeneLink(filepath='./samples/epiraction_ENCFF712SUP.bed.gz', label='regulatory_region_gene', source='ENCODE_EpiRaction', source_url='https://www.encodeproject.org/files/ENCFF712SUP/', biological_context='CL_0000765'), + 'encode_regulatory_region_gene_biosample': EncodeElementGeneLink(filepath='./samples/E2G_ENCFF617FJH.bed.gz', label='regulatory_region_gene_biosample', source='ENCODE-E2G-DNaseOnly', source_url='https://www.encodeproject.org/files/ENCFF617FJH/', biological_context='EFO_0001203'), + 'encode_regulatory_region_gene_treatment_CHEBI': EncodeElementGeneLink(filepath='./samples/E2G_ENCFF617FJH.bed.gz', label='regulatory_region_gene_biosample_treatment_CHEBI', source='ENCODE-E2G-DNaseOnly', source_url='https://www.encodeproject.org/files/ENCFF617FJH/', biological_context='EFO_0001203'), + 'encode_regulatory_region_gene_treatment_protein': EncodeElementGeneLink(filepath='./samples/E2G_ENCFF728HSS.bed.gz', label='regulatory_region_gene_biosample_treatment_protein', source='ENCODE-E2G-DNaseOnly', source_url='https://www.encodeproject.org/files/ENCFF728HSS/', biological_context='NTR_0000502'), + 'encode_donor': EncodeElementGeneLink(filepath='./samples/E2G_ENCFF617FJH.bed.gz', label='donor', source='ENCODE-E2G-DNaseOnly', source_url='https://www.encodeproject.org/files/ENCFF617FJH/', biological_context='EFO_0001203'), + 'encode_biosample': EncodeElementGeneLink(filepath='./samples/E2G_ENCFF728HSS.bed.gz', label='ontology_term', source='ENCODE-E2G-DNaseOnly', source_url='https://www.encodeproject.org/files/ENCFF728HSS/', biological_context='NTR_0000502'), + 'encode_regulatory_region_gene_donor': EncodeElementGeneLink(filepath='./samples/E2G_ENCFF617FJH.bed.gz', label='regulatory_region_gene_biosample_donor', source='ENCODE-E2G-DNaseOnly', source_url='https://www.encodeproject.org/files/ENCFF617FJH/', biological_context='EFO_0001203'), + 'encode_mpra_regulatory_region': EncodeMPRA(filepath='./samples/MPRA_ENCFF802FUV_example.bed.gz', label='regulatory_region', source_url='https://www.encodeproject.org/files/ENCFF802FUV/', biological_context='EFO_0002067'), + 'encode_mpra_regulatory_region_biosample': EncodeMPRA(filepath='./samples/MPRA_ENCFF802FUV_example.bed.gz', label='regulatory_region_biosample', source_url='https://www.encodeproject.org/files/ENCFF802FUV/', biological_context='EFO_0002067'), + 'encode_regulatory_region_crispr': ENCODE2GCRISPR(filepath='./samples/ENCODE_E2G_CRISPR_example.tsv', label='regulatory_region'), + 'encode_regulatory_region_gene_crispr': ENCODE2GCRISPR(filepath='./samples/ENCODE_E2G_CRISPR_example.tsv', label='regulatory_region_gene'), 'gaf': GAF(filepath='./samples/goa_human_sample.gaf.gz'), 'gaf_mouse': GAF(filepath='./samples/mgi_sample.gaf.gz', gaf_type='mouse'), 'gaf_isoform': GAF(filepath='./samples/goa_human_isoform.gaf.gz', gaf_type='human_isoform'), @@ -94,30 +94,30 @@ 'gwas_var_phenotypes_studies': GWAS(variants_to_ontology='./samples/gwas_v2d_igvf_sample.tsv', variants_to_genes='./samples/gwas_v2g_igvf_sample.tsv', gwas_collection='variants_phenotypes_studies'), 'motif': Motif(filepath='./samples/motifs', label='motif'), 'motif to protein': Motif(filepath='./samples/motifs', label='motif_protein_link'), - 'coxpresdb': Coxpresdb('./samples/coxpresdb/1'), - 'pathway': ReactomePathway('./samples/reactome/ReactomePathways_20240603.txt'), - 'genes_pathways': Reactome('./samples/reactome/Ensembl2Reactome_All_Levels_sample.txt', 'genes_pathways'), - 'parent_pathway_of': Reactome('./samples/reactome/ReactomePathwaysRelation_20240603.txt', 'parent_pathway_of'), - 'cellosaurus_terms': Cellosaurus('./samples/cellosaurus_example.obo.txt', type='node'), - 'cellosaurus_relationships': Cellosaurus('./samples/cellosaurus_example.obo.txt', type='edge'), - 'drug': PharmGKB('./samples/pharmGKB', label='drug'), - 'variant_drug': PharmGKB('./samples/pharmGKB', label='variant_drug'), - 'variant_drug_gene': PharmGKB('./samples/pharmGKB', label='variant_drug_gene'), - 'disease_gene': Disease('./samples/orphanet_example.xml'), + 'coxpresdb': Coxpresdb(filepath='./samples/coxpresdb/1'), + 'pathway': ReactomePathway(filepath='./samples/reactome/ReactomePathways.txt'), + 'genes_pathways': Reactome(filepath='./samples/reactome/Ensembl2Reactome_All_Levels_sample.txt', label='genes_pathways'), + 'parent_pathway_of': Reactome(filepath='./samples/reactome/ReactomePathwaysRelation.txt', label='parent_pathway_of'), + 'cellosaurus_terms': Cellosaurus(filepath='./samples/cellosaurus_example.obo.txt', type='node'), + 'cellosaurus_relationships': Cellosaurus(filepath='./samples/cellosaurus_example.obo.txt', type='edge'), + 'drug': PharmGKB(filepath='./samples/pharmGKB', label='drug'), + 'variant_drug': PharmGKB(filepath='./samples/pharmGKB', label='variant_drug'), + 'variant_drug_gene': PharmGKB(filepath='./samples/pharmGKB', label='variant_drug_gene'), + 'disease_gene': Disease(filepath='./samples/orphanet_example.xml'), 'oncotree_terms': Oncotree(type='node'), 'oncotree_relationships': Oncotree(type='edge'), - 'gene_term': DepMap('./samples/DepMap/CRISPRGeneDependency_transposed_example.csv', type='edge', label='gene_term'), - 'complex': EBIComplex('./samples/EBI_complex_example.tsv', label='complex'), - 'complex_protein': EBIComplex('./samples/EBI_complex_example.tsv', label='complex_protein'), - 'complex_term': EBIComplex('./samples/EBI_complex_example.tsv', label='complex_term'), - 'protein_protein': ProteinsInteraction('./samples/merged_PPI.UniProt.example.csv', label='protein_protein'), - 'gene_gene_biogrid': GeneGeneBiogrid('./samples/merged_PPI.UniProt.example.csv', label='gene_gene_biogrid'), - 'mouse_gene_gene_biogrid': GeneGeneBiogrid('./samples/merged_PPI_mouse.UniProt.example.csv', label='mouse_gene_gene_biogrid'), - 'regulatory_region_mm_regulatory_region': HumanMouseElementAdapter('./samples/element_mapping_example.txt.gz', label='regulatory_region_mm_regulatory_region'), - 'mm_orthologs': MGIHumanMouseOrthologAdapter('./samples/HOM_MouseHumanSequence_sample.rpt'), - 'coding_variants': DbSNFPAdapter('./samples/dbNSFP4.5a_variant.chrY_sample'), - 'variants_coding_variants': DbSNFPAdapter('./samples/dbNSFP4.5a_variant.chrY_sample', collection='variants_coding_variants'), - 'coding_variants_proteins': DbSNFPAdapter('./samples/dbNSFP4.5a_variant.chrY_sample', collection='coding_variants_proteins'), + 'gene_term': DepMap(filepath='./samples/DepMap/CRISPRGeneDependency_transposed_example.csv', type='edge', label='gene_term'), + 'complex': EBIComplex(filepath='./samples/EBI_complex_example.tsv', label='complex'), + 'complex_protein': EBIComplex(filepath='./samples/EBI_complex_example.tsv', label='complex_protein'), + 'complex_term': EBIComplex(filepath='./samples/EBI_complex_example.tsv', label='complex_term'), + 'protein_protein': ProteinsInteraction(filepath='./samples/merged_PPI.UniProt.example.csv', label='protein_protein'), + 'gene_gene_biogrid': GeneGeneBiogrid(filepath='./samples/merged_PPI.UniProt.example.csv', label='gene_gene_biogrid'), + 'mouse_gene_gene_biogrid': GeneGeneBiogrid(filepath='./samples/merged_PPI_mouse.UniProt.example.csv', label='mouse_gene_gene_biogrid'), + 'regulatory_region_mm_regulatory_region': HumanMouseElementAdapter(filepath='./samples/element_mapping_example.txt.gz', label='regulatory_region_mm_regulatory_region'), + 'mm_orthologs': MGIHumanMouseOrthologAdapter(filepath='./samples/HOM_MouseHumanSequence_sample.rpt'), + 'coding_variants': DbSNFP(filepath='./samples/dbNSFP4.5a_variant.chrY_sample'), + 'variants_coding_variants': DbSNFP(filepath='./samples/dbNSFP4.5a_variant.chrY_sample', collection='variants_coding_variants'), + 'coding_variants_proteins': DbSNFP(filepath='./samples/dbNSFP4.5a_variant.chrY_sample', collection='coding_variants_proteins'), 'mouse_variant': MouseGenomesProjectAdapter(filepath='./samples/mouse_variants/mouse_variant_snps_rsid_sample.vcf'), 'variant_disease': ClinGen('./samples/clinGen_variant_pathogenicity_example.csv', label='variant_disease'), 'variant_disease_gene': ClinGen('./samples/clinGen_variant_pathogenicity_example.csv', label='variant_disease_gene'), @@ -127,6 +127,164 @@ 'SEM_variant_protein': SEMPred('./samples/SEM/', label='sem_predicted_asb') } + +LABEL_TO_ADAPTER = { + 'gencode_genes': GencodeGene, + 'gencode_transcripts': Gencode, + 'transcribed_to': Gencode, + 'transcribed_from': Gencode, + 'eqtl': GtexEQtl, + 'eqtl_term': GtexEQtl, + 'AFGR_eqtl': AFGREQtl, + 'AFGR_eqtl_term': AFGREQtl, + 'topld': TopLD, + 'caqtl_ocr': CAQtl, + 'caqtl': CAQtl, + 'AFGR_caqtl_ocr': AFGRCAQtl, + 'AFGR_caqtl': AFGRCAQtl, + 'ccre': CCRE, + 'UniProtKB_sprot': UniprotProtein, + 'UniProtKB_trembl': UniprotProtein, + 'UniProtKB_Translates_To': Uniprot, + 'UniProtKB_Translation_Of': Uniprot, + 'favor': Favor, + 'pQTL': pQTL, + 'allele_specific_binding': ASB, + 'allele_specific_binding_cell': ASB, + 'allele_specific_binding_GVATdb': ASB_GVATDB, + 'gtex_splice_qtl': GtexSQtl, + 'gtex_splice_qtl_term': GtexSQtl, + 'AFGR_sqtl': AFGRSQtl, + 'AFGR_sqtl_term': AFGRSQtl, + 'encode_regulatory_region': EncodeElementGeneLink, + 'encode_regulatory_region_gene': EncodeElementGeneLink, + 'encode_regulatory_region_gene_biosample': EncodeElementGeneLink, + 'encode_regulatory_region_gene_treatment_CHEBI': EncodeElementGeneLink, + 'encode_regulatory_region_gene_treatment_protein': EncodeElementGeneLink, + 'encode_donor': EncodeElementGeneLink, + 'encode_biosample': EncodeElementGeneLink, + 'encode_regulatory_region_gene_donor': EncodeElementGeneLink, + 'encode_mpra_regulatory_region': EncodeMPRA, + 'encode_mpra_regulatory_region_biosample': EncodeMPRA, + 'encode_regulatory_region_crispr': ENCODE2GCRISPR, + 'encode_regulatory_region_gene_crispr': ENCODE2GCRISPR, + 'gaf': GAF, + 'gaf_mouse': GAF, + 'gaf_isoform': GAF, + 'gaf_rna': GAF, + 'gwas_studies': GWAS, + 'gwas_var_phenotypes': GWAS, + 'gwas_var_phenotypes_studies': GWAS, + 'motif': Motif, + 'motif to protein': Motif, + 'coxpresdb': Coxpresdb, + 'pathway': ReactomePathway, + 'genes_pathways': Reactome, + 'parent_pathway_of': Reactome, + 'cellosaurus_terms': Cellosaurus, + 'cellosaurus_relationships': Cellosaurus, + 'drug': PharmGKB, + 'variant_drug': PharmGKB, + 'variant_drug_gene': PharmGKB, + 'disease_gene': Disease, + 'oncotree_terms': Oncotree, + 'oncotree_relationships': Oncotree, + 'gene_term': DepMap, + 'complex': EBIComplex, + 'complex_protein': EBIComplex, + 'complex_term': EBIComplex, + 'protein_protein': ProteinsInteraction, + 'gene_gene_biogrid': GeneGeneBiogrid, + 'mouse_gene_gene_biogrid': GeneGeneBiogrid, + 'regulatory_region_mm_regulatory_region': HumanMouseElementAdapter, + 'mm_orthologs': MGIHumanMouseOrthologAdapter, + 'coding_variants': DbSNFPAdapter, + 'variants_coding_variants': DbSNFPAdapter, + 'coding_variants_proteins': DbSNFPAdapter, + 'mouse_variant': MouseGenomesProjectAdapter, + 'variant_disease': ClinGen, + 'variant_disease_gene': ClinGen, +} + + +LABEL_TO_ADAPTER = { + 'gencode_genes': GencodeGene, + 'gencode_transcripts': Gencode, + 'transcribed_to': Gencode, + 'transcribed_from': Gencode, + 'eqtl': GtexEQtl, + 'eqtl_term': GtexEQtl, + 'AFGR_eqtl': AFGREQtl, + 'AFGR_eqtl_term': AFGREQtl, + 'topld': TopLD, + 'caqtl_ocr': CAQtl, + 'caqtl': CAQtl, + 'AFGR_caqtl_ocr': AFGRCAQtl, + 'AFGR_caqtl': AFGRCAQtl, + 'ccre': CCRE, + 'UniProtKB_sprot': UniprotProtein, + 'UniProtKB_trembl': UniprotProtein, + 'UniProtKB_Translates_To': Uniprot, + 'UniProtKB_Translation_Of': Uniprot, + 'favor': Favor, + 'pQTL': pQTL, + 'allele_specific_binding': ASB, + 'allele_specific_binding_cell': ASB, + 'allele_specific_binding_GVATdb': ASB_GVATDB, + 'gtex_splice_qtl': GtexSQtl, + 'gtex_splice_qtl_term': GtexSQtl, + 'AFGR_sqtl': AFGRSQtl, + 'AFGR_sqtl_term': AFGRSQtl, + 'encode_regulatory_region': EncodeElementGeneLink, + 'encode_regulatory_region_gene': EncodeElementGeneLink, + 'encode_regulatory_region_gene_biosample': EncodeElementGeneLink, + 'encode_regulatory_region_gene_treatment_CHEBI': EncodeElementGeneLink, + 'encode_regulatory_region_gene_treatment_protein': EncodeElementGeneLink, + 'encode_donor': EncodeElementGeneLink, + 'encode_biosample': EncodeElementGeneLink, + 'encode_regulatory_region_gene_donor': EncodeElementGeneLink, + 'encode_mpra_regulatory_region': EncodeMPRA, + 'encode_mpra_regulatory_region_biosample': EncodeMPRA, + 'encode_regulatory_region_crispr': ENCODE2GCRISPR, + 'encode_regulatory_region_gene_crispr': ENCODE2GCRISPR, + 'gaf': GAF, + 'gaf_mouse': GAF, + 'gaf_isoform': GAF, + 'gaf_rna': GAF, + 'gwas_studies': GWAS, + 'gwas_var_phenotypes': GWAS, + 'gwas_var_phenotypes_studies': GWAS, + 'motif': Motif, + 'motif to protein': Motif, + 'coxpresdb': Coxpresdb, + 'pathway': ReactomePathway, + 'genes_pathways': Reactome, + 'parent_pathway_of': Reactome, + 'cellosaurus_terms': Cellosaurus, + 'cellosaurus_relationships': Cellosaurus, + 'drug': PharmGKB, + 'variant_drug': PharmGKB, + 'variant_drug_gene': PharmGKB, + 'disease_gene': Disease, + 'oncotree_terms': Oncotree, + 'oncotree_relationships': Oncotree, + 'gene_term': DepMap, + 'complex': EBIComplex, + 'complex_protein': EBIComplex, + 'complex_term': EBIComplex, + 'protein_protein': ProteinsInteraction, + 'gene_gene_biogrid': GeneGeneBiogrid, + 'mouse_gene_gene_biogrid': GeneGeneBiogrid, + 'regulatory_region_mm_regulatory_region': HumanMouseElementAdapter, + 'mm_orthologs': MGIHumanMouseOrthologAdapter, + 'coding_variants': DbSNFP, + 'variants_coding_variants': DbSNFP, + 'coding_variants_proteins': DbSNFP, + 'mouse_variant': MouseGenomesProjectAdapter, + 'variant_disease': ClinGen, + 'variant_disease_gene': ClinGen +} + in_docker = os.environ.get('IN_DOCKER') == 'TRUE' if not in_docker: diff --git a/data/adapters/AFGR_caqtl_adapter.py b/data/adapters/AFGR_caqtl_adapter.py index 3e31dbb4..1cd1a140 100644 --- a/data/adapters/AFGR_caqtl_adapter.py +++ b/data/adapters/AFGR_caqtl_adapter.py @@ -3,18 +3,18 @@ import json from math import log10 import os +from typing import Optional -from adapters import Adapter from adapters.helpers import build_variant_id, build_regulatory_region_id from db.arango_db import ArangoDB - +from adapters.writer import Writer # Example row from sorted.dist.hwe.af.AFR.caQTL.genPC.maf05.90.qn.idr.txt.gz # chr snp_pos snp_pos2 ref alt variant effect_af_eqtl p_hwe feature dist_start dist_end pvalue beta se # 1 66435 66435 ATT A 1_66435_ATT_A 0.125 0.644802 1:1001657:1002109 -935222 -935674 0.616173 0.055905 0.111128 -class AFGRCAQtl(Adapter): +class AFGRCAQtl: ALLOWED_LABELS = ['regulatory_region', 'AFGR_caqtl'] SOURCE = 'AFGR' @@ -23,11 +23,10 @@ class AFGRCAQtl(Adapter): CLASS_NAME = 'accessible_dna_element' ONTOLOGY_TERM_ID = 'EFO_0005292' # lymphoblastoid cell line ONTOLOGY_TERM_NAME = 'lymphoblastoid cell line' - OUTPUT_PATH = './parsed-data' - def __init__(self, filepath, label, dry_run=True): + def __init__(self, filepath, label, dry_run=True, writer: Optional[Writer] = None, **kwargs): if label not in AFGRCAQtl.ALLOWED_LABELS: - raise ValueError('Ivalid label. Allowed values: ' + + raise ValueError('Invalid label. Allowed values: ' + ','.join(AFGRCAQtl.ALLOWED_LABELS)) self.filepath = filepath @@ -37,16 +36,10 @@ def __init__(self, filepath, label, dry_run=True): self.type = 'edge' if(self.label == 'regulatory_region'): self.type = 'node' - - self.output_filepath = '{}/{}.json'.format( - self.OUTPUT_PATH, - self.dataset - ) - - super(AFGRCAQtl, self).__init__() + self.writer = writer def process_file(self): - parsed_data_file = open(self.output_filepath, 'w') + self.writer.open() with gzip.open(self.filepath, 'rt') as qtl_file: qtl_csv = csv.reader(qtl_file, delimiter='\t') @@ -100,16 +93,7 @@ def process_file(self): 'inverse_name': 'associates with' } - json.dump(_props, parsed_data_file) - parsed_data_file.write('\n') - parsed_data_file.close() - self.save_to_arango() - - def save_to_arango(self): - if self.dry_run: - print(self.arangodb()[0]) - else: - os.system(self.arangodb()[0]) + self.writer.write(json.dumps(_props)) + self.writer.write('\n') - def arangodb(self): - return ArangoDB().generate_json_import_statement(self.output_filepath, self.collection, type=self.type) + self.writer.close() diff --git a/data/adapters/AFGR_eqtl_adapter.py b/data/adapters/AFGR_eqtl_adapter.py index b0a2b0a7..c4db326a 100644 --- a/data/adapters/AFGR_eqtl_adapter.py +++ b/data/adapters/AFGR_eqtl_adapter.py @@ -2,27 +2,26 @@ import gzip import hashlib import json -import os -from adapters import Adapter -from adapters.helpers import build_variant_id -from db.arango_db import ArangoDB +from typing import Optional +from adapters.helpers import build_variant_id +from adapters.writer import Writer # Example row from sorted.dist.hwe.af.AFR_META.eQTL.nominal.hg38a.txt.gz # chr snp_pos snp_pos2 ref alt effect_af_eqtl variant feature log10p pvalue beta se qstat df p_het p_hwe dist_start dist_end geneSymbol geneType # 1 16103 16103 T G 0.0336427 1_16103_T_G ENSG00000187583.10 0.1944867 0.6390183 0.242489 0.516955 NA 1.0 NA 1.000000 -950394 -959762 PLEKHN1 protein_coding -class AFGREQtl(Adapter): + +class AFGREQtl: ALLOWED_LABELS = ['AFGR_eqtl', 'AFGR_eqtl_term'] SOURCE = 'AFGR' SOURCE_URL = 'https://github.com/smontgomlab/AFGR' BIOLOGICAL_CONTEXT = 'lymphoblastoid cell line' ONTOLOGY_TERM = 'EFO_0005292' # lymphoblastoid cell line - OUTPUT_PATH = './parsed-data' - def __init__(self, filepath, label='AFGR_eqtl', dry_run=True): + def __init__(self, filepath, label='AFGR_eqtl', dry_run=True, writer: Optional[Writer] = None, **kwargs): if label not in AFGREQtl.ALLOWED_LABELS: - raise ValueError('Ivalid label. Allowed values: ' + + raise ValueError('Invalid label. Allowed values: ' + ','.join(AFGREQtl.ALLOWED_LABELS)) self.filepath = filepath @@ -30,16 +29,10 @@ def __init__(self, filepath, label='AFGR_eqtl', dry_run=True): self.dataset = label self.dry_run = dry_run self.type = 'edge' - - self.output_filepath = '{}/{}.json'.format( - self.OUTPUT_PATH, - self.dataset - ) - - super(AFGREQtl, self).__init__() + self.writer = writer def process_file(self): - parsed_data_file = open(self.output_filepath, 'w') + self.writer.open() with gzip.open(self.filepath, 'rt') as qtl_file: qtl_csv = csv.reader(qtl_file, delimiter='\t') next(qtl_csv) @@ -93,16 +86,7 @@ def process_file(self): 'name': 'occurs in', 'inverse_name': 'has measurement' } - json.dump(_props, parsed_data_file) - parsed_data_file.write('\n') - parsed_data_file.close() - self.save_to_arango() - - def save_to_arango(self): - if self.dry_run: - print(self.arangodb()[0]) - else: - os.system(self.arangodb()[0]) + self.writer.write(json.dumps(_props)) + self.writer.write('\n') - def arangodb(self): - return ArangoDB().generate_json_import_statement(self.output_filepath, self.collection, type=self.type) + self.writer.close() diff --git a/data/adapters/AFGR_sqtl_adapter.py b/data/adapters/AFGR_sqtl_adapter.py index a69bf117..b3c4db1a 100644 --- a/data/adapters/AFGR_sqtl_adapter.py +++ b/data/adapters/AFGR_sqtl_adapter.py @@ -4,18 +4,17 @@ import json import pickle from math import log10 -import os +from typing import Optional -from adapters import Adapter from adapters.helpers import build_variant_id -from db.arango_db import ArangoDB +from adapters.writer import Writer # sorted.all.AFR.Meta.sQTL.genPC.nominal.maf05.mvmeta.fe.txt.gz # chr pos ref alt snp feature beta se zstat p 95pct_ci_lower 95pct_ci_upper qstat df p_het # chr1 88338 G A 1_88338_G_A 1:187577:187755:clu_2352 0.0723108199416329 0.0685894841949755 1.05425519363987 0.291766096608984 -0.0621220987986983 0.206743738681964 1.23511015771854 5 0.941465002419174 -class AFGRSQtl(Adapter): +class AFGRSQtl: ALLOWED_LABELS = ['AFGR_sqtl', 'AFGR_sqtl_term'] SOURCE = 'AFGR' SOURCE_URL = 'https://github.com/smontgomlab/AFGR' @@ -23,11 +22,10 @@ class AFGRSQtl(Adapter): BIOLOGICAL_CONTEXT = 'lymphoblastoid cell line' ONTOLOGY_TERM = 'EFO_0005292' # lymphoblastoid cell line MAX_LOG10_PVALUE = 400 # set the same value as gtex qtl - OUTPUT_PATH = './parsed-data' - def __init__(self, filepath, label='AFGR_sqtl', dry_run=True): + def __init__(self, filepath, label='AFGR_sqtl', dry_run=True, writer: Optional[Writer] = None, **kwargs): if label not in AFGRSQtl.ALLOWED_LABELS: - raise ValueError('Ivalid label. Allowed values: ' + + raise ValueError('Invalid label. Allowed values: ' + ','.join(AFGRSQtl.ALLOWED_LABELS)) self.filepath = filepath @@ -35,16 +33,10 @@ def __init__(self, filepath, label='AFGR_sqtl', dry_run=True): self.dataset = label self.dry_run = dry_run self.type = 'edge' - self.output_filepath = '{}/{}.json'.format( - self.OUTPUT_PATH, - self.dataset - ) - - super(AFGRSQtl, self).__init__() + self.writer = writer def process_file(self): - parsed_data_file = open(self.output_filepath, 'w') - + self.writer.open() self.load_intron_gene_mapping() with gzip.open(self.filepath, 'rt') as qtl_file: @@ -112,22 +104,12 @@ def process_file(self): 'name': 'occurs in', 'inverse_name': 'has measurement' } - json.dump(_props, parsed_data_file) - parsed_data_file.write('\n') - parsed_data_file.close() - self.save_to_arango() + self.writer.write(json.dumps(_props)) + self.writer.write('\n') + self.writer.close() def load_intron_gene_mapping(self): # key: intron_id (e.g. 1:187577:187755:clu_2352); value: gene ensembl id self.intron_gene_mapping = {} with open(AFGRSQtl.INTRON_GENE_MAPPING_PATH, 'rb') as mapfile: self.intron_gene_mapping = pickle.load(mapfile) - - def save_to_arango(self): - if self.dry_run: - print(self.arangodb()[0]) - else: - os.system(self.arangodb()[0]) - - def arangodb(self): - return ArangoDB().generate_json_import_statement(self.output_filepath, self.collection, type=self.type) diff --git a/data/adapters/__init__.py b/data/adapters/__init__.py index 425d52e1..25ca668e 100644 --- a/data/adapters/__init__.py +++ b/data/adapters/__init__.py @@ -1,5 +1,6 @@ import yaml import glob +import os from db.arango_db import ArangoDB @@ -34,9 +35,6 @@ def __init__(self): self.collection = self.schema_config['db_collection_name'] - def write_file(self): - self.process_file() - def has_indexes(self): return 'db_indexes' in self.schema_config @@ -115,3 +113,12 @@ def arangodb(self): self.element_type, self.has_edge_id ) + + def save_to_arango(self): + arango_imp = ArangoDB().generate_json_import_statement( + self.output_filepath, self.collection, type=self.type) + + if self.dry_run: + print(arango_imp[0]) + else: + os.system(arango_imp[0]) diff --git a/data/adapters/adastra_asb_adapter.py b/data/adapters/adastra_asb_adapter.py index 87c7c37d..4fd529a2 100644 --- a/data/adapters/adastra_asb_adapter.py +++ b/data/adapters/adastra_asb_adapter.py @@ -1,10 +1,10 @@ import csv import json import os +from typing import Optional -from adapters import Adapter -from db.arango_db import ArangoDB from adapters.helpers import build_variant_id +from adapters.writer import Writer # ADASTRA allele-specific binding (ASB) file downloaded from: https://adastra.autosome.org/assets/cltfdata/adastra.cltf.bill_cipher.zip # Cell ontology available from GTRD (Gene Transcription Regulation Database): http://gtrd.biouml.org/ @@ -14,7 +14,7 @@ # chr11 129321262.0 rs10750410 A G 1.25 518.5 73.0 2.0 -1.508200583122832 1.5220631227173078 0.9999438590195968 1.0 3.776801544248756e-06 0.0024711390339222 2.668977799202377 2.9239362481887974 0.8469536347168959 19 + No Hit False -class ASB(Adapter): +class ASB: # 1-based coordinate system ALLOWED_LABELS = ['asb', 'asb_cell_ontology'] ONTOLOGY_PRIORITY_LIST = ['CL:', 'UBERON:', 'CLO:', 'EFO:'] @@ -23,11 +23,9 @@ class ASB(Adapter): SOURCE = 'ADASTRA allele-specific TF binding calls' MOTIF_SOURCE = 'HOCOMOCOv11' - OUTPUT_PATH = './parsed-data' - - def __init__(self, filepath, label='asb', dry_run=True): + def __init__(self, filepath, label='asb', dry_run=True, writer: Optional[Writer] = None, **kwargs): if label not in ASB.ALLOWED_LABELS: - raise ValueError('Ivalid label. Allowed values: ' + + raise ValueError('Invalid label. Allowed values: ' + ','.join(ASB.ALLOWED_LABELS)) self.filepath = filepath @@ -38,15 +36,8 @@ def __init__(self, filepath, label='asb', dry_run=True): self.collection = 'variants_proteins' else: self.collection = 'variants_proteins_terms' - self.dry_run = dry_run - - self.output_filepath = '{}/{}.json'.format( - ASB.OUTPUT_PATH, - self.dataset - ) - - super(ASB, self).__init__() + self.writer = writer def load_tf_uniprot_id_mapping(self): self.tf_uniprot_id_mapping = {} # e.g. key: 'ANDR_HUMAN'; value: 'P10275' @@ -71,7 +62,7 @@ def load_cell_ontology_id_mapping(self): cell_ontology_id, cell_gtrd_id, cell_gtrd_name] def process_file(self): - parsed_data_file = open(self.output_filepath, 'w') + self.writer.open() self.load_tf_uniprot_id_mapping() self.load_cell_ontology_id_mapping() @@ -148,17 +139,7 @@ def process_file(self): 'inverse_name': 'has measurement' } - json.dump(props, parsed_data_file) - parsed_data_file.write('\n') - - parsed_data_file.close() - self.save_to_arango() - - def save_to_arango(self): - if self.dry_run: - print(self.arangodb()[0]) - else: - os.system(self.arangodb()[0]) + self.writer.write(json.dumps(props)) + self.writer.write('\n') - def arangodb(self): - return ArangoDB().generate_json_import_statement(self.output_filepath, self.collection, type=self.type) + self.writer.close() diff --git a/data/adapters/biogrid_gene_gene_adapter.py b/data/adapters/biogrid_gene_gene_adapter.py index bd32144a..f8a5f15c 100644 --- a/data/adapters/biogrid_gene_gene_adapter.py +++ b/data/adapters/biogrid_gene_gene_adapter.py @@ -1,11 +1,11 @@ import csv -import os import json import hashlib import obonet import pickle -from adapters import Adapter -from db.arango_db import ArangoDB +from typing import Optional + +from adapters.writer import Writer # Example lines in merged_PPI.UniProt.csv (and merged_PPI_mouse.UniProt.csv for mouse): # (Only loading lines with 'genetic interference' in Detection Method column, the other lines are loaded in ProteinsInteraction Adapter) @@ -17,17 +17,17 @@ # psi-mi.obo is downloaded from https://github.com/HUPO-PSI/psi-mi-CV/blob/master/psi-mi.obo -class GeneGeneBiogrid(Adapter): +class GeneGeneBiogrid: INTERACTION_MI_CODE_PATH = './data_loading_support_files/Biogrid_gene_gene/psi-mi.obo' - OUTPUT_PATH = './parsed-data' - def __init__(self, filepath, label, dry_run=True): + def __init__(self, filepath, label, dry_run=True, writer: Optional[Writer] = None, **kwargs): self.filepath = filepath self.dataset = label self.label = label self.dry_run = dry_run self.type = 'edge' + self.writer = writer if 'mouse' in self.filepath.split('/')[-1]: self.gene_collection = 'mm_genes' @@ -36,15 +36,8 @@ def __init__(self, filepath, label, dry_run=True): self.gene_collection = 'genes' self.protein_to_gene_mapping_path = './data_loading_support_files/Biogrid_gene_gene/biogrid_protein_mapping.pkl' - self.output_filepath = '{}/{}.json'.format( - GeneGeneBiogrid.OUTPUT_PATH, - self.dataset, - ) - - super(GeneGeneBiogrid, self).__init__() - def process_file(self): - parsed_data_file = open(self.output_filepath, 'w') + self.writer.open() print('Loading MI code mappings') self.load_MI_code_mapping() @@ -102,12 +95,12 @@ def process_file(self): 'z_score:long': 0, 'name': 'interacts with', 'inverse_name': 'interacts with', - 'molecular_function': 'ontology_terms/GO_0005515'} - json.dump(props, parsed_data_file) - parsed_data_file.write('\n') + 'molecular_function': 'ontology_terms/GO_0005515', + } + self.writer.write(json.dumps(props)) + self.writer.write('\n') - parsed_data_file.close() - self.save_to_arango() + self.writer.close() def load_MI_code_mapping(self): # get mapping for MI code -> name from obo file (e.g. MI:2370 -> synthetic lethality (sensu BioGRID)) @@ -121,12 +114,3 @@ def load_protein_gene_mapping(self): self.protein_gene_mapping = {} with open(self.protein_to_gene_mapping_path, 'rb') as mapfile: self.protein_gene_mapping = pickle.load(mapfile) - - def save_to_arango(self): - if self.dry_run: - print(self.arangodb()[0]) - else: - os.system(self.arangodb()[0]) - - def arangodb(self): - return ArangoDB().generate_json_import_statement(self.output_filepath, self.collection, type=self.type) diff --git a/data/adapters/ccre_adapter.py b/data/adapters/ccre_adapter.py index 0d6e8746..787ecefe 100644 --- a/data/adapters/ccre_adapter.py +++ b/data/adapters/ccre_adapter.py @@ -1,9 +1,9 @@ import gzip import csv import json -import os -from adapters import Adapter -from db.arango_db import ArangoDB +from typing import Optional + +from adapters.writer import Writer # cCRE,all input file has 10 columns: chromsome, start, end, ID, score (all 0), strand (NA), start, end, color, biochemical_activity # There are 8 types of biochemical_activity: @@ -24,7 +24,7 @@ # chr1 29320 29517 EH38E3951274 0 . 29320 29517 6,218,147 CA -class CCRE(Adapter): +class CCRE: BIOCHEMICAL_DESCRIPTION = { 'pELS': 'proximal Enhancer-like signal', 'CA': 'chromatin accessible', @@ -35,9 +35,8 @@ class CCRE(Adapter): 'CA-H3K4me3': 'chromatin accessible + H3K4me3 high signal', 'PLS': 'Promoter-like signal' } - OUTPUT_PATH = './parsed-data' - def __init__(self, filepath, label='regulatory_region', dry_run=True): + def __init__(self, filepath, label='regulatory_region', dry_run=True, writer: Optional[Writer] = None, **kwargs): self.filepath = filepath self.label = label self.dataset = label @@ -45,14 +44,10 @@ def __init__(self, filepath, label='regulatory_region', dry_run=True): filepath.split('/')[-1].split('.')[0] self.dry_run = dry_run self.type = 'node' - self.output_filepath = '{}/{}.json'.format( - self.OUTPUT_PATH, - self.dataset - ) - super(CCRE, self).__init__() + self.writer = writer def process_file(self): - parsed_data_file = open(self.output_filepath, 'w') + self.writer.open() with gzip.open(self.filepath, 'rt') as input_file: reader = csv.reader(input_file, delimiter='\t') @@ -71,20 +66,10 @@ def process_file(self): 'source': 'ENCODE_SCREEN (ccREs)', 'source_url': self.source_url } - json.dump(_props, parsed_data_file) - parsed_data_file.write('\n') + self.writer.write(json.dumps(_props)) + self.writer.write('\n') except: print(f'fail to process: {row}') pass - parsed_data_file.close() - self.save_to_arango() - - def save_to_arango(self): - if self.dry_run: - print(self.arangodb()[0]) - else: - os.system(self.arangodb()[0]) - - def arangodb(self): - return ArangoDB().generate_json_import_statement(self.output_filepath, self.collection, type=self.type) + self.writer.close() diff --git a/data/adapters/cellosaurus_ontology_adapter.py b/data/adapters/cellosaurus_ontology_adapter.py index b0c5d6ac..981c65bc 100644 --- a/data/adapters/cellosaurus_ontology_adapter.py +++ b/data/adapters/cellosaurus_ontology_adapter.py @@ -1,9 +1,8 @@ import obonet import json -import os +from typing import Optional -from db.arango_db import ArangoDB -from adapters import Adapter +from adapters.writer import Writer # cellosaurus.obo is downloaded from: https://ftp.expasy.org/databases/cellosaurus/ # Example node from the obo file: @@ -21,7 +20,7 @@ # creation_date: 2020-10-29T00:00:00Z -class Cellosaurus(Adapter): +class Cellosaurus: SOURCE = 'Cellosaurus' SOURCE_URL_PREFIX = 'https://www.cellosaurus.org/' NODE_KEYS = ['name', 'synonym', 'subset'] @@ -31,9 +30,7 @@ class Cellosaurus(Adapter): # NBCI TaxID for Human and Mouse SPECIES_IDS = ['NCBI_TaxID:9606', 'NCBI_TaxID:10090'] - OUTPUT_PATH = './parsed-data' - - def __init__(self, filepath, type='node', species_filter=True, dry_run=True): + def __init__(self, filepath, type='node', species_filter=True, dry_run=True, writer: Optional[Writer] = None, **kwargs): self.filepath = filepath self.type = type self.species_filter = species_filter @@ -43,17 +40,10 @@ def __init__(self, filepath, type='node', species_filter=True, dry_run=True): else: self.dataset = 'ontology_relationship' self.label = self.dataset - - self.output_filepath = '{}/{}_{}.json'.format( - Cellosaurus.OUTPUT_PATH, - self.dataset, - Cellosaurus.SOURCE - ) - - super(Cellosaurus, self).__init__() + self.writer = writer def process_file(self): - self.parsed_data_file = open(self.output_filepath, 'w') + self.writer.open() graph = obonet.read_obo(self.filepath) same_individual_pairs = [] @@ -152,21 +142,11 @@ def process_file(self): self.save_props(props) - self.parsed_data_file.close() - self.save_to_arango() + self.writer.close() def save_props(self, props): - json.dump(props, self.parsed_data_file) - self.parsed_data_file.write('\n') - - def save_to_arango(self): - if self.dry_run: - print(self.arangodb()[0]) - else: - os.system(self.arangodb()[0]) - - def arangodb(self): - return ArangoDB().generate_json_import_statement(self.output_filepath, self.collection, type=self.type) + self.writer.write(json.dumps(props)) + self.writer.write('\n') def to_key(self, xref): key = xref.replace(':', '_').replace('/', '_').replace(' ', '_') diff --git a/data/adapters/clingen_variant_disease_adapter.py b/data/adapters/clingen_variant_disease_adapter.py index 9c090d93..deb66dcb 100644 --- a/data/adapters/clingen_variant_disease_adapter.py +++ b/data/adapters/clingen_variant_disease_adapter.py @@ -2,10 +2,9 @@ import pickle import hashlib import json -import os +from typing import Optional -from adapters import Adapter -from db.arango_db import ArangoDB +from adapters.writer import Writer # Example row from variant_pathogenicity.tsv # ClinVar Variation Id chr start stop Gene ID HGNC Gene Symbol Mondo Id Disease Mode of Inheritance Assertion Summary of interpretation PubMed Articles Evidence Repo Link Retracted Allele HGVS Expressions Allele Registry Id @@ -13,18 +12,15 @@ # 2574002, 2574002, 9450897 https://erepo.genome.network/evrepo/ui/classification/CA114360/MONDO:0009861/006 FALSE [T/A/C] NM_000277.2:c.1A>G, NC_000012.12:g.102917130T>C, CM000674.2:g.102917130T>C, NC_000012.11:g.103310908T>C, CM000674.1:g.103310908T>C, NC_000012.10:g.101835038T>C, NG_008690.1:g.5473A>G, NG_008690.2:g.46281A>G, NM_000277.1:c.1A>G, XM_011538422.1:c.1A>G, NM_001354304.1:c.1A>G, XM_017019370.2:c.1A>G, NM_000277.3:c.1A>G, ENST00000307000.7:c.-147A>G, ENST00000546844.1:c.1A>G, ENST00000547319.1:n.312A>G, ENST00000549111.5:n.97A>G, ENST00000551337.5:c.1A>G, ENST00000551988.5:n.90A>G, ENST00000553106.5:c.1A>G, ENST00000635500.1:n.29-4232A>G, NM_000277.2(PAH):c.1A>G (p.Met1Val) CA114360 -class ClinGen(Adapter): +class ClinGen: ALLOWED_LABELS = ['variant_disease', 'variant_disease_gene'] VARIANT_ID_MAPPING_PATH = './data_loading_support_files/clingen_variant_id_mapping.pkl' - SOURCE = 'ClinGen' SOURCE_URL = 'https://search.clinicalgenome.org/kb/downloads' - OUTPUT_PATH = './parsed-data' - - def __init__(self, filepath, label, dry_run=True): + def __init__(self, filepath, label, dry_run=True, writer: Optional[Writer] = None, **kwargs): if label not in ClinGen.ALLOWED_LABELS: - raise ValueError('Ivalid label. Allowed values: ' + + raise ValueError('Invalid label. Allowed values: ' + ','.join(ClinGen.ALLOWED_LABELS)) self.filepath = filepath @@ -32,16 +28,10 @@ def __init__(self, filepath, label, dry_run=True): self.dataset = label self.dry_run = dry_run self.type = 'edge' - - self.output_filepath = '{}/{}.json'.format( - ClinGen.OUTPUT_PATH, - self.dataset, - ) - - super(ClinGen, self).__init__() + self.writer = writer def process_file(self): - parsed_data_file = open(self.output_filepath, 'w') + self.writer.open() self.load_variant_id_mapping() with open(self.filepath, 'r') as clingen_file: @@ -76,8 +66,8 @@ def process_file(self): 'source': ClinGen.SOURCE, 'source_url': ClinGen.SOURCE_URL } - json.dump(props, parsed_data_file) - parsed_data_file.write('\n') + self.writer.write(json.dumps(props)) + self.writer.write('\n') elif self.label == 'variant_disease_gene': variant_disease_gene_id = hashlib.sha256( @@ -94,23 +84,13 @@ def process_file(self): 'source': ClinGen.SOURCE, 'source_url': ClinGen.SOURCE_URL } - json.dump(props, parsed_data_file) - parsed_data_file.write('\n') + self.writer.write(json.dumps(props)) + self.writer.write('\n') - parsed_data_file.close() - self.save_to_arango() + self.writer.close() def load_variant_id_mapping(self): # key: ClinVar Variation Id; value: internal hashed variant id self.variant_id_mapping = {} with open(ClinGen.VARIANT_ID_MAPPING_PATH, 'rb') as mapfile: self.variant_id_mapping = pickle.load(mapfile) - - def save_to_arango(self): - if self.dry_run: - print(self.arangodb()[0]) - else: - os.system(self.arangodb()[0]) - - def arangodb(self): - return ArangoDB().generate_json_import_statement(self.output_filepath, self.collection, type=self.type) diff --git a/data/adapters/coxpresdb_adapter.py b/data/adapters/coxpresdb_adapter.py index 52a4fc34..8aa638c7 100644 --- a/data/adapters/coxpresdb_adapter.py +++ b/data/adapters/coxpresdb_adapter.py @@ -1,36 +1,30 @@ -from adapters import Adapter import pickle -import os import json -from db.arango_db import ArangoDB +from typing import Optional + +from adapters.writer import Writer # https://coxpresdb.jp/download/Hsa-r.c6-0/coex/Hsa-r.v22-05.G16651-S235187.combat_pca.subagging.z.d.zip # There is 16651 files. The file name is entrez gene id. The total genes annotated are 16651, one gene per file, each file contain logit score of other 16650 genes. # There are two fields in each row: entrez gene id and logit score -class Coxpresdb(Adapter): - OUTPUT_PATH = './parsed-data' +class Coxpresdb: - def __init__(self, file_path, dry_run=True): + def __init__(self, filepath, dry_run=True, writer: Optional[Writer] = None, **kwargs): - self.file_path = file_path + self.file_path = filepath self.dataset = 'coxpresdb' self.label = 'coxpresdb' self.source = 'CoXPresdb' self.source_url = 'https://coxpresdb.jp/' self.type = 'edge' self.dry_run = dry_run - self.output_filepath = '{}/{}.json'.format( - self.OUTPUT_PATH, - self.dataset - ) - - super(Coxpresdb, self).__init__() + self.writer = writer def process_file(self): - parsed_data_file = open(self.output_filepath, 'w') + self.writer.open() # entrez_to_ensembl.pkl is generated using those two files: # gencode file: https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_43/gencode.v43.chr_patch_hapl_scaff.annotation.gtf.gz # Homo_sapiens.gene_info.gz file: https://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz @@ -62,16 +56,6 @@ def process_file(self): 'inverse_name': 'coexpressed with', 'associated process': 'ontology_terms/GO_0010467' } - json.dump(_props, parsed_data_file) - parsed_data_file.write('\n') - parsed_data_file.close() - self.save_to_arango() - - def save_to_arango(self): - if self.dry_run: - print(self.arangodb()[0]) - else: - os.system(self.arangodb()[0]) - - def arangodb(self): - return ArangoDB().generate_json_import_statement(self.output_filepath, self.collection, type=self.type) + self.writer.write(json.dumps(_props)) + self.writer.write('\n') + self.writer.close() diff --git a/data/adapters/dbSNFP_adapter.py b/data/adapters/dbSNFP_adapter.py index cfa8176a..ad740b0d 100644 --- a/data/adapters/dbSNFP_adapter.py +++ b/data/adapters/dbSNFP_adapter.py @@ -1,9 +1,8 @@ -import hashlib -import os import json -from db.arango_db import ArangoDB -from adapters import Adapter -from adapters.helpers import build_variant_id, build_coding_variant_id +from typing import Optional + +from adapters.helpers import build_variant_id +from adapters.writer import Writer # Sample file - file has 709 columns: # #chr pos(1-based) ref alt aaref aaalt rs_dbSNP hg19_chr hg19_pos(1-based) hg18_chr ... Interpro_domain GTEx_V8_gene GTEx_V8_tissue Geuvadis_eQTL_target_gene @@ -11,27 +10,17 @@ # Y 2786990 T C X W . Y 2655031 Y 2715031 205 SRY ENSG00000184895 ENST00000383070 ENSP00000372547 ... . . . . . . -class DbSNFPAdapter(Adapter): +class DbSNFP: LABEL = 'dbSNFP_protein_variants' - OUTPUT_PATH = './parsed-data' - WRITE_THRESHOLD = 1000000 - - def __init__(self, filepath=None, collection='coding_variants', dry_run=True): - self.output_filepath = '{}/{}-{}-{}.json'.format( - DbSNFPAdapter.OUTPUT_PATH, - DbSNFPAdapter.LABEL, - collection, - filepath.split('/')[-1] - ) + def __init__(self, filepath=None, collection='coding_variants', dry_run=True, writer: Optional[Writer] = None, **kwargs): self.filepath = filepath - self.label = DbSNFPAdapter.LABEL + self.label = DbSNFP.LABEL self.dataset = self.label self.dry_run = dry_run self.collection_name = collection - - super(DbSNFPAdapter, self).__init__() + self.writer = writer def multiple_records(self, data_line): indexes = [11, 12, 13, 14, 15, 17] @@ -82,9 +71,7 @@ def breakdown_line(self, original_data_line): return data_lines def process_file(self): - parsed_data_file = open(self.output_filepath, 'w') - - record_count = 0 + self.writer.open() for line in open(self.filepath, 'r'): if line.startswith('#chr'): @@ -190,29 +177,6 @@ def long_data(pos): 'source_url': 'http://database.liulab.science/dbNSFP' } - json.dump(to_json, parsed_data_file) - parsed_data_file.write('\n') - record_count += 1 - - if record_count > DbSNFPAdapter.WRITE_THRESHOLD: - parsed_data_file.close() - self.save_to_arango() - - os.remove(self.output_filepath) - record_count = 0 - - parsed_data_file = open(self.output_filepath, 'w') - - parsed_data_file.close() - self.save_to_arango() - - def save_to_arango(self): - collection_type = 'node' if self.collection_name == 'coding_variants' else 'edge' - - import_sts = ArangoDB().generate_json_import_statement( - self.output_filepath, self.collection_name, type=collection_type)[0] - - if self.dry_run: - print(import_sts) - else: - os.system(import_sts) + self.writer.write(json.dumps(to_json)) + self.writer.write('\n') + self.writer.close() diff --git a/data/adapters/depmap_adapter.py b/data/adapters/depmap_adapter.py index b03a8b57..61d81b8d 100644 --- a/data/adapters/depmap_adapter.py +++ b/data/adapters/depmap_adapter.py @@ -1,11 +1,10 @@ import csv from collections import defaultdict import json -import os import pickle +from typing import Optional -from adapters import Adapter -from db.arango_db import ArangoDB +from adapters.writer import Writer # CRISPRGeneDependency.csv is downloaded from DepMap portal: https://depmap.org/portal/download/all/ in DepMap Public 23Q2 Primary Files set. # The original matrix in file is organized as ModelID (1,095 rows) X Gene (17,931 coloumns). @@ -28,31 +27,24 @@ # DepMap_gene_id_mapping.tsv is premapped file from gene symbol to gene ensembl id, queried from IGVF catalog gene collection. -class DepMap(Adapter): +class DepMap: SOURCE = 'DepMap' SOURCE_URL = 'https://depmap.org/portal/' SOURCE_FILE = 'CRISPRGeneDependency.csv' GENE_ID_MAPPING_PATH = './data_loading_support_files/DepMap/DepMap_gene_id_mapping.pkl' CELL_ONTOLOGY_ID_MAPPING_PATH = './data_loading_support_files/DepMap/DepMap_model.csv' - OUTPUT_PATH = './parsed-data' - CUTOFF = 0.5 # only load genes with dependency scores greater or equal to 0.5 for each cell - def __init__(self, filepath, type, label, dry_run=True): + def __init__(self, filepath, type, label, dry_run=True, writer: Optional[Writer] = None, **kwargs): self.filepath = filepath self.dataset = label self.label = label self.type = type self.dry_run = dry_run - self.output_filepath = '{}/{}.json'.format( - self.OUTPUT_PATH, - self.dataset - ) - - super(DepMap, self).__init__() + self.writer = writer def process_file(self): - parsed_data_file = open(self.output_filepath, 'w') + self.writer.open() self.load_cell_ontology_id_mapping() self.load_gene_id_mapping() @@ -107,10 +99,9 @@ def process_file(self): 'inverse_name': 'dependent on' } - json.dump(_props, parsed_data_file) - parsed_data_file.write('\n') - parsed_data_file.close() - self.save_to_arango() + self.writer.write(json.dumps(_props)) + self.writer.write('\n') + self.writer.close() def load_cell_ontology_id_mapping(self): # key: DepMap Model ID; value: ontology ids (i.e. CVCL ids) and properties of each cell @@ -134,12 +125,3 @@ def load_gene_id_mapping(self): self.gene_id_mapping = {} # key: gene symbol; value: gene ensembl id with open(DepMap.GENE_ID_MAPPING_PATH, 'rb') as gene_id_mapping_file: self.gene_id_mapping = pickle.load(gene_id_mapping_file) - - def save_to_arango(self): - if self.dry_run: - print(self.arangodb()[0]) - else: - os.system(self.arangodb()[0]) - - def arangodb(self): - return ArangoDB().generate_json_import_statement(self.output_filepath, self.collection, type=self.type) diff --git a/data/adapters/ebi_complex_adapter.py b/data/adapters/ebi_complex_adapter.py index 2fc74f13..7062603e 100644 --- a/data/adapters/ebi_complex_adapter.py +++ b/data/adapters/ebi_complex_adapter.py @@ -1,10 +1,9 @@ import csv -import os import json import pickle +from typing import Optional -from db.arango_db import ArangoDB -from adapters import Adapter +from adapters.writer import Writer # The complex tsv file for human was downloaded from EBI complex portal:http://ftp.ebi.ac.uk/pub/databases/intact/complex/current/complextab/9606.tsv # An example line with header: @@ -20,55 +19,42 @@ # Heterotrimer - - - - - psi-mi:"MI:0469"(IntAct) P84022(1)|Q13485(1)|Q15796(1) -class EBIComplex(Adapter): +class EBIComplex: ALLOWED_LABELS = ['complex', 'complex_protein', 'complex_term'] SOURCE = 'EBI' SOURCE_URL = 'https://www.ebi.ac.uk/complexportal/' - # cross-references to ontology terms we want to load XREF_SOURCES = ['efo', 'intact', 'mondo', 'orphanet', 'pubmed'] # removed biorxiv, -> only one case, and difficult to convert to key id - # path to pre-calculated dict containing binding regions pulled from api LINKED_FEATURE_PATH = './data_loading_support_files/EBI_complex/EBI_complex_linkedFeatures_09-26-23.pkl' SUBONTOLOGIES = './data_loading_support_files/complexes_terms_subontologies.json' - OUTPUT_PATH = './parsed-data' - - def __init__(self, filepath, label='complex', dry_run=True): + def __init__(self, filepath, label='complex', dry_run=True, writer: Optional[Writer] = None, **kwargs): if label not in EBIComplex.ALLOWED_LABELS: - raise ValueError('Ivalid labelS. Allowed values: ' + + raise ValueError('Invalid label. Allowed values: ' + ','.join(EBIComplex.ALLOWED_LABELS)) self.filepath = filepath self.label = label self.dataset = label self.dry_run = dry_run + self.writer = writer if label == 'complex': self.type = 'node' else: self.type = 'edge' - self.output_filepath = '{}/{}_{}.json'.format( - EBIComplex.OUTPUT_PATH, - self.dataset, - EBIComplex.SOURCE - ) - - super(EBIComplex, self).__init__() - def process_file(self): - self.parsed_data_file = open(self.output_filepath, 'w') + self.writer.open() self.load_subontologies() - with open(self.filepath, 'r') as complex_file: complex_tsv = csv.reader(complex_file, delimiter='\t') next(complex_tsv) for complex_row in complex_tsv: skip_flag = None complex_ac = complex_row[0] - molecules = complex_row[4].split('|') for molecule in molecules: if molecule.startswith('CHEBI:') or molecule.startswith('URS'): @@ -246,8 +232,7 @@ def process_file(self): self.save_props(props) - self.parsed_data_file.close() - self.save_to_arango() + self.writer.close() def get_chain_id(self, protein): if len(protein.split('-')) > 1: @@ -275,14 +260,5 @@ def load_subontologies(self): self.subontologies[sub['name']] = sub['subontology'] def save_props(self, props): - json.dump(props, self.parsed_data_file) - self.parsed_data_file.write('\n') - - def save_to_arango(self): - if self.dry_run: - print(self.arangodb()[0]) - else: - os.system(self.arangodb()[0]) - - def arangodb(self): - return ArangoDB().generate_json_import_statement(self.output_filepath, self.collection, type=self.type) + self.writer.write(json.dumps(props)) + self.writer.write('\n') diff --git a/data/adapters/encode_E2G_CRISPR_adapter.py b/data/adapters/encode_E2G_CRISPR_adapter.py index 2499902a..2bffb9a9 100644 --- a/data/adapters/encode_E2G_CRISPR_adapter.py +++ b/data/adapters/encode_E2G_CRISPR_adapter.py @@ -1,11 +1,11 @@ import csv import json -import os import pickle -from adapters import Adapter -from adapters.helpers import build_regulatory_region_id from math import log10 -from db.arango_db import ArangoDB +from typing import Optional + +from adapters.helpers import build_regulatory_region_id +from adapters.writer import Writer # Example lines from ENCFF968BZL.tsv (CRISPR tested data for ENCODE E2G training) # chrom chromStart chromEnd name EffectSize strandPerturbationTarget PerturbationTargetID chrTSS startTSS endTSS strandGene EffectSize95ConfidenceIntervalLow EffectSize95ConfidenceIntervalHigh measuredGeneSymbol measuredEnsemblID guideSpacerSeq guideSeq Significant pValue pValueAdjusted PowerAtEffectSize25 PowerAtEffectSize10 PowerAtEffectSize15 PowerAtEffectSize20 PowerAtEffectSize50 ValidConnection Notes Reference @@ -16,7 +16,7 @@ # Rename significant:boolean to significant in header file; Replace 'True' with 'true', 'False' with 'false' in parsed data files -class ENCODE2GCRISPR(Adapter): +class ENCODE2GCRISPR: ALLOWED_LABELS = ['regulatory_region', 'regulatory_region_gene'] SOURCE = 'ENCODE-E2G-CRISPR' @@ -25,9 +25,8 @@ class ENCODE2GCRISPR(Adapter): FILE_ACCESSION = 'ENCFF968BZL' BIOLOGICAL_CONTEXT = 'EFO_0002067' MAX_LOG10_PVALUE = 240 # max log10pvalue from file is 235 - OUTPUT_PATH = './parsed-data' - def __init__(self, filepath, label, dry_run=True): + def __init__(self, filepath, label, dry_run=True, writer: Optional[Writer] = None, **kwargs): if label not in ENCODE2GCRISPR.ALLOWED_LABELS: raise ValueError('Invalid label. Allowed values: ' + ','.join(ENCODE2GCRISPR.ALLOWED_LABELS)) @@ -39,16 +38,10 @@ def __init__(self, filepath, label, dry_run=True): self.type = 'edge' if(self.label == 'regulatory_region'): self.type = 'node' - - self.output_filepath = '{}/{}.json'.format( - self.OUTPUT_PATH, - self.dataset - ) - - super(ENCODE2GCRISPR, self).__init__() + self.writer = writer def process_file(self): - parsed_data_file = open(self.output_filepath, 'w') + self.writer.open() if self.label == 'regulatory_region': print('loading regulatory regions') self.load_regulatory_region() @@ -70,8 +63,8 @@ def process_file(self): 'source_url': ENCODE2GCRISPR.SOURCE_URL } - json.dump(_props, parsed_data_file) - parsed_data_file.write('\n') + self.writer.write(json.dumps(_props)) + self.writer.write('\n') elif self.label == 'regulatory_region_gene': self.load_gene_id_mapping() @@ -121,10 +114,9 @@ def process_file(self): 'source_url': ENCODE2GCRISPR.SOURCE_URL, 'biological_context': 'ontology_terms/' + ENCODE2GCRISPR.BIOLOGICAL_CONTEXT } - json.dump(_props, parsed_data_file) - parsed_data_file.write('\n') - parsed_data_file.close() - self.save_to_arango() + self.writer.write(json.dumps(_props)) + self.writer.write('\n') + self.writer.close() def load_regulatory_region(self): # each row is a pair of tested regulatory region <-> gene, significant column can be TRUE/FALSE @@ -152,12 +144,3 @@ def load_gene_id_mapping(self): self.gene_id_mapping = {} with open(ENCODE2GCRISPR.GENE_ID_MAPPING_PATH, 'rb') as mapfile: self.gene_id_mapping = pickle.load(mapfile) - - def save_to_arango(self): - if self.dry_run: - print(self.arangodb()[0]) - else: - os.system(self.arangodb()[0]) - - def arangodb(self): - return ArangoDB().generate_json_import_statement(self.output_filepath, self.collection, type=self.type) diff --git a/data/adapters/encode_caqtl_adapter.py b/data/adapters/encode_caqtl_adapter.py index e1f2e388..f6642554 100644 --- a/data/adapters/encode_caqtl_adapter.py +++ b/data/adapters/encode_caqtl_adapter.py @@ -1,8 +1,9 @@ import json import os -from adapters import Adapter +from typing import Optional + from adapters.helpers import build_variant_id, build_regulatory_region_id -from db.arango_db import ArangoDB +from adapters.writer import Writer # Example Encode caQTL input file: # chr1 766454 766455 chr1_766455_T_C chr1 766455 T C 1 778381 779150 FALSE 1_778381_779150 C T rs189800799 Progenitor @@ -16,7 +17,7 @@ # last column: cell name -class CAQtl(Adapter): +class CAQtl: # 1-based coordinate system ALLOWED_LABELS = ['regulatory_region', 'encode_caqtl'] @@ -36,11 +37,10 @@ class CAQtl(Adapter): 'term_name': 'liver' } } - OUTPUT_PATH = './parsed-data' - def __init__(self, filepath, source, label, dry_run=True): + def __init__(self, filepath, source, label, dry_run=True, writer: Optional[Writer] = None, **kwargs): if label not in CAQtl.ALLOWED_LABELS: - raise ValueError('Ivalid label. Allowed values: ' + + raise ValueError('Invalid label. Allowed values: ' + ','.join(CAQtl.ALLOWED_LABELS)) self.filepath = filepath @@ -51,16 +51,10 @@ def __init__(self, filepath, source, label, dry_run=True): self.type = 'edge' if(self.label == 'regulatory_region'): self.type = 'node' - - self.output_filepath = '{}/{}.json'.format( - self.OUTPUT_PATH, - self.dataset - ) - - super(CAQtl, self).__init__() + self.writer = writer def process_file(self): - parsed_data_file = open(self.output_filepath, 'w') + self.writer.open() for line in open(self.filepath, 'r'): data_line = line.strip().split() @@ -97,8 +91,8 @@ def process_file(self): 'inverse_name': 'associates with' } - json.dump(_props, parsed_data_file) - parsed_data_file.write('\n') + self.writer.write(json.dumps(_props)) + self.writer.write('\n') elif self.label == 'regulatory_region': _id = regulatory_region_id @@ -112,16 +106,6 @@ def process_file(self): 'type': 'accessible dna elements' } - json.dump(_props, parsed_data_file) - parsed_data_file.write('\n') - parsed_data_file.close() - self.save_to_arango() - - def save_to_arango(self): - if self.dry_run: - print(self.arangodb()[0]) - else: - os.system(self.arangodb()[0]) - - def arangodb(self): - return ArangoDB().generate_json_import_statement(self.output_filepath, self.collection, type=self.type) + self.writer.write(json.dumps(_props)) + self.writer.write('\n') + self.writer.close() diff --git a/data/adapters/encode_element_gene_adapter.py b/data/adapters/encode_element_gene_adapter.py index e34d5185..1213c691 100644 --- a/data/adapters/encode_element_gene_adapter.py +++ b/data/adapters/encode_element_gene_adapter.py @@ -1,11 +1,11 @@ import gzip import csv import json -import os -from adapters import Adapter -from adapters.helpers import build_regulatory_region_id -from db.arango_db import ArangoDB import requests +from typing import Optional + +from adapters.helpers import build_regulatory_region_id +from adapters.writer import Writer # There are 4 sources from encode: # ABC (Engrietz) @@ -58,7 +58,7 @@ # ENCODE-E2G: intergenic(ENH), promoter(PRO) and genic(ENH) -class EncodeElementGeneLink(Adapter): +class EncodeElementGeneLink: ALLOWED_LABELS = [ 'regulatory_region_gene', # regulatory_region --(edge)--> gene @@ -87,14 +87,13 @@ class EncodeElementGeneLink(Adapter): 'ENCODE-E2G-DNaseOnly': -1, 'ENCODE-E2G-Full': -1, } - OUTPUT_PATH = './parsed-data' - def __init__(self, filepath, label, source, source_url, biological_context, dry_run=True): + def __init__(self, filepath, label, source, source_url, biological_context, dry_run=True, writer: Optional[Writer] = None, **kwargs): if label not in EncodeElementGeneLink.ALLOWED_LABELS: - raise ValueError('Ivalid label. Allowed values: ' + + raise ValueError('Invalid label. Allowed values: ' + ','.join(EncodeElementGeneLink.ALLOWED_LABELS)) if source not in EncodeElementGeneLink.ALLOWED_SOURCES: - raise ValueError('Ivalid source. Allowed values: ' + + raise ValueError('Invalid source. Allowed values: ' + ','.join(EncodeElementGeneLink.ALLOWED_SOURCES)) self.filepath = filepath @@ -106,17 +105,12 @@ def __init__(self, filepath, label, source, source_url, biological_context, dry_ self.biological_context = biological_context self.dry_run = dry_run self.type = 'edge' - if(self.label in ['donor', 'ontology_term', 'regulatory_region']): + if (self.label in ['donor', 'ontology_term', 'regulatory_region']): self.type = 'node' - self.output_filepath = '{}/{}.json'.format( - self.OUTPUT_PATH, - self.dataset - ) - - super(EncodeElementGeneLink, self).__init__() + self.writer = writer def process_file(self): - parsed_data_file = open(self.output_filepath, 'w') + self.writer.open() # Check if needs to create those hyper-hyper edges from the input file, before opening & iterating over file rows if self.label == 'regulatory_region_gene_biosample_treatment_CHEBI': treatments = self.get_treatment_info() @@ -145,8 +139,8 @@ def process_file(self): return else: _props = self.get_biosample_term_info() - json.dump(_props, parsed_data_file) - parsed_data_file.write('\n') + self.writer.write(json.dumps(_props)) + self.writer.write('\n') with gzip.open(self.filepath, 'rt') as input_file: reader = csv.reader(input_file, delimiter='\t') @@ -179,8 +173,8 @@ def process_file(self): 'source_url': self.source_url, 'biological_context': 'ontology_terms/' + self.biological_context } - json.dump(_props, parsed_data_file) - parsed_data_file.write('\n') + self.writer.write(json.dumps(_props)) + self.writer.write('\n') elif self.label == 'regulatory_region': _id = regulatory_element_id @@ -217,8 +211,8 @@ def process_file(self): class_name, regulatory_element_id)) continue - json.dump(_props, parsed_data_file) - parsed_data_file.write('\n') + self.writer.write(json.dumps(_props)) + self.writer.write('\n') elif self.label == 'regulatory_region_gene_biosample': # edge --(hyper-edge)--> biosample (ontology_term) @@ -236,8 +230,8 @@ def process_file(self): 'source': self.source, 'source_url': self.source_url } - json.dump(_props, parsed_data_file) - parsed_data_file.write('\n') + self.writer.write(json.dumps(_props)) + self.writer.write('\n') elif self.label == 'regulatory_region_gene_biosample_treatment_CHEBI': # hyper-edge --(hyper-hyper-edge)--> treatment (ontology_term) @@ -265,8 +259,8 @@ def process_file(self): 'source': self.source, 'source_url': self.source_url } - json.dump(_props, parsed_data_file) - parsed_data_file.write('\n') + self.writer.write(json.dumps(_props)) + self.writer.write('\n') elif self.label == 'regulatory_region_gene_biosample_treatment_protein': # hyper-edge --(hyper-hyper-edge)--> treatment (protein) @@ -295,8 +289,8 @@ def process_file(self): 'source': self.source, 'source_url': self.source_url } - json.dump(_props, parsed_data_file) - parsed_data_file.write('\n') + self.writer.write(json.dumps(_props)) + self.writer.write('\n') elif self.label == 'regulatory_region_gene_biosample_donor': # hyper-edge --(hyper-hyper-edge)--> donor @@ -315,8 +309,8 @@ def process_file(self): 'source': self.source, 'source_url': self.source_url, } - json.dump(_props, parsed_data_file) - parsed_data_file.write('\n') + self.writer.write(json.dumps(_props)) + self.writer.write('\n') elif self.label == 'donor': for donor in donors: @@ -333,10 +327,9 @@ def process_file(self): 'source': 'ENCODE', 'source_url': self.source_url, } - json.dump(_props, parsed_data_file) - parsed_data_file.write('\n') - parsed_data_file.close() - self.save_to_arango() + self.writer.write(json.dumps(_props)) + self.writer.write('\n') + self.writer.close() def get_treatment_info(self): # get the treatment info of its annotation from the file url @@ -390,12 +383,3 @@ def get_biosample_term_info(self): } return props - - def save_to_arango(self): - if self.dry_run: - print(self.arangodb()[0]) - else: - os.system(self.arangodb()[0]) - - def arangodb(self): - return ArangoDB().generate_json_import_statement(self.output_filepath, self.collection, type=self.type) diff --git a/data/adapters/encode_mpra_adapter.py b/data/adapters/encode_mpra_adapter.py index 4d31a935..0b3caecb 100644 --- a/data/adapters/encode_mpra_adapter.py +++ b/data/adapters/encode_mpra_adapter.py @@ -1,17 +1,17 @@ import csv import gzip import json -import os -from adapters import Adapter -from adapters.helpers import build_regulatory_region_id -from db.arango_db import ArangoDB +from typing import Optional +from adapters.helpers import build_regulatory_region_id +from adapters.writer import Writer # Example rows from ENCODE lenti-MPRA bed file ENCFF802FUV.bed: (the last two columns are the same for all rows) # Column 7: activity score (i.e. log2(RNA/DNA)); Column 8: DNA count; Column 9: RNA count # chr1 10410 10610 HepG2_DNasePeakNoPromoter1 212 + -0.843 0.307 0.171 -1 -1 -class EncodeMPRA(Adapter): + +class EncodeMPRA: SOURCE = 'ENCODE_MPRA' @@ -19,9 +19,8 @@ class EncodeMPRA(Adapter): 'regulatory_region', 'regulatory_region_biosample' ] - OUTPUT_PATH = './parsed-data' - def __init__(self, filepath, label, source_url, biological_context, dry_run=True): # other? + def __init__(self, filepath, label, source_url, biological_context, dry_run=True, writer: Optional[Writer] = None, **kwargs): if label not in EncodeMPRA.ALLOWED_LABELS: raise ValueError('Ivalid label. Allowed values: ' + ','.join(EncodeMPRA.ALLOWED_LABELS)) @@ -35,16 +34,10 @@ def __init__(self, filepath, label, source_url, biological_context, dry_run=True self.type = 'edge' if(self.label == 'regulatory_region'): self.type = 'node' - - self.output_filepath = '{}/{}.json'.format( - self.OUTPUT_PATH, - self.dataset - ) - - super(EncodeMPRA, self).__init__() + self.writer = writer def process_file(self): - parsed_data_file = open(self.output_filepath, 'w') + self.writer.open() with gzip.open(self.filepath, 'rt') as mpra_file: mpra_csv = csv.reader(mpra_file, delimiter='\t') for row in mpra_csv: @@ -68,8 +61,8 @@ def process_file(self): 'source_url': self.source_url } - json.dump(_props, parsed_data_file) - parsed_data_file.write('\n') + self.writer.write(json.dumps(_props)) + self.writer.write('\n') elif self.label == 'regulatory_region_biosample': _id = '_'.join( @@ -90,16 +83,6 @@ def process_file(self): 'source': EncodeMPRA.SOURCE, 'source_url': self.source_url } - json.dump(_props, parsed_data_file) - parsed_data_file.write('\n') - parsed_data_file.close() - self.save_to_arango() - - def save_to_arango(self): - if self.dry_run: - print(self.arangodb()[0]) - else: - os.system(self.arangodb()[0]) - - def arangodb(self): - return ArangoDB().generate_json_import_statement(self.output_filepath, self.collection, type=self.type) + self.writer.write(json.dumps(_props)) + self.writer.write('\n') + self.writer.close() diff --git a/data/adapters/favor_adapter.py b/data/adapters/favor_adapter.py index 9bfb3b96..81c95526 100644 --- a/data/adapters/favor_adapter.py +++ b/data/adapters/favor_adapter.py @@ -1,14 +1,13 @@ +import json +from typing import Optional from ga4gh.vrs.extras.translator import Translator from ga4gh.vrs.dataproxy import create_dataproxy from biocommons.seqrepo import SeqRepo -from adapters import Adapter from adapters.helpers import build_variant_id from scripts.variants_spdi import build_spdi, build_hgvs_from_spdi -from db.arango_db import ArangoDB -import json -import os +from adapters.writer import Writer # Example file format for FAVOR (from chr 21) @@ -58,14 +57,11 @@ # RFullDB/ucsc_info=ENST00000612610.4,ENST00000620481.4,ENST00000623795.1,ENST00000623903.3,ENST00000623960.3 -class Favor(Adapter): +class Favor: # Originally 1-based coordinate system # Converted to 0-based DATASET = 'favor' - OUTPUT_PATH = './parsed-data' - - WRITE_THRESHOLD = 1000000 NUMERIC_FIELDS = ['start_position', 'end_position', 'vid', 'linsight', 'gc', 'cpg', 'priphcons', 'mamphcons', 'verphcons', 'priphylop', 'mamphylop', 'verphylop', 'bstatistic', 'freq10000bp', 'rare10000', 'k36_umap', 'k50_umap', 'k100_uma', 'nucdiv'] @@ -91,19 +87,13 @@ class Favor(Adapter): 'rare10000', 'k36_umap', 'k50_umap', 'k100_uma', 'nucdiv' ] - def __init__(self, filepath=None, chr_x_y=None, dry_run=True): + def __init__(self, filepath=None, chr_x_y=None, dry_run=True, writer: Optional[Writer] = None, **kwargs): self.filepath = filepath self.dataset = Favor.DATASET self.label = Favor.DATASET - self.output_filepath = '{}/{}-{}.json'.format( - Favor.OUTPUT_PATH, - self.dataset, - filepath.split('/')[-1], - ) self.dry_run = dry_run self.chr_x_y = chr_x_y - - super(Favor, self).__init__() + self.writer = writer def convert_freq_value(self, value): if value == '.': @@ -177,8 +167,7 @@ def parse_metadata(self, info): return info_obj def process_file(self): - parsed_data_file = open(self.output_filepath, 'w') - + self.writer.open() # Install instructions: https://github.com/biocommons/biocommons.seqrepo dp = create_dataproxy( 'seqrepo+file:///usr/local/share/seqrepo/2018-11-26') @@ -186,7 +175,6 @@ def process_file(self): translator = Translator(data_proxy=dp) reading_data = False - record_count = 0 json_objects = [] json_object_keys = set() @@ -263,35 +251,14 @@ def process_file(self): store_json = json_objects.pop(0) json_object_keys.remove(store_json['_key']) - json.dump(store_json, parsed_data_file) - parsed_data_file.write('\n') - record_count += 1 + self.writer.write(json.dumps(store_json)) + self.writer.write('\n') else: json_objects = [to_json] json_object_keys.add(to_json['_key']) - if record_count > Favor.WRITE_THRESHOLD: - parsed_data_file.close() - self.save_to_arango() - - os.remove(self.output_filepath) - record_count = 0 - - parsed_data_file = open(self.output_filepath, 'w') - for object in json_objects: - json.dump(object, parsed_data_file) - parsed_data_file.write('\n') - record_count += 1 - - parsed_data_file.close() - self.save_to_arango() - - def arangodb(self): - return ArangoDB().generate_json_import_statement(self.output_filepath, self.collection) + self.writer.write(json.dumps(object)) + self.writer.write('\n') - def save_to_arango(self): - if self.dry_run: - print(self.arangodb()[0]) - else: - os.system(self.arangodb()[0]) + self.writer.close() diff --git a/data/adapters/gaf_adapter.py b/data/adapters/gaf_adapter.py index 80b53aa8..001d345b 100644 --- a/data/adapters/gaf_adapter.py +++ b/data/adapters/gaf_adapter.py @@ -1,13 +1,12 @@ -import os import gzip import json import hashlib import pickle +from typing import Optional from Bio.UniProt.GOA import gafiterator -from adapters import Adapter -from db.arango_db import ArangoDB +from adapters.writer import Writer # GAF files are defined here: https://geneontology.github.io/docs/go-annotation-file-gaf-format-2.2/ # @@ -43,9 +42,8 @@ # URS0000000C0D ENSEMBL_GENCODE ENST00000582841 9606 lncRNA ENSG00000265443.1 # URS0000000CF3 ENSEMBL_GENCODE ENST00000414886 9606 lncRNA ENSG00000226856.9 -class GAF(Adapter): +class GAF: DATASET = 'gaf' - OUTPUT_PATH = './parsed-data' RNACENTRAL_ID_MAPPING_PATH = './samples/rnacentral_ensembl_gencode.tsv.gz' # generated from current proteins collection in the Catalog MOUSE_MGI_TO_UNIPROT_PATH = './data_loading_support_files/mgi_to_ensembl.pkl' @@ -57,9 +55,9 @@ class GAF(Adapter): 'rnacentral': 'https://ftp.ebi.ac.uk/pub/databases/RNAcentral/current_release/id_mapping/database_mappings/ensembl_gencode.tsv' } - def __init__(self, filepath, gaf_type='human', dry_run=True): + def __init__(self, filepath, gaf_type='human', dry_run=True, writer: Optional[Writer] = None, **kwargs): if gaf_type not in GAF.SOURCES.keys(): - raise ValueError('Ivalid type. Allowed values: ' + + raise ValueError('Invalid type. Allowed values: ' + ', '.join(GAF.SOURCES.keys())) self.filepath = filepath @@ -67,13 +65,7 @@ def __init__(self, filepath, gaf_type='human', dry_run=True): self.label = GAF.DATASET self.dry_run = dry_run self.type = gaf_type - self.output_filepath = '{}/{}-{}.json'.format( - GAF.OUTPUT_PATH, - self.dataset, - filepath.split('/')[-1] - ) - - super(GAF, self).__init__() + self.writer = writer def load_rnacentral_mapping(self): self.rnacentral_mapping = {} @@ -88,7 +80,7 @@ def load_mouse_mgi_to_uniprot(self): open(GAF.MOUSE_MGI_TO_UNIPROT_PATH, 'rb')) def process_file(self): - parsed_data_file = open(self.output_filepath, 'w') + self.writer.open() if self.type == 'rna': self.load_rnacentral_mapping() @@ -156,20 +148,7 @@ def process_file(self): props['name'] = 'has the function' props['inverse_name'] = 'is a function of' - json.dump(props, parsed_data_file) - parsed_data_file.write('\n') - - parsed_data_file.close() - self.save_to_arango() + self.writer.write(json.dumps(props)) + self.writer.write('\n') - def save_to_arango(self): - if self.dry_run: - print(self.arangodb()[0]) - else: - os.system(self.arangodb()[0]) - - def arangodb(self): - collection = 'go_terms_annotations' - if self.type == 'mouse': - collection = 'go_terms_mm_proteins' - return ArangoDB().generate_json_import_statement(self.output_filepath, collection, type='edges') + self.writer.close() diff --git a/data/adapters/gencode_adapter.py b/data/adapters/gencode_adapter.py index cfdf593a..0a62c186 100644 --- a/data/adapters/gencode_adapter.py +++ b/data/adapters/gencode_adapter.py @@ -1,8 +1,7 @@ import json -import os -from adapters import Adapter -from db.arango_db import ArangoDB +from typing import Optional +from adapters.writer import Writer # Example genocde gtf input file: # ##description: evidence-based annotation of the human genome (GRCh38), version 43 (Ensembl 109) # ##provider: GENCODE @@ -15,7 +14,7 @@ # chr1 HAVANA exon 12613 12721 . + . gene_id "ENSG00000290825.1"; transcript_id "ENST00000456328.2"; gene_type "lncRNA"; gene_name "DDX11L2"; transcript_type "lncRNA"; transcript_name "DDX11L2-202"; exon_number 2; exon_id "ENSE00003582793.1"; level 2; transcript_support_level "1"; tag "basic"; tag "Ensembl_canonical"; havana_transcript "OTTHUMT00000362751.1"; -class Gencode(Adapter): +class Gencode: ALLOWED_LABELS = ['gencode_transcript', 'mm_gencode_transcript', 'transcribed_to', 'transcribed_from'] @@ -24,11 +23,10 @@ class Gencode(Adapter): ALLOWED_ORGANISMS = ['HUMAN', 'MOUSE'] INDEX = {'chr': 0, 'type': 2, 'coord_start': 3, 'coord_end': 4, 'info': 8} - OUTPUT_PATH = './parsed-data' - def __init__(self, filepath=None, label='gencode_transcript', organism='HUMAN', chr='all', dry_run=True): + def __init__(self, filepath=None, label='gencode_transcript', organism='HUMAN', chr='all', dry_run=True, writer: Optional[Writer] = None, **kwargs): if label not in Gencode.ALLOWED_LABELS: - raise ValueError('Ivalid labelS. Allowed values: ' + + raise ValueError('Invalid labelS. Allowed values: ' + ','.join(Gencode.ALLOWED_LABELS)) self.filepath = filepath @@ -49,13 +47,7 @@ def __init__(self, filepath=None, label='gencode_transcript', organism='HUMAN', self.type = 'edge' if(self.label in ['gencode_transcript', 'mm_gencode_transcript']): self.type = 'node' - - self.output_filepath = '{}/{}.json'.format( - self.OUTPUT_PATH, - self.dataset - ) - - super(Gencode, self).__init__() + self.writer = writer def parse_info_metadata(self, info): parsed_info = {} @@ -65,7 +57,7 @@ def parse_info_metadata(self, info): return parsed_info def process_file(self): - parsed_data_file = open(self.output_filepath, 'w') + self.writer.open() for line in open(self.filepath, 'r'): if line.startswith('#'): continue @@ -97,8 +89,9 @@ def process_file(self): 'version': self.version, 'source_url': self.source_url } - json.dump(props, parsed_data_file) - parsed_data_file.write('\n') + self.writer.write(json.dumps(props)) + self.writer.write('\n') + elif self.label == 'transcribed_to': _id = gene_key + '_' + transcript_key _source = self.gene_endpoint + gene_key @@ -114,8 +107,9 @@ def process_file(self): 'inverse_name': 'transcribed by', 'biological_process': 'ontology_terms/GO_0010467' } - json.dump(_props, parsed_data_file) - parsed_data_file.write('\n') + self.writer.write(json.dumps(_props)) + self.writer.write('\n') + elif self.label == 'transcribed_from': _id = transcript_key + '_' + gene_key _source = self.transcript_endpoint + transcript_key @@ -131,19 +125,9 @@ def process_file(self): 'inverse_name': 'transcribes', 'biological_process': 'ontology_terms/GO_0010467' } - json.dump(_props, parsed_data_file) - parsed_data_file.write('\n') + self.writer.write(json.dumps(_props)) + self.writer.write('\n') except: print( f'fail to process for label to load: {self.label}, data: {line}') - parsed_data_file.close() - self.save_to_arango() - - def save_to_arango(self): - if self.dry_run: - print(self.arangodb()[0]) - else: - os.system(self.arangodb()[0]) - - def arangodb(self): - return ArangoDB().generate_json_import_statement(self.output_filepath, self.collection, type=self.type) + self.writer.close() diff --git a/data/adapters/gencode_gene_adapter.py b/data/adapters/gencode_gene_adapter.py index 4c99f420..da8af488 100644 --- a/data/adapters/gencode_gene_adapter.py +++ b/data/adapters/gencode_gene_adapter.py @@ -1,9 +1,8 @@ -from adapters import Adapter import gzip import json -import os -from db.arango_db import ArangoDB +from typing import Optional +from adapters.writer import Writer # Example genocde gtf input file: # ##description: evidence-based annotation of the human genome (GRCh38), version 43 (Ensembl 109) @@ -17,30 +16,24 @@ # chr1 HAVANA exon 12613 12721 . + . gene_id "ENSG00000290825.1"; transcript_id "ENST00000456328.2"; gene_type "lncRNA"; gene_name "DDX11L2"; transcript_type "lncRNA"; transcript_name "DDX11L2-202"; exon_number 2; exon_id "ENSE00003582793.1"; level 2; transcript_support_level "1"; tag "basic"; tag "Ensembl_canonical"; havana_transcript "OTTHUMT00000362751.1"; -class GencodeGene(Adapter): +class GencodeGene: ALLOWED_KEYS = ['gene_id', 'gene_type', 'gene_name', 'transcript_id', 'transcript_type', 'transcript_name', 'hgnc_id', 'mgi_id'] INDEX = {'chr': 0, 'type': 2, 'coord_start': 3, 'coord_end': 4, 'info': 8} - OUTPUT_FOLDER = './parsed-data' ALLOWED_LABELS = [ 'gencode_gene', 'mm_gencode_gene', ] - def __init__(self, filepath=None, gene_alias_file_path=None, chr='all', label='gencode_gene', dry_run=False): + def __init__(self, filepath=None, gene_alias_file_path=None, chr='all', label='gencode_gene', dry_run=False, writer: Optional[Writer] = None, **kwargs): if label not in GencodeGene.ALLOWED_LABELS: - raise ValueError('Ivalid label. Allowed values: ' + + raise ValueError('Invalid label. Allowed values: ' + ','.join(GencodeGene.ALLOWED_LABELS)) self.filepath = filepath self.chr = chr self.label = label self.gene_alias_file_path = gene_alias_file_path - if not os.path.exists(GencodeGene.OUTPUT_FOLDER): - os.makedirs(GencodeGene.OUTPUT_FOLDER) - self.output_filepath = '{}/{}.json'.format( - GencodeGene.OUTPUT_FOLDER, - self.label, - ) + self.writer = writer self.dry_run = dry_run if self.label == 'gencode_gene': self.version = 'v43' @@ -49,8 +42,6 @@ def __init__(self, filepath=None, gene_alias_file_path=None, chr='all', label='g self.version = 'vM33' self.source_url = 'https://www.gencodegenes.org/mouse/' - super(GencodeGene, self).__init__() - def parse_info_metadata(self, info): parsed_info = {} for key, value in zip(info, info[1:]): @@ -138,7 +129,7 @@ def get_entrez_id(self, alias): def process_file(self): alias_dict = self.get_collection_alias() - parsed_data_file = open(self.output_filepath, 'w') + self.writer.open() for line in open(self.filepath, 'r'): if line.startswith('#'): continue @@ -185,17 +176,6 @@ def process_file(self): 'entrez': alias['entrez'] } ) - json.dump(to_json, parsed_data_file) - parsed_data_file.write('\n') - - parsed_data_file.close() - self.save_to_arango() - - def arangodb(self): - return ArangoDB().generate_json_import_statement(self.output_filepath, self.collection) - - def save_to_arango(self): - if self.dry_run: - print(self.arangodb()[0]) - else: - os.system(self.arangodb()[0]) + self.writer.write(json.dumps(to_json)) + self.writer.write('\n') + self.writer.close() diff --git a/data/adapters/gencode_gene_structure_adapter.py b/data/adapters/gencode_gene_structure_adapter.py index cf635d65..e5e2dfe0 100644 --- a/data/adapters/gencode_gene_structure_adapter.py +++ b/data/adapters/gencode_gene_structure_adapter.py @@ -1,8 +1,7 @@ -from adapters import Adapter -import gzip import json -import os -from db.arango_db import ArangoDB +from typing import Optional + +from adapters.writer import Writer # Example genocde gtf input file: # ##description: evidence-based annotation of the human genome (GRCh38), version 43 (Ensembl 109) @@ -18,7 +17,7 @@ # Column three has the gene structure info we want to load. Each exon can have substructures of CDS, UTR, start_condon, and stop_condon, which will have the same exon_id. -class GencodeStructure(Adapter): +class GencodeStructure: ALLOWED_KEYS = ['gene_id', 'gene_name', 'transcript_id', 'transcript_name', 'exon_number', 'exon_id'] @@ -36,11 +35,9 @@ class GencodeStructure(Adapter): 'mm_transcript_contains_mm_gene_structure' ] - OUTPUT_FOLDER = './parsed-data' - - def __init__(self, filepath=None, chr='all', label='gene_structure', dry_run=True): + def __init__(self, filepath=None, chr='all', label='gene_structure', dry_run=True, writer: Optional[Writer] = None, **kwargs): if label not in GencodeStructure.ALLOWED_LABELS: - raise ValueError('Ivalid label. Allowed values: ' + + raise ValueError('Invalid label. Allowed values: ' + ','.join(GencodeStructure.ALLOWED_LABELS)) self.filepath = filepath self.chr = chr @@ -56,11 +53,6 @@ def __init__(self, filepath=None, chr='all', label='gene_structure', dry_run=Tru if self.label == 'mm_transcript_contains_mm_gene_structure': self.transcript_endpoint = 'mm_transcripts/' self.gene_structure_endpoint = 'mm_genes_structure/' - self.output_filepath = '{}/{}.json'.format( - GencodeStructure.OUTPUT_FOLDER, - self.label - ) - self.SKIP_BIOCYPHER = True if self.label in ['gene_structure', 'transcript_contains_gene_structure']: self.version = 'v43' @@ -69,8 +61,7 @@ def __init__(self, filepath=None, chr='all', label='gene_structure', dry_run=Tru self.organism = 'Mus musculus' self.version = 'vM33' self.source_url = 'https://www.gencodegenes.org/mouse/' - - super(GencodeStructure, self).__init__() + self.writer = writer def parse_info_metadata(self, info): parsed_info = {} @@ -80,7 +71,7 @@ def parse_info_metadata(self, info): return parsed_info def process_file(self): - parsed_data_file = open(self.output_filepath, 'w') + self.writer.open() UTR_keys = set() exon_transcript = None last_exon_end = 0 @@ -148,8 +139,8 @@ def process_file(self): 'inverse_name': 'contained in' } - json.dump(to_json, parsed_data_file) - parsed_data_file.write('\n') + self.writer.write(json.dumps(to_json)) + self.writer.write('\n') # checked the gtf file is sorted by transcript_id & exon_number so this should work if gene_structure_type == 'exon': @@ -194,8 +185,8 @@ def process_file(self): 'inverse_name': 'contained in' } - json.dump(to_json, parsed_data_file) - parsed_data_file.write('\n') + self.writer.write(json.dumps(to_json)) + self.writer.write('\n') exon_transcript = info['transcript_id'] # the 'closer' end to the next exon @@ -203,14 +194,4 @@ def process_file(self): split_line[GencodeStructure.INDEX['coord_end']]) if split_line[GencodeStructure.INDEX['strand']] == '+' else int( split_line[GencodeStructure.INDEX['coord_start']]) - parsed_data_file.close() - self.save_to_arango() - - def arangodb(self): - return ArangoDB().generate_json_import_statement(self.output_filepath, self.collection, type=self.type) - - def save_to_arango(self): - if self.dry_run: - print(self.arangodb()[0]) - else: - os.system(self.arangodb()[0]) + self.writer.close() diff --git a/data/adapters/gtex_eqtl_adapter.py b/data/adapters/gtex_eqtl_adapter.py index 36dbaf5b..04f8b98b 100644 --- a/data/adapters/gtex_eqtl_adapter.py +++ b/data/adapters/gtex_eqtl_adapter.py @@ -4,12 +4,10 @@ import os import gzip from math import log10 +from typing import Optional -from adapters import Adapter from adapters.helpers import build_variant_id, to_float -from db.arango_db import ArangoDB - - +from adapters.writer import Writer # Example QTEx eQTL input file: # variant_id gene_id tss_distance ma_samples ma_count maf pval_nominal slope slope_se pval_nominal_threshold min_pval_nominal pval_beta # chr1_845402_A_G_b38 ENSG00000225972.1 216340 4 4 0.0155039 2.89394e-06 2.04385 0.413032 2.775e-05 2.89394e-06 0.00337661 @@ -20,19 +18,17 @@ # Brain - Amygdala Brain_Amygdala UBERON:0001876 -class GtexEQtl(Adapter): +class GtexEQtl: # 1-based coordinate system in variant_id ALLOWED_LABELS = ['GTEx_eqtl', 'GTEx_eqtl_term'] SOURCE = 'GTEx' SOURCE_URL_PREFIX = 'https://storage.googleapis.com/adult-gtex/bulk-qtl/v8/single-tissue-cis-qtl/GTEx_Analysis_v8_eQTL/' ONTOLOGY_ID_MAPPING_PATH = './data_loading_support_files/GTEx_UBERON_mapping.tsv' MAX_LOG10_PVALUE = 400 # based on max p_value from eqtl dataset - OUTPUT_PATH = './parsed-data' - - def __init__(self, filepath, label='GTEx_eqtl', dry_run=True): + def __init__(self, filepath=None, label='GTEx_eqtl', dry_run=True, writer: Optional[Writer] = None, **kwargs): if label not in GtexEQtl.ALLOWED_LABELS: - raise ValueError('Ivalid label. Allowed values: ' + + raise ValueError('Invalid label. Allowed values: ' + ','.join(GtexEQtl.ALLOWED_LABELS)) self.filepath = filepath @@ -40,16 +36,11 @@ def __init__(self, filepath, label='GTEx_eqtl', dry_run=True): self.label = label self.dry_run = dry_run self.type = 'edge' - self.output_filepath = '{}/{}.json'.format( - self.OUTPUT_PATH, - self.dataset - ) - - super(GtexEQtl, self).__init__() + self.writer = writer def process_file(self): - parsed_data_file = open(self.output_filepath, 'w') self.load_ontology_mapping() + self.writer.open() # Iterate over all tissues in the folder, example filename: Brain_Amygdala.v8.signif_variant_gene_pairs.txt.gz # Note: The server was crashed due to memory issues when iterating all the 49 tissues at once, had to split the files into 4 folders instead when loading. @@ -57,6 +48,7 @@ def process_file(self): if filename.endswith('signif_variant_gene_pairs.txt.gz'): print('Loading ' + filename) filename_biological_context = filename.split('.')[0] + print('Biological context: ' + filename_biological_context) if self.label == 'GTEx_eqtl_term': ontology_id = self.ontology_id_mapping.get( @@ -113,15 +105,10 @@ def process_file(self): 'pval_beta': to_float(row[-1]), 'label': 'eQTL', 'source': GtexEQtl.SOURCE, - 'source_url': GtexEQtl.SOURCE_URL_PREFIX + filename, - 'name': 'modulates expression of', - 'inverse_name': 'expression modulated by', - 'biological_process': 'ontology_terms/GO_0010468' + 'source_url': GtexEQtl.SOURCE_URL_PREFIX + filename } - json.dump(_props, parsed_data_file) - parsed_data_file.write('\n') - + self.writer.write(json.dumps(_props) + '\n') except: print(row) pass @@ -142,15 +129,13 @@ def process_file(self): 'name': 'occurs in', 'inverse_name': 'has measurement' } + print('_props:', _props) - json.dump(_props, parsed_data_file) - parsed_data_file.write('\n') - - except: + self.writer.write(json.dumps(_props) + '\n') + except Exception as e: print(row) pass - parsed_data_file.close() - self.save_to_arango() + self.writer.close() def load_ontology_mapping(self): self.ontology_id_mapping = {} # e.g. key: 'Brain_Amygdala', value: 'UBERON_0001876' @@ -164,12 +149,3 @@ def load_ontology_mapping(self): if row[1]: self.ontology_id_mapping[row[1]] = row[2].replace(':', '_') self.ontology_term_mapping[row[1]] = row[3] - - def save_to_arango(self): - if self.dry_run: - print(self.arangodb()[0]) - else: - os.system(self.arangodb()[0]) - - def arangodb(self): - return ArangoDB().generate_json_import_statement(self.output_filepath, self.collection, type=self.type) diff --git a/data/adapters/gtex_sqtl_adapter.py b/data/adapters/gtex_sqtl_adapter.py index abf0c643..ea2b73d4 100644 --- a/data/adapters/gtex_sqtl_adapter.py +++ b/data/adapters/gtex_sqtl_adapter.py @@ -4,9 +4,10 @@ import hashlib import csv from math import log10 -from adapters import Adapter +from typing import Optional + from adapters.helpers import build_variant_id, to_float -from db.arango_db import ArangoDB +from adapters.writer import Writer # The splice QTLs from GTEx are here: https://storage.googleapis.com/adult-gtex/bulk-qtl/v8/single-tissue-cis-qtl/GTEx_Analysis_v8_sQTL.tar # All the files use assembly grch38 @@ -29,17 +30,16 @@ # Brain - Amygdala Brain_Amygdala UBERON:0001876 -class GtexSQtl(Adapter): +class GtexSQtl: ALLOWED_LABELS = ['GTEx_splice_QTL', 'GTEx_splice_QTL_term'] SOURCE = 'GTEx' SOURCE_URL_PREFIX = 'https://storage.googleapis.com/adult-gtex/bulk-qtl/v8/single-tissue-cis-qtl/GTEx_Analysis_v8_sQTL/' ONTOLOGY_ID_MAPPING_PATH = './data_loading_support_files/GTEx_UBERON_mapping.tsv' # same as eqtl MAX_LOG10_PVALUE = 400 # based on max p_value from sqtl dataset - OUTPUT_PATH = './parsed-data' - def __init__(self, filepath, label='GTEx_splice_QTL', dry_run=True): + def __init__(self, filepath, label='GTEx_splice_QTL', dry_run=True, writer: Optional[Writer] = None, **kwargs): if label not in GtexSQtl.ALLOWED_LABELS: - raise ValueError('Ivalid label. Allowed values: ' + + raise ValueError('Invalid label. Allowed values: ' + ','.join(GtexSQtl.ALLOWED_LABELS)) self.filepath = filepath @@ -47,15 +47,10 @@ def __init__(self, filepath, label='GTEx_splice_QTL', dry_run=True): self.label = label self.dry_run = dry_run self.type = 'edge' - self.output_filepath = '{}/{}.json'.format( - self.OUTPUT_PATH, - self.dataset - ) - - super(GtexSQtl, self).__init__() + self.writer = writer def process_file(self): - parsed_data_file = open(self.output_filepath, 'w') + self.writer.open() self.load_ontology_mapping() # Iterate over all tissues in the folder, example filename: Brain_Amygdala.v8.sqtl_signifpairs.txt.gz @@ -132,8 +127,8 @@ def process_file(self): 'inverse_name': 'splicing modulated by', 'biological_process': 'ontology_terms/GO_0043484' } - json.dump(_props, parsed_data_file) - parsed_data_file.write('\n') + self.writer.write(json.dumps(_props)) + self.writer.write('\n') except: print( @@ -156,15 +151,15 @@ def process_file(self): 'inverse_name': 'has measurement' } - json.dump(_props, parsed_data_file) - parsed_data_file.write('\n') + self.writer.write(json.dumps(_props)) + self.writer.write('\n') except: print( f'fail to process edge for GTEx sQTL: {variant_id_info} and {phenotype_id}') pass - parsed_data_file.close() - self.save_to_arango() + + self.writer.close() def load_ontology_mapping(self): self.ontology_id_mapping = {} # e.g. key: 'Brain_Amygdala', value: 'UBERON_0001876' @@ -178,12 +173,3 @@ def load_ontology_mapping(self): if row[1]: self.ontology_id_mapping[row[1]] = row[2].replace(':', '_') self.ontology_term_mapping[row[1]] = row[3] - - def save_to_arango(self): - if self.dry_run: - print(self.arangodb()[0]) - else: - os.system(self.arangodb()[0]) - - def arangodb(self): - return ArangoDB().generate_json_import_statement(self.output_filepath, self.collection, type=self.type) diff --git a/data/adapters/gvatdb_asb_adapter.py b/data/adapters/gvatdb_asb_adapter.py index 734da22d..ba85dd64 100644 --- a/data/adapters/gvatdb_asb_adapter.py +++ b/data/adapters/gvatdb_asb_adapter.py @@ -1,11 +1,12 @@ import csv import json -import os import pickle -from adapters import Adapter -from adapters.helpers import build_variant_id from math import log10 -from db.arango_db import ArangoDB +from typing import Optional + +from adapters.helpers import build_variant_id +from adapters.writer import Writer + # Example rows from GVATdb_hg38.csv: the tested variants are in the center position of the oligo # The first three columns are variants coordinates in hg38, # which are liftovered from the hg19 coordinates in the original file GVATdb.csv @@ -18,27 +19,21 @@ # chr1,940255,940256,chr1_875636_C_T,C,T,ASCL1,novel_batch,4.21438970378755,0.000343998624005503,1.0421349006409,0.632781468874124 -class ASB_GVATDB(Adapter): +class ASB_GVATDB: TF_ID_MAPPING_PATH = './data_loading_support_files/GVATdb_TF_mapping.pkl' SOURCE = 'GVATdb allele-specific TF binding calls' SOURCE_URL = 'https://renlab.sdsc.edu/GVATdb/' - OUTPUT_PATH = './parsed-data' - def __init__(self, filepath, label, dry_run=True): + def __init__(self, filepath, label, dry_run=True, writer: Optional[Writer] = None, **kwargs): self.filepath = filepath self.label = label self.dataset = label self.dry_run = dry_run self.type = 'edge' - self.output_filepath = '{}/{}.json'.format( - self.OUTPUT_PATH, - self.dataset - ) - - super(ASB_GVATDB, self).__init__() + self.writer = writer def process_file(self): - parsed_data_file = open(self.output_filepath, 'w') + self.writer.open() self.load_tf_uniprot_id_mapping() with open(self.filepath, 'r') as asb_file: @@ -98,22 +93,12 @@ def process_file(self): 'biological_process': 'ontology_terms/GO_0051101' } - json.dump(_props, parsed_data_file) - parsed_data_file.write('\n') - parsed_data_file.close() - self.save_to_arango() + self.writer.write(json.dumps(_props)) + self.writer.write('\n') + self.writer.close() def load_tf_uniprot_id_mapping(self): # map tf names to uniprot ids self.tf_uniprot_id_mapping = {} with open(ASB_GVATDB.TF_ID_MAPPING_PATH, 'rb') as tf_uniprot_id_mapfile: self.tf_uniprot_id_mapping = pickle.load(tf_uniprot_id_mapfile) - - def save_to_arango(self): - if self.dry_run: - print(self.arangodb()[0]) - else: - os.system(self.arangodb()[0]) - - def arangodb(self): - return ArangoDB().generate_json_import_statement(self.output_filepath, self.collection, type=self.type) diff --git a/data/adapters/gwas_adapter.py b/data/adapters/gwas_adapter.py index 73c3e87d..3ea2d30f 100644 --- a/data/adapters/gwas_adapter.py +++ b/data/adapters/gwas_adapter.py @@ -1,11 +1,11 @@ -import os import json import hashlib import pickle from math import log10 -from adapters import Adapter +from typing import Optional + from adapters.helpers import build_variant_id -from db.arango_db import ArangoDB +from adapters.writer import Writer # GWAS variant to phenotype - v2d_igvf.tsv @@ -27,20 +27,19 @@ # 'eqtl']" [0.7 0.9 0.7 0. 0. 0.3 0.9] -class GWAS(Adapter): +class GWAS: # studies, variants <-(edge)-> phenotypes, edge <-> studies (hyperedge with variant info & study-specific stats) # variants in GWAS is 1-based, need to convert gwas variant position from 1-based to 0-based MAX_LOG10_PVALUE = 27000 # max abs value on pval_exponent is 26677 ONTOLOGY_MAPPING_PATH = './data_loading_support_files/gwas_ontology_term_name_mapping.pkl' - OUTPUT_PATH = './parsed-data' ALLOWED_COLLECTIONS = ['studies', 'variants_phenotypes', 'variants_phenotypes_studies'] - def __init__(self, variants_to_ontology, variants_to_genes, gwas_collection='studies', dry_run=True): + def __init__(self, variants_to_ontology, variants_to_genes, gwas_collection='studies', dry_run=True, writer: Optional[Writer] = None, **kwargs): if gwas_collection not in GWAS.ALLOWED_COLLECTIONS: - raise ValueError('Ivalid collection. Allowed values: ' + + raise ValueError('Invalid collection. Allowed values: ' + ','.join(GWAS.ALLOWED_COLLECTIONS)) self.variants_to_ontology_filepath = variants_to_ontology @@ -57,14 +56,7 @@ def __init__(self, variants_to_ontology, variants_to_genes, gwas_collection='stu self.gwas_collection = gwas_collection self.dry_run = dry_run - - self.output_filepath = '{}/{}-{}.json'.format( - GWAS.OUTPUT_PATH, - self.gwas_collection, - variants_to_ontology.split('/')[-1] - ) - - super(GWAS, self).__init__() + self.writer = writer # trying to capture the breakline problem described in the comments above def line_appears_broken(self, row): @@ -173,7 +165,7 @@ def process_variants_phenotypes(self, row): key = hashlib.sha256( (variant_id + '_' + ontology_term_id).encode()).hexdigest() - if self.collection == 'variants_phenotypes': + if self.gwas_collection == 'variants_phenotypes': if key in self.processed_keys: return None self.processed_keys.add(key) @@ -190,6 +182,7 @@ def process_variants_phenotypes(self, row): } def process_file(self): + self.writer.open() # tagged variants & genes info go to heyperedge collection if self.gwas_collection == 'variants_phenotypes_studies': print('Collecting tagged variants...') @@ -206,9 +199,6 @@ def process_file(self): # Many records are duplicated with different tagged variants. # We are collecting all tagged variants at once. # For that, we need to keep track of which keys we already processed to avoid duplicated entries. - - parsed_data_file = open(self.output_filepath, 'w') - print('Processing file...') for record in open(self.variants_to_ontology_filepath, 'r'): @@ -248,20 +238,10 @@ def process_file(self): if props is None: continue - json.dump(props, parsed_data_file) - parsed_data_file.write('\n') - - parsed_data_file.close() - self.save_to_arango() + self.writer.write(json.dumps(props)) + self.writer.write('\n') - def arangodb(self): - return ArangoDB().generate_json_import_statement(self.output_filepath, self.gwas_collection, type=self.type) - - def save_to_arango(self): - if self.dry_run: - print(self.arangodb()[0]) - else: - os.system(self.arangodb()[0]) + self.writer.close() def get_tagged_variants(self): header = None diff --git a/data/adapters/human_mouse_element_adapter.py b/data/adapters/human_mouse_element_adapter.py index 75604ff5..26ffa6fe 100644 --- a/data/adapters/human_mouse_element_adapter.py +++ b/data/adapters/human_mouse_element_adapter.py @@ -1,10 +1,10 @@ import gzip import csv import json -import os -from adapters import Adapter +from typing import Optional + from adapters.helpers import build_regulatory_region_id -from db.arango_db import ArangoDB +from adapters.writer import Writer # ENCFF078OEX – ENCODE contains a mapping of ENCODE mouse and human DNase HS regions # doc for headers: https://www.encodeproject.org/documents/924f991f-616f-4bfd-ae1f-6d22acb048b4/@@download/attachment/extended_score_txt_format.pdf @@ -22,7 +22,7 @@ # 1.100671 1438444 chr1:190772-190971 chr6:121518345-121518544 87 0.595 0 0 0.14204574384648785 0.16878 0.35295128811475457 0.45000000000000007 0.29554 0.6467372850126809 0.07660107275344429 0.290718 0.6516195185696351 0.44000000000000006 0.35876 0.749424691902632 0.09939259539511455 0.340664 0.7073855435688593 0.44000000000000006 0.34273 0.7261781138385377 0.12122052125065466 0.093576 0.6012641180271799 0.38 0.78846 0.9443415427587122 Human DHS -class HumanMouseElementAdapter(Adapter): +class HumanMouseElementAdapter: SOURCE = 'FUNCODE' ALLOWED_LABELS = [ 'regulatory_region', @@ -63,11 +63,10 @@ class HumanMouseElementAdapter(Adapter): 'cob_H3K4me3_fdr': 31, 'source': 32, } - OUTPUT_PATH = './parsed-data' - def __init__(self, filepath, label='regulatory_region_mm_regulatory_region', dry_run=True): + def __init__(self, filepath, label='regulatory_region_mm_regulatory_region', dry_run=True, writer: Optional[Writer] = None, **kwargs): if label not in HumanMouseElementAdapter.ALLOWED_LABELS: - raise ValueError('Ivalid label. Allowed values: ' + + raise ValueError('Invalid label. Allowed values: ' + ','.join(HumanMouseElementAdapter.ALLOWED_LABELS)) self.filepath = filepath self.label = label @@ -78,15 +77,10 @@ def __init__(self, filepath, label='regulatory_region_mm_regulatory_region', dry self.type = 'node' if(self.label == 'regulatory_region_mm_regulatory_region'): self.type = 'edge' - self.output_filepath = '{}/{}.json'.format( - self.OUTPUT_PATH, - self.dataset - ) - - super(HumanMouseElementAdapter, self).__init__() + self.writer = writer def process_file(self): - parsed_data_file = open(self.output_filepath, 'w') + self.writer.open() with gzip.open(self.filepath, 'rt') as input_file: reader = csv.reader(input_file, delimiter='\t') next(reader) @@ -111,8 +105,8 @@ def process_file(self): 'source': self.SOURCE, 'source_url': self.source_url } - json.dump(_props, parsed_data_file) - parsed_data_file.write('\n') + self.writer.write(json.dumps(_props)) + self.writer.write('\n') elif self.label == 'mm_regulatory_region': _props = { '_key': _id_mouse, @@ -123,8 +117,8 @@ def process_file(self): 'source': self.SOURCE, 'source_url': self.source_url } - json.dump(_props, parsed_data_file) - parsed_data_file.write('\n') + self.writer.write(json.dumps(_props)) + self.writer.write('\n') else: _id = _id_human + '_' + _id_mouse _target = 'regulatory_regions/' + _id_human @@ -163,16 +157,6 @@ def process_file(self): 'source': self.SOURCE, 'source_url': self.source_url } - json.dump(_props, parsed_data_file) - parsed_data_file.write('\n') - parsed_data_file.close() - self.save_to_arango() - - def save_to_arango(self): - if self.dry_run: - print(self.arangodb()[0]) - else: - os.system(self.arangodb()[0]) - - def arangodb(self): - return ArangoDB().generate_json_import_statement(self.output_filepath, self.collection, type=self.type) + self.writer.write(json.dumps(_props)) + self.writer.write('\n') + self.writer.close() diff --git a/data/adapters/mgi_human_mouse_ortholog_adapter.py b/data/adapters/mgi_human_mouse_ortholog_adapter.py index 69259225..0bf3e8e7 100644 --- a/data/adapters/mgi_human_mouse_ortholog_adapter.py +++ b/data/adapters/mgi_human_mouse_ortholog_adapter.py @@ -1,8 +1,8 @@ import json -import os import pickle -from adapters import Adapter -from db.arango_db import ArangoDB +from typing import Optional + +from adapters.writer import Writer # Sample file: # DB Class Key Common Organism Name NCBI Taxon ID Symbol EntrezGene ID Mouse MGI ID HGNC ID OMIM Gene ID Genetic Location Genome Coordinates (mouse: GRCm39 human: GRCh38) Nucleotide RefSeq IDs Protein RefSeq IDs SWISS_PROT IDs @@ -11,24 +11,18 @@ # 45916482 mouse, laboratory 10090 Hoxa4 15401 MGI:96176 Chr6 25.4 cM Chr6:52166662-52168683(-) NM_008265 NP_032291 P06798 -class MGIHumanMouseOrthologAdapter(Adapter): +class MGIHumanMouseOrthologAdapter: LABEL = 'human_mm_genes_ortholog' MGI_ENSEMBL_FILEPATH = 'data_loading_support_files/MRK_ENSEMBL.rpt' HUMAN_ENTREZ_TO_ENSEMBL_FILEPATH = './data_loading_support_files/entrez_to_ensembl.pkl' - OUTPUT_PATH = './parsed-data' - def __init__(self, filepath, dry_run=True): + def __init__(self, filepath, dry_run=True, writer: Optional[Writer] = None, **kwargs): self.filepath = filepath self.label = MGIHumanMouseOrthologAdapter.LABEL self.dataset = self.label self.dry_run = dry_run self.type = 'edge' - self.output_filepath = '{}/{}.json'.format( - self.OUTPUT_PATH, - self.dataset - ) - - super(MGIHumanMouseOrthologAdapter, self).__init__() + self.writer = writer def load_entrz_ensembl_mapping(self): with open(MGIHumanMouseOrthologAdapter.HUMAN_ENTREZ_TO_ENSEMBL_FILEPATH, 'rb') as f: @@ -41,7 +35,7 @@ def load_mgi_ensembl_mapping(self): self.mm_gene_mapping[data_line[0]] = data_line[5] def process_file(self): - parsed_data_file = open(self.output_filepath, 'w') + self.writer.open() self.load_mgi_ensembl_mapping() self.load_entrz_ensembl_mapping() @@ -109,16 +103,6 @@ def process_file(self): 'source_url': 'https://www.informatics.jax.org/downloads/reports/HOM_MouseHumanSequence.rpt' } - json.dump(props, parsed_data_file) - parsed_data_file.write('\n') - parsed_data_file.close() - self.save_to_arango() - - def save_to_arango(self): - if self.dry_run: - print(self.arangodb()[0]) - else: - os.system(self.arangodb()[0]) - - def arangodb(self): - return ArangoDB().generate_json_import_statement(self.output_filepath, self.collection, type=self.type) + self.writer.write(json.dumps(props)) + self.writer.write('\n') + self.writer.close() diff --git a/data/adapters/motif_adapter.py b/data/adapters/motif_adapter.py index d342eacf..cc2824a9 100644 --- a/data/adapters/motif_adapter.py +++ b/data/adapters/motif_adapter.py @@ -1,9 +1,8 @@ import os import json -import csv +from typing import Optional -from adapters import Adapter -from db.arango_db import ArangoDB +from adapters.writer import Writer # Example TF motif file from HOCOMOCO (e.g. ATF1_HUMAN.H11MO.0.B.pwm), which adastra used. # Each pwm (position weight matrix) is a N x 4 matrix, where N is the length of the TF motif. @@ -21,17 +20,15 @@ # 0.7561011054759478 -0.7707228823699511 -0.2914989252431338 -0.4151773801942997 -class Motif(Adapter): +class Motif: ALLOWED_LABELS = ['motif', 'motif_protein_link'] SOURCE = 'HOCOMOCOv11' SOURCE_URL = 'hocomoco11.autosome.org/motif/' TF_ID_MAPPING_PATH = './samples/motifs/HOCOMOCOv11_core_annotation_HUMAN_mono.tsv' - OUTPUT_PATH = './parsed-data' - - def __init__(self, filepath, label='motif', dry_run=True): + def __init__(self, filepath, label='motif', dry_run=True, writer: Optional[Writer] = None, **kwargs): if label not in Motif.ALLOWED_LABELS: - raise ValueError('Ivalid label. Allowed values: ' + + raise ValueError('Invalid label. Allowed values: ' + ','.join(Motif.ALLOWED_LABELS)) self.filepath = filepath @@ -47,12 +44,7 @@ def __init__(self, filepath, label='motif', dry_run=True): self.tf_ids = Motif.TF_ID_MAPPING_PATH self.source = Motif.SOURCE self.source_url = Motif.SOURCE_URL - self.output_filepath = '{}/{}.json'.format( - Motif.OUTPUT_PATH, - self.dataset - ) - - super(Motif, self).__init__() + self.writer = writer def load_tf_uniprot_id_mapping(self): self.tf_uniprot_id_mapping = {} # e.g. key: 'ANDR_HUMAN'; value: 'P10275' @@ -62,7 +54,7 @@ def load_tf_uniprot_id_mapping(self): self.tf_uniprot_id_mapping[mapping[-2]] = mapping[-1] def process_file(self): - parsed_data_file = open(self.output_filepath, 'w') + self.writer.open() for filename in os.listdir(self.filepath): if filename.endswith('.pwm'): print(filename) @@ -112,17 +104,7 @@ def process_file(self): 'source': self.source } - json.dump(props, parsed_data_file) - parsed_data_file.write('\n') - - parsed_data_file.close() - self.save_to_arango() - - def save_to_arango(self): - if self.dry_run: - print(self.arangodb()[0]) - else: - os.system(self.arangodb()[0]) + self.writer.write(json.dumps(props)) + self.writer.write('\n') - def arangodb(self): - return ArangoDB().generate_json_import_statement(self.output_filepath, self.collection, type=self.type) + self.writer.close() diff --git a/data/adapters/mouse_genomes_project_adapter.py b/data/adapters/mouse_genomes_project_adapter.py index 81921490..34dbb68a 100644 --- a/data/adapters/mouse_genomes_project_adapter.py +++ b/data/adapters/mouse_genomes_project_adapter.py @@ -1,14 +1,14 @@ +import json +from typing import Optional + from ga4gh.vrs.extras.translator import Translator from ga4gh.vrs.dataproxy import create_dataproxy from biocommons.seqrepo import SeqRepo -from adapters import Adapter from adapters.helpers import build_mouse_variant_id -from scripts.variants_spdi import build_spdi, build_hgvs_from_spdi +from adapters.writer import Writer -from db.arango_db import ArangoDB -import json -import os +from scripts.variants_spdi import build_spdi, build_hgvs_from_spdi # source files are from here: https://ftp.ebi.ac.uk/pub/databases/mousegenomes/REL-2112-v8-SNPs_Indels/ # mouse genomes project info: https://www.sanger.ac.uk/data/mouse-genomes-project/ @@ -30,13 +30,11 @@ # 1 3050776 . T TAAA 7046.84 . AC=2;AF=0.019;AN=104;BaseQRankSum=-0.382;DP=4058;ExcessHet=0.1499;FS=3.125;InbreedingCoeff=0.1457;MLEAC=2;MLEAF=0.019;MQ=59.91;MQRankSum=0;QD=6.54;ReadPosRankSum=-0.611;SOR=0.99;CSQ=AAA|intergenic_variant|MODIFIER|||||||||||||||||||| GT:AD:DP:GQ:PGT:PID:PL:PS:FI 0/0:15,0:15:45:.:.:0,45,495:.:1 0/0:32,0:32:87:.:.:0,87,1305:.:1 0/0:7,0:7:18:.:.:0,18,270:.:0 0/0:48,0:48:99:.:.:0,120,1800:.:1 0/0:14,0:14:33:.:.:0,33,495:.:0 0/0:79,0:79:99:.:.:0,120,1800:.:1 0/0:40,0:40:79:.:.:0,79,1570:.:1 0/0:66,0:66:99:.:.:0,120,1800:.:1 0/0:52,0:52:99:.:.:0,120,1800:.:1 0/0:25,0:25:66:.:.:0,66,990:.:1 0/0:12,0:12:33:.:.:0,33,495:.:0 0/0:46,0:46:99:.:.:0,112,1800:.:1 0/0:71,0:71:99:.:.:0,120,1800:.:1 0/0:88,0:88:99:.:.:0,120,1800:.:1 0/0:97,0:97:99:.:.:0,120,1800:.:1 0/0:175,0:205:99:.:.:543,1068,7996:.:1 0/0:115,0:138:99:.:.:453,798,5117:.:1 0/0:74,0:74:99:.:.:0,120,1800:.:1 0/0:57,0:57:99:.:.:0,120,1800:.:1 0/0:37,0:37:90:.:.:0,90,1350:.:1 0/0:16,0:39:99:.:.:560,608,1280:.:1 0/0:708,0:708:99:.:.:0,120,1800:.:1 0/0:44,0:44:99:.:.:0,108,1620:.:1 0/0:22,0:22:60:.:.:0,60,900:.:1 0/0:30,0:30:84:.:.:0,84,1260:.:1 0/0:23,0:23:60:.:.:0,60,900:.:1 0/0:270,0:333:99:.:.:1588,2400,12886:.:1 0/0:103,0:103:99:.:.:0,120,1800:.:1 0/0:49,0:49:99:.:.:0,120,1800:.:1 0/0:10,0:10:27:.:.:0,27,405:.:0 0/0:24,0:24:70:.:.:0,70,975:.:1 0/0:19,0:19:54:.:.:0,54,810:.:1 0/0:157,0:181:99:.:.:339,811,6730:.:1 0/0:22,0:22:54:.:.:0,54,810:.:1 0/0:31,0:31:90:.:.:0,90,1142:.:1 0/0:31,0:56:99:.:.:558,651,1946:.:1 0/0:59,0:59:99:.:.:0,120,1800:.:1 0/0:71,0:71:99:.:.:0,120,1800:.:1 0/0:24,0:24:60:.:.:0,60,900:.:1 0/0:416,0:462:99:.:.:0,1251,17229:.:1 0/0:34,0:34:99:.:.:0,99,1485:.:1 0/0:30,0:30:90:.:.:0,90,1263:.:1 0/0:37,0:37:93:.:.:0,93,1395:.:1 0/0:53,0:53:99:.:.:0,120,1800:.:1 0/0:43,0:43:99:.:.:0,120,1800:.:1 0/0:26,0:26:66:.:.:0,66,990:.:1 0/0:5,0:5:12:.:.:0,12,180:.:0 1|1:0,34:34:99:1|1:3050776_T_TAAA:1530,102,0:3050776:1 0/0:37,0:37:99:.:.:0,102,1530:.:1 0/0:9,0:9:27:.:.:0,27,364:.:0 0/0:28,0:92:99:.:.:2102,2187,3180:.:1 0/0:15,0:15:45:.:.:0,45,626:.:1 -class MouseGenomesProjectAdapter(Adapter): +class MouseGenomesProjectAdapter: # Originally 1-based coordinate system # Converted to 0-based LABEL = 'mouse_variant' - OUTPUT_FOLDER = './parsed-data' - WRITE_THRESHOLD = 1000000 FILE_COLUMNS = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', '129P2_OlaHsd', '129S1_SvImJ', '129S5SvEvBrd', 'A_J', 'AKR_J', 'B10.RIII', 'BALB_cByJ', 'BALB_cJ', 'BTBR_T+_Itpr3tf_J', 'BUB_BnJ', 'C3H_HeH', 'C3H_HeJ', 'C57BL_10J', 'C57BL_10SnJ', 'C57BL_6NJ', 'C57BR_cdJ', 'C57L_J', 'C58_J', 'CAST_EiJ', 'CBA_J', 'CE_J', 'CZECHII_EiJ', 'DBA_1J', 'DBA_2J', 'FVB_NJ', 'I_LnJ', 'JF1_MsJ', 'KK_HiJ', 'LEWES_EiJ', 'LG_J', 'LP_J', 'MAMy_J', 'MOLF_EiJ', 'NOD_ShiLtJ', 'NON_LtJ', 'NZB_B1NJ', 'NZO_HlLtJ', 'NZW_LacJ', 'PL_J', 'PWK_PhJ', 'QSi3', 'QSi5', 'RF_J', 'RIIIS_J', 'SEA_GnJ', 'SJL_J', 'SM_J', 'SPRET_EiJ', 'ST_bJ', 'SWR_J', 'WSB_EiJ', 'ZALENDE_EiJ'] @@ -44,24 +42,15 @@ class MouseGenomesProjectAdapter(Adapter): STRAINS = ['129S1_SvImJ', 'A_J', 'CAST_EiJ', 'NOD_ShiLtJ', 'NZO_HlLtJ', 'PWK_PhJ', 'WSB_EiJ'] - def __init__(self, filepath=None, dry_run=True): + def __init__(self, filepath=None, dry_run=True, writer: Optional[Writer] = None, **kwargs): self.filepath = filepath self.label = self.LABEL - if not os.path.exists(self.OUTPUT_FOLDER): - os.makedirs(self.OUTPUT_FOLDER) - self.output_filepath = '{}/{}-{}.json'.format( - self.OUTPUT_FOLDER, - self.label, - filepath.split('/')[-1], - ) self.organism = 'Mus musculus' self.dry_run = dry_run - - super(MouseGenomesProjectAdapter, self).__init__() + self.writer = writer def process_file(self): - parsed_data_file = open(self.output_filepath, 'w') - + self.writer.open() # Check install seqrepo instrunction in docs folder dp = create_dataproxy( 'seqrepo+file:///usr/local/share/seqrepo/mouse') @@ -69,10 +58,8 @@ def process_file(self): translator = Translator(data_proxy=dp, default_assembly_name='GRCm39') reading_data = False - record_count = 0 json_objects = [] json_object_keys = set() - line_index = 0 for line in open(self.filepath, 'r'): if line.startswith('#CHROM'): @@ -157,35 +144,14 @@ def process_file(self): store_json = json_objects.pop(0) json_object_keys.remove(store_json['_key']) - json.dump(store_json, parsed_data_file) - parsed_data_file.write('\n') - record_count += 1 + self.writer.write(json.dumps(store_json)) + self.writer.write('\n') else: json_objects = [to_json] json_object_keys.add(to_json['_key']) - if record_count > self.WRITE_THRESHOLD: - parsed_data_file.close() - self.save_to_arango() - - os.remove(self.output_filepath) - record_count = 0 - - parsed_data_file = open( - self.output_filepath, 'w') - for object in json_objects: - json.dump(object, parsed_data_file) - parsed_data_file.write('\n') - record_count += 1 - - parsed_data_file.close() - self.save_to_arango() - - def arangodb(self): - return ArangoDB().generate_json_import_statement(self.output_filepath, self.collection) + for json_object in json_objects: + self.writer.write(json.dumps(json_object)) + self.writer.write('\n') - def save_to_arango(self): - if self.dry_run: - print(self.arangodb()[0]) - else: - os.system(self.arangodb()[0]) + self.writer.close() diff --git a/data/adapters/oncotree_adapter.py b/data/adapters/oncotree_adapter.py index 381286e6..babf8487 100644 --- a/data/adapters/oncotree_adapter.py +++ b/data/adapters/oncotree_adapter.py @@ -1,8 +1,8 @@ import json -import os import requests -from adapters import Adapter -from db.arango_db import ArangoDB +from typing import Optional + +from adapters.writer import Writer # The tumor types are available from oncotree api: https://oncotree.mskcc.org:443/api/tumorTypes # Example for one tumor type node: @@ -22,30 +22,25 @@ # The hierarchical classification tree can also be explored from: https://oncotree.mskcc.org/ -class Oncotree(Adapter): +class Oncotree: SOURCE = 'Oncotree' SOURCE_URL = 'https://oncotree.mskcc.org/' API_URL = 'https://oncotree.mskcc.org:443/api/tumorTypes' - OUTPUT_PATH = './parsed-data' - def __init__(self, type, dry_run=True): + def __init__(self, type, dry_run=True, writer: Optional[Writer] = None, **kwargs): self.type = type - if type == 'node': + if self.type == 'node': self.dataset = 'ontology_term' self.label = 'ontology_term' else: self.dataset = 'ontology_relationship' self.label = 'ontology_relationship' self.dry_run = dry_run - self.output_filepath = '{}/{}.json'.format( - self.OUTPUT_PATH, - self.dataset - ) - super(Oncotree, self).__init__() + self.writer = writer def process_file(self): - parsed_data_file = open(self.output_filepath, 'w') + self.writer.open() oncotree_json = requests.get(Oncotree.API_URL).json() for node in oncotree_json: # reformating for one illegal term: MDS/MPN @@ -65,8 +60,8 @@ def process_file(self): 'uri': Oncotree.SOURCE_URL } - json.dump(_props, parsed_data_file) - parsed_data_file.write('\n') + self.writer.write(json.dumps(_props)) + self.writer.write('\n') else: _source = 'ontology_terms/Oncotree_' + key @@ -90,8 +85,8 @@ def process_file(self): 'source': Oncotree.SOURCE, } - json.dump(_props, parsed_data_file) - parsed_data_file.write('\n') + self.writer.write(json.dumps(_props)) + self.writer.write('\n') if node['externalReferences']: type = 'database cross-reference' @@ -113,16 +108,6 @@ def process_file(self): 'source': Oncotree.SOURCE, } - json.dump(_props, parsed_data_file) - parsed_data_file.write('\n') - parsed_data_file.close() - self.save_to_arango() - - def save_to_arango(self): - if self.dry_run: - print(self.arangodb()[0]) - else: - os.system(self.arangodb()[0]) - - def arangodb(self): - return ArangoDB().generate_json_import_statement(self.output_filepath, self.collection, type=self.type) + self.writer.write(json.dumps(_props)) + self.writer.write('\n') + self.writer.close() diff --git a/data/adapters/ontologies_adapter.py b/data/adapters/ontologies_adapter.py index b3aaf960..e5e34601 100644 --- a/data/adapters/ontologies_adapter.py +++ b/data/adapters/ontologies_adapter.py @@ -1,17 +1,16 @@ -import os import json import gzip import urllib import tempfile +from typing import Optional + import rdflib from owlready2 import * -from db.arango_db import ArangoDB -from adapters import Adapter +from adapters.writer import Writer -class Ontology(Adapter): - OUTPUT_PATH = './parsed-data' +class Ontology: ONTOLOGIES = { 'uberon': 'https://api.data.igvf.org/reference-files/IGVFFI7985BGYI/@@download/IGVFFI7985BGYI.owl.gz', @@ -60,7 +59,16 @@ class Ontology(Adapter): PREDICATES = [SUBCLASS, DB_XREF] RESTRICTION_PREDICATES = [HAS_PART, PART_OF] - def __init__(self, ontology, dry_run=True): + def __init__( + self, + ontology, + dry_run=True, + node_primary_writer: Optional[Writer] = None, + node_secondary_writer: Optional[Writer] = None, + edge_primary_writer: Optional[Writer] = None, + edge_secondary_writer: Optional[Writer] = None, + **kwargs + ): if ontology not in Ontology.ONTOLOGIES.keys(): raise ValueError('Ontology not supported.') @@ -69,8 +77,10 @@ def __init__(self, ontology, dry_run=True): self.dry_run = dry_run self.ontology = ontology - - super(Ontology, self).__init__() + self.node_primary_writer = node_primary_writer + self.node_secondary_writer = node_secondary_writer + self.edge_primary_writer = edge_primary_writer + self.edge_secondary_writer = edge_secondary_writer def process_file(self): path = '{}/{}-'.format(Ontology.OUTPUT_PATH, self.ontology) @@ -80,12 +90,12 @@ def process_file(self): # primary data will replace secondary data when loading into DB self.outputs = { 'node': { - 'primary': open(path + 'node-primary.json', 'w'), - 'secondary': open(path + 'node-secondary.json', 'w') + 'primary': self.node_primary_writer.open(), + 'secondary': self.node_secondary_writer.open() }, 'edge': { - 'primary': open(path + 'edge-primary.json', 'w'), - 'secondary': open(path + 'edge-secondary.json', 'w') + 'primary': self.edge_primary_writer.open(), + 'secondary': self.edge_secondary_writer.open() } } @@ -95,8 +105,6 @@ def process_file(self): self.outputs[t]['primary'].close() self.outputs[t]['secondary'].close() - self.save_to_arango(type=t) - def process_ontology(self): print('Downloading {}...'.format(self.ontology)) @@ -248,8 +256,7 @@ def save_props(self, props, primary=True, prop_type='node'): if not primary: save_to = self.outputs[prop_type]['secondary'] - json.dump(props, save_to) - save_to.write('\n') + save_to.write(json.dumps(props) + '\n') def predicate_name(self, predicate): predicate = str(predicate) @@ -343,24 +350,6 @@ def is_blank(self, node): return isinstance(node, BLANK_NODE) - def arangodb(self, primary=True, type='node'): - collection = self.collection - if type == 'edge': - collection = self.collection + '_' + self.collection - - if primary is False: - return ArangoDB().generate_json_import_statement(self.outputs[type]['secondary'].name, collection, type=type) - - return ArangoDB().generate_json_import_statement(self.outputs[type]['primary'].name, collection, type=type, replace=True) - - def save_to_arango(self, type='node'): - if self.dry_run: - print(self.arangodb(primary=False, type=type)[0]) - print(self.arangodb(type=type)[0]) - else: - os.system(self.arangodb(primary=False, type=type)[0]) - os.system(self.arangodb(type=type)[0]) - # it's faster to load all subject/objects beforehand def clear_cache(self): self.cache = {} diff --git a/data/adapters/orphanet_disease_adapter.py b/data/adapters/orphanet_disease_adapter.py index 17122ac8..e129e2b7 100644 --- a/data/adapters/orphanet_disease_adapter.py +++ b/data/adapters/orphanet_disease_adapter.py @@ -1,9 +1,8 @@ import xml.etree.ElementTree as ET import json -import os +from typing import Optional -from adapters import Adapter -from db.arango_db import ArangoDB +from adapters.writer import Writer # The xml file was download from https://www.orphadata.com/genes/ # The disease-gene association elements are under each Disorder element in the tree from the xml file @@ -31,28 +30,21 @@ # -class Disease(Adapter): +class Disease: SOURCE = 'Orphanet' SOURCE_URL = 'https://www.orphadata.com/genes/' - OUTPUT_PATH = './parsed-data' - - def __init__(self, filepath, dry_run=True): + def __init__(self, filepath, dry_run=True, writer: Optional[Writer] = None, **kwargs): self.filepath = filepath self.dataset = 'disease_gene' self.label = 'disease_gene' self.collection = 'diseases_genes' self.type = 'edge' self.dry_run = dry_run - self.output_filepath = '{}/{}.json'.format( - Disease.OUTPUT_PATH, - self.dataset - ) - - super(Disease, self).__init__() + self.writer = writer def process_file(self): - parsed_data_file = open(self.output_filepath, 'w') + self.writer.open() # the xml file is relatively small, just parse at once here # or could return an iterator with ET.iterparse(xmlfile) @@ -104,17 +96,7 @@ def process_file(self): 'source_url': Disease.SOURCE_URL } - json.dump(props, parsed_data_file) - parsed_data_file.write('\n') - - parsed_data_file.close() - self.save_to_arango() - - def save_to_arango(self): - if self.dry_run: - print(self.arangodb()[0]) - else: - os.system(self.arangodb()[0]) + self.writer.write(json.dumps(props)) + self.writer.write('\n') - def arangodb(self): - return ArangoDB().generate_json_import_statement(self.output_filepath, self.collection, type=self.type) + self.writer.close() diff --git a/data/adapters/pQTL_adapter.py b/data/adapters/pQTL_adapter.py index 36344d59..de127fd9 100644 --- a/data/adapters/pQTL_adapter.py +++ b/data/adapters/pQTL_adapter.py @@ -1,38 +1,32 @@ import csv import json -import os -from adapters import Adapter +from typing import Optional + from adapters.helpers import build_variant_id -from db.arango_db import ArangoDB +from adapters.writer import Writer # Example rows from pQTL file (Supplementary Table 9) # Variant ID (CHROM:GENPOS (hg37):A0:A1:imp:v1) CHROM GENPOS (hg38) Region ID Region Start Region End MHC UKBPPP ProteinID Assay Target Target UniProt rsID A1FREQ (discovery) BETA (discovery, wrt. A1) SE (discovery) log10(p) (discovery) A1FREQ (replication) BETA (replication) SE (replication) log10(p) (replication) cis/trans cis gene Bioinfomatic annotated gene Ensembl gene ID Annotated gene consequence Biotype Distance to gene CADD_phred SIFT PolyPhen PHAST Phylop_score FitCons_score IMPACT # 2:27730940:T:C:imp:v1 2 27508073 975 26263266 29121418 0 A1BG:P04217:OID30771:v1 A1BG P04217 rs1260326 0.6084 -0.137 0.007 79.2 0.6306 -0.105 0.010 23.9 trans - GCKR ENSG00000084734 missense_variant,splice_region_variant protein_coding 0 T Benign 408 0.553676 MODERATE -class pQTL(Adapter): +class pQTL: SOURCE = 'UKB' SOURCE_URL = 'https://metabolomips.org/ukbbpgwas/' BIOLOGICAL_CONTEXT = 'blood plasma' - OUTPUT_PATH = './parsed-data' - def __init__(self, filepath, label, dry_run=True): + def __init__(self, filepath, label, dry_run=True, writer: Optional[Writer] = None, **kwargs): self.filepath = filepath self.label = label self.dataset = label self.dry_run = dry_run self.type = 'edge' - self.output_filepath = '{}/{}.json'.format( - self.OUTPUT_PATH, - self.dataset - ) - - super(pQTL, self).__init__() + self.writer = writer def process_file(self): - parsed_data_file = open(self.output_filepath, 'w') + self.writer.open() with open(self.filepath, 'r') as pqtl_file: pqtl_csv = csv.reader(pqtl_file) next(pqtl_csv) @@ -53,7 +47,7 @@ def process_file(self): '_from': _source, '_to': _target, 'rsid': row[10] if row[10] != '-' else None, - 'variant_' + # 'variant_' 'label': 'pQTL', 'log10pvalue': float(row[14]), 'beta': float(row[12]), # i.e. effect size @@ -69,16 +63,6 @@ def process_file(self): 'method': 'ontology_terms/BAO_0080027' } - json.dump(_props, parsed_data_file) - parsed_data_file.write('\n') - parsed_data_file.close() - self.save_to_arango() - - def save_to_arango(self): - if self.dry_run: - print(self.arangodb()[0]) - else: - os.system(self.arangodb()[0]) - - def arangodb(self): - return ArangoDB().generate_json_import_statement(self.output_filepath, self.collection, type=self.type) + self.writer.write(json.dumps(_props)) + self.writer.write('\n') + self.writer.close() diff --git a/data/adapters/pharmgkb_drug_adapter.py b/data/adapters/pharmgkb_drug_adapter.py index 616bd732..c3082be8 100644 --- a/data/adapters/pharmgkb_drug_adapter.py +++ b/data/adapters/pharmgkb_drug_adapter.py @@ -2,11 +2,11 @@ import json import csv import re +from collections import defaultdict +from typing import Optional -from db.arango_db import ArangoDB -from adapters import Adapter from adapters.helpers import build_variant_id_from_hgvs -from collections import defaultdict +from adapters.writer import Writer # Variant Annotation files downloaded from https://www.pharmgkb.org/downloads # Split into three files with most columns in common @@ -29,7 +29,7 @@ # genes.tsv: map gene symbols to Ensembl IDs -class PharmGKB(Adapter): +class PharmGKB: SOURCE = 'pharmGKB' SOURCE_URL_PREFIX = 'https://www.pharmgkb.org/' DRUG_ID_MAPPING_PATH = './data_loading_support_files/pharmGKB_chemicals.tsv' @@ -48,9 +48,7 @@ class PharmGKB(Adapter): 'variant_drug_gene', ] - OUTPUT_PATH = './parsed-data' - - def __init__(self, filepath, label, dry_run=True): + def __init__(self, filepath, label, dry_run=True, writer: Optional[Writer] = None, **kwargs): if label not in PharmGKB.ALLOWED_LABELS: raise ValueError('Invalid label. Allowed values: ' + ','.join(PharmGKB.ALLOWED_LABELS)) @@ -63,17 +61,10 @@ def __init__(self, filepath, label, dry_run=True): self.type = 'node' else: self.type = 'edge' - - self.output_filepath = '{}/{}_{}.json'.format( - PharmGKB.OUTPUT_PATH, - self.dataset, - PharmGKB.SOURCE - ) - - super(PharmGKB, self).__init__() + self.writer = writer def process_file(self): - self.parsed_data_file = open(self.output_filepath, 'w') + self.writer.open() if self.type == 'node': with open(PharmGKB.DRUG_ID_MAPPING_PATH, 'r') as drug_file: @@ -100,8 +91,7 @@ def process_file(self): self.save_props(props) - self.parsed_data_file.close() - self.save_to_arango() + self.writer.close() else: self.load_drug_id_mapping() @@ -281,8 +271,7 @@ def process_file(self): self.save_props(props) - self.parsed_data_file.close() - self.save_to_arango() + self.writer.close() def load_drug_id_mapping(self): # e.g. key: '17-alpha-dihydroequilenin sulfate', value: 'PA166238901' @@ -386,14 +375,5 @@ def match_variant_alleles(self, variant_hgvs_ids, variant_drug_row): return None def save_props(self, props): - json.dump(props, self.parsed_data_file) - self.parsed_data_file.write('\n') - - def save_to_arango(self): - if self.dry_run: - print(self.arangodb()[0]) - else: - os.system(self.arangodb()[0]) - - def arangodb(self): - return ArangoDB().generate_json_import_statement(self.output_filepath, self.collection, type=self.type) + self.writer.write(json.dumps(props)) + self.writer.write('\n') diff --git a/data/adapters/proteins_interaction_adapter.py b/data/adapters/proteins_interaction_adapter.py index ccd0bd6b..0d28f0cd 100644 --- a/data/adapters/proteins_interaction_adapter.py +++ b/data/adapters/proteins_interaction_adapter.py @@ -1,10 +1,10 @@ import csv -import os import json import hashlib +from typing import Optional + import obonet -from adapters import Adapter -from db.arango_db import ArangoDB +from adapters.writer import Writer # Example lines in merged_PPI.UniProt.csv (and merged_PPI_mouse.UniProt.csv for mouse): # Protein ID 1,Protein ID 2,PMID,Detection Method,Detection Method (PSI-MI),Interaction Type,Interaction Type (PSI-MI),Confidence Value (biogrid),Confidence Value (intact),Source @@ -12,30 +12,21 @@ # Q9Y243,Q9Y6H6,[33961781],affinity chromatography technology,MI:0004,physical association,MI:0915,0.990648979,,BioGRID -class ProteinsInteraction(Adapter): +class ProteinsInteraction: INTERACTION_MI_CODE_PATH = './data_loading_support_files/Biogrid_gene_gene/psi-mi.obo' - OUTPUT_PATH = './parsed-data' - def __init__(self, filepath, label, dry_run=True): + def __init__(self, filepath, label, dry_run=True, writer: Optional[Writer] = None, **kwargs): self.filepath = filepath self.dataset = label self.label = label self.dry_run = dry_run self.type = 'edge' - + self.writer = writer if 'mouse' in self.filepath.split('/')[-1]: self.organism = 'Mus musculus' else: self.organism = 'Homo sapiens' - self.output_filepath = '{}/{}_{}.json'.format( - ProteinsInteraction.OUTPUT_PATH, - self.dataset, - self.organism.replace(' ', '_') - ) - - super(ProteinsInteraction, self).__init__() - def load_MI_code_mapping(self): # get mapping for MI code -> name from obo file (e.g. MI:2370 -> synthetic lethality (sensu BioGRID)) self.MI_code_mapping = {} @@ -44,7 +35,7 @@ def load_MI_code_mapping(self): self.MI_code_mapping[node] = graph.nodes[node]['name'] def process_file(self): - parsed_data_file = open(self.output_filepath, 'w') + self.writer.open() print('Loading MI code mappings') self.load_MI_code_mapping() @@ -85,17 +76,7 @@ def process_file(self): 'inverse_name': 'physically interacts with', 'molecular_function': 'ontology_terms/GO_0005515' } - json.dump(props, parsed_data_file) - parsed_data_file.write('\n') - - parsed_data_file.close() - self.save_to_arango() - - def save_to_arango(self): - if self.dry_run: - print(self.arangodb()[0]) - else: - os.system(self.arangodb()[0]) + self.writer.write(json.dumps(props)) + self.writer.write('\n') - def arangodb(self): - return ArangoDB().generate_json_import_statement(self.output_filepath, self.collection, type=self.type) + self.writer.close() diff --git a/data/adapters/reactome_adapter.py b/data/adapters/reactome_adapter.py index c4d77ef7..325ecb76 100644 --- a/data/adapters/reactome_adapter.py +++ b/data/adapters/reactome_adapter.py @@ -1,7 +1,7 @@ import json -import os -from adapters import Adapter -from db.arango_db import ArangoDB +from typing import Optional + +from adapters.writer import Writer # Data file for genes_pathways: https://reactome.org/download/current/Ensembl2Reactome_All_Levels.txt # data format: @@ -28,30 +28,24 @@ # R-BTA-109582 R-BTA-140877 -class Reactome(Adapter): +class Reactome: ALLOWED_LABELS = ['genes_pathways', 'parent_pathway_of'] - OUTPUT_PATH = './parsed-data' - def __init__(self, filepath, label, dry_run=True): + def __init__(self, filepath, label, dry_run=True, writer: Optional[Writer] = None, **kwargs): if label not in Reactome.ALLOWED_LABELS: - raise ValueError('Ivalid label. Allowed values: ' + + raise ValueError('Invalid label. Allowed values: ' + ', '.join(Reactome.ALLOWED_LABELS)) self.filepath = filepath self.dataset = label self.label = label self.dry_run = dry_run self.type = 'edge' - self.output_filepath = '{}/{}.json'.format( - self.OUTPUT_PATH, - self.dataset - ) - - super(Reactome, self).__init__() + self.writer = writer def process_file(self): - parsed_data_file = open(self.output_filepath, 'w') + self.writer.open() with open(self.filepath) as input: _props = { 'source': 'Reactome', @@ -80,8 +74,8 @@ def process_file(self): 'inverse_name': 'has part' } ) - json.dump(_props, parsed_data_file) - parsed_data_file.write('\n') + self.writer.write(json.dumps(_props)) + self.writer.write('\n') else: parent, child = line.strip().split('\t') if parent.startswith('R-HSA'): @@ -97,16 +91,6 @@ def process_file(self): 'inverse_name': 'child of' } ) - json.dump(_props, parsed_data_file) - parsed_data_file.write('\n') - parsed_data_file.close() - self.save_to_arango() - - def save_to_arango(self): - if self.dry_run: - print(self.arangodb()[0]) - else: - os.system(self.arangodb()[0]) - - def arangodb(self): - return ArangoDB().generate_json_import_statement(self.output_filepath, self.collection, type=self.type) + self.writer.write(json.dumps(_props)) + self.writer.write('\n') + self.writer.close() diff --git a/data/adapters/reactome_pathway_adapter.py b/data/adapters/reactome_pathway_adapter.py index 53eef261..cf46146a 100644 --- a/data/adapters/reactome_pathway_adapter.py +++ b/data/adapters/reactome_pathway_adapter.py @@ -1,12 +1,12 @@ -from adapters import Adapter -import gzip import json -import os -from db.arango_db import ArangoDB +from typing import Optional + import requests from requests.adapters import HTTPAdapter, Retry from requests.exceptions import JSONDecodeError +from adapters.writer import Writer + # This adapter is used to parse Reactome pathway data. # the input file is last modified on 2024-06-03 and is available at: https://reactome.org/download/current/ReactomePathways.txt # Example pathway input file: @@ -17,27 +17,17 @@ # R-HSA-5619084 ABC transporter disorders Homo sapiens -class ReactomePathway(Adapter): - - OUTPUT_FOLDER = './parsed-data' - - def __init__(self, filepath=None, dry_run=False): +class ReactomePathway: + def __init__(self, filepath=None, dry_run=False, writer: Optional[Writer] = None, **kwargs): self.filepath = filepath self.label = 'pathway' self.dataset = 'pathway' - if not os.path.exists(ReactomePathway.OUTPUT_FOLDER): - os.makedirs(ReactomePathway.OUTPUT_FOLDER) - self.output_filepath = '{}/{}.json'.format( - ReactomePathway.OUTPUT_FOLDER, - self.dataset, - ) self.dry_run = dry_run - - super(ReactomePathway, self).__init__() + self.writer = writer def process_file(self): - parsed_data_file = open(self.output_filepath, 'w') + self.writer.open() session = requests.Session() retries = Retry(total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504]) @@ -95,21 +85,11 @@ def process_file(self): 'go_biological_process': 'ontology_terms/' + go_biological_process['databaseName'] + '_' + go_biological_process['accession'] } ) - json.dump(to_json, parsed_data_file) - parsed_data_file.write('\n') + self.writer.write(json.dumps(to_json)) + self.writer.write('\n') except JSONDecodeError as e: print( f'Can not query for {query}. The status code is {response.status_code}. The text is {response.text}') raise JSONDecodeError() - parsed_data_file.close() - self.save_to_arango() - - def arangodb(self): - return ArangoDB().generate_json_import_statement(self.output_filepath, self.collection) - - def save_to_arango(self): - if self.dry_run: - print(self.arangodb()[0]) - else: - os.system(self.arangodb()[0]) + self.writer.close() diff --git a/data/adapters/topld_adapter.py b/data/adapters/topld_adapter.py index e428ba1c..3eb37c1f 100644 --- a/data/adapters/topld_adapter.py +++ b/data/adapters/topld_adapter.py @@ -1,12 +1,9 @@ import csv import json -import os -from hashlib import sha256 +from typing import Optional -from adapters import Adapter from adapters.helpers import build_variant_id - -from db.arango_db import ArangoDB +from adapters.writer import Writer # Example TOPLD input data file: @@ -20,34 +17,23 @@ # 5031031,rs1441313282,0.010486891385767793,C,T,5031031:C:T,FP565260.3|FP565260.3|FP565260.3|FP565260.3|FP565260.3,"intron_variant|intron_variant|intron_variant|intron_variant,NMD_transcript_variant|intron_variant",2.135,.,. -class TopLD(Adapter): +class TopLD: DATASET = 'topld_linkage_disequilibrium' - OUTPUT_PATH = './parsed-data' - - def __init__(self, chr, data_filepath, annotation_filepath, ancestry='SAS', dry_run=True): - self.data_filepath = data_filepath - self.annotations_filepath = annotation_filepath - + def __init__(self, filepath, annotation_filepath, chr, ancestry='SAS', dry_run=True, writer: Optional[Writer] = None, **kwargs): + self.filepath = filepath + self.annotation_filepath = annotation_filepath + self.writer = writer self.chr = chr self.ancestry = ancestry self.dataset = TopLD.DATASET self.label = TopLD.DATASET - self.dry_run = dry_run - self.output_filepath = '{}/{}-{}.json'.format( - TopLD.OUTPUT_PATH, - self.dataset, - data_filepath.split('/')[-1] - ) - - super(TopLD, self).__init__() - def process_annotations(self): print('Processing annotations...') self.ids = {} - with open(self.annotations_filepath, 'r') as annotations: + with open(self.annotation_filepath, 'r') as annotations: annotations_csv = csv.reader(annotations) next(annotations_csv) @@ -68,10 +54,9 @@ def process_file(self): print('Processing data...') - parsed_data_file = open(self.output_filepath, 'w') - record_count = 0 + self.writer.open() - for line in open(self.data_filepath, 'r'): + for line in open(self.filepath, 'r'): row = line.split(',') if row[0] == 'SNP1': @@ -98,30 +83,7 @@ def process_file(self): 'source_url': 'http://topld.genetics.unc.edu/' } - json.dump(props, parsed_data_file) - parsed_data_file.write('\n') - record_count += 1 - - if record_count > 1000000: - parsed_data_file.close() - self.save_to_arango() - - os.remove(self.output_filepath) - record_count = 0 - - parsed_data_file = open(self.output_filepath, 'w') - - parsed_data_file.close() - self.save_to_arango() - - if not self.dry_run: - os.remove(self.output_filepath) - - def arangodb(self): - return ArangoDB().generate_json_import_statement(self.output_filepath, 'variants_variants', type='edges') + self.writer.write(json.dumps(props)) + self.writer.write('\n') - def save_to_arango(self): - if self.dry_run: - print(self.arangodb()[0]) - else: - os.system(self.arangodb()[0]) + self.writer.close() diff --git a/data/adapters/uniprot_adapter.py b/data/adapters/uniprot_adapter.py index 80e6a552..ee860ec0 100644 --- a/data/adapters/uniprot_adapter.py +++ b/data/adapters/uniprot_adapter.py @@ -1,9 +1,10 @@ import gzip import json -import os +from typing import Optional + from Bio import SeqIO -from adapters import Adapter -from db.arango_db import ArangoDB + +from adapters.writer import Writer # Data file is uniprot_sprot_human.dat.gz and uniprot_trembl_human.dat.gz at https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/. # We can use SeqIO from Bio to read the file. @@ -11,19 +12,18 @@ # id, name will be loaded for protein. Ensembl IDs(example: Ensembl:ENST00000372839.7) in dbxrefs will be used to create protein and transcript relationship. -class Uniprot(Adapter): +class Uniprot: ALLOWED_LABELS = ['UniProtKB_Translates_To'] ALLOWED_SOURCES = ['UniProtKB/Swiss-Prot', 'UniProtKB/TrEMBL'] ALLOWED_ORGANISMS = ['HUMAN', 'MOUSE'] - OUTPUT_PATH = './parsed-data' - def __init__(self, filepath, label, source, organism='HUMAN', dry_run=True): + def __init__(self, filepath, label, source, organism='HUMAN', dry_run=True, writer: Optional[Writer] = None, **kwargs): if label not in Uniprot.ALLOWED_LABELS: - raise ValueError('Ivalid label. Allowed values: ' + + raise ValueError('Invalid label. Allowed values: ' + ', '.join(Uniprot.ALLOWED_LABELS)) if organism not in Uniprot.ALLOWED_ORGANISMS: - raise ValueError('Ivalid organism. Allowed values: ' + + raise ValueError('Invalid organism. Allowed values: ' + ', '.join(Uniprot.ALLOWED_ORGANISMS)) self.filepath = filepath self.label = label @@ -37,15 +37,10 @@ def __init__(self, filepath, label, source, organism='HUMAN', dry_run=True): self.dataset = label self.dry_run = dry_run self.type = 'edge' - self.output_filepath = '{}/{}.json'.format( - self.OUTPUT_PATH, - self.dataset - ) - - super(Uniprot, self).__init__() + self.writer = writer def process_file(self): - parsed_data_file = open(self.output_filepath, 'w') + self.writer.open() with gzip.open(self.filepath, 'rt') as input_file: records = SeqIO.parse(input_file, 'swiss') for record in records: @@ -70,20 +65,10 @@ def process_file(self): _props['_key'] = _id _props['_from'] = _source _props['_to'] = _target - json.dump(_props, parsed_data_file) - parsed_data_file.write('\n') + self.writer.write(json.dumps(_props)) + self.writer.write('\n') except: print( f'fail to process for label {self.label}: {record.id}') pass - parsed_data_file.close() - self.save_to_arango() - - def save_to_arango(self): - if self.dry_run: - print(self.arangodb()[0]) - else: - os.system(self.arangodb()[0]) - - def arangodb(self): - return ArangoDB().generate_json_import_statement(self.output_filepath, self.collection, type=self.type) + self.writer.close() diff --git a/data/adapters/uniprot_protein_adapter.py b/data/adapters/uniprot_protein_adapter.py index 94768353..417f8766 100644 --- a/data/adapters/uniprot_protein_adapter.py +++ b/data/adapters/uniprot_protein_adapter.py @@ -1,10 +1,11 @@ import gzip import json -import os -from adapters import Adapter -from db.arango_db import ArangoDB +from typing import Optional + from Bio import SwissProt +from adapters.writer import Writer + # Data file is uniprot_sprot_human.dat.gz and uniprot_trembl_human.dat.gz at https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/. # We can use SeqIO from Bio to read the file. @@ -12,18 +13,17 @@ # id, name will be loaded for protein. Ensembl IDs(example: Ensembl:ENST00000372839.7) in dbxrefs will be used to create protein and transcript relationship. -class UniprotProtein(Adapter): - OUTPUT_FOLDER = './parsed-data' +class UniprotProtein: ALLOWED_SOURCES = ['UniProtKB/Swiss-Prot', 'UniProtKB/TrEMBL'] # two taxonomy IDs are allowed: 9606 for Homo sapiens, and 10090 for Mus musculus ALLOWED_TAXONOMY_IDS = ['9606', '10090'] - def __init__(self, filepath, source, taxonomy_id='9606', dry_run=True): + def __init__(self, filepath, source, taxonomy_id='9606', dry_run=True, writer: Optional[Writer] = None, **kwargs): if source not in UniprotProtein.ALLOWED_SOURCES: - raise ValueError('Ivalid source. Allowed values: ' + + raise ValueError('Invalid source. Allowed values: ' + ', '.join(UniprotProtein.ALLOWED_SOURCES)) if taxonomy_id not in UniprotProtein.ALLOWED_TAXONOMY_IDS: - raise ValueError('Ivalid taxonomy id. Allowed values: ' + + raise ValueError('Invalid taxonomy id. Allowed values: ' + ', '.join(UniprotProtein.ALLOWED_TAXONOMY_IDS)) self.filepath = filepath self.dataset = 'UniProtKB_protein' @@ -34,15 +34,7 @@ def __init__(self, filepath, source, taxonomy_id='9606', dry_run=True): if taxonomy_id == '10090': self.organism = 'Mus musculus' self.dry_run = dry_run - - if not os.path.exists(UniprotProtein.OUTPUT_FOLDER): - os.makedirs(UniprotProtein.OUTPUT_FOLDER) - self.output_filepath = '{}/{}.json'.format( - UniprotProtein.OUTPUT_FOLDER, - self.dataset, - ) - - super(UniprotProtein, self).__init__() + self.writer = writer def get_dbxrefs(self, cross_references): dbxrefs = [] @@ -83,7 +75,7 @@ def get_full_name(self, description): return rec_name def process_file(self): - parsed_data_file = open(self.output_filepath, 'w') + self.writer.open() with gzip.open(self.filepath, 'rt') as input_file: records = SwissProt.parse(input_file) for record in records: @@ -100,16 +92,6 @@ def process_file(self): } if full_name: to_json['full_name'] = full_name - json.dump(to_json, parsed_data_file) - parsed_data_file.write('\n') - parsed_data_file.close() - self.save_to_arango() - - def arangodb(self): - return ArangoDB().generate_json_import_statement(self.output_filepath, self.collection) - - def save_to_arango(self): - if self.dry_run: - print(self.arangodb()[0]) - else: - os.system(self.arangodb()[0]) + self.writer.write(json.dumps(to_json)) + self.writer.write('\n') + self.writer.close() diff --git a/data/adapters/writer.py b/data/adapters/writer.py index 665b191f..512d97b2 100644 --- a/data/adapters/writer.py +++ b/data/adapters/writer.py @@ -1,21 +1,31 @@ +from abc import ABC, abstractmethod + import boto3 import smart_open from typing import Optional -class Writer: +class Writer(ABC): def __init__(self): pass + @abstractmethod def open(self): - raise NotImplementedError + pass + @abstractmethod def write(self, content): - raise NotImplementedError + pass + @abstractmethod def close(self): - raise NotImplementedError + pass + + @property + @abstractmethod + def destination(self): + pass class S3Writer(Writer): @@ -28,7 +38,7 @@ def __init__(self, bucket: str, key: str, session: boto3.Session) -> None: self.s3_file = None def open(self): - self.s3_file = smart_open.open(self.s3_uri, mode='w', transport_params={ + self.s3_file = smart_open.open(self.destination, mode='w', transport_params={ 'client': self.session.client('s3')}) def write(self, content): @@ -41,7 +51,7 @@ def _create_s3_uri(self): return f's3://{self.bucket}/{self.key}' @property - def s3_uri(self): + def destination(self): if self._s3_uri is None: self._s3_uri = self._create_s3_uri() return self._s3_uri @@ -64,6 +74,33 @@ def write(self, content): def close(self): self.file.close() + @property + def destination(self): + return self.filepath + + +class SpyWriter(Writer): + + def __init__(self) -> None: + self.container = [] + + def open(self): + pass + + def write(self, content): + self.container.append(content) + + def close(self): + pass + + @property + def contents(self): + return self.container + + @property + def destination(self): + pass + def get_writer( filepath: Optional[str] = None, diff --git a/data/data_loader.py b/data/data_loader.py deleted file mode 100644 index 2c984e04..00000000 --- a/data/data_loader.py +++ /dev/null @@ -1,22 +0,0 @@ -import os -import argparse -from active_adapters import ADAPTERS - -from db.arango_db import ArangoDB - -parser = argparse.ArgumentParser( - prog='IGVF Catalog Sample Data Loader', - description='Loads sample data into a local ArangoDB instance' -) - -parser.add_argument('-a', '--adapter', nargs='*', - help='Loads the sample data for an adapter', choices=ADAPTERS.keys()) - -args = parser.parse_args() -adapters = args.adapter or ADAPTERS.keys() - -import_cmds = [] - -for a in adapters: - adapter = ADAPTERS[a] - adapter.write_file() diff --git a/data/data_parser.py b/data/data_parser.py new file mode 100644 index 00000000..ca707759 --- /dev/null +++ b/data/data_parser.py @@ -0,0 +1,77 @@ +import os +import argparse +import boto3 +from active_adapters import ADAPTERS +from active_adapters import LABEL_TO_ADAPTER + +from adapters.writer import get_writer + +parser = argparse.ArgumentParser( + prog='IGVF Catalog Sample Data Loader', + description='Loads sample data into a local ArangoDB instance' +) + +# arguments that are not adapter creation related +parser.add_argument('--output-bucket', type=str, + default=None, help='The S3 bucket to use') +parser.add_argument('--output-bucket-key', type=str, default=None, + help='The S3 location to use, for example "path/to/output.file".') +parser.add_argument('--output-local-path', type=str, default=None, + help='The local path to use, for example "path/to/output.file".') +parser.add_argument('--adapter', help='Loads the sample data for an adapter.', + choices=LABEL_TO_ADAPTER.keys(), required=True) +parser.add_argument('--aws-profile', type=str, default=None, + help='The AWS profile to use, for example "igvf-dev".') + +# arguments that are in at least one adapter signature +parser.add_argument('--gene-alias-file-path', type=str, + help='Gene alias file path for GencodeGene.') +parser.add_argument('--label', help='The label of the adapter to load.') +parser.add_argument('--ancestry', type=str, help='Ancestry for TopLD.') +parser.add_argument('--source', type=str) +parser.add_argument('--source-url', type=str) +parser.add_argument('--biological-context', type=str, + help='Biological context for EncodeElementGeneLink.') +parser.add_argument('--gaf-type', type=str, help='GAF type for GAF.') +parser.add_argument('--variants-to-genes', type=str, + help='Location of variants to genes TSV for GWAS.') +parser.add_argument('--gwas-collection', type=str, + help='GWAS collection for GWAS.') +parser.add_argument('--type', type=str, choices=['edge', 'node']) +parser.add_argument('--collection', type=str, help='Collection for DbSNFP.') +parser.add_argument('--annotation-filepath', type=str, + help='Annotation CSV path for TopLD.') +parser.add_argument('--filepath', type=str, + default=None, help='The path to the input file.', required=True) + +args = parser.parse_args() + +non_adapter_signature_args = [ + 'output_bucket', + 'output_bucket_key', + 'output_local_path', + 'adapter', + 'aws_profile' +] + +non_adapter_signature_namespace = argparse.Namespace() +adapter_signature_namespace = argparse.Namespace() + +# separate args into non adapter signature and adapter signature args +for arg in vars(args): + if arg in non_adapter_signature_args: + setattr(non_adapter_signature_namespace, arg, getattr(args, arg)) + else: + setattr(adapter_signature_namespace, arg, getattr(args, arg)) + +writer = get_writer( + filepath=non_adapter_signature_namespace.output_local_path, + bucket=non_adapter_signature_namespace.output_bucket, + key=non_adapter_signature_namespace.output_bucket_key, + session=boto3.Session( + profile_name=non_adapter_signature_namespace.aws_profile) +) + +adapter = LABEL_TO_ADAPTER[non_adapter_signature_namespace.adapter]( + **vars(adapter_signature_namespace), writer=writer) +adapter.process_file() diff --git a/data/pytest.ini b/data/pytest.ini new file mode 100644 index 00000000..a635c5c0 --- /dev/null +++ b/data/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +pythonpath = . diff --git a/data/tests/test_AFGR_caqtl_adapter.py b/data/tests/test_AFGR_caqtl_adapter.py new file mode 100644 index 00000000..016db72a --- /dev/null +++ b/data/tests/test_AFGR_caqtl_adapter.py @@ -0,0 +1,27 @@ +import json + +from adapters.AFGR_caqtl_adapter import AFGRCAQtl +from adapters.writer import SpyWriter + + +def test_AFGR_caqtl_adapter_regulatory_region(): + writer = SpyWriter() + adapter = AFGRCAQtl(filepath='./samples/AFGR/sorted.dist.hwe.af.AFR.caQTL.example.txt.gz', + label='regulatory_region', writer=writer) + adapter.process_file() + first_item = json.loads(writer.contents[0]) + assert len(writer.contents) == 200 + assert len(first_item) == 8 + assert first_item['_key'] == 'accessible_dna_element_1_906596_907043_GRCh38' + + +def test_AFGR_caqtl_adapter_AFGR_caqtl(): + writer = SpyWriter() + adapter = AFGRCAQtl(filepath='./samples/AFGR/sorted.dist.hwe.af.AFR.caQTL.example.txt.gz', + label='AFGR_caqtl', writer=writer) + adapter.process_file() + first_item = json.loads(writer.contents[0]) + assert len(writer.contents) == 200 + assert len(first_item) == 13 + assert '_from' in first_item + assert first_item['_key'] == '701f175a69d51e1e7c526f8c8ca2b2165ba7a58aadfa797dfa737916120b8ce5_accessible_dna_element_1_906596_907043_GRCh38_AFGR' diff --git a/data/tests/test_AFGR_eqtl_adapter.py b/data/tests/test_AFGR_eqtl_adapter.py new file mode 100644 index 00000000..ec6b4292 --- /dev/null +++ b/data/tests/test_AFGR_eqtl_adapter.py @@ -0,0 +1,26 @@ +import json + +from adapters.AFGR_eqtl_adapter import AFGREQtl +from adapters.writer import SpyWriter + + +def test_AFGR_eqtl_adapter_AFGR_eqtl(): + writer = SpyWriter() + adapter = AFGREQtl(filepath='./samples/AFGR/sorted.dist.hwe.af.AFR_META.eQTL.example.txt.gz', + label='AFGR_eqtl', writer=writer) + adapter.process_file() + first_item = json.loads(writer.contents[0]) + assert len(writer.contents) == 200 + assert len(first_item) == 14 + assert first_item['inverse_name'] == 'expression modulated by' + + +def test_AFGR_eqtl_adapter_AFGR_eqtl_term(): + writer = SpyWriter() + adapter = AFGREQtl(filepath='./samples/AFGR/sorted.dist.hwe.af.AFR_META.eQTL.example.txt.gz', + label='AFGR_eqtl_term', writer=writer) + adapter.process_file() + first_item = json.loads(writer.contents[0]) + assert len(writer.contents) == 200 + assert len(first_item) == 8 + assert first_item['inverse_name'] == 'has measurement' diff --git a/data/tests/test_AFGR_sqtl.py b/data/tests/test_AFGR_sqtl.py new file mode 100644 index 00000000..4eb050d7 --- /dev/null +++ b/data/tests/test_AFGR_sqtl.py @@ -0,0 +1,26 @@ +import json + +from adapters.AFGR_sqtl_adapter import AFGRSQtl +from adapters.writer import SpyWriter + + +def test_AFGR_sqtl_adapter_AFGR_sqtl(): + writer = SpyWriter() + adapter = AFGRSQtl(filepath='./samples/AFGR/sorted.all.AFR.Meta.sQTL.example.txt.gz', + label='AFGR_sqtl', writer=writer) + adapter.process_file() + first_item = json.loads(writer.contents[0]) + assert len(writer.contents) == 214 + assert len(first_item) == 17 + assert first_item['intron_chr'].startswith('chr') + + +def test_AFGR_sqtl_adapter_AFGR_sqtl_term(): + writer = SpyWriter() + adapter = AFGRSQtl(filepath='./samples/AFGR/sorted.all.AFR.Meta.sQTL.example.txt.gz', + label='AFGR_sqtl_term', writer=writer) + adapter.process_file() + first_item = json.loads(writer.contents[0]) + assert len(writer.contents) == 214 + assert len(first_item) == 8 + assert first_item['inverse_name'] == 'has measurement' diff --git a/data/tests/test_adapter.py b/data/tests/test_adapter.py index 2282d65d..d241402e 100644 --- a/data/tests/test_adapter.py +++ b/data/tests/test_adapter.py @@ -1,6 +1,6 @@ -from adapters import Adapter, OUTPUT_PATH -from unittest.mock import patch, mock_open, Mock +import pytest import yaml +from adapters import Adapter, OUTPUT_PATH MOCK_TEST_NODE = ''' test gene: @@ -19,7 +19,6 @@ chr: str ''' - MOCK_TEST_EDGE = ''' test correlation: represented_as: edge @@ -34,13 +33,21 @@ ''' -@patch('builtins.open', new_callable=mock_open, read_data=MOCK_TEST_NODE) -def test_adapter_ingests_config_file_for_nodes(mock_op): +@pytest.fixture +def mock_node_config(mocker): + return mocker.patch('builtins.open', mocker.mock_open(read_data=MOCK_TEST_NODE)) + + +@pytest.fixture +def mock_edge_config(mocker): + return mocker.patch('builtins.open', mocker.mock_open(read_data=MOCK_TEST_EDGE)) + + +def test_adapter_ingests_config_file_for_nodes(mock_node_config): class TestAdapter(Adapter): def __init__(self): self.label = 'test node' - - super(TestAdapter, self).__init__() + super().__init__() adapter = TestAdapter() @@ -50,13 +57,11 @@ def __init__(self): assert adapter.file_prefix == 'TestGene' -@patch('builtins.open', new_callable=mock_open, read_data=MOCK_TEST_EDGE) -def test_adapter_ingests_config_file_for_edges(mock_op): +def test_adapter_ingests_config_file_for_edges(mock_edge_config): class TestAdapter(Adapter): def __init__(self): self.label = 'test edge' - - super(TestAdapter, self).__init__() + super().__init__() adapter = TestAdapter() @@ -67,120 +72,107 @@ def __init__(self): assert adapter.file_prefix == 'CORRELATION' -@patch('adapters.ArangoDB') -@patch('builtins.open', new_callable=mock_open, read_data=MOCK_TEST_NODE) -def test_adapter_generate_arangodb_import_sts_per_chr(mock_op, mock_arango): - with patch('glob.glob', return_value=['file1', 'file2']) as mock_glob: - class TestAdapter(Adapter): - def __init__(self): - self.label = 'test node' - self.file_prefix = 'test_prefix' - self.chr = 'chr1' - - super(TestAdapter, self).__init__() - - def process_file(self): - return 'Test file processed' +def test_adapter_generate_arangodb_import_sts_per_chr(mock_node_config, mocker): + mock_arango = mocker.patch('adapters.ArangoDB') + mock_glob = mocker.patch('glob.glob', return_value=['file1', 'file2']) - adapter = TestAdapter() + class TestAdapter(Adapter): + def __init__(self): + self.label = 'test node' + self.file_prefix = 'test_prefix' + self.chr = 'chr1' + super().__init__() - adapter.arangodb() + def process_file(self): + return 'Test file processed' - mock_arango().generate_import_statement.assert_called_with( - OUTPUT_PATH + adapter.file_prefix + '-header.csv', - mock_glob.return_value, - 'test_collection_chr1', - 'node', - True - ) + adapter = TestAdapter() + adapter.arangodb() + mock_arango().generate_import_statement.assert_called_with( + OUTPUT_PATH + adapter.file_prefix + '-header.csv', + mock_glob.return_value, + 'test_collection_chr1', + 'node', + True + ) -@patch('adapters.ArangoDB') -@patch('builtins.open', new_callable=mock_open, read_data=MOCK_TEST_EDGE) -def test_adapter_generate_arangodb_import_sts(mock_op, mock_arango): - with patch('glob.glob', return_value=['file1', 'file2']) as mock_glob: - class TestAdapter(Adapter): - def __init__(self): - self.label = 'test edge' - self.file_prefix = 'test_prefix' - super(TestAdapter, self).__init__() +def test_adapter_generate_arangodb_import_sts(mock_edge_config, mocker): + mock_arango = mocker.patch('adapters.ArangoDB') + mock_glob = mocker.patch('glob.glob', return_value=['file1', 'file2']) - def process_file(self): - return 'Test file processed' + class TestAdapter(Adapter): + def __init__(self): + self.label = 'test edge' + self.file_prefix = 'test_prefix' + super().__init__() - adapter = TestAdapter() + def process_file(self): + return 'Test file processed' - adapter.arangodb() + adapter = TestAdapter() + adapter.arangodb() - mock_arango().generate_import_statement.assert_called_with( - OUTPUT_PATH + adapter.file_prefix + '-header.csv', - mock_glob.return_value, - 'test_collection_edges', - 'edge', - True - ) + mock_arango().generate_import_statement.assert_called_with( + OUTPUT_PATH + adapter.file_prefix + '-header.csv', + mock_glob.return_value, + 'test_collection_edges', + 'edge', + True + ) -@patch('builtins.open', new_callable=mock_open, read_data=MOCK_TEST_NODE) -def test_adapter_has_indexes(mock_op): +def test_adapter_has_indexes(mock_node_config): class TestAdapter(Adapter): def __init__(self): self.label = 'test node' - super(TestAdapter, self).__init__() + super().__init__() adapter = TestAdapter() - assert adapter.has_indexes() == True -@patch('builtins.open', new_callable=mock_open, read_data=MOCK_TEST_EDGE) -def test_adapter_doesnt_have_indexes(mock_op): +def test_adapter_doesnt_have_indexes(mock_edge_config): class TestAdapter(Adapter): def __init__(self): self.label = 'test edge' - super(TestAdapter, self).__init__() + super().__init__() adapter = TestAdapter() - assert adapter.has_indexes() == False -@patch('builtins.open', new_callable=mock_open, read_data=MOCK_TEST_EDGE) -def test_adapter_doesnt_create_indexes_if_not_set(mock_op, capfd): +def test_adapter_doesnt_create_indexes_if_not_set(mock_edge_config, capsys): class TestAdapter(Adapter): def __init__(self): self.label = 'test edge' - super(TestAdapter, self).__init__() + super().__init__() adapter = TestAdapter() - adapter.create_indexes() - out, err = capfd.readouterr() - - assert out == 'No indexes registered in {} config\n'.format( - adapter.collection) + captured = capsys.readouterr() + err_msg = 'No indexes registered in ' + adapter.collection + ' config\n' + assert captured.out == err_msg -@patch('adapters.ArangoDB') -@patch('builtins.open', new_callable=mock_open, read_data=MOCK_TEST_NODE) -def test_adapter_creates_indexes(mock_op, mock_arango): - with patch('glob.glob', return_value=['file1', 'file2']) as mock_glob: - class TestAdapter(Adapter): - def __init__(self): - self.label = 'test node' - super(TestAdapter, self).__init__() +def test_adapter_creates_indexes(mock_node_config, mocker): + mock_arango = mocker.patch('adapters.ArangoDB') - adapter = TestAdapter() + class TestAdapter(Adapter): + def __init__(self): + self.label = 'test node' + super().__init__() - indexes = adapter.schema_config['db_indexes'] - index = [*indexes.keys()][0] + adapter = TestAdapter() + indexes = adapter.schema_config['db_indexes'] + index = next(iter(indexes)) - adapter.create_indexes() + adapter.create_indexes() - mock_arango().create_index.assert_called_with( - adapter.collection, - indexes[index]['type'], - indexes[index]['fields'] - ) + mock_arango().create_index.assert_called_with( + adapter.collection, + indexes[index]['type'], + indexes[index]['fields'] + ) diff --git a/data/tests/test_biogrid_gene_gene.py b/data/tests/test_biogrid_gene_gene.py new file mode 100644 index 00000000..8b916639 --- /dev/null +++ b/data/tests/test_biogrid_gene_gene.py @@ -0,0 +1,32 @@ +import json + +from adapters.biogrid_gene_gene_adapter import GeneGeneBiogrid +from adapters.writer import SpyWriter + + +def test_biogrid_gene_gene_adapter_gene_gene_biogrid(): + writer = SpyWriter() + adapter = GeneGeneBiogrid( + filepath='./samples/merged_PPI.UniProt.example.csv', label='biogrid_gene_gene', writer=writer) + adapter.process_file() + first_item = json.loads(writer.contents[0]) + print(first_item) + assert len(writer.contents) == 4 + assert len(first_item) == 15 + assert first_item['source'] == 'BioGRID' + assert first_item['confidence_value_biogrid:long'] is None + assert first_item['interaction_type'] == [ + 'negative genetic interaction (sensu BioGRID)'] + + +def test_biogrid_gene_gene_adapter_mouse_gene_gene_biogrid(): + writer = SpyWriter() + adapter = GeneGeneBiogrid(filepath='./samples/merged_PPI_mouse.UniProt.example.csv', + label='mouse_gene_gene_biogrid', writer=writer) + adapter.process_file() + first_item = json.loads(writer.contents[0]) + assert len(writer.contents) == 14 + assert len(first_item) == 15 + assert first_item['source'] == 'BioGRID' + assert first_item['interaction_type'] == [ + 'positive genetic interaction (sensu BioGRID)'] diff --git a/data/tests/test_ccre_adapter.py b/data/tests/test_ccre_adapter.py new file mode 100644 index 00000000..155dcc61 --- /dev/null +++ b/data/tests/test_ccre_adapter.py @@ -0,0 +1,28 @@ +import json + +from adapters.ccre_adapter import CCRE +from adapters.writer import SpyWriter + + +def test_ccre_adapter(): + writer = SpyWriter() + adapter = CCRE(filepath='./samples/ccre_example.bed.gz', + label='regulatory_region', writer=writer) + adapter.process_file() + assert len(writer.contents) == 5510 + first_item = json.loads(writer.contents[0]) + assert first_item['_key'] == 'EH38E4255188' + assert first_item['chr'] == 'chr20' + assert first_item['source_url'].startswith( + 'https://www.encodeproject.org/files/') + + +def test_ccre_adapter_initialization(): + adapter = CCRE(filepath='./samples/ccre_example.bed.gz', + label='custom_label') + assert adapter.filepath == './samples/ccre_example.bed.gz' + assert adapter.label == 'custom_label' + assert adapter.dataset == 'custom_label' + assert adapter.source_url.startswith( + 'https://www.encodeproject.org/files/') + assert adapter.type == 'node' diff --git a/data/tests/test_cellosaurus_ontology_adapter.py b/data/tests/test_cellosaurus_ontology_adapter.py new file mode 100644 index 00000000..a04bf5cf --- /dev/null +++ b/data/tests/test_cellosaurus_ontology_adapter.py @@ -0,0 +1,70 @@ +import json + +from adapters.cellosaurus_ontology_adapter import Cellosaurus +from adapters.writer import SpyWriter + + +def test_cellosaurus_adapter_node(): + writer = SpyWriter() + adapter = Cellosaurus(filepath='./samples/cellosaurus_example.obo.txt', + type='node', writer=writer) + adapter.process_file() + assert len(writer.contents) > 0 + first_item = json.loads(writer.contents[0]) + assert first_item['_key'].startswith('CVCL_') + assert 'name' in first_item + assert 'uri' in first_item + assert first_item['source'] == 'Cellosaurus' + + +def test_cellosaurus_adapter_edge(): + writer = SpyWriter() + adapter = Cellosaurus(filepath='./samples/cellosaurus_example.obo.txt', + type='edge', writer=writer) + adapter.process_file() + assert len(writer.contents) > 0 + first_item = json.loads(writer.contents[0]) + assert '_key' in first_item + assert '_from' in first_item + assert '_to' in first_item + assert 'name' in first_item + assert 'inverse_name' in first_item + assert first_item['source'] == 'Cellosaurus' + + +def test_cellosaurus_adapter_species_filter(): + writer = SpyWriter() + adapter = Cellosaurus(filepath='./samples/cellosaurus_example.obo.txt', + type='node', species_filter=True, writer=writer) + adapter.process_file() + filtered_count = len(writer.contents) + + writer = SpyWriter() + adapter = Cellosaurus(filepath='./samples/cellosaurus_example.obo.txt', + type='node', species_filter=False, writer=writer) + adapter.process_file() + unfiltered_count = len(writer.contents) + + assert filtered_count < unfiltered_count + + +def test_cellosaurus_adapter_to_key(): + adapter = Cellosaurus(filepath='./samples/cellosaurus_example.obo.txt') + assert adapter.to_key('NCBI_TaxID:9606') == 'NCBITaxon_9606' + assert adapter.to_key('ORDO:Orphanet_102') == 'Orphanet_102' + assert adapter.to_key('CLO:CLO_0001001') == 'CLO_0001001' + + +def test_cellosaurus_adapter_initialization(): + adapter = Cellosaurus( + filepath='./samples/cellosaurus_example.obo.txt', type='node') + assert adapter.filepath == './samples/cellosaurus_example.obo.txt' + assert adapter.type == 'node' + assert adapter.dataset == 'ontology_term' + assert adapter.label == 'ontology_term' + + adapter = Cellosaurus( + filepath='./samples/cellosaurus_example.obo.txt', type='edge') + assert adapter.type == 'edge' + assert adapter.dataset == 'ontology_relationship' + assert adapter.label == 'ontology_relationship' diff --git a/data/tests/test_clingen_variant_disease_adapter.py b/data/tests/test_clingen_variant_disease_adapter.py new file mode 100644 index 00000000..ae5f2f5d --- /dev/null +++ b/data/tests/test_clingen_variant_disease_adapter.py @@ -0,0 +1,63 @@ +import json + +from adapters.clingen_variant_disease_adapter import ClinGen +from adapters.writer import SpyWriter + + +def test_clingen_adapter_variant_disease(): + writer = SpyWriter() + adapter = ClinGen(filepath='./samples/clinGen_variant_pathogenicity_example.csv', + label='variant_disease', writer=writer) + adapter.process_file() + + assert len(writer.contents) > 0 + first_item = json.loads(writer.contents[0]) + + assert '_key' in first_item + assert '_from' in first_item + assert '_to' in first_item + assert first_item['name'] == 'associated with' + assert first_item['inverse_name'] == 'associated with' + assert 'gene_id' in first_item + assert 'assertion' in first_item + assert 'pmids' in first_item + assert first_item['source'] == 'ClinGen' + assert first_item['source_url'] == 'https://search.clinicalgenome.org/kb/downloads' + + +def test_clingen_adapter_variant_disease_gene(): + writer = SpyWriter() + adapter = ClinGen(filepath='./samples/clinGen_variant_pathogenicity_example.csv', + label='variant_disease_gene', writer=writer) + adapter.process_file() + + assert len(writer.contents) > 0 + first_item = json.loads(writer.contents[0]) + + assert '_key' in first_item + assert '_from' in first_item + assert '_to' in first_item + assert first_item['name'] == 'associated with' + assert first_item['inverse_name'] == 'associated with' + assert 'inheritance_mode' in first_item + assert first_item['source'] == 'ClinGen' + assert first_item['source_url'] == 'https://search.clinicalgenome.org/kb/downloads' + + +def test_clingen_adapter_invalid_label(): + try: + ClinGen(filepath='./samples/clinGen_variant_pathogenicity_example.csv', + label='invalid_label') + except ValueError as e: + assert str(e).startswith('Invalid label. Allowed values:') + else: + assert False, 'Expected ValueError was not raised' + + +def test_clingen_adapter_initialization(): + adapter = ClinGen( + filepath='./samples/clinGen_variant_pathogenicity_example.csv', label='variant_disease') + assert adapter.filepath == './samples/clinGen_variant_pathogenicity_example.csv' + assert adapter.label == 'variant_disease' + assert adapter.dataset == 'variant_disease' + assert adapter.type == 'edge' diff --git a/data/tests/test_coexpresdb_adapter.py b/data/tests/test_coexpresdb_adapter.py new file mode 100644 index 00000000..c361747e --- /dev/null +++ b/data/tests/test_coexpresdb_adapter.py @@ -0,0 +1,44 @@ +import json + +from adapters.coxpresdb_adapter import Coxpresdb +from adapters.writer import SpyWriter + + +def test_coxpresdb_adapter(): + writer = SpyWriter() + adapter = Coxpresdb(filepath='./samples/coxpresdb/1', writer=writer) + adapter.process_file() + + assert len(writer.contents) > 0 + first_item = json.loads(writer.contents[0]) + + assert '_key' in first_item + assert '_from' in first_item + assert '_to' in first_item + assert 'z_score' in first_item + assert first_item['source'] == 'CoXPresdb' + assert first_item['source_url'] == 'https://coxpresdb.jp/' + assert first_item['name'] == 'coexpressed with' + assert first_item['inverse_name'] == 'coexpressed with' + assert first_item['associated process'] == 'ontology_terms/GO_0010467' + + +def test_coxpresdb_adapter_z_score_filter(): + writer = SpyWriter() + adapter = Coxpresdb(filepath='./samples/coxpresdb/1', writer=writer) + adapter.process_file() + + for item in writer.contents: + if item.startswith('{'): + data = json.loads(item) + assert abs(float(data['z_score'])) >= 3 + + +def test_coxpresdb_adapter_initialization(): + adapter = Coxpresdb(filepath='foobarbaz') + assert adapter.file_path == 'foobarbaz' + assert adapter.dataset == 'coxpresdb' + assert adapter.label == 'coxpresdb' + assert adapter.source == 'CoXPresdb' + assert adapter.source_url == 'https://coxpresdb.jp/' + assert adapter.type == 'edge' diff --git a/data/tests/test_dbSNFP_adapter.py b/data/tests/test_dbSNFP_adapter.py new file mode 100644 index 00000000..094cce38 --- /dev/null +++ b/data/tests/test_dbSNFP_adapter.py @@ -0,0 +1,87 @@ +import json +from adapters.dbSNFP_adapter import DbSNFP +from adapters.writer import SpyWriter + + +def test_dbSNFP_adapter_coding_variants(): + writer = SpyWriter() + adapter = DbSNFP( + filepath='./samples/dbNSFP4.5a_variant.chrY_sample', writer=writer) + adapter.process_file() + + assert len(writer.contents) > 1 + first_item = json.loads(writer.contents[0]) + + assert '_key' in first_item + assert 'name' in first_item + assert 'gene_name' in first_item + assert 'transcript_id' in first_item + assert 'source' in first_item + assert first_item['source'] == 'dbSNFP 4.5a' + + +def test_dbSNFP_adapter_variants_coding_variants(): + writer = SpyWriter() + adapter = DbSNFP(filepath='./samples/dbNSFP4.5a_variant.chrY_sample', + collection='variants_coding_variants', writer=writer) + adapter.process_file() + + assert len(writer.contents) > 1 + first_item = json.loads(writer.contents[0]) + + assert '_from' in first_item + assert '_to' in first_item + assert 'name' in first_item + assert 'inverse_name' in first_item + assert 'source' in first_item + assert first_item['source'] == 'dbSNFP 4.5a' + + +def test_dbSNFP_adapter_coding_variants_proteins(): + writer = SpyWriter() + adapter = DbSNFP(filepath='./samples/dbNSFP4.5a_variant.chrY_sample', + collection='coding_variants_proteins', writer=writer) + adapter.process_file() + + assert len(writer.contents) > 1 + first_item = json.loads(writer.contents[0]) + + assert '_from' in first_item + assert '_to' in first_item + assert 'name' in first_item + assert 'inverse_name' in first_item + assert 'type' in first_item + assert 'source' in first_item + assert first_item['source'] == 'dbSNFP 4.5a' + + +def test_dbSNFP_adapter_multiple_records(): + adapter = DbSNFP(filepath='./samples/dbNSFP4.5a_variant.chrY_sample') + data_line = ['Y', '2786989', 'C', 'A', 'X', 'Y', '.', 'Y', '2655030', 'Y', '2715030', '205;206', 'SRY;SRY', + 'ENSG00000184895;ENSG00000184895', 'ENST00000383070;ENST00000383070', 'ENSP00000372547;ENSP00000372547'] + + assert adapter.multiple_records(data_line) == True + + +def test_dbSNFP_adapter_breakdown_line(): + adapter = DbSNFP(filepath='./samples/dbNSFP4.5a_variant.chrY_sample') + original_data_line = ['Y', '2786989', 'C', 'A', 'X', 'Y', '.', 'Y', '2655030', 'Y', '2715030', '205;206', 'SRY;SRY', + 'ENSG00000184895;ENSG00000184895', 'ENST00000383070;ENST00000383070', 'ENSP00000372547;ENSP00000372547'] + + broken_down_lines = adapter.breakdown_line(original_data_line) + + assert len(broken_down_lines) == 2 + assert broken_down_lines[0][11] == '205' + assert broken_down_lines[1][11] == '206' + assert broken_down_lines[0][12] == 'SRY' + assert broken_down_lines[1][12] == 'SRY' + + +def test_dbSNFP_adapter_initialization(): + adapter = DbSNFP(filepath='./samples/dbNSFP4.5a_variant.chrY_sample', + collection='custom_collection') + + assert adapter.filepath == './samples/dbNSFP4.5a_variant.chrY_sample' + assert adapter.label == 'dbSNFP_protein_variants' + assert adapter.dataset == 'dbSNFP_protein_variants' + assert adapter.collection_name == 'custom_collection' diff --git a/data/tests/test_depmap_adapter.py b/data/tests/test_depmap_adapter.py new file mode 100644 index 00000000..c6031500 --- /dev/null +++ b/data/tests/test_depmap_adapter.py @@ -0,0 +1,81 @@ +import json +from adapters.depmap_adapter import DepMap +from adapters.writer import SpyWriter + + +def test_depmap_adapter_process_file(): + writer = SpyWriter() + adapter = DepMap( + filepath='./samples/DepMap/CRISPRGeneDependency_transposed_example.csv', + type='edge', + label='gene_term', + writer=writer + ) + adapter.process_file() + + assert len(writer.contents) > 1, 'No records were parsed.' + first_item = json.loads(writer.contents[0]) + + # Check for presence of essential keys + expected_keys = [ + '_key', '_from', '_to', 'biology_context', + 'model_id', 'model_type', 'cancer_term', + 'gene_dependency', 'source', 'source_url', + 'source_file', 'name', 'inverse_name' + ] + for key in expected_keys: + assert key in first_item, f'Missing key: {key}' + + # Additional specific assertions + assert first_item['source'] == 'DepMap' + assert first_item['source_url'] == 'https://depmap.org/portal/' + assert first_item['source_file'] == 'CRISPRGeneDependency.csv' + assert first_item['name'] == 'essential in' + assert first_item['inverse_name'] == 'dependent on' + + +def test_depmap_adapter_initialization(): + adapter = DepMap( + filepath='./samples/DepMap/CRISPRGeneDependency_transposed_example.csv', + type='edge', + label='depmap' + ) + assert adapter.filepath == './samples/DepMap/CRISPRGeneDependency_transposed_example.csv' + assert adapter.type == 'edge' + assert adapter.label == 'depmap' + assert adapter.dataset == 'depmap' + assert adapter.dry_run == True + assert adapter.writer is None, 'Writer should be None by default.' + + +def test_depmap_adapter_missing_gene_id_mapping(): + writer = SpyWriter() + adapter = DepMap( + filepath='./samples/DepMap/CRISPRGeneDependency_transposed_example.csv', + type='edge', + label='depmap', + writer=writer + ) + adapter.process_file() + + assert len( + writer.contents) > 0, 'No records were parsed despite missing gene mappings.' + first_item = json.loads(writer.contents[0]) + assert 'gene_dependency' in first_item, "Record should contain 'gene_dependency'." + assert first_item['gene_dependency'] >= DepMap.CUTOFF, 'Dependency score below cutoff.' + + +def test_depmap_adapter_dependency_cutoff(): + writer = SpyWriter() + adapter = DepMap( + filepath='./samples/DepMap/CRISPRGeneDependency_transposed_example.csv', + type='edge', + label='depmap', + writer=writer + ) + adapter.process_file() + + first_item = json.loads(writer.contents[0]) + assert first_item['gene_dependency'] >= DepMap.CUTOFF, ( + f"Dependency score {first_item['gene_dependency']} below cutoff." + ) diff --git a/data/tests/test_ebi_complex_adapter.py b/data/tests/test_ebi_complex_adapter.py new file mode 100644 index 00000000..03a79d51 --- /dev/null +++ b/data/tests/test_ebi_complex_adapter.py @@ -0,0 +1,88 @@ +import json +import pytest +from adapters.ebi_complex_adapter import EBIComplex +from adapters.writer import SpyWriter + + +def test_ebi_complex_initialization(): + sample_filepath = './samples/EBI_complex_example.tsv' + for label in EBIComplex.ALLOWED_LABELS: + writer = SpyWriter() + adapter = EBIComplex(sample_filepath, label=label, writer=writer) + assert adapter.filepath == sample_filepath + assert adapter.label == label + assert adapter.dataset == label + assert adapter.dry_run == True + assert adapter.writer == writer + + if label == 'complex': + assert adapter.type == 'node' + else: + assert adapter.type == 'edge' + + +def test_ebi_complex_invalid_label(): + sample_filepath = './samples/EBI_complex_example.tsv' + writer = SpyWriter() + with pytest.raises(ValueError, match='Invalid label. Allowed values: complex,complex_protein,complex_term'): + EBIComplex(sample_filepath, label='invalid_label', writer=writer) + + +def test_ebi_complex_process_file(): + sample_filepath = './samples/EBI_complex_example.tsv' + for label in EBIComplex.ALLOWED_LABELS: + writer = SpyWriter() + adapter = EBIComplex(sample_filepath, label=label, writer=writer) + adapter.process_file() + + # Check that some data was written + assert len(writer.contents) > 0 + + # Check the structure of the first item + first_item = json.loads(writer.contents[0]) + if label == 'complex': + assert '_key' in first_item + assert 'name' in first_item + elif label == 'complex_protein': + assert '_key' in first_item + assert '_from' in first_item + assert '_to' in first_item + elif label == 'complex_term': + assert '_key' in first_item + assert '_from' in first_item + assert '_to' in first_item + + +def test_ebi_complex_get_chain_id(): + adapter = EBIComplex('./samples/EBI_complex_example.tsv', label='complex') + + assert adapter.get_chain_id('P12345') == None + assert adapter.get_chain_id('P12345-1') == None + assert adapter.get_chain_id('P12345-PRO_0000123456') == 'PRO_0000123456' + + +def test_ebi_complex_get_isoform_id(): + adapter = EBIComplex('./samples/EBI_complex_example.tsv', label='complex') + + assert adapter.get_isoform_id('P12345') == None + assert adapter.get_isoform_id('P12345-1') == '1' + assert adapter.get_isoform_id('P12345-PRO_0000123456') == None + + +def test_ebi_complex_load_linked_features_dict(): + sample_filepath = './samples/EBI_complex_example.tsv' + writer = SpyWriter() + adapter = EBIComplex( + sample_filepath, label='complex_protein', writer=writer) + adapter.load_linked_features_dict() + assert hasattr(adapter, 'linked_features_dict') + assert isinstance(adapter.linked_features_dict, dict) + + +def test_ebi_complex_load_subontologies(): + sample_filepath = './samples/EBI_complex_example.tsv' + writer = SpyWriter() + adapter = EBIComplex(sample_filepath, label='complex', writer=writer) + adapter.load_subontologies() + assert hasattr(adapter, 'subontologies') + assert isinstance(adapter.subontologies, dict) diff --git a/data/tests/test_encode_E2G_CRISPR_adapter.py b/data/tests/test_encode_E2G_CRISPR_adapter.py new file mode 100644 index 00000000..5bf291e6 --- /dev/null +++ b/data/tests/test_encode_E2G_CRISPR_adapter.py @@ -0,0 +1,83 @@ +import json +import pytest +from adapters.encode_E2G_CRISPR_adapter import ENCODE2GCRISPR +from adapters.writer import SpyWriter + + +def test_encode2gcrispr_adapter_regulatory_region(): + writer = SpyWriter() + adapter = ENCODE2GCRISPR( + filepath='./samples/ENCODE_E2G_CRISPR_example.tsv', label='regulatory_region', writer=writer) + adapter.process_file() + first_item = json.loads(writer.contents[0]) + assert len(writer.contents) > 0 + assert '_key' in first_item + assert 'chr' in first_item + assert 'start' in first_item + assert 'end' in first_item + assert 'type' in first_item + assert first_item['source'] == ENCODE2GCRISPR.SOURCE + assert first_item['source_url'] == ENCODE2GCRISPR.SOURCE_URL + + +def test_encode2gcrispr_adapter_regulatory_region_gene(): + writer = SpyWriter() + adapter = ENCODE2GCRISPR(filepath='./samples/ENCODE_E2G_CRISPR_example.tsv', + label='regulatory_region_gene', writer=writer) + adapter.process_file() + first_item = json.loads(writer.contents[0]) + assert len(writer.contents) > 0 + assert '_key' in first_item + assert '_from' in first_item + assert '_to' in first_item + assert 'score' in first_item + assert 'p_value' in first_item + assert 'log10pvalue' in first_item + assert 'significant' in first_item + assert first_item['source'] == ENCODE2GCRISPR.SOURCE + assert first_item['source_url'] == ENCODE2GCRISPR.SOURCE_URL + assert first_item['biological_context'] == f'ontology_terms/{ENCODE2GCRISPR.BIOLOGICAL_CONTEXT}' + + +def test_encode2gcrispr_adapter_invalid_label(): + writer = SpyWriter() + with pytest.raises(ValueError, match='Invalid label. Allowed values: regulatory_region,regulatory_region_gene'): + ENCODE2GCRISPR(filepath='./samples/ENCODE_E2G_CRISPR_example.tsv', + label='invalid_label', writer=writer) + + +def test_encode2gcrispr_adapter_initialization(): + writer = SpyWriter() + for label in ENCODE2GCRISPR.ALLOWED_LABELS: + adapter = ENCODE2GCRISPR( + filepath='./samples/ENCODE_E2G_CRISPR_example.tsv', label=label, writer=writer) + assert adapter.filepath == './samples/ENCODE_E2G_CRISPR_example.tsv' + assert adapter.label == label + assert adapter.dataset == label + assert adapter.dry_run == True + assert adapter.writer == writer + + if label == 'regulatory_region': + assert adapter.type == 'node' + else: + assert adapter.type == 'edge' + + +def test_encode2gcrispr_adapter_load_regulatory_region(): + writer = SpyWriter() + adapter = ENCODE2GCRISPR( + filepath='./samples/ENCODE_E2G_CRISPR_example.tsv', label='regulatory_region', writer=writer) + adapter.load_regulatory_region() + assert hasattr(adapter, 'regulatory_region_nodes') + assert isinstance(adapter.regulatory_region_nodes, dict) + assert len(adapter.regulatory_region_nodes) > 0 + + +def test_encode2gcrispr_adapter_load_gene_id_mapping(): + writer = SpyWriter() + adapter = ENCODE2GCRISPR(filepath='./samples/ENCODE_E2G_CRISPR_example.tsv', + label='regulatory_region_gene', writer=writer) + adapter.load_gene_id_mapping() + assert hasattr(adapter, 'gene_id_mapping') + assert isinstance(adapter.gene_id_mapping, dict) + assert len(adapter.gene_id_mapping) > 0 diff --git a/data/tests/test_encode_caqtl_adapter.py b/data/tests/test_encode_caqtl_adapter.py new file mode 100644 index 00000000..c80552f6 --- /dev/null +++ b/data/tests/test_encode_caqtl_adapter.py @@ -0,0 +1,58 @@ +import json +import pytest +from adapters.encode_caqtl_adapter import CAQtl +from adapters.writer import SpyWriter + + +def test_caqtl_adapter_regulatory_region(): + writer = SpyWriter() + adapter = CAQtl(filepath='./samples/caqtl-sample.bed', + source='PMID:34017130', label='regulatory_region', writer=writer) + adapter.process_file() + first_item = json.loads(writer.contents[0]) + assert len(writer.contents) > 0 + assert '_key' in first_item + assert 'chr' in first_item + assert 'start' in first_item + assert 'end' in first_item + assert first_item['type'] == 'accessible dna elements' + + +def test_caqtl_adapter_encode_caqtl(): + writer = SpyWriter() + adapter = CAQtl(filepath='./samples/caqtl-sample.bed', + source='PMID:34017130', label='encode_caqtl', writer=writer) + adapter.process_file() + first_item = json.loads(writer.contents[0]) + assert len(writer.contents) > 0 + assert '_key' in first_item + assert '_from' in first_item + assert '_to' in first_item + assert first_item['label'] == 'caQTL' + assert first_item['name'] == 'associates with' + assert first_item['inverse_name'] == 'associates with' + + +def test_caqtl_adapter_invalid_label(): + writer = SpyWriter() + with pytest.raises(ValueError, match='Invalid label. Allowed values: regulatory_region,encode_caqtl'): + CAQtl(filepath='./samples/caqtl-sample.bed', + source='PMID:34017130', label='invalid_label', writer=writer) + + +def test_caqtl_adapter_initialization(): + writer = SpyWriter() + for label in CAQtl.ALLOWED_LABELS: + adapter = CAQtl(filepath='./samples/caqtl-sample.bed', + source='PMID:34017130', label=label, writer=writer) + assert adapter.filepath == './samples/caqtl-sample.bed' + assert adapter.label == label + assert adapter.dataset == label + assert adapter.source == 'PMID:34017130' + assert adapter.dry_run == True + assert adapter.writer == writer + + if label == 'regulatory_region': + assert adapter.type == 'node' + else: + assert adapter.type == 'edge' diff --git a/data/tests/test_encode_mpra_adapter.py b/data/tests/test_encode_mpra_adapter.py new file mode 100644 index 00000000..d39c5c31 --- /dev/null +++ b/data/tests/test_encode_mpra_adapter.py @@ -0,0 +1,80 @@ +import json +import pytest +from adapters.encode_mpra_adapter import EncodeMPRA +from adapters.writer import SpyWriter + + +def test_encode_mpra_adapter_regulatory_region(): + writer = SpyWriter() + adapter = EncodeMPRA(filepath='./samples/MPRA_ENCFF802FUV_example.bed.gz', + label='regulatory_region', + source_url='https://www.encodeproject.org/files/ENCFF802FUV/', + biological_context='EFO_0002067', + writer=writer) + adapter.process_file() + first_item = json.loads(writer.contents[0]) + assert len(writer.contents) > 0 + assert '_key' in first_item + assert 'chr' in first_item + assert 'start' in first_item + assert 'end' in first_item + assert first_item['type'] == 'MPRA_tested_regulatory_element' + assert first_item['source'] == EncodeMPRA.SOURCE + assert first_item['source_url'] == 'https://www.encodeproject.org/files/ENCFF802FUV/' + + +def test_encode_mpra_adapter_regulatory_region_biosample(): + writer = SpyWriter() + adapter = EncodeMPRA(filepath='./samples/MPRA_ENCFF802FUV_example.bed.gz', + label='regulatory_region_biosample', + source_url='https://www.encodeproject.org/files/ENCFF802FUV/', + biological_context='EFO_0002067', + writer=writer) + adapter.process_file() + first_item = json.loads(writer.contents[0]) + assert len(writer.contents) > 0 + assert '_key' in first_item + assert '_from' in first_item + assert '_to' in first_item + assert first_item['type'] == 'MPRA_expression_tested' + assert 'element_name' in first_item + assert 'strand' in first_item + assert 'activity_score' in first_item + assert 'bed_score' in first_item + assert 'DNA_count' in first_item + assert 'RNA_count' in first_item + assert first_item['source'] == EncodeMPRA.SOURCE + assert first_item['source_url'] == 'https://www.encodeproject.org/files/ENCFF802FUV/' + + +def test_encode_mpra_adapter_invalid_label(): + writer = SpyWriter() + with pytest.raises(ValueError, match='Ivalid label. Allowed values: regulatory_region,regulatory_region_biosample'): + EncodeMPRA(filepath='./samples/MPRA_ENCFF802FUV_example.bed.gz', + label='invalid_label', + source_url='https://www.encodeproject.org/files/ENCFF802FUV/', + biological_context='EFO_0002067', + writer=writer) + + +def test_encode_mpra_adapter_initialization(): + writer = SpyWriter() + for label in EncodeMPRA.ALLOWED_LABELS: + adapter = EncodeMPRA(filepath='./samples/MPRA_ENCFF802FUV_example.bed.gz', + label=label, + source_url='https://www.encodeproject.org/files/ENCFF802FUV/', + biological_context='EFO_0002067', + writer=writer) + assert adapter.filepath == './samples/MPRA_ENCFF802FUV_example.bed.gz' + assert adapter.label == label + assert adapter.dataset == label + assert adapter.source_url == 'https://www.encodeproject.org/files/ENCFF802FUV/' + assert adapter.file_accession == 'ENCFF802FUV' + assert adapter.biological_context == 'EFO_0002067' + assert adapter.dry_run == True + assert adapter.writer == writer + + if label == 'regulatory_region': + assert adapter.type == 'node' + else: + assert adapter.type == 'edge' diff --git a/data/tests/test_gaf_adapter.py b/data/tests/test_gaf_adapter.py new file mode 100644 index 00000000..45e12792 --- /dev/null +++ b/data/tests/test_gaf_adapter.py @@ -0,0 +1,76 @@ +import json +import pytest +from adapters.gaf_adapter import GAF +from adapters.writer import SpyWriter + + +def test_gaf_adapter_human(): + writer = SpyWriter() + adapter = GAF(filepath='./samples/goa_human_sample.gaf.gz', + gaf_type='human', writer=writer) + adapter.process_file() + first_item = json.loads(writer.contents[0]) + assert len(writer.contents) > 0 + assert '_key' in first_item + assert '_from' in first_item + assert '_to' in first_item + assert first_item['organism'] == 'Homo sapiens' + assert first_item['source'] == 'Gene Ontology' + assert first_item['source_url'] == GAF.SOURCES['human'] + + +def test_gaf_adapter_mouse(): + writer = SpyWriter() + adapter = GAF(filepath='./samples/mgi_sample.gaf.gz', + gaf_type='mouse', writer=writer) + adapter.process_file() + first_item = json.loads(writer.contents[0]) + assert len(writer.contents) > 0 + assert '_key' in first_item + assert '_from' in first_item + assert '_to' in first_item + assert first_item['organism'] == 'Mus musculus' + assert first_item['source'] == 'Gene Ontology' + assert first_item['source_url'] == GAF.SOURCES['mouse'] + + +def test_gaf_adapter_rna(): + writer = SpyWriter() + adapter = GAF(filepath='./samples/goa_human_rna.gaf.gz', + gaf_type='rna', writer=writer) + adapter.process_file() + first_item = json.loads(writer.contents[0]) + assert len(writer.contents) > 0 + assert '_key' in first_item + assert '_from' in first_item + assert '_to' in first_item + assert first_item['organism'] == 'Homo sapiens' + assert first_item['source'] == 'Gene Ontology' + assert first_item['source_url'] == GAF.SOURCES['rna'] + + +def test_gaf_adapter_invalid_type(): + writer = SpyWriter() + with pytest.raises(ValueError, match='Invalid type. Allowed values: human, human_isoform, mouse, rna, rnacentral'): + GAF(filepath='./samples/goa_human_sample.gaf.gz', + gaf_type='invalid_type', writer=writer) + + +def test_gaf_adapter_load_rnacentral_mapping(): + writer = SpyWriter() + adapter = GAF(filepath='./samples/goa_human_rna.gaf.gz', + gaf_type='rna', writer=writer) + adapter.load_rnacentral_mapping() + assert hasattr(adapter, 'rnacentral_mapping') + assert isinstance(adapter.rnacentral_mapping, dict) + assert len(adapter.rnacentral_mapping) > 0 + + +def test_gaf_adapter_load_mouse_mgi_to_uniprot(): + writer = SpyWriter() + adapter = GAF(filepath='./samples/mgi_sample.gaf.gz', + gaf_type='mouse', writer=writer) + adapter.load_mouse_mgi_to_uniprot() + assert hasattr(adapter, 'mouse_mgi_mapping') + assert isinstance(adapter.mouse_mgi_mapping, dict) + assert len(adapter.mouse_mgi_mapping) > 0 diff --git a/data/tests/test_gencode_adapter.py b/data/tests/test_gencode_adapter.py new file mode 100644 index 00000000..6316c920 --- /dev/null +++ b/data/tests/test_gencode_adapter.py @@ -0,0 +1,105 @@ +import json +import pytest +from adapters.gencode_adapter import Gencode +from adapters.writer import SpyWriter + + +def test_gencode_adapter_transcript(): + writer = SpyWriter() + adapter = Gencode(filepath='./samples/gencode_sample.gtf', + label='gencode_transcript', writer=writer) + adapter.process_file() + first_item = json.loads(writer.contents[0]) + assert len(writer.contents) > 0 + assert '_key' in first_item + assert 'transcript_id' in first_item + assert 'name' in first_item + assert 'transcript_type' in first_item + assert 'chr' in first_item + assert 'start' in first_item + assert 'end' in first_item + assert 'gene_name' in first_item + assert first_item['source'] == 'GENCODE' + assert first_item['version'] == 'v43' + assert first_item['source_url'] == 'https://www.gencodegenes.org/human/' + + +def test_gencode_adapter_transcribed_to(): + writer = SpyWriter() + adapter = Gencode(filepath='./samples/gencode_sample.gtf', + label='transcribed_to', writer=writer) + adapter.process_file() + first_item = json.loads(writer.contents[0]) + assert len(writer.contents) > 0 + assert '_key' in first_item + assert '_from' in first_item + assert '_to' in first_item + assert first_item['name'] == 'transcribes' + assert first_item['inverse_name'] == 'transcribed by' + assert first_item['biological_process'] == 'ontology_terms/GO_0010467' + + +def test_gencode_adapter_transcribed_from(): + writer = SpyWriter() + adapter = Gencode(filepath='./samples/gencode_sample.gtf', + label='transcribed_from', writer=writer) + adapter.process_file() + first_item = json.loads(writer.contents[0]) + assert len(writer.contents) > 0 + assert '_key' in first_item + assert '_from' in first_item + assert '_to' in first_item + assert first_item['name'] == 'transcribed by' + assert first_item['inverse_name'] == 'transcribes' + assert first_item['biological_process'] == 'ontology_terms/GO_0010467' + + +def test_gencode_adapter_mouse(): + writer = SpyWriter() + adapter = Gencode(filepath='./samples/gencode_sample.gtf', + label='mm_gencode_transcript', organism='MOUSE', writer=writer) + adapter.process_file() + first_item = json.loads(writer.contents[0]) + assert len(writer.contents) > 0 + assert '_key' in first_item + assert first_item['source'] == 'GENCODE' + assert first_item['version'] == 'vM33' + assert first_item['source_url'] == 'https://www.gencodegenes.org/mouse/' + + +def test_gencode_adapter_invalid_label(): + writer = SpyWriter() + with pytest.raises(ValueError, match='Invalid labelS. Allowed values: gencode_transcript,mm_gencode_transcript,transcribed_to,transcribed_from'): + Gencode(filepath='./samples/gencode_sample.gtf', + label='invalid_label', writer=writer) + + +def test_gencode_adapter_initialization(): + writer = SpyWriter() + for label in Gencode.ALLOWED_LABELS: + adapter = Gencode(filepath='./samples/gencode_sample.gtf', + label=label, writer=writer) + assert adapter.filepath == './samples/gencode_sample.gtf' + assert adapter.label == label + assert adapter.dataset == label + assert adapter.dry_run == True + assert adapter.writer == writer + + if label in ['gencode_transcript', 'mm_gencode_transcript']: + assert adapter.type == 'node' + else: + assert adapter.type == 'edge' + + +def test_gencode_adapter_parse_info_metadata(): + adapter = Gencode(filepath='./samples/gencode_sample.gtf', + label='gencode_transcript') + info = ['gene_id', '"ENSG00000223972.5";', 'transcript_id', '"ENST00000456328.2";', 'gene_type', '"transcribed_unprocessed_pseudogene";', + 'gene_name', '"DDX11L1";', 'transcript_type', '"processed_transcript";', 'transcript_name', '"DDX11L1-202";'] + parsed_info = adapter.parse_info_metadata(info) + assert parsed_info['gene_id'] == 'ENSG00000223972.5' + assert parsed_info['transcript_id'] == 'ENST00000456328.2' + assert parsed_info['gene_type'] == 'transcribed_unprocessed_pseudogene' + assert parsed_info['gene_name'] == 'DDX11L1' + assert parsed_info['transcript_type'] == 'processed_transcript' + assert parsed_info['transcript_name'] == 'DDX11L1-202' diff --git a/data/tests/test_gencode_gene_adapter.py b/data/tests/test_gencode_gene_adapter.py new file mode 100644 index 00000000..31a46b07 --- /dev/null +++ b/data/tests/test_gencode_gene_adapter.py @@ -0,0 +1,53 @@ +import json +import pytest +from adapters.gencode_gene_adapter import GencodeGene +from adapters.writer import SpyWriter + + +def test_gencode_gene_adapter_human(): + writer = SpyWriter() + adapter = GencodeGene(filepath='./samples/gencode_sample.gtf', + gene_alias_file_path='./samples/Homo_sapiens.gene_info.gz', + label='gencode_gene', + writer=writer) + adapter.process_file() + first_item = json.loads(writer.contents[0]) + assert len(writer.contents) > 0 + assert '_key' in first_item + assert 'gene_id' in first_item + assert 'gene_type' in first_item + assert 'chr' in first_item + assert 'start:long' in first_item + assert 'end:long' in first_item + assert 'name' in first_item + assert first_item['source'] == 'GENCODE' + assert first_item['version'] == 'v43' + assert first_item['source_url'] == 'https://www.gencodegenes.org/human/' + + +def test_gencode_gene_adapter_invalid_label(): + writer = SpyWriter() + with pytest.raises(ValueError, match='Invalid label. Allowed values: gencode_gene,mm_gencode_gene'): + GencodeGene(filepath='./samples/gencode_sample.gtf', + gene_alias_file_path='./samples/Homo_sapiens.gene_info.gz', + label='invalid_label', + writer=writer) + + +def test_gencode_gene_adapter_parse_info_metadata(): + adapter = GencodeGene(filepath='./samples/gencode_sample.gtf', + gene_alias_file_path='./samples/Homo_sapiens.gene_info.gz') + info = ['gene_id', '"ENSG00000223972.5";', 'gene_type', + '"transcribed_unprocessed_pseudogene";', 'gene_name', '"DDX11L1";'] + parsed_info = adapter.parse_info_metadata(info) + assert parsed_info['gene_id'] == 'ENSG00000223972.5' + assert parsed_info['gene_type'] == 'transcribed_unprocessed_pseudogene' + assert parsed_info['gene_name'] == 'DDX11L1' + + +def test_gencode_gene_adapter_get_collection_alias(): + adapter = GencodeGene(filepath='./samples/gencode_sample.gtf', + gene_alias_file_path='./samples/Homo_sapiens.gene_info.gz') + alias_dict = adapter.get_collection_alias() + assert isinstance(alias_dict, dict) + assert len(alias_dict) > 0 diff --git a/data/tests/test_gencode_gene_structure_adapter.py b/data/tests/test_gencode_gene_structure_adapter.py new file mode 100644 index 00000000..993a29d0 --- /dev/null +++ b/data/tests/test_gencode_gene_structure_adapter.py @@ -0,0 +1,112 @@ +import json +import pytest +from adapters.gencode_gene_structure_adapter import GencodeStructure +from adapters.writer import SpyWriter + + +def test_gencode_structure_adapter_gene_structure(): + writer = SpyWriter() + adapter = GencodeStructure( + filepath='./samples/gencode_sample.gtf', label='gene_structure', writer=writer) + adapter.process_file() + first_item = json.loads(writer.contents[0]) + assert len(writer.contents) > 0 + assert '_key' in first_item + assert 'name' in first_item + assert 'chr' in first_item + assert 'start:long' in first_item + assert 'end:long' in first_item + assert 'strand' in first_item + assert 'type' in first_item + assert 'gene_id' in first_item + assert 'gene_name' in first_item + assert 'transcript_id' in first_item + assert 'transcript_name' in first_item + assert 'exon_number' in first_item + assert 'exon_id' in first_item + assert first_item['source'] == 'GENCODE' + assert first_item['version'] == 'v43' + assert first_item['source_url'] == 'https://www.gencodegenes.org/human/' + assert first_item['organism'] == 'Homo sapiens' + + +def test_gencode_structure_adapter_mm_gene_structure(): + writer = SpyWriter() + adapter = GencodeStructure( + filepath='./samples/gencode_sample.gtf', label='mm_gene_structure', writer=writer) + adapter.process_file() + first_item = json.loads(writer.contents[0]) + assert len(writer.contents) > 0 + assert '_key' in first_item + assert first_item['source'] == 'GENCODE' + assert first_item['version'] == 'vM33' + assert first_item['source_url'] == 'https://www.gencodegenes.org/mouse/' + assert first_item['organism'] == 'Mus musculus' + + +def test_gencode_structure_adapter_transcript_contains_gene_structure(): + writer = SpyWriter() + adapter = GencodeStructure(filepath='./samples/gencode_sample.gtf', + label='transcript_contains_gene_structure', writer=writer) + adapter.process_file() + first_item = json.loads(writer.contents[0]) + assert len(writer.contents) > 0 + assert '_from' in first_item + assert '_to' in first_item + assert 'name' in first_item + assert 'inverse_name' in first_item + assert first_item['source'] == 'GENCODE' + assert first_item['version'] == 'v43' + assert first_item['source_url'] == 'https://www.gencodegenes.org/human/' + assert first_item['organism'] == 'Homo sapiens' + + +def test_gencode_structure_adapter_mm_transcript_contains_mm_gene_structure(): + writer = SpyWriter() + adapter = GencodeStructure(filepath='./samples/gencode_sample.gtf', + label='mm_transcript_contains_mm_gene_structure', writer=writer) + adapter.process_file() + first_item = json.loads(writer.contents[0]) + assert len(writer.contents) > 0 + assert '_from' in first_item + assert '_to' in first_item + assert first_item['source'] == 'GENCODE' + assert first_item['version'] == 'vM33' + assert first_item['source_url'] == 'https://www.gencodegenes.org/mouse/' + assert first_item['organism'] == 'Mus musculus' + + +def test_gencode_structure_adapter_invalid_label(): + writer = SpyWriter() + with pytest.raises(ValueError, match='Invalid label. Allowed values: gene_structure,mm_gene_structure,transcript_contains_gene_structure,mm_transcript_contains_mm_gene_structure'): + GencodeStructure(filepath='./samples/gencode_sample.gtf', + label='invalid_label', writer=writer) + + +def test_gencode_structure_adapter_initialization(): + writer = SpyWriter() + for label in GencodeStructure.ALLOWED_LABELS: + adapter = GencodeStructure( + filepath='./samples/gencode_sample.gtf', label=label, writer=writer) + assert adapter.filepath == './samples/gencode_sample.gtf' + assert adapter.label == label + assert adapter.dry_run == True + assert adapter.writer == writer + + if label in ['gene_structure', 'mm_gene_structure']: + assert adapter.type == 'node' + else: + assert adapter.type == 'edge' + + +def test_gencode_structure_adapter_parse_info_metadata(): + adapter = GencodeStructure(filepath='./samples/gencode_sample.gtf') + info = ['gene_id', '"ENSG00000223972.5";', 'transcript_id', '"ENST00000456328.2";', 'gene_name', + '"DDX11L1";', 'transcript_name', '"DDX11L1-202";', 'exon_number', '1', 'exon_id', '"ENSE00002234944.1";'] + parsed_info = adapter.parse_info_metadata(info) + assert parsed_info['gene_id'] == 'ENSG00000223972.5' + assert parsed_info['transcript_id'] == 'ENST00000456328.2' + assert parsed_info['gene_name'] == 'DDX11L1' + assert parsed_info['transcript_name'] == 'DDX11L1-202' + assert parsed_info['exon_number'] == '1' + assert parsed_info['exon_id'] == 'ENSE00002234944.1' diff --git a/data/tests/test_gtex_eqtl_adapter.py b/data/tests/test_gtex_eqtl_adapter.py new file mode 100644 index 00000000..511bb09e --- /dev/null +++ b/data/tests/test_gtex_eqtl_adapter.py @@ -0,0 +1,75 @@ +import json +import pytest +from adapters.gtex_eqtl_adapter import GtexEQtl +from adapters.writer import SpyWriter +import os + + +def test_gtex_eqtl_adapter_eqtl(): + writer = SpyWriter() + adapter = GtexEQtl(filepath='./samples/GTEx_eQTL', + label='GTEx_eqtl', writer=writer) + adapter.process_file() + first_item = json.loads(writer.contents[0]) + assert len(writer.contents) > 0 + assert '_key' in first_item + assert '_from' in first_item + assert '_to' in first_item + assert 'biological_context' in first_item + assert 'chr' in first_item + assert 'p_value' in first_item + assert 'log10pvalue' in first_item + assert 'effect_size' in first_item + assert 'pval_beta' in first_item + assert first_item['label'] == 'eQTL' + assert first_item['source'] == GtexEQtl.SOURCE + assert first_item['source_url'].startswith(GtexEQtl.SOURCE_URL_PREFIX) + + +def test_gtex_eqtl_adapter_eqtl_term(): + writer = SpyWriter() + adapter = GtexEQtl(filepath='./samples/GTEx_eQTL', + label='GTEx_eqtl_term', writer=writer) + adapter.process_file() + + first_item = json.loads(writer.contents[0]) + assert len(writer.contents) > 0 + assert '_key' in first_item + assert '_from' in first_item + assert '_to' in first_item + assert 'biological_context' in first_item + assert first_item['source'] == GtexEQtl.SOURCE + assert first_item['source_url'].startswith(GtexEQtl.SOURCE_URL_PREFIX) + assert first_item['name'] == 'occurs in' + assert first_item['inverse_name'] == 'has measurement' + + +def test_gtex_eqtl_adapter_invalid_label(): + writer = SpyWriter() + with pytest.raises(ValueError, match='Invalid label. Allowed values: GTEx_eqtl,GTEx_eqtl_term'): + GtexEQtl(filepath='./samples/GTEx_eQTL', + label='invalid_label', writer=writer) + + +def test_gtex_eqtl_adapter_initialization(): + writer = SpyWriter() + for label in GtexEQtl.ALLOWED_LABELS: + adapter = GtexEQtl(filepath='./samples/GTEx_eQTL', + label=label, writer=writer) + assert adapter.filepath == './samples/GTEx_eQTL' + assert adapter.label == label + assert adapter.dataset == label + assert adapter.dry_run == True + assert adapter.type == 'edge' + assert adapter.writer == writer + + +def test_gtex_eqtl_adapter_load_ontology_mapping(): + adapter = GtexEQtl(filepath='./samples/GTEx_eQTL') + adapter.load_ontology_mapping() + assert hasattr(adapter, 'ontology_id_mapping') + assert isinstance(adapter.ontology_id_mapping, dict) + assert len(adapter.ontology_id_mapping) > 0 + assert hasattr(adapter, 'ontology_term_mapping') + assert isinstance(adapter.ontology_term_mapping, dict) + assert len(adapter.ontology_term_mapping) > 0 diff --git a/data/tests/test_gtex_sqtl_adapter.py b/data/tests/test_gtex_sqtl_adapter.py new file mode 100644 index 00000000..c06a9db2 --- /dev/null +++ b/data/tests/test_gtex_sqtl_adapter.py @@ -0,0 +1,78 @@ +import json +import pytest +from adapters.gtex_sqtl_adapter import GtexSQtl +from adapters.writer import SpyWriter +import os + + +def test_gtex_sqtl_adapter_splice_qtl(): + writer = SpyWriter() + adapter = GtexSQtl(filepath='./samples/GTEx_sQTL', + label='GTEx_splice_QTL', writer=writer) + adapter.process_file() + first_item = json.loads(writer.contents[0]) + assert len(writer.contents) > 0 + assert '_key' in first_item + assert '_from' in first_item + assert '_to' in first_item + assert 'biological_context' in first_item + assert 'chr' in first_item + assert 'p_value' in first_item + assert 'log10pvalue' in first_item + assert 'effect_size' in first_item + assert 'effect_size_se' in first_item + assert 'pval_beta' in first_item + assert 'intron_chr' in first_item + assert 'intron_start' in first_item + assert 'intron_end' in first_item + assert first_item['label'] == 'splice_QTL' + assert first_item['source'] == GtexSQtl.SOURCE + assert first_item['source_url'].startswith(GtexSQtl.SOURCE_URL_PREFIX) + assert first_item['name'] == 'modulates splicing of' + assert first_item['inverse_name'] == 'splicing modulated by' + assert first_item['biological_process'] == 'ontology_terms/GO_0043484' + + +def test_gtex_sqtl_adapter_splice_qtl_term(): + writer = SpyWriter() + adapter = GtexSQtl(filepath='./samples/GTEx_sQTL', + label='GTEx_splice_QTL_term', writer=writer) + adapter.process_file() + first_item = json.loads(writer.contents[0]) + assert len(writer.contents) > 0 + assert '_key' in first_item + assert '_from' in first_item + assert '_to' in first_item + assert 'biological_context' in first_item + assert first_item['source'] == GtexSQtl.SOURCE + assert first_item['source_url'].startswith(GtexSQtl.SOURCE_URL_PREFIX) + assert first_item['name'] == 'occurs in' + assert first_item['inverse_name'] == 'has measurement' + + +def test_gtex_sqtl_adapter_invalid_label(): + writer = SpyWriter() + with pytest.raises(ValueError, match='Invalid label. Allowed values: GTEx_splice_QTL,GTEx_splice_QTL_term'): + GtexSQtl(filepath='./samples/GTEx_sQTL', + label='invalid_label', writer=writer) + + +def test_gtex_sqtl_adapter_initialization(): + writer = SpyWriter() + for label in GtexSQtl.ALLOWED_LABELS: + adapter = GtexSQtl(filepath='./samples/GTEx_sQTL', + label=label, writer=writer) + assert adapter.filepath == './samples/GTEx_sQTL' + assert adapter.label == label + assert adapter.dataset == label + assert adapter.dry_run == True + assert adapter.type == 'edge' + assert adapter.writer == writer + + +def test_gtex_sqtl_adapter_load_ontology_mapping(): + adapter = GtexSQtl(filepath='./samples/GTEx_sQTL') + adapter.load_ontology_mapping() + assert hasattr(adapter, 'ontology_id_mapping') + assert isinstance(adapter.ontology_id_mapping, dict) + assert len(adapter.ontology_id_mapping) > 0 diff --git a/data/tests/test_gvatdb_asb_adapter.py b/data/tests/test_gvatdb_asb_adapter.py new file mode 100644 index 00000000..0193b421 --- /dev/null +++ b/data/tests/test_gvatdb_asb_adapter.py @@ -0,0 +1,45 @@ +import json +from adapters.gvatdb_asb_adapter import ASB_GVATDB +from adapters.writer import SpyWriter + + +def test_asb_gvatdb_adapter_process(): + writer = SpyWriter() + adapter = ASB_GVATDB(filepath='./samples/GVATdb_sample.csv', + label='GVATdb_ASB', writer=writer) + adapter.process_file() + first_item = json.loads(writer.contents[0]) + assert len(writer.contents) > 0 + assert '_key' in first_item + assert '_from' in first_item + assert '_to' in first_item + assert 'log10pvalue' in first_item + assert 'p_value' in first_item + assert 'hg19_coordinate' in first_item + assert first_item['source'] == ASB_GVATDB.SOURCE + assert first_item['source_url'] == ASB_GVATDB.SOURCE_URL + assert first_item['label'] == 'allele-specific binding' + assert first_item['name'] == 'modulates binding of' + assert first_item['inverse_name'] == 'binding modulated by' + assert first_item['biological_process'] == 'ontology_terms/GO_0051101' + + +def test_asb_gvatdb_adapter_initialization(): + writer = SpyWriter() + adapter = ASB_GVATDB(filepath='./samples/GVATdb_sample.csv', + label='GVATdb_ASB', writer=writer) + assert adapter.filepath == './samples/GVATdb_sample.csv' + assert adapter.label == 'GVATdb_ASB' + assert adapter.dataset == 'GVATdb_ASB' + assert adapter.dry_run == True + assert adapter.type == 'edge' + assert adapter.writer == writer + + +def test_asb_gvatdb_adapter_load_tf_uniprot_id_mapping(): + adapter = ASB_GVATDB( + filepath='./samples/GVATdb_sample.csv', label='GVATdb_ASB') + adapter.load_tf_uniprot_id_mapping() + assert hasattr(adapter, 'tf_uniprot_id_mapping') + assert isinstance(adapter.tf_uniprot_id_mapping, dict) + assert len(adapter.tf_uniprot_id_mapping) > 0 diff --git a/data/tests/test_gwas_adapter.py b/data/tests/test_gwas_adapter.py new file mode 100644 index 00000000..cfe7ae74 --- /dev/null +++ b/data/tests/test_gwas_adapter.py @@ -0,0 +1,77 @@ +import json +import pytest +from adapters.gwas_adapter import GWAS +from adapters.writer import SpyWriter + + +@pytest.fixture +def gwas_files(): + return { + 'variants_to_ontology': './samples/gwas_v2d_igvf_sample.tsv', + 'variants_to_genes': './samples/gwas_v2g_igvf_sample.tsv' + } + + +@pytest.fixture +def spy_writer(): + return SpyWriter() + + +def test_variants_phenotypes_collection(gwas_files, spy_writer): + gwas = GWAS(gwas_files['variants_to_ontology'], gwas_files['variants_to_genes'], + gwas_collection='variants_phenotypes', writer=spy_writer) + gwas.process_file() + + assert len(spy_writer.contents) > 0 + for item in spy_writer.contents: + if item.startswith('{'): + data = json.loads(item) + assert '_from' in data + assert '_to' in data + assert '_key' in data + assert 'source' in data + assert 'name' in data + + +def test_get_tagged_variants(gwas_files): + gwas = GWAS(gwas_files['variants_to_ontology'], gwas_files['variants_to_genes'], + gwas_collection='variants_phenotypes_studies') + tagged_variants = gwas.get_tagged_variants() + + assert len(tagged_variants) > 0 + for key, variants in tagged_variants.items(): + assert isinstance(variants, list) + for variant in variants: + assert 'tag_chrom' in variant + assert 'tag_pos:long' in variant + assert 'tag_ref' in variant + assert 'tag_alt' in variant + + +def test_get_genes_from_variant_to_genes_file(gwas_files): + gwas = GWAS(gwas_files['variants_to_ontology'], gwas_files['variants_to_genes'], + gwas_collection='variants_phenotypes_studies') + genes = gwas.get_genes_from_variant_to_genes_file() + + assert len(genes) > 0 + for variant_id, gene_data in genes.items(): + assert isinstance(gene_data, dict) + for gene_id, gene_info in gene_data.items(): + assert gene_id.startswith('genes/') + assert isinstance(gene_info, list) + for info in gene_info: + assert 'feature' in info + assert 'type_id' in info + assert 'source_id' in info + + +def test_load_ontology_name_mapping(gwas_files): + gwas = GWAS(gwas_files['variants_to_ontology'], gwas_files['variants_to_genes'], + gwas_collection='variants_phenotypes_studies') + gwas.load_ontology_name_mapping() + + assert hasattr(gwas, 'ontology_name_mapping') + assert len(gwas.ontology_name_mapping) > 0 + for ontology_id, ontology_name in gwas.ontology_name_mapping.items(): + assert isinstance(ontology_id, str) + assert isinstance(ontology_name, str) diff --git a/data/tests/test_human_mouse_element_adapter.py b/data/tests/test_human_mouse_element_adapter.py new file mode 100644 index 00000000..fd978358 --- /dev/null +++ b/data/tests/test_human_mouse_element_adapter.py @@ -0,0 +1,89 @@ +import json +import pytest +from adapters.human_mouse_element_adapter import HumanMouseElementAdapter +from adapters.writer import SpyWriter + + +@pytest.fixture +def human_mouse_files(): + return './samples/element_mapping_example.txt.gz' + + +@pytest.fixture +def spy_writer(): + return SpyWriter() + + +def test_regulatory_region(human_mouse_files, spy_writer): + adapter = HumanMouseElementAdapter( + human_mouse_files, label='regulatory_region', writer=spy_writer) + adapter.process_file() + + assert len(spy_writer.contents) > 0 + data = json.loads(spy_writer.contents[0]) + assert '_key' in data + assert 'chr' in data + assert 'start' in data + assert 'end' in data + assert 'type' in data + assert data['type'] == 'accessible dna elements' + assert 'source' in data + assert 'source_url' in data + + +def test_mm_regulatory_region(human_mouse_files, spy_writer): + adapter = HumanMouseElementAdapter( + human_mouse_files, label='mm_regulatory_region', writer=spy_writer) + adapter.process_file() + + assert len(spy_writer.contents) > 0 + data = json.loads(spy_writer.contents[0]) + assert '_key' in data + assert 'chr' in data + assert 'start' in data + assert 'end' in data + assert 'type' in data + assert data['type'] == 'accessible dna elements (mouse)' + assert 'source' in data + assert 'source_url' in data + + +def test_regulatory_region_mm_regulatory_region(human_mouse_files, spy_writer): + adapter = HumanMouseElementAdapter( + human_mouse_files, label='regulatory_region_mm_regulatory_region', writer=spy_writer) + adapter.process_file() + + assert len(spy_writer.contents) > 0 + data = json.loads(spy_writer.contents[0]) + assert '_key' in data + assert '_from' in data + assert '_to' in data + assert 'percent_identical_bp' in data + assert 'phastCons4way' in data + assert 'phyloP4way' in data + assert 'cov_chromatin_accessibility' in data + assert 'cov_chromatin_accessibility_pval' in data + assert 'cov_chromatin_accessibility_fdr' in data + assert 'cob_chromatin_accessibility' in data + assert 'cob_chromatin_accessibility_pval' in data + assert 'cob_chromatin_accessibility_fdr' in data + assert 'cov_H3K27ac' in data + assert 'cov_H3K27ac_pval' in data + assert 'cov_H3K27ac_fdr' in data + assert 'cob_H3K27ac' in data + assert 'cob_H3K27ac_pval' in data + assert 'cob_H3K27ac_fdr' in data + assert 'cov_H3K4me1' in data + assert 'cov_H3K4me1_pval' in data + assert 'cov_H3K4me1_fdr' in data + assert 'cob_H3K4me1' in data + assert 'cob_H3K4me1_pval' in data + assert 'cob_H3K4me1_fdr' in data + assert 'cov_H3K4me3' in data + assert 'cov_H3K4me3_pval' in data + assert 'cov_H3K4me3_fdr' in data + assert 'cob_H3K4me3' in data + assert 'cob_H3K4me3_pval' in data + assert 'cob_H3K4me3_fdr' in data + assert 'source' in data + assert 'source_url' in data diff --git a/data/tests/test_mgi_human_mouse_ortholog_adapter.py b/data/tests/test_mgi_human_mouse_ortholog_adapter.py new file mode 100644 index 00000000..6948e3fa --- /dev/null +++ b/data/tests/test_mgi_human_mouse_ortholog_adapter.py @@ -0,0 +1,44 @@ +import json +import pytest +from adapters.mgi_human_mouse_ortholog_adapter import MGIHumanMouseOrthologAdapter +from adapters.writer import SpyWriter + + +@pytest.fixture +def sample_filepath(): + return './samples/HOM_MouseHumanSequence_sample.rpt' + + +@pytest.fixture +def spy_writer(): + return SpyWriter() + + +def test_process_file(sample_filepath, spy_writer): + adapter = MGIHumanMouseOrthologAdapter(sample_filepath, writer=spy_writer) + adapter.process_file() + + assert len(spy_writer.contents) > 0 + # Check only the first item to make the test faster + data = json.loads(spy_writer.contents[0]) + assert '_key' in data + assert '_from' in data + assert '_to' in data + assert data['_from'].startswith('genes/') + assert data['_to'].startswith('mm_genes/') + assert data['name'] == 'homologous to' + assert data['inverse_name'] == 'homologous to' + assert data['relationship'] == 'ontology_terms/NCIT_C79968' + assert data['source'] == 'MGI' + assert data['source_url'] == 'https://www.informatics.jax.org/downloads/reports/HOM_MouseHumanSequence.rpt' + + +def test_load_mappings(sample_filepath, spy_writer): + adapter = MGIHumanMouseOrthologAdapter(sample_filepath, writer=spy_writer) + adapter.load_mgi_ensembl_mapping() + adapter.load_entrz_ensembl_mapping() + + assert hasattr(adapter, 'mm_gene_mapping') + assert len(adapter.mm_gene_mapping) > 0 + assert hasattr(adapter, 'gene_mapping') + assert len(adapter.gene_mapping) > 0 diff --git a/data/tests/test_motif_adapter.py b/data/tests/test_motif_adapter.py new file mode 100644 index 00000000..02cba496 --- /dev/null +++ b/data/tests/test_motif_adapter.py @@ -0,0 +1,65 @@ +import json +import pytest +from adapters.motif_adapter import Motif +from adapters.writer import SpyWriter + + +@pytest.fixture +def sample_filepath(): + return './samples/motifs' + + +@pytest.fixture +def spy_writer(): + return SpyWriter() + + +def test_motif_node(sample_filepath, spy_writer): + motif = Motif(sample_filepath, label='motif', writer=spy_writer) + motif.process_file() + + assert len(spy_writer.contents) > 0 + data = json.loads(spy_writer.contents[0]) + assert '_key' in data + assert 'name' in data + assert 'tf_name' in data + assert 'source' in data + assert 'source_url' in data + assert 'pwm' in data + assert 'length' in data + assert data['source'] == Motif.SOURCE + assert data['source_url'].startswith(Motif.SOURCE_URL) + + +def test_motif_protein_link(sample_filepath, spy_writer): + motif = Motif(sample_filepath, label='motif_protein_link', + writer=spy_writer) + motif.process_file() + + assert len(spy_writer.contents) > 0 + data = json.loads(spy_writer.contents[0]) + assert '_key' in data + assert '_from' in data + assert '_to' in data + assert 'name' in data + assert 'inverse_name' in data + assert 'biological_process' in data + assert 'source' in data + assert data['name'] == 'is used by' + assert data['inverse_name'] == 'uses' + assert data['biological_process'] == 'ontology_terms/GO_0003677' + assert data['source'] == Motif.SOURCE + + +def test_invalid_label(sample_filepath, spy_writer): + with pytest.raises(ValueError): + Motif(sample_filepath, label='invalid_label', writer=spy_writer) + + +def test_load_tf_uniprot_id_mapping(sample_filepath, spy_writer): + motif = Motif(sample_filepath, label='motif_protein_link', + writer=spy_writer) + motif.load_tf_uniprot_id_mapping() + + assert hasattr(motif, 'tf_uniprot_id_mapping') + assert len(motif.tf_uniprot_id_mapping) > 0 diff --git a/data/tests/test_orphanet_disease_adapter.py b/data/tests/test_orphanet_disease_adapter.py new file mode 100644 index 00000000..c89875ac --- /dev/null +++ b/data/tests/test_orphanet_disease_adapter.py @@ -0,0 +1,38 @@ +import json +import pytest +from adapters.orphanet_disease_adapter import Disease +from adapters.writer import SpyWriter + + +@pytest.fixture +def sample_filepath(): + return './samples/orphanet_example.xml' + + +@pytest.fixture +def spy_writer(): + return SpyWriter() + + +def test_process_file(sample_filepath, spy_writer): + disease = Disease(sample_filepath, writer=spy_writer) + disease.process_file() + + assert len(spy_writer.contents) > 0 + data = json.loads(spy_writer.contents[0]) + assert '_key' in data + assert '_from' in data + assert '_to' in data + assert 'name' in data + assert 'inverse_name' in data + assert 'pmid' in data + assert 'term_name' in data + assert 'gene_symbol' in data + assert 'association_type' in data + assert 'association_status' in data + assert 'source' in data + assert 'source_url' in data + assert data['name'] == 'associated_with' + assert data['inverse_name'] == 'associated_with' + assert data['source'] == Disease.SOURCE + assert data['source_url'] == Disease.SOURCE_URL diff --git a/data/tests/test_pQTL_adapter.py b/data/tests/test_pQTL_adapter.py new file mode 100644 index 00000000..d46de8d7 --- /dev/null +++ b/data/tests/test_pQTL_adapter.py @@ -0,0 +1,17 @@ +import json + +from adapters.pQTL_adapter import pQTL +from adapters.writer import SpyWriter + + +def test_pQTL_adapter(): + writer = SpyWriter() + adapter = pQTL(filepath='./samples/pQTL_UKB_example.csv', + label='pqtl', writer=writer) + adapter.process_file() + assert len(writer.contents) == 200 + first_item = json.loads(writer.contents[0]) + assert first_item['_key'] == '7c956a1b8ed65d87dd710feb0e7614683e8a65eb83306daee6be58cdf8b17b01_P04217_UKB' + assert first_item['name'] == 'associated with levels of' + assert first_item['label'] == 'pQTL' + assert first_item['log10pvalue'] == 79.2 diff --git a/data/tests/test_pharmagkb_drug_adapter.py b/data/tests/test_pharmagkb_drug_adapter.py new file mode 100644 index 00000000..c3a6737a --- /dev/null +++ b/data/tests/test_pharmagkb_drug_adapter.py @@ -0,0 +1,76 @@ +import pytest +import json +from adapters.pharmgkb_drug_adapter import PharmGKB +from adapters.writer import SpyWriter + + +@pytest.fixture +def filepath(): + return './samples/pharmGKB' + + +@pytest.fixture +def spy_writer(): + return SpyWriter() + + +def test_drug_label(filepath, spy_writer): + pharmgkb = PharmGKB(filepath=filepath, label='drug', writer=spy_writer) + assert pharmgkb.type == 'node' + assert pharmgkb.label == 'drug' + + pharmgkb.process_file() + + assert len(spy_writer.contents) > 0 + first_item = json.loads(spy_writer.contents[0]) + assert isinstance(first_item, dict) + assert set(first_item.keys()) == { + '_key', 'name', 'drug_ontology_terms', 'source', 'source_url'} + assert first_item['source'] == 'pharmGKB' + assert first_item['source_url'].startswith( + 'https://www.pharmgkb.org/chemical/') + + +def test_variant_drug_label(filepath, spy_writer): + pharmgkb = PharmGKB(filepath=filepath, + label='variant_drug', writer=spy_writer) + assert pharmgkb.type == 'edge' + assert pharmgkb.label == 'variant_drug' + + pharmgkb.process_file() + + assert len(spy_writer.contents) > 0 + first_item = json.loads(spy_writer.contents[0]) + assert isinstance(first_item, dict) + assert set(first_item.keys()) == {'_key', '_from', '_to', 'gene_symbol', 'pmid', + 'study_parameters', 'phenotype_categories', 'name', 'inverse_name', 'source', 'source_url'} + assert first_item['_from'].startswith('variants/') + assert first_item['_to'].startswith('drugs/') + assert first_item['source'] == 'pharmGKB' + assert first_item['source_url'].startswith( + 'https://www.pharmgkb.org/variantAnnotation/') + + +def test_variant_drug_gene_label(filepath, spy_writer): + pharmgkb = PharmGKB(filepath=filepath, + label='variant_drug_gene', writer=spy_writer) + assert pharmgkb.type == 'edge' + assert pharmgkb.label == 'variant_drug_gene' + + pharmgkb.process_file() + + assert len(spy_writer.contents) > 0 + first_item = json.loads(spy_writer.contents[0]) + assert isinstance(first_item, dict) + assert set(first_item.keys()) == { + '_key', '_from', '_to', 'name', 'inverse_name', 'gene_symbol', 'source', 'source_url'} + assert first_item['_from'].startswith('variants_drugs/') + assert first_item['_to'].startswith('genes/') + assert first_item['source'] == 'pharmGKB' + assert first_item['source_url'].startswith( + 'https://www.pharmgkb.org/variantAnnotation/') + + +def test_invalid_label(filepath, spy_writer): + with pytest.raises(ValueError): + PharmGKB(filepath=filepath, label='invalid_label', writer=spy_writer) diff --git a/data/tests/test_proteins_interaction_adapter.py b/data/tests/test_proteins_interaction_adapter.py new file mode 100644 index 00000000..4e480947 --- /dev/null +++ b/data/tests/test_proteins_interaction_adapter.py @@ -0,0 +1,65 @@ +import json +import pytest +from adapters.proteins_interaction_adapter import ProteinsInteraction +from adapters.writer import SpyWriter + + +@pytest.fixture +def filepath(): + return './samples/merged_PPI.UniProt.example.csv' + + +@pytest.fixture +def spy_writer(): + return SpyWriter() + + +def test_proteins_interaction_adapter(filepath, spy_writer): + adapter = ProteinsInteraction( + filepath=filepath, label='edge', writer=spy_writer) + adapter.process_file() + + assert len(spy_writer.contents) > 0 + first_item = json.loads(spy_writer.contents[0]) + + assert '_key' in first_item + assert '_from' in first_item + assert '_to' in first_item + assert 'detection_method' in first_item + assert 'detection_method_code' in first_item + assert 'interaction_type' in first_item + assert 'interaction_type_code' in first_item + assert 'confidence_value_biogrid:long' in first_item + assert 'confidence_value_intact:long' in first_item + assert 'source' in first_item + assert 'pmids' in first_item + assert 'organism' in first_item + assert first_item['name'] == 'physically interacts with' + assert first_item['inverse_name'] == 'physically interacts with' + assert first_item['molecular_function'] == 'ontology_terms/GO_0005515' + + +def test_proteins_interaction_adapter_initialization(filepath, spy_writer): + adapter = ProteinsInteraction( + filepath=filepath, label='edge', writer=spy_writer) + assert adapter.filepath == filepath + assert adapter.label == 'edge' + assert adapter.dataset == 'edge' + assert adapter.type == 'edge' + assert adapter.organism == 'Homo sapiens' + + +def test_proteins_interaction_adapter_mouse(spy_writer): + mouse_filepath = './samples/merged_PPI_mouse.UniProt.csv' + adapter = ProteinsInteraction( + filepath=mouse_filepath, label='edge', writer=spy_writer) + assert adapter.organism == 'Mus musculus' + + +def test_proteins_interaction_adapter_load_MI_code_mapping(filepath, spy_writer): + adapter = ProteinsInteraction( + filepath=filepath, label='edge', writer=spy_writer) + adapter.load_MI_code_mapping() + assert hasattr(adapter, 'MI_code_mapping') + assert isinstance(adapter.MI_code_mapping, dict) + assert len(adapter.MI_code_mapping) > 0 diff --git a/data/tests/test_reactome_adapter.py b/data/tests/test_reactome_adapter.py new file mode 100644 index 00000000..af2c5446 --- /dev/null +++ b/data/tests/test_reactome_adapter.py @@ -0,0 +1,45 @@ +import json +import pytest +from adapters.reactome_adapter import Reactome +from adapters.writer import SpyWriter + + +@pytest.fixture +def filepath(): + return './samples/reactome/Ensembl2Reactome_All_Levels_sample.txt' + + +@pytest.fixture +def spy_writer(): + return SpyWriter() + + +def test_reactome_adapter_genes_pathways(filepath, spy_writer): + adapter = Reactome(filepath=filepath, + label='genes_pathways', writer=spy_writer) + adapter.process_file() + + assert len(spy_writer.contents) > 0 + first_item = json.loads(spy_writer.contents[0]) + + assert '_key' in first_item + assert '_from' in first_item + assert '_to' in first_item + assert first_item['name'] == 'belongs to' + assert first_item['inverse_name'] == 'has part' + assert first_item['source'] == 'Reactome' + assert first_item['source_url'] == 'https://reactome.org/' + + +def test_reactome_adapter_initialization(filepath, spy_writer): + adapter = Reactome(filepath=filepath, + label='genes_pathways', writer=spy_writer) + assert adapter.filepath == filepath + assert adapter.label == 'genes_pathways' + assert adapter.dataset == 'genes_pathways' + assert adapter.type == 'edge' + + +def test_reactome_adapter_invalid_label(filepath, spy_writer): + with pytest.raises(ValueError): + Reactome(filepath=filepath, label='invalid_label', writer=spy_writer) diff --git a/data/tests/test_topld_adapter.py b/data/tests/test_topld_adapter.py new file mode 100644 index 00000000..7e058ebd --- /dev/null +++ b/data/tests/test_topld_adapter.py @@ -0,0 +1,79 @@ +import json +from adapters.topld_adapter import TopLD +from adapters.writer import SpyWriter + + +def test_topld_adapter_initialization(): + writer = SpyWriter() + adapter = TopLD(filepath='./samples/topld_sample.csv', + annotation_filepath='./samples/topld_info_annotation.csv', + chr='chr22', + ancestry='SAS', + writer=writer) + + assert adapter.filepath == './samples/topld_sample.csv' + assert adapter.annotation_filepath == './samples/topld_info_annotation.csv' + assert adapter.chr == 'chr22' + assert adapter.ancestry == 'SAS' + assert adapter.dataset == TopLD.DATASET + assert adapter.label == TopLD.DATASET + assert adapter.dry_run == True + assert adapter.writer == writer + + +def test_topld_adapter_process_file(): + writer = SpyWriter() + adapter = TopLD(filepath='./samples/topld_sample.csv', + annotation_filepath='./samples/topld_info_annotation.csv', + chr='chr22', + ancestry='SAS', + writer=writer) + + adapter.process_file() + + assert len(writer.contents) > 0 + first_item = json.loads(writer.contents[0]) + + assert '_from' in first_item + assert '_to' in first_item + assert 'chr' in first_item + assert 'negated' in first_item + assert 'variant_1_base_pair' in first_item + assert 'variant_2_base_pair' in first_item + assert 'variant_1_rsid' in first_item + assert 'variant_2_rsid' in first_item + assert 'r2:long' in first_item + assert 'd_prime:long' in first_item + assert 'ancestry' in first_item + assert 'label' in first_item + assert 'name' in first_item + assert 'inverse_name' in first_item + assert 'source' in first_item + assert 'source_url' in first_item + + assert first_item['chr'] == 'chr22' + assert first_item['ancestry'] == 'SAS' + assert first_item['label'] == 'linkage disequilibrum' + assert first_item['name'] == 'correlated with' + assert first_item['inverse_name'] == 'correlated with' + assert first_item['source'] == 'TopLD' + assert first_item['source_url'] == 'http://topld.genetics.unc.edu/' + + +def test_topld_adapter_process_annotations(): + writer = SpyWriter() + adapter = TopLD(filepath='./samples/topld_sample.csv', + annotation_filepath='./samples/topld_info_annotation.csv', + chr='chr22', + ancestry='SAS', + writer=writer) + + adapter.process_annotations() + + assert len(adapter.ids) > 0 + first_key = next(iter(adapter.ids)) + first_value = adapter.ids[first_key] + + assert 'rsid' in first_value + assert 'variant_id' in first_value + assert first_value['variant_id'].startswith('variants/') diff --git a/data/tests/test_uniprot_adapter.py b/data/tests/test_uniprot_adapter.py new file mode 100644 index 00000000..8cb107f0 --- /dev/null +++ b/data/tests/test_uniprot_adapter.py @@ -0,0 +1,101 @@ +import json +import pytest +from adapters.uniprot_adapter import Uniprot +from adapters.writer import SpyWriter + + +def test_uniprot_adapter_initialization(): + writer = SpyWriter() + adapter = Uniprot(filepath='./samples/uniprot_sprot_human_sample.dat.gz', + label='UniProtKB_Translates_To', + source='UniProtKB/Swiss-Prot', + writer=writer) + + assert adapter.filepath == './samples/uniprot_sprot_human_sample.dat.gz' + assert adapter.label == 'UniProtKB_Translates_To' + assert adapter.source == 'UniProtKB/Swiss-Prot' + assert adapter.organism == 'HUMAN' + assert adapter.transcript_endpoint == 'transcripts/' + assert adapter.ensembl_prefix == 'ENST' + assert adapter.dataset == 'UniProtKB_Translates_To' + assert adapter.dry_run == True + assert adapter.type == 'edge' + assert adapter.writer == writer + + +def test_uniprot_adapter_process_file(): + writer = SpyWriter() + adapter = Uniprot(filepath='./samples/uniprot_sprot_human_sample.dat.gz', + label='UniProtKB_Translates_To', + source='UniProtKB/Swiss-Prot', + writer=writer) + + adapter.process_file() + + assert len(writer.contents) > 0 + first_item = json.loads(writer.contents[0]) + + assert '_key' in first_item + assert '_from' in first_item + assert '_to' in first_item + assert 'source' in first_item + assert 'source_url' in first_item + assert 'name' in first_item + assert 'inverse_name' in first_item + assert 'biological_process' in first_item + + assert first_item['source'] == 'UniProtKB/Swiss-Prot' + assert first_item['source_url'] == 'https://www.uniprot.org/help/downloads' + assert first_item['name'] == 'translates to' + assert first_item['inverse_name'] == 'translated from' + assert first_item['biological_process'] == 'ontology_terms/GO_0006412' + assert first_item['_from'].startswith('transcripts/') + assert first_item['_to'].startswith('proteins/') + + +def test_uniprot_adapter_translation_to(): + writer = SpyWriter() + adapter = Uniprot(filepath='./samples/uniprot_sprot_human_sample.dat.gz', + label='UniProtKB_Translates_To', + source='UniProtKB/Swiss-Prot', + writer=writer) + + adapter.process_file() + + assert len(writer.contents) > 0 + first_item = json.loads(writer.contents[0]) + + assert first_item['_from'].startswith('transcripts/') + assert first_item['_to'].startswith('proteins/') + + +def test_uniprot_adapter_mouse(): + writer = SpyWriter() + adapter = Uniprot(filepath='./samples/uniprot_sprot_human_sample.dat.gz', + label='UniProtKB_Translates_To', + source='UniProtKB/Swiss-Prot', + organism='MOUSE', + writer=writer) + + assert adapter.organism == 'MOUSE' + assert adapter.transcript_endpoint == 'mm_transcripts/' + assert adapter.ensembl_prefix == 'ENSMUST' + + +def test_uniprot_adapter_invalid_label(): + writer = SpyWriter() + with pytest.raises(ValueError, match='Invalid label. Allowed values: UniProtKB_Translates_To'): + Uniprot(filepath='./samples/uniprot_sprot_human_sample.dat.gz', + label='Invalid_Label', + source='UniProtKB/Swiss-Prot', + writer=writer) + + +def test_uniprot_adapter_invalid_organism(): + writer = SpyWriter() + with pytest.raises(ValueError, match='Invalid organism. Allowed values: HUMAN, MOUSE'): + Uniprot(filepath='./samples/uniprot_sprot_human_sample.dat.gz', + label='UniProtKB_Translates_To', + source='UniProtKB/Swiss-Prot', + organism='UNICORN', + writer=writer) diff --git a/data/tests/test_uniprot_protein_adapter.py b/data/tests/test_uniprot_protein_adapter.py new file mode 100644 index 00000000..d43bbbe7 --- /dev/null +++ b/data/tests/test_uniprot_protein_adapter.py @@ -0,0 +1,134 @@ +import json +import pytest +from adapters.uniprot_protein_adapter import UniprotProtein +from adapters.writer import SpyWriter + + +def test_uniprot_protein_adapter_initialization(): + writer = SpyWriter() + adapter = UniprotProtein(filepath='./samples/uniprot_sprot_human_sample.dat.gz', + source='UniProtKB/Swiss-Prot', + writer=writer) + + assert adapter.filepath == './samples/uniprot_sprot_human_sample.dat.gz' + assert adapter.dataset == 'UniProtKB_protein' + assert adapter.label == 'UniProtKB_protein' + assert adapter.source == 'UniProtKB/Swiss-Prot' + assert adapter.taxonomy_id == ['9606'] + assert adapter.organism == 'Homo sapiens' + assert adapter.dry_run == True + assert adapter.writer == writer + + +def test_uniprot_protein_adapter_process_file(): + writer = SpyWriter() + adapter = UniprotProtein(filepath='./samples/uniprot_sprot_human_sample.dat.gz', + source='UniProtKB/Swiss-Prot', + writer=writer) + + adapter.process_file() + + assert len(writer.contents) > 0 + first_item = json.loads(writer.contents[0]) + + assert '_key' in first_item + assert 'name' in first_item + assert 'organism' in first_item + assert 'dbxrefs' in first_item + assert 'source' in first_item + assert 'source_url' in first_item + + assert first_item['organism'] == 'Homo sapiens' + assert first_item['source'] == 'UniProtKB/Swiss-Prot' + assert first_item['source_url'] == 'https://www.uniprot.org/help/downloads' + assert isinstance(first_item['dbxrefs'], list) + + +def test_uniprot_protein_adapter_mouse(): + writer = SpyWriter() + adapter = UniprotProtein(filepath='./samples/uniprot_sprot_human_sample.dat.gz', + source='UniProtKB/Swiss-Prot', + taxonomy_id='10090', + writer=writer) + + assert adapter.taxonomy_id == ['10090'] + assert adapter.organism == 'Mus musculus' + + +def test_uniprot_protein_adapter_trembl(): + writer = SpyWriter() + adapter = UniprotProtein(filepath='./samples/uniprot_trembl_human_sample.dat.gz', + source='UniProtKB/TrEMBL', + writer=writer) + + assert adapter.source == 'UniProtKB/TrEMBL' + + +def test_uniprot_protein_adapter_invalid_source(): + writer = SpyWriter() + with pytest.raises(ValueError, match='Invalid source. Allowed values: UniProtKB/Swiss-Prot, UniProtKB/TrEMBL'): + UniprotProtein(filepath='./samples/uniprot_sprot_human_sample.dat.gz', + source='Invalid_Source', + writer=writer) + + +def test_uniprot_protein_adapter_invalid_taxonomy(): + writer = SpyWriter() + with pytest.raises(ValueError, match='Invalid taxonomy id. Allowed values: 9606, 10090'): + UniprotProtein(filepath='./samples/uniprot_sprot_human_sample.dat.gz', + source='UniProtKB/Swiss-Prot', + taxonomy_id='12345', + writer=writer) + + +def test_uniprot_protein_adapter_dry_run(): + writer = SpyWriter() + adapter = UniprotProtein(filepath='./samples/uniprot_sprot_human_sample.dat.gz', + source='UniProtKB/Swiss-Prot', + dry_run=False, + writer=writer) + + assert adapter.dry_run == False + + +def test_uniprot_protein_adapter_get_dbxrefs(): + writer = SpyWriter() + adapter = UniprotProtein(filepath='./samples/uniprot_sprot_human_sample.dat.gz', + source='UniProtKB/Swiss-Prot', + writer=writer) + + test_cross_references = [ + ('EMBL', 'X12345', 'Y67890', '-'), + ('RefSeq', 'NP_001234.1', 'NP_005678.2'), + ('Ensembl', 'ENST00000123456', 'ENSP00000234567'), + ('MANE-Select', 'ENST00000987654.1', 'NM_001122334.2'), + ('Other', 'ID12345') + ] + + dbxrefs = adapter.get_dbxrefs(test_cross_references) + + assert len(dbxrefs) == 9 + assert {'name': 'EMBL', 'id': 'X12345'} in dbxrefs + assert {'name': 'EMBL', 'id': 'Y67890'} in dbxrefs + assert {'name': 'RefSeq', 'id': 'NP_001234.1'} in dbxrefs + assert {'name': 'RefSeq', 'id': 'NP_005678.2'} in dbxrefs + assert {'name': 'Ensembl', 'id': 'ENST00000123456'} in dbxrefs + assert {'name': 'Ensembl', 'id': 'ENSP00000234567'} in dbxrefs + assert {'name': 'MANE-Select', 'id': 'ENST00000987654.1'} in dbxrefs + + +def test_uniprot_protein_adapter_get_full_name(): + writer = SpyWriter() + adapter = UniprotProtein(filepath='./samples/uniprot_sprot_human_sample.dat.gz', + source='UniProtKB/Swiss-Prot', + writer=writer) + + test_description = 'RecName: Full=Test protein; AltName: Full=Alternative name; Short=AN' + full_name = adapter.get_full_name(test_description) + + assert full_name == 'Test protein' + + test_description_2 = 'SubName: Full=Uncharacterized protein' + full_name_2 = adapter.get_full_name(test_description_2) + + assert full_name_2 == 'Uncharacterized protein' diff --git a/data/tests/test_writer.py b/data/tests/test_writer.py index 8dc7ca6d..eaa1faac 100644 --- a/data/tests/test_writer.py +++ b/data/tests/test_writer.py @@ -47,10 +47,10 @@ def test_s3_writer_close(mocker): mock_file.close.assert_called_once() -def test_s3_writer_s3_uri(): +def test_s3_writer_destination(): session = MagicMock() writer = S3Writer(bucket='test-bucket', key='test-key', session=session) - assert writer.s3_uri == 's3://test-bucket/test-key' + assert writer.destination == 's3://test-bucket/test-key' def test_local_writer_open(mocker): @@ -85,6 +85,11 @@ def test_local_writer_close(mocker): mock_open_instance().close.assert_called_once() +def test_local_writer_destination(): + writer = LocalWriter(filepath='/path/to/file.txt') + assert writer.destination == '/path/to/file.txt' + + def test_get_writer_local(mocker): filepath = '/path/to/file.txt' writer = get_writer(filepath=filepath) From 10091c4c4341fbe1a89a6f03f6dc90047d66ff7e Mon Sep 17 00:00:00 2001 From: Pedro Assis Date: Mon, 9 Sep 2024 15:47:48 -1000 Subject: [PATCH 02/20] first version --- data/data_sources.yaml | 498 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 498 insertions(+) create mode 100644 data/data_sources.yaml diff --git a/data/data_sources.yaml b/data/data_sources.yaml new file mode 100644 index 00000000..ad66ac25 --- /dev/null +++ b/data/data_sources.yaml @@ -0,0 +1,498 @@ +# Data fetched from: +# https://data.igvf.org/multireport/?type=File&content_type=&field=%40id&field=content_type&field=href&field=submitted_file_name +# Example: https://data.igvf.org/multireport/?type=File&content_type=variants_variants&field=%40id&field=content_type&field=href&field=submitted_file_name + +topld in linkage disequilibrium with: + collection: variants_variants + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI4988BAVR/@@download/IGVFFI4988BAVR.csv.gz # AFR_chr1_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI6426PMAM/@@download/IGVFFI6426PMAM.csv.gz # AFR_chr1_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI6017SFTI/@@download/IGVFFI6017SFTI.csv.gz # AFR_chr10_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI1575BTNO/@@download/IGVFFI1575BTNO.csv.gz # AFR_chr10_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI3689PSHO/@@download/IGVFFI3689PSHO.csv.gz # AFR_chr11_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI8869CGUR/@@download/IGVFFI8869CGUR.csv.gz # AFR_chr11_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI2345GBSJ/@@download/IGVFFI2345GBSJ.csv.gz # AFR_chr12_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI1503EMAN/@@download/IGVFFI1503EMAN.csv.gz # AFR_chr12_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI6387MEUJ/@@download/IGVFFI6387MEUJ.csv.gz # AFR_chr13_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI1789WUOQ/@@download/IGVFFI1789WUOQ.csv.gz # AFR_chr13_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI4604UUMX/@@download/IGVFFI4604UUMX.csv.gz # AFR_chr14_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI6922SPXS/@@download/IGVFFI6922SPXS.csv.gz # AFR_chr14_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI6868KJLJ/@@download/IGVFFI6868KJLJ.csv.gz # AFR_chr15_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI1289UWHP/@@download/IGVFFI1289UWHP.csv.gz # AFR_chr15_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI8677QERA/@@download/IGVFFI8677QERA.csv.gz # AFR_chr16_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI7816QPIO/@@download/IGVFFI7816QPIO.csv.gz # AFR_chr16_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI3188ZNMS/@@download/IGVFFI3188ZNMS.csv.gz # AFR_chr17_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI2292ULMJ/@@download/IGVFFI2292ULMJ.csv.gz # AFR_chr17_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI4534XQQM/@@download/IGVFFI4534XQQM.csv.gz # AFR_chr18_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI8075AAJN/@@download/IGVFFI8075AAJN.csv.gz # AFR_chr18_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI2620GNFP/@@download/IGVFFI2620GNFP.csv.gz # AFR_chr19_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI7936FQRL/@@download/IGVFFI7936FQRL.csv.gz # AFR_chr19_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI0965IFWW/@@download/IGVFFI0965IFWW.csv.gz # AFR_chr2_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI0049LYPQ/@@download/IGVFFI0049LYPQ.csv.gz # AFR_chr2_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI8510ZUCB/@@download/IGVFFI8510ZUCB.csv.gz # AFR_chr20_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI9374PPQX/@@download/IGVFFI9374PPQX.csv.gz # AFR_chr20_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI0181VZRK/@@download/IGVFFI0181VZRK.csv.gz # AFR_chr21_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI7615LCWL/@@download/IGVFFI7615LCWL.csv.gz # AFR_chr21_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI5317OWWC/@@download/IGVFFI5317OWWC.csv.gz # AFR_chr22_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI9325TAJS/@@download/IGVFFI9325TAJS.csv.gz # AFR_chr22_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI0335ZNWE/@@download/IGVFFI0335ZNWE.csv.gz # AFR_chr3_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI8799TLOX/@@download/IGVFFI8799TLOX.csv.gz # AFR_chr3_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI8209BEML/@@download/IGVFFI8209BEML.csv.gz # AFR_chr4_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI7685OCDM/@@download/IGVFFI7685OCDM.csv.gz # AFR_chr4_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI1643QAFB/@@download/IGVFFI1643QAFB.csv.gz # AFR_chr5_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI9841ITGV/@@download/IGVFFI9841ITGV.csv.gz # AFR_chr5_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI8763ZXIC/@@download/IGVFFI8763ZXIC.csv.gz # AFR_chr6_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI1568NXGF/@@download/IGVFFI1568NXGF.csv.gz # AFR_chr6_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI0269VHOR/@@download/IGVFFI0269VHOR.csv.gz # AFR_chr7_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI6102BGSS/@@download/IGVFFI6102BGSS.csv.gz # AFR_chr7_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI3460LUBI/@@download/IGVFFI3460LUBI.csv.gz # AFR_chr8_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI9322KFAA/@@download/IGVFFI9322KFAA.csv.gz # AFR_chr8_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI8675OMUI/@@download/IGVFFI8675OMUI.csv.gz # AFR_chr9_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI6370WQTA/@@download/IGVFFI6370WQTA.csv.gz # AFR_chr9_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI0128BIUE/@@download/IGVFFI0128BIUE.csv.gz # AFR_chrX_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI5282ZKGG/@@download/IGVFFI5282ZKGG.csv.gz # AFR_chrX_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI6922VJRX/@@download/IGVFFI6922VJRX.csv.gz # EAS_chr1_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI4205VZNV/@@download/IGVFFI4205VZNV.csv.gz # EAS_chr1_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI3299CJXR/@@download/IGVFFI3299CJXR.csv.gz # EAS_chr10_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI2813YYEM/@@download/IGVFFI2813YYEM.csv.gz # EAS_chr10_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI0001TKCQ/@@download/IGVFFI0001TKCQ.csv.gz # EAS_chr11_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI1675ESAE/@@download/IGVFFI1675ESAE.csv.gz # EAS_chr11_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI1681NTHN/@@download/IGVFFI1681NTHN.csv.gz # EAS_chr12_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI9250AWFY/@@download/IGVFFI9250AWFY.csv.gz # EAS_chr12_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI7982GYMX/@@download/IGVFFI7982GYMX.csv.gz # EAS_chr13_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI6644DYMA/@@download/IGVFFI6644DYMA.csv.gz # EAS_chr13_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI5971WJZL/@@download/IGVFFI5971WJZL.csv.gz # EAS_chr14_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI0443CEHK/@@download/IGVFFI0443CEHK.csv.gz # EAS_chr14_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI1388GVTD/@@download/IGVFFI1388GVTD.csv.gz # EAS_chr15_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI8903VMSH/@@download/IGVFFI8903VMSH.csv.gz # EAS_chr15_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI8253JGBS/@@download/IGVFFI8253JGBS.csv.gz # EAS_chr16_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI7819JSAV/@@download/IGVFFI7819JSAV.csv.gz # EAS_chr16_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI3716TZAE/@@download/IGVFFI3716TZAE.csv.gz # EAS_chr17_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI5125CXFC/@@download/IGVFFI5125CXFC.csv.gz # EAS_chr17_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI7959TXAE/@@download/IGVFFI7959TXAE.csv.gz # EAS_chr18_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI6273JSNT/@@download/IGVFFI6273JSNT.csv.gz # EAS_chr18_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI9810ULOS/@@download/IGVFFI9810ULOS.csv.gz # EAS_chr19_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI8398WWUU/@@download/IGVFFI8398WWUU.csv.gz # EAS_chr19_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI3236CCGQ/@@download/IGVFFI3236CCGQ.csv.gz # EAS_chr2_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI9146BMAQ/@@download/IGVFFI9146BMAQ.csv.gz # EAS_chr2_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI1215AGYG/@@download/IGVFFI1215AGYG.csv.gz # EAS_chr20_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI1000QIDI/@@download/IGVFFI1000QIDI.csv.gz # EAS_chr20_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI1396BWBR/@@download/IGVFFI1396BWBR.csv.gz # EAS_chr21_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI7405VBOA/@@download/IGVFFI7405VBOA.csv.gz # EAS_chr21_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI8527LSKE/@@download/IGVFFI8527LSKE.csv.gz # EAS_chr22_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI4552IBIK/@@download/IGVFFI4552IBIK.csv.gz # EAS_chr22_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI9236LAJC/@@download/IGVFFI9236LAJC.csv.gz # EAS_chr3_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI8307SIIN/@@download/IGVFFI8307SIIN.csv.gz # EAS_chr3_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI7621PGDL/@@download/IGVFFI7621PGDL.csv.gz # EAS_chr4_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI7919PYMG/@@download/IGVFFI7919PYMG.csv.gz # EAS_chr4_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI9069FNVL/@@download/IGVFFI9069FNVL.csv.gz # EAS_chr5_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI1105WFFT/@@download/IGVFFI1105WFFT.csv.gz # EAS_chr5_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI9606VGEM/@@download/IGVFFI9606VGEM.csv.gz # EAS_chr6_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI6091YUMG/@@download/IGVFFI6091YUMG.csv.gz # EAS_chr6_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI2017IBUK/@@download/IGVFFI2017IBUK.csv.gz # EAS_chr7_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI8938PDZC/@@download/IGVFFI8938PDZC.csv.gz # EAS_chr7_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI5905OOCH/@@download/IGVFFI5905OOCH.csv.gz # EAS_chr8_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI0517IUJD/@@download/IGVFFI0517IUJD.csv.gz # EAS_chr8_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI5695FIKM/@@download/IGVFFI5695FIKM.csv.gz # EAS_chr9_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI0691QMVD/@@download/IGVFFI0691QMVD.csv.gz # EAS_chr9_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI7274KIOJ/@@download/IGVFFI7274KIOJ.csv.gz # EAS_chrX_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI5524CIVU/@@download/IGVFFI5524CIVU.csv.gz # EAS_chrX_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI5747OROY/@@download/IGVFFI5747OROY.csv.gz # EUR_chr1_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI9466ROJP/@@download/IGVFFI9466ROJP.csv.gz # EUR_chr1_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI4320MFWQ/@@download/IGVFFI4320MFWQ.csv.gz # EUR_chr10_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI2802WZSX/@@download/IGVFFI2802WZSX.csv.gz # EUR_chr10_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI2951BQJL/@@download/IGVFFI2951BQJL.csv.gz # EUR_chr11_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI8973CELU/@@download/IGVFFI8973CELU.csv.gz # EUR_chr11_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI2572CZXG/@@download/IGVFFI2572CZXG.csv.gz # EUR_chr12_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI7343WANW/@@download/IGVFFI7343WANW.csv.gz # EUR_chr12_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI4525PWEM/@@download/IGVFFI4525PWEM.csv.gz # EUR_chr13_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI3440XURJ/@@download/IGVFFI3440XURJ.csv.gz # EUR_chr13_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI6963CFNO/@@download/IGVFFI6963CFNO.csv.gz # EUR_chr14_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI4016OWFC/@@download/IGVFFI4016OWFC.csv.gz # EUR_chr14_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI0629YVRR/@@download/IGVFFI0629YVRR.csv.gz # EUR_chr15_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI8977ELVB/@@download/IGVFFI8977ELVB.csv.gz # EUR_chr15_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI1262RQPS/@@download/IGVFFI1262RQPS.csv.gz # EUR_chr16_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI2340ZUIS/@@download/IGVFFI2340ZUIS.csv.gz # EUR_chr16_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI5776LESU/@@download/IGVFFI5776LESU.csv.gz # EUR_chr17_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI0630DFNF/@@download/IGVFFI0630DFNF.csv.gz # EUR_chr17_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI6221TIWW/@@download/IGVFFI6221TIWW.csv.gz # EUR_chr18_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI8561MJET/@@download/IGVFFI8561MJET.csv.gz # EUR_chr18_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI3930GXSL/@@download/IGVFFI3930GXSL.csv.gz # EUR_chr19_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI9140IFFB/@@download/IGVFFI9140IFFB.csv.gz # EUR_chr19_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI3348PNPF/@@download/IGVFFI3348PNPF.csv.gz # EUR_chr2_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI2638AADX/@@download/IGVFFI2638AADX.csv.gz # EUR_chr2_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI0057MIDM/@@download/IGVFFI0057MIDM.csv.gz # EUR_chr20_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI3388FIZH/@@download/IGVFFI3388FIZH.csv.gz # EUR_chr20_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI1077ZVIG/@@download/IGVFFI1077ZVIG.csv.gz # EUR_chr21_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI8131FVSK/@@download/IGVFFI8131FVSK.csv.gz # EUR_chr21_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI4539ZOSM/@@download/IGVFFI4539ZOSM.csv.gz # EUR_chr22_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI2927XMAG/@@download/IGVFFI2927XMAG.csv.gz # EUR_chr22_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI6537SONW/@@download/IGVFFI6537SONW.csv.gz # EUR_chr3_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI0957CFJL/@@download/IGVFFI0957CFJL.csv.gz # EUR_chr3_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI5064GTBB/@@download/IGVFFI5064GTBB.csv.gz # EUR_chr4_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI4814RFDK/@@download/IGVFFI4814RFDK.csv.gz # EUR_chr4_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI7008JCZM/@@download/IGVFFI7008JCZM.csv.gz # EUR_chr5_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI6763QCND/@@download/IGVFFI6763QCND.csv.gz # EUR_chr5_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI7406LNCF/@@download/IGVFFI7406LNCF.csv.gz # EUR_chr6_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI3840ZILD/@@download/IGVFFI3840ZILD.csv.gz # EUR_chr6_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI5166XCUK/@@download/IGVFFI5166XCUK.csv.gz # EUR_chr7_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI6781QNMF/@@download/IGVFFI6781QNMF.csv.gz # EUR_chr7_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI3042YQCU/@@download/IGVFFI3042YQCU.csv.gz # EUR_chr8_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI1492WUXS/@@download/IGVFFI1492WUXS.csv.gz # EUR_chr8_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI9925FVER/@@download/IGVFFI9925FVER.csv.gz # EUR_chr9_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI7932HPEQ/@@download/IGVFFI7932HPEQ.csv.gz # EUR_chr9_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI0751GXSN/@@download/IGVFFI0751GXSN.csv.gz # EUR_chrX_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI2708NZUX/@@download/IGVFFI2708NZUX.csv.gz # EUR_chrX_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI1964MBFM/@@download/IGVFFI1964MBFM.csv.gz # SAS_chr1_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI4069OUJK/@@download/IGVFFI4069OUJK.csv.gz # SAS_chr1_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI8464XBNQ/@@download/IGVFFI8464XBNQ.csv.gz # SAS_chr10_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI3928RLQG/@@download/IGVFFI3928RLQG.csv.gz # SAS_chr10_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI1846FZQS/@@download/IGVFFI1846FZQS.csv.gz # SAS_chr11_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI5684QZMS/@@download/IGVFFI5684QZMS.csv.gz # SAS_chr11_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI6950UNWT/@@download/IGVFFI6950UNWT.csv.gz # SAS_chr12_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI9611ESJS/@@download/IGVFFI9611ESJS.csv.gz # SAS_chr12_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI2980RYEV/@@download/IGVFFI2980RYEV.csv.gz # SAS_chr13_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI4404UDMA/@@download/IGVFFI4404UDMA.csv.gz # SAS_chr13_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI5689AJDD/@@download/IGVFFI5689AJDD.csv.gz # SAS_chr14_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI4923EDRM/@@download/IGVFFI4923EDRM.csv.gz # SAS_chr14_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI0801HKKO/@@download/IGVFFI0801HKKO.csv.gz # SAS_chr15_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI6829FTTM/@@download/IGVFFI6829FTTM.csv.gz # SAS_chr15_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI4532OZGU/@@download/IGVFFI4532OZGU.csv.gz # SAS_chr16_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI2893HUQC/@@download/IGVFFI2893HUQC.csv.gz # SAS_chr16_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI1996ZRWG/@@download/IGVFFI1996ZRWG.csv.gz # SAS_chr17_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI5506NDHQ/@@download/IGVFFI5506NDHQ.csv.gz # SAS_chr17_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI3873IOZR/@@download/IGVFFI3873IOZR.csv.gz # SAS_chr18_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI2572GHLF/@@download/IGVFFI2572GHLF.csv.gz # SAS_chr18_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI2273SWOJ/@@download/IGVFFI2273SWOJ.csv.gz # SAS_chr19_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI9734YTOZ/@@download/IGVFFI9734YTOZ.csv.gz # SAS_chr19_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI9301BLZL/@@download/IGVFFI9301BLZL.csv.gz # SAS_chr2_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI2811UJAX/@@download/IGVFFI2811UJAX.csv.gz # SAS_chr2_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI5658LJJR/@@download/IGVFFI5658LJJR.csv.gz # SAS_chr20_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI5801JXQW/@@download/IGVFFI5801JXQW.csv.gz # SAS_chr20_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI9219AECP/@@download/IGVFFI9219AECP.csv.gz # SAS_chr21_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI2272XSMF/@@download/IGVFFI2272XSMF.csv.gz # SAS_chr21_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI6472LNUM/@@download/IGVFFI6472LNUM.csv.gz # SAS_chr22_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI2647JZFP/@@download/IGVFFI2647JZFP.csv.gz # SAS_chr22_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI9115OEUN/@@download/IGVFFI9115OEUN.csv.gz # SAS_chr3_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI9886EYDU/@@download/IGVFFI9886EYDU.csv.gz # SAS_chr3_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI0815UKHM/@@download/IGVFFI0815UKHM.csv.gz # SAS_chr4_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI5519UZMR/@@download/IGVFFI5519UZMR.csv.gz # SAS_chr4_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI0100CRLO/@@download/IGVFFI0100CRLO.csv.gz # SAS_chr5_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI3798YYON/@@download/IGVFFI3798YYON.csv.gz # SAS_chr5_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI1481EGLA/@@download/IGVFFI1481EGLA.csv.gz # SAS_chr6_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI1983CUUE/@@download/IGVFFI1983CUUE.csv.gz # SAS_chr6_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI1969GNAN/@@download/IGVFFI1969GNAN.csv.gz # SAS_chr7_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI5065IGEI/@@download/IGVFFI5065IGEI.csv.gz # SAS_chr7_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI1197OITD/@@download/IGVFFI1197OITD.csv.gz # SAS_chr8_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI1486IYBR/@@download/IGVFFI1486IYBR.csv.gz # SAS_chr8_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI2657WQXI/@@download/IGVFFI2657WQXI.csv.gz # SAS_chr9_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI0169SJFP/@@download/IGVFFI0169SJFP.csv.gz # SAS_chr9_no_filter_0.2_1000000_LD.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI2542ZRKC/@@download/IGVFFI2542ZRKC.csv.gz # SAS_chrX_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI4317EUQC/@@download/IGVFFI4317EUQC.csv.gz # SAS_chrX_no_filter_0.2_1000000_LD.csv.gz + +sequence variant: + collection: variants + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI0843RDRY/@@download/IGVFFI0843RDRY.vcf.gz # dbSNP155Nov2.chr1.mn.agds.vcf.gz + - https://api.data.igvf.org/reference-files/IGVFFI1416DAFY/@@download/IGVFFI1416DAFY.vcf.gz # dbSNP155Nov2.chr10.mn.agds.vcf.gz + - https://api.data.igvf.org/reference-files/IGVFFI2080RTXB/@@download/IGVFFI2080RTXB.vcf.gz # dbSNP155Nov2.chr11.mn.agds.vcf.gz + - https://api.data.igvf.org/reference-files/IGVFFI4162IYHI/@@download/IGVFFI4162IYHI.vcf.gz # dbSNP155Nov2.chr12.mn.agds.vcf.gz + - https://api.data.igvf.org/reference-files/IGVFFI5557ZASX/@@download/IGVFFI5557ZASX.vcf.gz # dbSNP155Nov2.chr13.mn.agds.vcf.gz + - https://api.data.igvf.org/reference-files/IGVFFI0796BXIT/@@download/IGVFFI0796BXIT.vcf.gz # dbSNP155Nov2.chr14.mn.agds.vcf.gz + - https://api.data.igvf.org/reference-files/IGVFFI4947WQME/@@download/IGVFFI4947WQME.vcf.gz # dbSNP155Nov2.chr15.mn.agds.vcf.gz + - https://api.data.igvf.org/reference-files/IGVFFI3090HTFJ/@@download/IGVFFI3090HTFJ.vcf.gz # dbSNP155Nov2.chr16.mn.agds.vcf.gz + - https://api.data.igvf.org/reference-files/IGVFFI1035MHRG/@@download/IGVFFI1035MHRG.vcf.gz # dbSNP155Nov2.chr17.mn.agds.vcf.gz + - https://api.data.igvf.org/reference-files/IGVFFI1796LZGF/@@download/IGVFFI1796LZGF.vcf.gz # dbSNP155Nov2.chr18.mn.agds.vcf.gz + - https://api.data.igvf.org/reference-files/IGVFFI2284ZDWV/@@download/IGVFFI2284ZDWV.vcf.gz # dbSNP155Nov2.chr19.mn.agds.vcf.gz + - https://api.data.igvf.org/reference-files/IGVFFI1938SSTV/@@download/IGVFFI1938SSTV.vcf.gz # dbSNP155Nov2.chr2.mn.agds.vcf.gz + - https://api.data.igvf.org/reference-files/IGVFFI3274JYIH/@@download/IGVFFI3274JYIH.vcf.gz # dbSNP155Nov2.chr20.mn.agds.vcf.gz + - https://api.data.igvf.org/reference-files/IGVFFI2738FFUW/@@download/IGVFFI2738FFUW.vcf.gz # dbSNP155Nov2.chr21.mn.agds.vcf.gz + - https://api.data.igvf.org/reference-files/IGVFFI8213FCFM/@@download/IGVFFI8213FCFM.vcf.gz # dbSNP155Nov2.chr22.mn.agds.vcf.gz + - https://api.data.igvf.org/reference-files/IGVFFI5318ERCL/@@download/IGVFFI5318ERCL.vcf.gz # dbSNP155Nov2.chr3.mn.agds.vcf.gz + - https://api.data.igvf.org/reference-files/IGVFFI5900YTWR/@@download/IGVFFI5900YTWR.vcf.gz # dbSNP155Nov2.chr4.mn.agds.vcf.gz + - https://api.data.igvf.org/reference-files/IGVFFI0142CSVH/@@download/IGVFFI0142CSVH.vcf.gz # dbSNP155Nov2.chr5.mn.agds.vcf.gz + - https://api.data.igvf.org/reference-files/IGVFFI6615YNYR/@@download/IGVFFI6615YNYR.vcf.gz # dbSNP155Nov2.chr6.mn.agds.vcf.gz + - https://api.data.igvf.org/reference-files/IGVFFI3237WUWL/@@download/IGVFFI3237WUWL.vcf.gz # dbSNP155Nov2.chr7.mn.agds.vcf.gz + - https://api.data.igvf.org/reference-files/IGVFFI1945UTYR/@@download/IGVFFI1945UTYR.vcf.gz # dbSNP155Nov2.chr8.mn.agds.vcf.gz + - https://api.data.igvf.org/reference-files/IGVFFI2207IKBS/@@download/IGVFFI2207IKBS.vcf.gz # dbSNP155Nov2.chr9.mn.agds.vcf.gz + +gene: + collection: genes + datafiles: [] + +transcript: + collection: transcripts + datafiles: [] + +transcribed to: + collection: genes_transcripts + datafiles: [] + +transcribed from: + collection: genes_transcripts + datafiles: [] + +gene structure: + collection: genes_structure + datafiles: [] + +mouse gene structure: + collection: mm_genes_structure + datafiles: [] + +transcript contains gene structure: + collection: transcripts_genes_structure + datafiles: [] + +mouse transcript contains mouse gene structure: + collection: mm_transcripts_mm_genes_structure + datafiles: [] + +AFGR variant to regulatory region: + collection: variants_regulatory_regions + datafiles: [] + +encode variant to regulatory region: + collection: variants_regulatory_regions + datafiles: [] + +ontology term: + collection: ontology_terms + datafiles: [] + +ontology relationship: + collection: ontology_terms_ontology_terms + datafiles: [] + +protein: + collection: proteins + datafiles: [] + +translates to: + collection: transcripts_proteins + datafiles: [] + +variant to protein association: + collection: variants_proteins + datafiles: [] + +allele specific binding: + collection: variants_proteins + datafiles: [] + +allele specific binding cell ontology: + collection: variants_proteins_terms + datafiles: [] + +translation of: + collection: transcripts_proteins + datafiles: [] + +variant to gene association: + collection: variants_genes + datafiles: [] + +gtex splice variant to gene association: + collection: variants_genes + datafiles: [] + +gtex splice variant to gene association to ontology term: + collection: variants_genes_terms + datafiles: [] + +gtex variant to gene expression association: + collection: variants_genes + datafiles: [] + +gtex variant to gene expression association to ontology term: + collection: variants_genes_terms + datafiles: [] + +AFGR splice variant to gene association: + collection: variants_genes + datafiles: [] + +AFGR splice variant to gene association to ontology term: + collection: variants_genes_terms + datafiles: [] + +AFGR variant to gene expression association: + collection: variants_genes + datafiles: [] + +AFGR variant to gene expression association to ontology term: + collection: variants_genes_terms + datafiles: [] + +regulatory element to gene expression association: + collection: regulatory_regions_genes + datafiles: [] + +regulatory region: + collection: regulatory_regions + datafiles: [] + +regulatory element to gene expression association to biosample: + collection: regulatory_regions_genes_biosamples + datafiles: [] + +regulatory element to gene expression association to biosample to CHEBI treatment: + collection: regulatory_regions_genes_biosamples_treatments_CHEBI + datafiles: [] + +regulatory element to gene expression association to biosample to protein treatment: + collection: regulatory_regions_genes_biosamples_treatments_proteins + datafiles: [] + +donor: + collection: donors + datafiles: [] + +regulatory element to gene expression association to biosample to donor: + collection: regulatory_regions_genes_biosamples_donors + datafiles: [] + +regulatory element to biosample: + collection: regulatory_regions_biosamples + datafiles: [] + +gaf: + collection: gene_products_terms + datafiles: [] + +motif: + collection: motifs + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI9678CVIS/@@download/IGVFFI9678CVIS.tar.gz # HOCOMOCOv11_core_pwm_HUMAN_mono.tar.gz + +motif to protein: + collection: motifs_proteins + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI0050HPJU/@@download/IGVFFI0050HPJU.tsv.gz # HOCOMOCOv11_core_annotation_HUMAN_mono.tsv.gz + +protein to protein interaction: + collection: proteins_proteins + datafiles: [] + +mouse gene to gene interaction: + collection: mm_genes_mm_genes + datafiles: [] + +gene to gene coexpression association: + collection: genes_genes + datafiles: [] + +gene to gene interaction: + collection: genes_genes + datafiles: [] + +pathway: + collection: pathways + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI6573JZEO/@@download/IGVFFI6573JZEO.txt.gz # ReactomePathways.txt.gz + +gene to pathway association: + collection: genes_pathways + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI5159WVTH/@@download/IGVFFI5159WVTH.txt.gz # Ensembl2Reactome_All_Levels.txt.gz + +parent pathway of: + collection: pathways_pathways + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI8863FVFN/@@download/IGVFFI8863FVFN.txt.gz # ReactomePathwaysRelation.txt.gz + +study: + collection: studies + datafiles: [] + +variant to phenotype: + collection: variants_phenotypes + datafiles: [] + +variant to phenotype to study: + collection: variants_phenotypes_studies + datafiles: [] + +drug: + collection: drugs + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI2997DUKO/@@download/IGVFFI2997DUKO.tsv.gz # pharmGKB_chemicals.tsv.gz + +variant to drug: + collection: variants_drugs + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI4821BJHQ/@@download/IGVFFI4821BJHQ.tsv.gz # data_loading_support_files/pharmGKB_genes.tsv.gz + - https://api.data.igvf.org/reference-files/IGVFFI1149WTCK/@@download/IGVFFI1149WTCK.tsv.gz # data_loading_support_files/pharmGKB_study_parameters.tsv.gz + +variant drug association to gene: + collection: variants_drugs_genes + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI7955ICXJ/@@download/IGVFFI7955ICXJ.tsv.gz # data_loading_support_files/pharmGKB_variants.tsv.gz + - https://api.data.igvf.org/reference-files/IGVFFI8835SMSP/@@download/IGVFFI8835SMSP.tar.gz # variantAnnotations.tar.gz + +disease to gene: + collection: diseases_genes + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI4540ZCXZ/@@download/IGVFFI4540ZCXZ.xml.gz # en_product6.xml.gz + +variant to disease: + collection: variants_diseases + datafiles: [] + +variant to disease to gene: + collection: variants_diseases_genes + datafiles: [] + +gene to term: + collection: genes_biosamples + datafiles: [] + +complex: + collection: complexes + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI1451BYVS/@@download/IGVFFI1451BYVS.txt.gz # column_definitions_readme.txt.gz + - https://api.data.igvf.org/reference-files/IGVFFI1444TRQL/@@download/IGVFFI1444TRQL.tsv.gz # EBI_complex_proteins_9606.tsv.gz + +complex to protein: + collection: complexes_proteins + datafiles: [] + +complex to term: + collection: complexes_terms + datafiles: [] + +regulatory region mouse: + collection: mm_regulatory_regions + datafiles: [] + +gene mouse: + collection: mm_genes + datafiles: [] + +transcript mouse: + collection: mm_transcripts + datafiles: [] + +sequence variant mouse: + collection: mm_variants + datafiles: [] + +human mouse regulatory region mapping: + collection: regulatory_regions_mm_regulatory_regions + datafiles: [] + +human mouse genes orthology: + collection: genes_mm_genes + datafiles: [] + +coding variant: + collection: coding_variants + datafiles: [] + +coding variant to protein: + collection: coding_variants_proteins + datafiles: [] + +variants to coding variant: + collection: variants_coding_variants + datafiles: [] From 0a9a66a26342e6c48e4373bb2204e8f0bd1edfd0 Mon Sep 17 00:00:00 2001 From: Pedro Assis Date: Wed, 9 Oct 2024 16:13:35 -1000 Subject: [PATCH 03/20] topld datafiles --- data/data_sources.yaml | 471 +++++++++++++++++++++++++---------------- 1 file changed, 287 insertions(+), 184 deletions(-) diff --git a/data/data_sources.yaml b/data/data_sources.yaml index ad66ac25..3ec95b88 100644 --- a/data/data_sources.yaml +++ b/data/data_sources.yaml @@ -2,193 +2,296 @@ # https://data.igvf.org/multireport/?type=File&content_type=&field=%40id&field=content_type&field=href&field=submitted_file_name # Example: https://data.igvf.org/multireport/?type=File&content_type=variants_variants&field=%40id&field=content_type&field=href&field=submitted_file_name +# Example: pypy3 data_loader.py --adapter topld --output-bucket igvf-catalog-parsed-collections --filepath ~/topld/afr/AFR_chr1_no_filter_0.2_1000000_LD.csv --output-bucket-key variants_variants/topld_afr_chr1.jsonl --chr chr1 --annotation-filepath ~/topld/afr/AFR_chr1_no_filter_0.2_1000000_info_annotation.csv --ancestry AFR topld in linkage disequilibrium with: collection: variants_variants + params: + - chr + - annotation-filepath + - ancestry + command: pypy3 data_loader.py --adapter topld --output-bucket igvf-catalog-parsed-collections --filepath {datafile} --output-bucket-key {collection}/topld_{ancestry}_{chr}.jsonl --chr {chr} --annotation-filepath {annotation_datafile} --ancestry {ancestry} + pypy3: true datafiles: - - https://api.data.igvf.org/reference-files/IGVFFI4988BAVR/@@download/IGVFFI4988BAVR.csv.gz # AFR_chr1_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI6426PMAM/@@download/IGVFFI6426PMAM.csv.gz # AFR_chr1_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI6017SFTI/@@download/IGVFFI6017SFTI.csv.gz # AFR_chr10_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI1575BTNO/@@download/IGVFFI1575BTNO.csv.gz # AFR_chr10_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI3689PSHO/@@download/IGVFFI3689PSHO.csv.gz # AFR_chr11_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI8869CGUR/@@download/IGVFFI8869CGUR.csv.gz # AFR_chr11_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI2345GBSJ/@@download/IGVFFI2345GBSJ.csv.gz # AFR_chr12_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI1503EMAN/@@download/IGVFFI1503EMAN.csv.gz # AFR_chr12_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI6387MEUJ/@@download/IGVFFI6387MEUJ.csv.gz # AFR_chr13_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI1789WUOQ/@@download/IGVFFI1789WUOQ.csv.gz # AFR_chr13_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI4604UUMX/@@download/IGVFFI4604UUMX.csv.gz # AFR_chr14_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI6922SPXS/@@download/IGVFFI6922SPXS.csv.gz # AFR_chr14_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI6868KJLJ/@@download/IGVFFI6868KJLJ.csv.gz # AFR_chr15_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI1289UWHP/@@download/IGVFFI1289UWHP.csv.gz # AFR_chr15_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI8677QERA/@@download/IGVFFI8677QERA.csv.gz # AFR_chr16_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI7816QPIO/@@download/IGVFFI7816QPIO.csv.gz # AFR_chr16_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI3188ZNMS/@@download/IGVFFI3188ZNMS.csv.gz # AFR_chr17_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI2292ULMJ/@@download/IGVFFI2292ULMJ.csv.gz # AFR_chr17_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI4534XQQM/@@download/IGVFFI4534XQQM.csv.gz # AFR_chr18_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI8075AAJN/@@download/IGVFFI8075AAJN.csv.gz # AFR_chr18_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI2620GNFP/@@download/IGVFFI2620GNFP.csv.gz # AFR_chr19_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI7936FQRL/@@download/IGVFFI7936FQRL.csv.gz # AFR_chr19_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI0965IFWW/@@download/IGVFFI0965IFWW.csv.gz # AFR_chr2_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI0049LYPQ/@@download/IGVFFI0049LYPQ.csv.gz # AFR_chr2_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI8510ZUCB/@@download/IGVFFI8510ZUCB.csv.gz # AFR_chr20_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI9374PPQX/@@download/IGVFFI9374PPQX.csv.gz # AFR_chr20_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI0181VZRK/@@download/IGVFFI0181VZRK.csv.gz # AFR_chr21_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI7615LCWL/@@download/IGVFFI7615LCWL.csv.gz # AFR_chr21_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI5317OWWC/@@download/IGVFFI5317OWWC.csv.gz # AFR_chr22_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI9325TAJS/@@download/IGVFFI9325TAJS.csv.gz # AFR_chr22_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI0335ZNWE/@@download/IGVFFI0335ZNWE.csv.gz # AFR_chr3_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI8799TLOX/@@download/IGVFFI8799TLOX.csv.gz # AFR_chr3_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI8209BEML/@@download/IGVFFI8209BEML.csv.gz # AFR_chr4_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI7685OCDM/@@download/IGVFFI7685OCDM.csv.gz # AFR_chr4_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI1643QAFB/@@download/IGVFFI1643QAFB.csv.gz # AFR_chr5_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI9841ITGV/@@download/IGVFFI9841ITGV.csv.gz # AFR_chr5_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI8763ZXIC/@@download/IGVFFI8763ZXIC.csv.gz # AFR_chr6_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI1568NXGF/@@download/IGVFFI1568NXGF.csv.gz # AFR_chr6_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI0269VHOR/@@download/IGVFFI0269VHOR.csv.gz # AFR_chr7_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI6102BGSS/@@download/IGVFFI6102BGSS.csv.gz # AFR_chr7_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI3460LUBI/@@download/IGVFFI3460LUBI.csv.gz # AFR_chr8_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI9322KFAA/@@download/IGVFFI9322KFAA.csv.gz # AFR_chr8_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI8675OMUI/@@download/IGVFFI8675OMUI.csv.gz # AFR_chr9_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI6370WQTA/@@download/IGVFFI6370WQTA.csv.gz # AFR_chr9_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI0128BIUE/@@download/IGVFFI0128BIUE.csv.gz # AFR_chrX_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI5282ZKGG/@@download/IGVFFI5282ZKGG.csv.gz # AFR_chrX_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI6922VJRX/@@download/IGVFFI6922VJRX.csv.gz # EAS_chr1_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI4205VZNV/@@download/IGVFFI4205VZNV.csv.gz # EAS_chr1_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI3299CJXR/@@download/IGVFFI3299CJXR.csv.gz # EAS_chr10_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI2813YYEM/@@download/IGVFFI2813YYEM.csv.gz # EAS_chr10_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI0001TKCQ/@@download/IGVFFI0001TKCQ.csv.gz # EAS_chr11_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI1675ESAE/@@download/IGVFFI1675ESAE.csv.gz # EAS_chr11_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI1681NTHN/@@download/IGVFFI1681NTHN.csv.gz # EAS_chr12_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI9250AWFY/@@download/IGVFFI9250AWFY.csv.gz # EAS_chr12_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI7982GYMX/@@download/IGVFFI7982GYMX.csv.gz # EAS_chr13_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI6644DYMA/@@download/IGVFFI6644DYMA.csv.gz # EAS_chr13_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI5971WJZL/@@download/IGVFFI5971WJZL.csv.gz # EAS_chr14_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI0443CEHK/@@download/IGVFFI0443CEHK.csv.gz # EAS_chr14_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI1388GVTD/@@download/IGVFFI1388GVTD.csv.gz # EAS_chr15_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI8903VMSH/@@download/IGVFFI8903VMSH.csv.gz # EAS_chr15_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI8253JGBS/@@download/IGVFFI8253JGBS.csv.gz # EAS_chr16_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI7819JSAV/@@download/IGVFFI7819JSAV.csv.gz # EAS_chr16_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI3716TZAE/@@download/IGVFFI3716TZAE.csv.gz # EAS_chr17_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI5125CXFC/@@download/IGVFFI5125CXFC.csv.gz # EAS_chr17_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI7959TXAE/@@download/IGVFFI7959TXAE.csv.gz # EAS_chr18_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI6273JSNT/@@download/IGVFFI6273JSNT.csv.gz # EAS_chr18_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI9810ULOS/@@download/IGVFFI9810ULOS.csv.gz # EAS_chr19_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI8398WWUU/@@download/IGVFFI8398WWUU.csv.gz # EAS_chr19_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI3236CCGQ/@@download/IGVFFI3236CCGQ.csv.gz # EAS_chr2_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI9146BMAQ/@@download/IGVFFI9146BMAQ.csv.gz # EAS_chr2_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI1215AGYG/@@download/IGVFFI1215AGYG.csv.gz # EAS_chr20_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI1000QIDI/@@download/IGVFFI1000QIDI.csv.gz # EAS_chr20_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI1396BWBR/@@download/IGVFFI1396BWBR.csv.gz # EAS_chr21_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI7405VBOA/@@download/IGVFFI7405VBOA.csv.gz # EAS_chr21_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI8527LSKE/@@download/IGVFFI8527LSKE.csv.gz # EAS_chr22_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI4552IBIK/@@download/IGVFFI4552IBIK.csv.gz # EAS_chr22_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI9236LAJC/@@download/IGVFFI9236LAJC.csv.gz # EAS_chr3_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI8307SIIN/@@download/IGVFFI8307SIIN.csv.gz # EAS_chr3_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI7621PGDL/@@download/IGVFFI7621PGDL.csv.gz # EAS_chr4_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI7919PYMG/@@download/IGVFFI7919PYMG.csv.gz # EAS_chr4_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI9069FNVL/@@download/IGVFFI9069FNVL.csv.gz # EAS_chr5_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI1105WFFT/@@download/IGVFFI1105WFFT.csv.gz # EAS_chr5_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI9606VGEM/@@download/IGVFFI9606VGEM.csv.gz # EAS_chr6_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI6091YUMG/@@download/IGVFFI6091YUMG.csv.gz # EAS_chr6_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI2017IBUK/@@download/IGVFFI2017IBUK.csv.gz # EAS_chr7_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI8938PDZC/@@download/IGVFFI8938PDZC.csv.gz # EAS_chr7_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI5905OOCH/@@download/IGVFFI5905OOCH.csv.gz # EAS_chr8_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI0517IUJD/@@download/IGVFFI0517IUJD.csv.gz # EAS_chr8_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI5695FIKM/@@download/IGVFFI5695FIKM.csv.gz # EAS_chr9_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI0691QMVD/@@download/IGVFFI0691QMVD.csv.gz # EAS_chr9_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI7274KIOJ/@@download/IGVFFI7274KIOJ.csv.gz # EAS_chrX_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI5524CIVU/@@download/IGVFFI5524CIVU.csv.gz # EAS_chrX_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI5747OROY/@@download/IGVFFI5747OROY.csv.gz # EUR_chr1_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI9466ROJP/@@download/IGVFFI9466ROJP.csv.gz # EUR_chr1_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI4320MFWQ/@@download/IGVFFI4320MFWQ.csv.gz # EUR_chr10_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI2802WZSX/@@download/IGVFFI2802WZSX.csv.gz # EUR_chr10_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI2951BQJL/@@download/IGVFFI2951BQJL.csv.gz # EUR_chr11_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI8973CELU/@@download/IGVFFI8973CELU.csv.gz # EUR_chr11_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI2572CZXG/@@download/IGVFFI2572CZXG.csv.gz # EUR_chr12_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI7343WANW/@@download/IGVFFI7343WANW.csv.gz # EUR_chr12_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI4525PWEM/@@download/IGVFFI4525PWEM.csv.gz # EUR_chr13_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI3440XURJ/@@download/IGVFFI3440XURJ.csv.gz # EUR_chr13_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI6963CFNO/@@download/IGVFFI6963CFNO.csv.gz # EUR_chr14_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI4016OWFC/@@download/IGVFFI4016OWFC.csv.gz # EUR_chr14_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI0629YVRR/@@download/IGVFFI0629YVRR.csv.gz # EUR_chr15_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI8977ELVB/@@download/IGVFFI8977ELVB.csv.gz # EUR_chr15_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI1262RQPS/@@download/IGVFFI1262RQPS.csv.gz # EUR_chr16_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI2340ZUIS/@@download/IGVFFI2340ZUIS.csv.gz # EUR_chr16_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI5776LESU/@@download/IGVFFI5776LESU.csv.gz # EUR_chr17_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI0630DFNF/@@download/IGVFFI0630DFNF.csv.gz # EUR_chr17_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI6221TIWW/@@download/IGVFFI6221TIWW.csv.gz # EUR_chr18_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI8561MJET/@@download/IGVFFI8561MJET.csv.gz # EUR_chr18_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI3930GXSL/@@download/IGVFFI3930GXSL.csv.gz # EUR_chr19_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI9140IFFB/@@download/IGVFFI9140IFFB.csv.gz # EUR_chr19_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI3348PNPF/@@download/IGVFFI3348PNPF.csv.gz # EUR_chr2_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI2638AADX/@@download/IGVFFI2638AADX.csv.gz # EUR_chr2_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI0057MIDM/@@download/IGVFFI0057MIDM.csv.gz # EUR_chr20_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI3388FIZH/@@download/IGVFFI3388FIZH.csv.gz # EUR_chr20_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI1077ZVIG/@@download/IGVFFI1077ZVIG.csv.gz # EUR_chr21_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI8131FVSK/@@download/IGVFFI8131FVSK.csv.gz # EUR_chr21_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI4539ZOSM/@@download/IGVFFI4539ZOSM.csv.gz # EUR_chr22_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI2927XMAG/@@download/IGVFFI2927XMAG.csv.gz # EUR_chr22_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI6537SONW/@@download/IGVFFI6537SONW.csv.gz # EUR_chr3_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI0957CFJL/@@download/IGVFFI0957CFJL.csv.gz # EUR_chr3_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI5064GTBB/@@download/IGVFFI5064GTBB.csv.gz # EUR_chr4_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI4814RFDK/@@download/IGVFFI4814RFDK.csv.gz # EUR_chr4_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI7008JCZM/@@download/IGVFFI7008JCZM.csv.gz # EUR_chr5_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI6763QCND/@@download/IGVFFI6763QCND.csv.gz # EUR_chr5_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI7406LNCF/@@download/IGVFFI7406LNCF.csv.gz # EUR_chr6_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI3840ZILD/@@download/IGVFFI3840ZILD.csv.gz # EUR_chr6_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI5166XCUK/@@download/IGVFFI5166XCUK.csv.gz # EUR_chr7_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI6781QNMF/@@download/IGVFFI6781QNMF.csv.gz # EUR_chr7_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI3042YQCU/@@download/IGVFFI3042YQCU.csv.gz # EUR_chr8_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI1492WUXS/@@download/IGVFFI1492WUXS.csv.gz # EUR_chr8_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI9925FVER/@@download/IGVFFI9925FVER.csv.gz # EUR_chr9_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI7932HPEQ/@@download/IGVFFI7932HPEQ.csv.gz # EUR_chr9_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI0751GXSN/@@download/IGVFFI0751GXSN.csv.gz # EUR_chrX_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI2708NZUX/@@download/IGVFFI2708NZUX.csv.gz # EUR_chrX_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI1964MBFM/@@download/IGVFFI1964MBFM.csv.gz # SAS_chr1_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI4069OUJK/@@download/IGVFFI4069OUJK.csv.gz # SAS_chr1_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI8464XBNQ/@@download/IGVFFI8464XBNQ.csv.gz # SAS_chr10_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI3928RLQG/@@download/IGVFFI3928RLQG.csv.gz # SAS_chr10_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI1846FZQS/@@download/IGVFFI1846FZQS.csv.gz # SAS_chr11_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI5684QZMS/@@download/IGVFFI5684QZMS.csv.gz # SAS_chr11_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI6950UNWT/@@download/IGVFFI6950UNWT.csv.gz # SAS_chr12_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI9611ESJS/@@download/IGVFFI9611ESJS.csv.gz # SAS_chr12_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI2980RYEV/@@download/IGVFFI2980RYEV.csv.gz # SAS_chr13_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI4404UDMA/@@download/IGVFFI4404UDMA.csv.gz # SAS_chr13_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI5689AJDD/@@download/IGVFFI5689AJDD.csv.gz # SAS_chr14_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI4923EDRM/@@download/IGVFFI4923EDRM.csv.gz # SAS_chr14_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI0801HKKO/@@download/IGVFFI0801HKKO.csv.gz # SAS_chr15_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI6829FTTM/@@download/IGVFFI6829FTTM.csv.gz # SAS_chr15_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI4532OZGU/@@download/IGVFFI4532OZGU.csv.gz # SAS_chr16_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI2893HUQC/@@download/IGVFFI2893HUQC.csv.gz # SAS_chr16_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI1996ZRWG/@@download/IGVFFI1996ZRWG.csv.gz # SAS_chr17_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI5506NDHQ/@@download/IGVFFI5506NDHQ.csv.gz # SAS_chr17_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI3873IOZR/@@download/IGVFFI3873IOZR.csv.gz # SAS_chr18_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI2572GHLF/@@download/IGVFFI2572GHLF.csv.gz # SAS_chr18_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI2273SWOJ/@@download/IGVFFI2273SWOJ.csv.gz # SAS_chr19_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI9734YTOZ/@@download/IGVFFI9734YTOZ.csv.gz # SAS_chr19_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI9301BLZL/@@download/IGVFFI9301BLZL.csv.gz # SAS_chr2_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI2811UJAX/@@download/IGVFFI2811UJAX.csv.gz # SAS_chr2_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI5658LJJR/@@download/IGVFFI5658LJJR.csv.gz # SAS_chr20_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI5801JXQW/@@download/IGVFFI5801JXQW.csv.gz # SAS_chr20_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI9219AECP/@@download/IGVFFI9219AECP.csv.gz # SAS_chr21_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI2272XSMF/@@download/IGVFFI2272XSMF.csv.gz # SAS_chr21_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI6472LNUM/@@download/IGVFFI6472LNUM.csv.gz # SAS_chr22_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI2647JZFP/@@download/IGVFFI2647JZFP.csv.gz # SAS_chr22_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI9115OEUN/@@download/IGVFFI9115OEUN.csv.gz # SAS_chr3_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI9886EYDU/@@download/IGVFFI9886EYDU.csv.gz # SAS_chr3_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI0815UKHM/@@download/IGVFFI0815UKHM.csv.gz # SAS_chr4_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI5519UZMR/@@download/IGVFFI5519UZMR.csv.gz # SAS_chr4_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI0100CRLO/@@download/IGVFFI0100CRLO.csv.gz # SAS_chr5_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI3798YYON/@@download/IGVFFI3798YYON.csv.gz # SAS_chr5_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI1481EGLA/@@download/IGVFFI1481EGLA.csv.gz # SAS_chr6_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI1983CUUE/@@download/IGVFFI1983CUUE.csv.gz # SAS_chr6_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI1969GNAN/@@download/IGVFFI1969GNAN.csv.gz # SAS_chr7_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI5065IGEI/@@download/IGVFFI5065IGEI.csv.gz # SAS_chr7_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI1197OITD/@@download/IGVFFI1197OITD.csv.gz # SAS_chr8_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI1486IYBR/@@download/IGVFFI1486IYBR.csv.gz # SAS_chr8_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI2657WQXI/@@download/IGVFFI2657WQXI.csv.gz # SAS_chr9_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI0169SJFP/@@download/IGVFFI0169SJFP.csv.gz # SAS_chr9_no_filter_0.2_1000000_LD.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI2542ZRKC/@@download/IGVFFI2542ZRKC.csv.gz # SAS_chrX_no_filter_0.2_1000000_info_annotation.csv.gz - - https://api.data.igvf.org/reference-files/IGVFFI4317EUQC/@@download/IGVFFI4317EUQC.csv.gz # SAS_chrX_no_filter_0.2_1000000_LD.csv.gz + - AFR: + - chr1: + - https://api.data.igvf.org/reference-files/IGVFFI4988BAVR/@@download/IGVFFI4988BAVR.csv.gz # AFR_chr1_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI6426PMAM/@@download/IGVFFI6426PMAM.csv.gz # AFR_chr1_no_filter_0.2_1000000_LD.csv.gz + - chr2: + - https://api.data.igvf.org/reference-files/IGVFFI0965IFWW/@@download/IGVFFI0965IFWW.csv.gz # AFR_chr2_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI0049LYPQ/@@download/IGVFFI0049LYPQ.csv.gz # AFR_chr2_no_filter_0.2_1000000_LD.csv.gz + - chr3: + - https://api.data.igvf.org/reference-files/IGVFFI0335ZNWE/@@download/IGVFFI0335ZNWE.csv.gz # AFR_chr3_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI8799TLOX/@@download/IGVFFI8799TLOX.csv.gz # AFR_chr3_no_filter_0.2_1000000_LD.csv.gz + - chr4: + - https://api.data.igvf.org/reference-files/IGVFFI8209BEML/@@download/IGVFFI8209BEML.csv.gz # AFR_chr4_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI7685OCDM/@@download/IGVFFI7685OCDM.csv.gz # AFR_chr4_no_filter_0.2_1000000_LD.csv.gz + - chr5: + - https://api.data.igvf.org/reference-files/IGVFFI1643QAFB/@@download/IGVFFI1643QAFB.csv.gz # AFR_chr5_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI9841ITGV/@@download/IGVFFI9841ITGV.csv.gz # AFR_chr5_no_filter_0.2_1000000_LD.csv.gz + - chr6: + - https://api.data.igvf.org/reference-files/IGVFFI8763ZXIC/@@download/IGVFFI8763ZXIC.csv.gz # AFR_chr6_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI1568NXGF/@@download/IGVFFI1568NXGF.csv.gz # AFR_chr6_no_filter_0.2_1000000_LD.csv.gz + - chr7: + - https://api.data.igvf.org/reference-files/IGVFFI0269VHOR/@@download/IGVFFI0269VHOR.csv.gz # AFR_chr7_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI6102BGSS/@@download/IGVFFI6102BGSS.csv.gz # AFR_chr7_no_filter_0.2_1000000_LD.csv.gz + - chr8: + - https://api.data.igvf.org/reference-files/IGVFFI3460LUBI/@@download/IGVFFI3460LUBI.csv.gz # AFR_chr8_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI9322KFAA/@@download/IGVFFI9322KFAA.csv.gz # AFR_chr8_no_filter_0.2_1000000_LD.csv.gz + - chr9: + - https://api.data.igvf.org/reference-files/IGVFFI8675OMUI/@@download/IGVFFI8675OMUI.csv.gz # AFR_chr9_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI6370WQTA/@@download/IGVFFI6370WQTA.csv.gz # AFR_chr9_no_filter_0.2_1000000_LD.csv.gz + - chr10: + - https://api.data.igvf.org/reference-files/IGVFFI6017SFTI/@@download/IGVFFI6017SFTI.csv.gz # AFR_chr10_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI1575BTNO/@@download/IGVFFI1575BTNO.csv.gz # AFR_chr10_no_filter_0.2_1000000_LD.csv.gz + - chr11: + - https://api.data.igvf.org/reference-files/IGVFFI3689PSHO/@@download/IGVFFI3689PSHO.csv.gz # AFR_chr11_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI8869CGUR/@@download/IGVFFI8869CGUR.csv.gz # AFR_chr11_no_filter_0.2_1000000_LD.csv.gz + - chr12: + - https://api.data.igvf.org/reference-files/IGVFFI2345GBSJ/@@download/IGVFFI2345GBSJ.csv.gz # AFR_chr12_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI1503EMAN/@@download/IGVFFI1503EMAN.csv.gz # AFR_chr12_no_filter_0.2_1000000_LD.csv.gz + - chr13: + - https://api.data.igvf.org/reference-files/IGVFFI6387MEUJ/@@download/IGVFFI6387MEUJ.csv.gz # AFR_chr13_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI1789WUOQ/@@download/IGVFFI1789WUOQ.csv.gz # AFR_chr13_no_filter_0.2_1000000_LD.csv.gz + - chr14: + - https://api.data.igvf.org/reference-files/IGVFFI4604UUMX/@@download/IGVFFI4604UUMX.csv.gz # AFR_chr14_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI6922SPXS/@@download/IGVFFI6922SPXS.csv.gz # AFR_chr14_no_filter_0.2_1000000_LD.csv.gz + - chr15: + - https://api.data.igvf.org/reference-files/IGVFFI6868KJLJ/@@download/IGVFFI6868KJLJ.csv.gz # AFR_chr15_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI1289UWHP/@@download/IGVFFI1289UWHP.csv.gz # AFR_chr15_no_filter_0.2_1000000_LD.csv.gz + - chr16: + - https://api.data.igvf.org/reference-files/IGVFFI8677QERA/@@download/IGVFFI8677QERA.csv.gz # AFR_chr16_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI7816QPIO/@@download/IGVFFI7816QPIO.csv.gz # AFR_chr16_no_filter_0.2_1000000_LD.csv.gz + - chr17: + - https://api.data.igvf.org/reference-files/IGVFFI3188ZNMS/@@download/IGVFFI3188ZNMS.csv.gz # AFR_chr17_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI2292ULMJ/@@download/IGVFFI2292ULMJ.csv.gz # AFR_chr17_no_filter_0.2_1000000_LD.csv.gz + - chr18: + - https://api.data.igvf.org/reference-files/IGVFFI4534XQQM/@@download/IGVFFI4534XQQM.csv.gz # AFR_chr18_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI8075AAJN/@@download/IGVFFI8075AAJN.csv.gz # AFR_chr18_no_filter_0.2_1000000_LD.csv.gz + - chr19: + - https://api.data.igvf.org/reference-files/IGVFFI2620GNFP/@@download/IGVFFI2620GNFP.csv.gz # AFR_chr19_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI7936FQRL/@@download/IGVFFI7936FQRL.csv.gz # AFR_chr19_no_filter_0.2_1000000_LD.csv.gz + - chr20: + - https://api.data.igvf.org/reference-files/IGVFFI8510ZUCB/@@download/IGVFFI8510ZUCB.csv.gz # AFR_chr20_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI9374PPQX/@@download/IGVFFI9374PPQX.csv.gz # AFR_chr20_no_filter_0.2_1000000_LD.csv.gz + - chr21: + - https://api.data.igvf.org/reference-files/IGVFFI0181VZRK/@@download/IGVFFI0181VZRK.csv.gz # AFR_chr21_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI7615LCWL/@@download/IGVFFI7615LCWL.csv.gz # AFR_chr21_no_filter_0.2_1000000_LD.csv.gz + - chr22: + - https://api.data.igvf.org/reference-files/IGVFFI5317OWWC/@@download/IGVFFI5317OWWC.csv.gz # AFR_chr22_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI9325TAJS/@@download/IGVFFI9325TAJS.csv.gz # AFR_chr22_no_filter_0.2_1000000_LD.csv.gz + - chrX: + - https://api.data.igvf.org/reference-files/IGVFFI0128BIUE/@@download/IGVFFI0128BIUE.csv.gz # AFR_chrX_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI5282ZKGG/@@download/IGVFFI5282ZKGG.csv.gz # AFR_chrX_no_filter_0.2_1000000_LD.csv.gz + - EAS: + - chr1: + - https://api.data.igvf.org/reference-files/IGVFFI6922VJRX/@@download/IGVFFI6922VJRX.csv.gz # EAS_chr1_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI4205VZNV/@@download/IGVFFI4205VZNV.csv.gz # EAS_chr1_no_filter_0.2_1000000_LD.csv.gz + - chr2: + - https://api.data.igvf.org/reference-files/IGVFFI3236CCGQ/@@download/IGVFFI3236CCGQ.csv.gz # EAS_chr2_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI9146BMAQ/@@download/IGVFFI9146BMAQ.csv.gz # EAS_chr2_no_filter_0.2_1000000_LD.csv.gz + - chr3: + - https://api.data.igvf.org/reference-files/IGVFFI9236LAJC/@@download/IGVFFI9236LAJC.csv.gz # EAS_chr3_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI8307SIIN/@@download/IGVFFI8307SIIN.csv.gz # EAS_chr3_no_filter_0.2_1000000_LD.csv.gz + - chr4: + - https://api.data.igvf.org/reference-files/IGVFFI7621PGDL/@@download/IGVFFI7621PGDL.csv.gz # EAS_chr4_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI7919PYMG/@@download/IGVFFI7919PYMG.csv.gz # EAS_chr4_no_filter_0.2_1000000_LD.csv.gz + - chr5: + - https://api.data.igvf.org/reference-files/IGVFFI9069FNVL/@@download/IGVFFI9069FNVL.csv.gz # EAS_chr5_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI1105WFFT/@@download/IGVFFI1105WFFT.csv.gz # EAS_chr5_no_filter_0.2_1000000_LD.csv.gz + - chr6: + - https://api.data.igvf.org/reference-files/IGVFFI9606VGEM/@@download/IGVFFI9606VGEM.csv.gz # EAS_chr6_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI6091YUMG/@@download/IGVFFI6091YUMG.csv.gz # EAS_chr6_no_filter_0.2_1000000_LD.csv.gz + - chr7: + - https://api.data.igvf.org/reference-files/IGVFFI2017IBUK/@@download/IGVFFI2017IBUK.csv.gz # EAS_chr7_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI8938PDZC/@@download/IGVFFI8938PDZC.csv.gz # EAS_chr7_no_filter_0.2_1000000_LD.csv.gz + - chr8: + - https://api.data.igvf.org/reference-files/IGVFFI5905OOCH/@@download/IGVFFI5905OOCH.csv.gz # EAS_chr8_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI0517IUJD/@@download/IGVFFI0517IUJD.csv.gz # EAS_chr8_no_filter_0.2_1000000_LD.csv.gz + - chr9: + - https://api.data.igvf.org/reference-files/IGVFFI5695FIKM/@@download/IGVFFI5695FIKM.csv.gz # EAS_chr9_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI0691QMVD/@@download/IGVFFI0691QMVD.csv.gz # EAS_chr9_no_filter_0.2_1000000_LD.csv.gz + - chr10: + - https://api.data.igvf.org/reference-files/IGVFFI3299CJXR/@@download/IGVFFI3299CJXR.csv.gz # EAS_chr10_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI2813YYEM/@@download/IGVFFI2813YYEM.csv.gz # EAS_chr10_no_filter_0.2_1000000_LD.csv.gz + - chr11: + - https://api.data.igvf.org/reference-files/IGVFFI0001TKCQ/@@download/IGVFFI0001TKCQ.csv.gz # EAS_chr11_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI1675ESAE/@@download/IGVFFI1675ESAE.csv.gz # EAS_chr11_no_filter_0.2_1000000_LD.csv.gz + - chr12: + - https://api.data.igvf.org/reference-files/IGVFFI1681NTHN/@@download/IGVFFI1681NTHN.csv.gz # EAS_chr12_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI9250AWFY/@@download/IGVFFI9250AWFY.csv.gz # EAS_chr12_no_filter_0.2_1000000_LD.csv.gz + - chr13: + - https://api.data.igvf.org/reference-files/IGVFFI7982GYMX/@@download/IGVFFI7982GYMX.csv.gz # EAS_chr13_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI6644DYMA/@@download/IGVFFI6644DYMA.csv.gz # EAS_chr13_no_filter_0.2_1000000_LD.csv.gz + - chr14: + - https://api.data.igvf.org/reference-files/IGVFFI5971WJZL/@@download/IGVFFI5971WJZL.csv.gz # EAS_chr14_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI0443CEHK/@@download/IGVFFI0443CEHK.csv.gz # EAS_chr14_no_filter_0.2_1000000_LD.csv.gz + - chr15: + - https://api.data.igvf.org/reference-files/IGVFFI1388GVTD/@@download/IGVFFI1388GVTD.csv.gz # EAS_chr15_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI8903VMSH/@@download/IGVFFI8903VMSH.csv.gz # EAS_chr15_no_filter_0.2_1000000_LD.csv.gz + - chr16: + - https://api.data.igvf.org/reference-files/IGVFFI8253JGBS/@@download/IGVFFI8253JGBS.csv.gz # EAS_chr16_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI7819JSAV/@@download/IGVFFI7819JSAV.csv.gz # EAS_chr16_no_filter_0.2_1000000_LD.csv.gz + - chr17: + - https://api.data.igvf.org/reference-files/IGVFFI3716TZAE/@@download/IGVFFI3716TZAE.csv.gz # EAS_chr17_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI5125CXFC/@@download/IGVFFI5125CXFC.csv.gz # EAS_chr17_no_filter_0.2_1000000_LD.csv.gz + - chr18: + - https://api.data.igvf.org/reference-files/IGVFFI7959TXAE/@@download/IGVFFI7959TXAE.csv.gz # EAS_chr18_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI6273JSNT/@@download/IGVFFI6273JSNT.csv.gz # EAS_chr18_no_filter_0.2_1000000_LD.csv.gz + - chr19: + - https://api.data.igvf.org/reference-files/IGVFFI9810ULOS/@@download/IGVFFI9810ULOS.csv.gz # EAS_chr19_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI8398WWUU/@@download/IGVFFI8398WWUU.csv.gz # EAS_chr19_no_filter_0.2_1000000_LD.csv.gz + - chr20: + - https://api.data.igvf.org/reference-files/IGVFFI1215AGYG/@@download/IGVFFI1215AGYG.csv.gz # EAS_chr20_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI1000QIDI/@@download/IGVFFI1000QIDI.csv.gz # EAS_chr20_no_filter_0.2_1000000_LD.csv.gz + - chr21: + - https://api.data.igvf.org/reference-files/IGVFFI1396BWBR/@@download/IGVFFI1396BWBR.csv.gz # EAS_chr21_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI7405VBOA/@@download/IGVFFI7405VBOA.csv.gz # EAS_chr21_no_filter_0.2_1000000_LD.csv.gz + - chr22: + - https://api.data.igvf.org/reference-files/IGVFFI8527LSKE/@@download/IGVFFI8527LSKE.csv.gz # EAS_chr22_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI4552IBIK/@@download/IGVFFI4552IBIK.csv.gz # EAS_chr22_no_filter_0.2_1000000_LD.csv.gz + - chrX: + - https://api.data.igvf.org/reference-files/IGVFFI7274KIOJ/@@download/IGVFFI7274KIOJ.csv.gz # EAS_chrX_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI5524CIVU/@@download/IGVFFI5524CIVU.csv.gz # EAS_chrX_no_filter_0.2_1000000_LD.csv.gz + - EUR: + - chr1: + - https://api.data.igvf.org/reference-files/IGVFFI5747OROY/@@download/IGVFFI5747OROY.csv.gz # EUR_chr1_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI9466ROJP/@@download/IGVFFI9466ROJP.csv.gz # EUR_chr1_no_filter_0.2_1000000_LD.csv.gz + - chr2: + - https://api.data.igvf.org/reference-files/IGVFFI3348PNPF/@@download/IGVFFI3348PNPF.csv.gz # EUR_chr2_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI2638AADX/@@download/IGVFFI2638AADX.csv.gz # EUR_chr2_no_filter_0.2_1000000_LD.csv.gz + - chr3: + - https://api.data.igvf.org/reference-files/IGVFFI6537SONW/@@download/IGVFFI6537SONW.csv.gz # EUR_chr3_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI0957CFJL/@@download/IGVFFI0957CFJL.csv.gz # EUR_chr3_no_filter_0.2_1000000_LD.csv.gz + - chr4: + - https://api.data.igvf.org/reference-files/IGVFFI5064GTBB/@@download/IGVFFI5064GTBB.csv.gz # EUR_chr4_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI4814RFDK/@@download/IGVFFI4814RFDK.csv.gz # EUR_chr4_no_filter_0.2_1000000_LD.csv.gz + - chr5: + - https://api.data.igvf.org/reference-files/IGVFFI7008JCZM/@@download/IGVFFI7008JCZM.csv.gz # EUR_chr5_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI6763QCND/@@download/IGVFFI6763QCND.csv.gz # EUR_chr5_no_filter_0.2_1000000_LD.csv.gz + - chr6: + - https://api.data.igvf.org/reference-files/IGVFFI7406LNCF/@@download/IGVFFI7406LNCF.csv.gz # EUR_chr6_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI3840ZILD/@@download/IGVFFI3840ZILD.csv.gz # EUR_chr6_no_filter_0.2_1000000_LD.csv.gz + - chr7: + - https://api.data.igvf.org/reference-files/IGVFFI5166XCUK/@@download/IGVFFI5166XCUK.csv.gz # EUR_chr7_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI6781QNMF/@@download/IGVFFI6781QNMF.csv.gz # EUR_chr7_no_filter_0.2_1000000_LD.csv.gz + - chr8: + - https://api.data.igvf.org/reference-files/IGVFFI3042YQCU/@@download/IGVFFI3042YQCU.csv.gz # EUR_chr8_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI1492WUXS/@@download/IGVFFI1492WUXS.csv.gz # EUR_chr8_no_filter_0.2_1000000_LD.csv.gz + - chr9: + - https://api.data.igvf.org/reference-files/IGVFFI9925FVER/@@download/IGVFFI9925FVER.csv.gz # EUR_chr9_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI7932HPEQ/@@download/IGVFFI7932HPEQ.csv.gz # EUR_chr9_no_filter_0.2_1000000_LD.csv.gz + - chr10: + - https://api.data.igvf.org/reference-files/IGVFFI4320MFWQ/@@download/IGVFFI4320MFWQ.csv.gz # EUR_chr10_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI2802WZSX/@@download/IGVFFI2802WZSX.csv.gz # EUR_chr10_no_filter_0.2_1000000_LD.csv.gz + - chr11: + - https://api.data.igvf.org/reference-files/IGVFFI2951BQJL/@@download/IGVFFI2951BQJL.csv.gz # EUR_chr11_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI8973CELU/@@download/IGVFFI8973CELU.csv.gz # EUR_chr11_no_filter_0.2_1000000_LD.csv.gz + - chr12: + - https://api.data.igvf.org/reference-files/IGVFFI2572CZXG/@@download/IGVFFI2572CZXG.csv.gz # EUR_chr12_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI7343WANW/@@download/IGVFFI7343WANW.csv.gz # EUR_chr12_no_filter_0.2_1000000_LD.csv.gz + - chr13: + - https://api.data.igvf.org/reference-files/IGVFFI4525PWEM/@@download/IGVFFI4525PWEM.csv.gz # EUR_chr13_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI3440XURJ/@@download/IGVFFI3440XURJ.csv.gz # EUR_chr13_no_filter_0.2_1000000_LD.csv.gz + - chr14: + - https://api.data.igvf.org/reference-files/IGVFFI6963CFNO/@@download/IGVFFI6963CFNO.csv.gz # EUR_chr14_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI4016OWFC/@@download/IGVFFI4016OWFC.csv.gz # EUR_chr14_no_filter_0.2_1000000_LD.csv.gz + - chr15: + - https://api.data.igvf.org/reference-files/IGVFFI0629YVRR/@@download/IGVFFI0629YVRR.csv.gz # EUR_chr15_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI8977ELVB/@@download/IGVFFI8977ELVB.csv.gz # EUR_chr15_no_filter_0.2_1000000_LD.csv.gz + - chr16: + - https://api.data.igvf.org/reference-files/IGVFFI1262RQPS/@@download/IGVFFI1262RQPS.csv.gz # EUR_chr16_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI2340ZUIS/@@download/IGVFFI2340ZUIS.csv.gz # EUR_chr16_no_filter_0.2_1000000_LD.csv.gz + - chr17: + - https://api.data.igvf.org/reference-files/IGVFFI5776LESU/@@download/IGVFFI5776LESU.csv.gz # EUR_chr17_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI0630DFNF/@@download/IGVFFI0630DFNF.csv.gz # EUR_chr17_no_filter_0.2_1000000_LD.csv.gz + - chr18: + - https://api.data.igvf.org/reference-files/IGVFFI6221TIWW/@@download/IGVFFI6221TIWW.csv.gz # EUR_chr18_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI8561MJET/@@download/IGVFFI8561MJET.csv.gz # EUR_chr18_no_filter_0.2_1000000_LD.csv.gz + - chr19: + - https://api.data.igvf.org/reference-files/IGVFFI3930GXSL/@@download/IGVFFI3930GXSL.csv.gz # EUR_chr19_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI9140IFFB/@@download/IGVFFI9140IFFB.csv.gz # EUR_chr19_no_filter_0.2_1000000_LD.csv.gz + - chr20: + - https://api.data.igvf.org/reference-files/IGVFFI0057MIDM/@@download/IGVFFI0057MIDM.csv.gz # EUR_chr20_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI3388FIZH/@@download/IGVFFI3388FIZH.csv.gz # EUR_chr20_no_filter_0.2_1000000_LD.csv.gz + - chr21: + - https://api.data.igvf.org/reference-files/IGVFFI1077ZVIG/@@download/IGVFFI1077ZVIG.csv.gz # EUR_chr21_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI8131FVSK/@@download/IGVFFI8131FVSK.csv.gz # EUR_chr21_no_filter_0.2_1000000_LD.csv.gz + - chr22: + - https://api.data.igvf.org/reference-files/IGVFFI4539ZOSM/@@download/IGVFFI4539ZOSM.csv.gz # EUR_chr22_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI2927XMAG/@@download/IGVFFI2927XMAG.csv.gz # EUR_chr22_no_filter_0.2_1000000_LD.csv.gz + - chrX: + - https://api.data.igvf.org/reference-files/IGVFFI0751GXSN/@@download/IGVFFI0751GXSN.csv.gz # EUR_chrX_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI2708NZUX/@@download/IGVFFI2708NZUX.csv.gz # EUR_chrX_no_filter_0.2_1000000_LD.csv.gz + - SAS: + - chr1: + - https://api.data.igvf.org/reference-files/IGVFFI1964MBFM/@@download/IGVFFI1964MBFM.csv.gz # SAS_chr1_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI4069OUJK/@@download/IGVFFI4069OUJK.csv.gz # SAS_chr1_no_filter_0.2_1000000_LD.csv.gz + - chr2: + - https://api.data.igvf.org/reference-files/IGVFFI9301BLZL/@@download/IGVFFI9301BLZL.csv.gz # SAS_chr2_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI2811UJAX/@@download/IGVFFI2811UJAX.csv.gz # SAS_chr2_no_filter_0.2_1000000_LD.csv.gz + - chr3: + - https://api.data.igvf.org/reference-files/IGVFFI9115OEUN/@@download/IGVFFI9115OEUN.csv.gz # SAS_chr3_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI9886EYDU/@@download/IGVFFI9886EYDU.csv.gz # SAS_chr3_no_filter_0.2_1000000_LD.csv.gz + - chr4: + - https://api.data.igvf.org/reference-files/IGVFFI0815UKHM/@@download/IGVFFI0815UKHM.csv.gz # SAS_chr4_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI5519UZMR/@@download/IGVFFI5519UZMR.csv.gz # SAS_chr4_no_filter_0.2_1000000_LD.csv.gz + - chr5: + - https://api.data.igvf.org/reference-files/IGVFFI0100CRLO/@@download/IGVFFI0100CRLO.csv.gz # SAS_chr5_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI3798YYON/@@download/IGVFFI3798YYON.csv.gz # SAS_chr5_no_filter_0.2_1000000_LD.csv.gz + - chr6: + - https://api.data.igvf.org/reference-files/IGVFFI1481EGLA/@@download/IGVFFI1481EGLA.csv.gz # SAS_chr6_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI1983CUUE/@@download/IGVFFI1983CUUE.csv.gz # SAS_chr6_no_filter_0.2_1000000_LD.csv.gz + - chr7: + - https://api.data.igvf.org/reference-files/IGVFFI1969GNAN/@@download/IGVFFI1969GNAN.csv.gz # SAS_chr7_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI5065IGEI/@@download/IGVFFI5065IGEI.csv.gz # SAS_chr7_no_filter_0.2_1000000_LD.csv.gz + - chr8: + - https://api.data.igvf.org/reference-files/IGVFFI1197OITD/@@download/IGVFFI1197OITD.csv.gz # SAS_chr8_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI1486IYBR/@@download/IGVFFI1486IYBR.csv.gz # SAS_chr8_no_filter_0.2_1000000_LD.csv.gz + - chr9: + - https://api.data.igvf.org/reference-files/IGVFFI2657WQXI/@@download/IGVFFI2657WQXI.csv.gz # SAS_chr9_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI0169SJFP/@@download/IGVFFI0169SJFP.csv.gz # SAS_chr9_no_filter_0.2_1000000_LD.csv.gz + - chr10: + - https://api.data.igvf.org/reference-files/IGVFFI8464XBNQ/@@download/IGVFFI8464XBNQ.csv.gz # SAS_chr10_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI3928RLQG/@@download/IGVFFI3928RLQG.csv.gz # SAS_chr10_no_filter_0.2_1000000_LD.csv.gz + - chr11: + - https://api.data.igvf.org/reference-files/IGVFFI1846FZQS/@@download/IGVFFI1846FZQS.csv.gz # SAS_chr11_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI5684QZMS/@@download/IGVFFI5684QZMS.csv.gz # SAS_chr11_no_filter_0.2_1000000_LD.csv.gz + - chr12: + - https://api.data.igvf.org/reference-files/IGVFFI6950UNWT/@@download/IGVFFI6950UNWT.csv.gz # SAS_chr12_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI9611ESJS/@@download/IGVFFI9611ESJS.csv.gz # SAS_chr12_no_filter_0.2_1000000_LD.csv.gz + - chr13: + - https://api.data.igvf.org/reference-files/IGVFFI2980RYEV/@@download/IGVFFI2980RYEV.csv.gz # SAS_chr13_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI4404UDMA/@@download/IGVFFI4404UDMA.csv.gz # SAS_chr13_no_filter_0.2_1000000_LD.csv.gz + - chr14: + - https://api.data.igvf.org/reference-files/IGVFFI5689AJDD/@@download/IGVFFI5689AJDD.csv.gz # SAS_chr14_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI4923EDRM/@@download/IGVFFI4923EDRM.csv.gz # SAS_chr14_no_filter_0.2_1000000_LD.csv.gz + - chr15: + - https://api.data.igvf.org/reference-files/IGVFFI0801HKKO/@@download/IGVFFI0801HKKO.csv.gz # SAS_chr15_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI6829FTTM/@@download/IGVFFI6829FTTM.csv.gz # SAS_chr15_no_filter_0.2_1000000_LD.csv.gz + - chr16: + - https://api.data.igvf.org/reference-files/IGVFFI4532OZGU/@@download/IGVFFI4532OZGU.csv.gz # SAS_chr16_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI2893HUQC/@@download/IGVFFI2893HUQC.csv.gz # SAS_chr16_no_filter_0.2_1000000_LD.csv.gz + - chr17: + - https://api.data.igvf.org/reference-files/IGVFFI1996ZRWG/@@download/IGVFFI1996ZRWG.csv.gz # SAS_chr17_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI5506NDHQ/@@download/IGVFFI5506NDHQ.csv.gz # SAS_chr17_no_filter_0.2_1000000_LD.csv.gz + - chr18: + - https://api.data.igvf.org/reference-files/IGVFFI3873IOZR/@@download/IGVFFI3873IOZR.csv.gz # SAS_chr18_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI2572GHLF/@@download/IGVFFI2572GHLF.csv.gz # SAS_chr18_no_filter_0.2_1000000_LD.csv.gz + - chr19: + - https://api.data.igvf.org/reference-files/IGVFFI2273SWOJ/@@download/IGVFFI2273SWOJ.csv.gz # SAS_chr19_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI9734YTOZ/@@download/IGVFFI9734YTOZ.csv.gz # SAS_chr19_no_filter_0.2_1000000_LD.csv.gz + - chr20: + - https://api.data.igvf.org/reference-files/IGVFFI5658LJJR/@@download/IGVFFI5658LJJR.csv.gz # SAS_chr20_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI5801JXQW/@@download/IGVFFI5801JXQW.csv.gz # SAS_chr20_no_filter_0.2_1000000_LD.csv.gz + - chr21: + - https://api.data.igvf.org/reference-files/IGVFFI9219AECP/@@download/IGVFFI9219AECP.csv.gz # SAS_chr21_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI2272XSMF/@@download/IGVFFI2272XSMF.csv.gz # SAS_chr21_no_filter_0.2_1000000_LD.csv.gz + - chr22: + - https://api.data.igvf.org/reference-files/IGVFFI6472LNUM/@@download/IGVFFI6472LNUM.csv.gz # SAS_chr22_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI2647JZFP/@@download/IGVFFI2647JZFP.csv.gz # SAS_chr22_no_filter_0.2_1000000_LD.csv.gz + - chrX: + - https://api.data.igvf.org/reference-files/IGVFFI2542ZRKC/@@download/IGVFFI2542ZRKC.csv.gz # SAS_chrX_no_filter_0.2_1000000_info_annotation.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI4317EUQC/@@download/IGVFFI4317EUQC.csv.gz # SAS_chrX_no_filter_0.2_1000000_LD.csv.gz sequence variant: collection: variants From 68ee23d7417d90fe773d5c092b53ed744ed63c42 Mon Sep 17 00:00:00 2001 From: Pedro Assis Date: Wed, 9 Oct 2024 17:01:28 -1000 Subject: [PATCH 04/20] updated favor files --- data/data_sources.yaml | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/data/data_sources.yaml b/data/data_sources.yaml index 3ec95b88..34a4b8d9 100644 --- a/data/data_sources.yaml +++ b/data/data_sources.yaml @@ -293,10 +293,21 @@ topld in linkage disequilibrium with: - https://api.data.igvf.org/reference-files/IGVFFI2542ZRKC/@@download/IGVFFI2542ZRKC.csv.gz # SAS_chrX_no_filter_0.2_1000000_info_annotation.csv.gz - https://api.data.igvf.org/reference-files/IGVFFI4317EUQC/@@download/IGVFFI4317EUQC.csv.gz # SAS_chrX_no_filter_0.2_1000000_LD.csv.gz +# Example: python3 data_loader.py --adapter favor --output-bucket igvf-catalog-parsed-collections --filepath ~/FAVORFULLDBdbSNP155/IGVFFI0843RDRY.vcf --output-bucket-key variants/favor_IGVFFI0843RDRY.vcf.jsonl sequence variant: collection: variants + command: pypy3 data_loader.py --adapter favor --output-bucket igvf-catalog-parsed-collections --filepath {datafile} --output-bucket-key {collection}/favor_{datafile}.jsonl + pypy3: false datafiles: - https://api.data.igvf.org/reference-files/IGVFFI0843RDRY/@@download/IGVFFI0843RDRY.vcf.gz # dbSNP155Nov2.chr1.mn.agds.vcf.gz + - https://api.data.igvf.org/reference-files/IGVFFI1938SSTV/@@download/IGVFFI1938SSTV.vcf.gz # dbSNP155Nov2.chr2.mn.agds.vcf.gz + - https://api.data.igvf.org/reference-files/IGVFFI5318ERCL/@@download/IGVFFI5318ERCL.vcf.gz # dbSNP155Nov2.chr3.mn.agds.vcf.gz + - https://api.data.igvf.org/reference-files/IGVFFI5900YTWR/@@download/IGVFFI5900YTWR.vcf.gz # dbSNP155Nov2.chr4.mn.agds.vcf.gz + - https://api.data.igvf.org/reference-files/IGVFFI0142CSVH/@@download/IGVFFI0142CSVH.vcf.gz # dbSNP155Nov2.chr5.mn.agds.vcf.gz + - https://api.data.igvf.org/reference-files/IGVFFI6615YNYR/@@download/IGVFFI6615YNYR.vcf.gz # dbSNP155Nov2.chr6.mn.agds.vcf.gz + - https://api.data.igvf.org/reference-files/IGVFFI3237WUWL/@@download/IGVFFI3237WUWL.vcf.gz # dbSNP155Nov2.chr7.mn.agds.vcf.gz + - https://api.data.igvf.org/reference-files/IGVFFI1945UTYR/@@download/IGVFFI1945UTYR.vcf.gz # dbSNP155Nov2.chr8.mn.agds.vcf.gz + - https://api.data.igvf.org/reference-files/IGVFFI2207IKBS/@@download/IGVFFI2207IKBS.vcf.gz # dbSNP155Nov2.chr9.mn.agds.vcf.gz - https://api.data.igvf.org/reference-files/IGVFFI1416DAFY/@@download/IGVFFI1416DAFY.vcf.gz # dbSNP155Nov2.chr10.mn.agds.vcf.gz - https://api.data.igvf.org/reference-files/IGVFFI2080RTXB/@@download/IGVFFI2080RTXB.vcf.gz # dbSNP155Nov2.chr11.mn.agds.vcf.gz - https://api.data.igvf.org/reference-files/IGVFFI4162IYHI/@@download/IGVFFI4162IYHI.vcf.gz # dbSNP155Nov2.chr12.mn.agds.vcf.gz @@ -307,17 +318,12 @@ sequence variant: - https://api.data.igvf.org/reference-files/IGVFFI1035MHRG/@@download/IGVFFI1035MHRG.vcf.gz # dbSNP155Nov2.chr17.mn.agds.vcf.gz - https://api.data.igvf.org/reference-files/IGVFFI1796LZGF/@@download/IGVFFI1796LZGF.vcf.gz # dbSNP155Nov2.chr18.mn.agds.vcf.gz - https://api.data.igvf.org/reference-files/IGVFFI2284ZDWV/@@download/IGVFFI2284ZDWV.vcf.gz # dbSNP155Nov2.chr19.mn.agds.vcf.gz - - https://api.data.igvf.org/reference-files/IGVFFI1938SSTV/@@download/IGVFFI1938SSTV.vcf.gz # dbSNP155Nov2.chr2.mn.agds.vcf.gz - https://api.data.igvf.org/reference-files/IGVFFI3274JYIH/@@download/IGVFFI3274JYIH.vcf.gz # dbSNP155Nov2.chr20.mn.agds.vcf.gz - https://api.data.igvf.org/reference-files/IGVFFI2738FFUW/@@download/IGVFFI2738FFUW.vcf.gz # dbSNP155Nov2.chr21.mn.agds.vcf.gz - https://api.data.igvf.org/reference-files/IGVFFI8213FCFM/@@download/IGVFFI8213FCFM.vcf.gz # dbSNP155Nov2.chr22.mn.agds.vcf.gz - - https://api.data.igvf.org/reference-files/IGVFFI5318ERCL/@@download/IGVFFI5318ERCL.vcf.gz # dbSNP155Nov2.chr3.mn.agds.vcf.gz - - https://api.data.igvf.org/reference-files/IGVFFI5900YTWR/@@download/IGVFFI5900YTWR.vcf.gz # dbSNP155Nov2.chr4.mn.agds.vcf.gz - - https://api.data.igvf.org/reference-files/IGVFFI0142CSVH/@@download/IGVFFI0142CSVH.vcf.gz # dbSNP155Nov2.chr5.mn.agds.vcf.gz - - https://api.data.igvf.org/reference-files/IGVFFI6615YNYR/@@download/IGVFFI6615YNYR.vcf.gz # dbSNP155Nov2.chr6.mn.agds.vcf.gz - - https://api.data.igvf.org/reference-files/IGVFFI3237WUWL/@@download/IGVFFI3237WUWL.vcf.gz # dbSNP155Nov2.chr7.mn.agds.vcf.gz - - https://api.data.igvf.org/reference-files/IGVFFI1945UTYR/@@download/IGVFFI1945UTYR.vcf.gz # dbSNP155Nov2.chr8.mn.agds.vcf.gz - - https://api.data.igvf.org/reference-files/IGVFFI2207IKBS/@@download/IGVFFI2207IKBS.vcf.gz # dbSNP155Nov2.chr9.mn.agds.vcf.gz + - need updated chrX vcf + - need updated chrY vcf + - https://api.data.igvf.org/reference-files/IGVFFI2231RETG/@@download/IGVFFI2231RETG.vcf.gz # Y2AVECombinedVariants.rmna.mn.genome.v6.vcf.gz gene: collection: genes From b86c3959c5cb4977a3d5ad77951d4d4bd3e82c3f Mon Sep 17 00:00:00 2001 From: Pedro Assis Date: Thu, 10 Oct 2024 11:28:08 -1000 Subject: [PATCH 05/20] adding genes files --- data/data_sources.yaml | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/data/data_sources.yaml b/data/data_sources.yaml index 34a4b8d9..5b7e525b 100644 --- a/data/data_sources.yaml +++ b/data/data_sources.yaml @@ -325,9 +325,17 @@ sequence variant: - need updated chrY vcf - https://api.data.igvf.org/reference-files/IGVFFI2231RETG/@@download/IGVFFI2231RETG.vcf.gz # Y2AVECombinedVariants.rmna.mn.genome.v6.vcf.gz +# Example: pypy3 data_loader.py --adapter gencode_genes --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/IGVFFI7217ZMJZ.gtf --output-bucket-key genes/genes_IGVFFI7217ZMJZ.vcf.jsonl --label gencode_gene --gene_alias_file_path samples/Homo_sapiens.gene_info.gz gene: collection: genes - datafiles: [] + params: + - label + - gene_alias_file_path + command: pypy3 data_loader.py --adapter gencode_genes --output-bucket igvf-catalog-parsed-collections --filepath {datafile} --output-bucket-key genes/genes_{datafile}.jsonl --label gencode_gene --gene_alias_file_path samples/Homo_sapiens.gene_info.gz + pypy3: true + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI7217ZMJZ/@@download/IGVFFI7217ZMJZ.gtf.gz # Homo sapiens GRCh38 GENCODE v43 genome + - need Homo_sapiens.gene_info.gz in the data portal # gene alias support file: Homo_sapiens.gene_info.gz transcript: collection: transcripts @@ -576,6 +584,9 @@ regulatory region mouse: gene mouse: collection: mm_genes + params: + - gene_alias_file_path + pypy3: true datafiles: [] transcript mouse: From 17c308135271e74caeedc32bc33cab80e20cc90c Mon Sep 17 00:00:00 2001 From: Pedro Assis Date: Thu, 10 Oct 2024 13:56:53 -1000 Subject: [PATCH 06/20] removing unused param --- data/adapters/gencode_adapter.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/data/adapters/gencode_adapter.py b/data/adapters/gencode_adapter.py index 0a62c186..cc8f3ba3 100644 --- a/data/adapters/gencode_adapter.py +++ b/data/adapters/gencode_adapter.py @@ -24,13 +24,12 @@ class Gencode: INDEX = {'chr': 0, 'type': 2, 'coord_start': 3, 'coord_end': 4, 'info': 8} - def __init__(self, filepath=None, label='gencode_transcript', organism='HUMAN', chr='all', dry_run=True, writer: Optional[Writer] = None, **kwargs): + def __init__(self, filepath=None, label='gencode_transcript', organism='HUMAN', dry_run=True, writer: Optional[Writer] = None, **kwargs): if label not in Gencode.ALLOWED_LABELS: raise ValueError('Invalid labelS. Allowed values: ' + ','.join(Gencode.ALLOWED_LABELS)) self.filepath = filepath - self.chr = chr self.label = label self.organism = organism self.transcript_endpoint = 'transcripts/' From 5008ed93961f376c957551788fbe1b342a3307da Mon Sep 17 00:00:00 2001 From: Pedro Assis Date: Thu, 10 Oct 2024 13:57:08 -1000 Subject: [PATCH 07/20] adding gencode files --- data/data_sources.yaml | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/data/data_sources.yaml b/data/data_sources.yaml index 5b7e525b..edbcd929 100644 --- a/data/data_sources.yaml +++ b/data/data_sources.yaml @@ -325,29 +325,46 @@ sequence variant: - need updated chrY vcf - https://api.data.igvf.org/reference-files/IGVFFI2231RETG/@@download/IGVFFI2231RETG.vcf.gz # Y2AVECombinedVariants.rmna.mn.genome.v6.vcf.gz -# Example: pypy3 data_loader.py --adapter gencode_genes --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/IGVFFI7217ZMJZ.gtf --output-bucket-key genes/genes_IGVFFI7217ZMJZ.vcf.jsonl --label gencode_gene --gene_alias_file_path samples/Homo_sapiens.gene_info.gz +# Example: pypy3 data_loader.py --adapter gencode_genes --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/IGVFFI7217ZMJZ.gtf --output-bucket-key genes/genes_IGVFFI7217ZMJZ.vcf.jsonl --label gencode_gene --gene-alias-file-path ~/dataset/IGVFFI7344CFHT.tsv.gz gene: collection: genes params: - label - gene_alias_file_path - command: pypy3 data_loader.py --adapter gencode_genes --output-bucket igvf-catalog-parsed-collections --filepath {datafile} --output-bucket-key genes/genes_{datafile}.jsonl --label gencode_gene --gene_alias_file_path samples/Homo_sapiens.gene_info.gz + command: pypy3 data_loader.py --adapter gencode_genes --output-bucket igvf-catalog-parsed-collections --filepath {datafile} --output-bucket-key genes/genes_{datafile}.jsonl --label gencode_gene --gene-alias-file-path {gene_alias_datafile} pypy3: true datafiles: - https://api.data.igvf.org/reference-files/IGVFFI7217ZMJZ/@@download/IGVFFI7217ZMJZ.gtf.gz # Homo sapiens GRCh38 GENCODE v43 genome - - need Homo_sapiens.gene_info.gz in the data portal # gene alias support file: Homo_sapiens.gene_info.gz + - https://api.data.igvf.org/reference-files/IGVFFI7344CFHT/@@download/IGVFFI7344CFHT.tsv.gz # gene alias support file: Homo_sapiens.gene_info.gz +# Example: pypy3 data_loader.py --adapter gencode_transcripts --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/IGVFFI7217ZMJZ.gtf --output-bucket-key transcripts/transcripts_IGVFFI7217ZMJZ.vcf.jsonl --label gencode_transcript transcript: collection: transcripts - datafiles: [] + params: + - label + command: pypy3 data_loader.py --adapter gencode_transcripts --output-bucket igvf-catalog-parsed-collections --filepath {datafile} --output-bucket-key transcripts/transcripts_{datafile}.jsonl --label gencode_transcript + pypy3: true + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI7217ZMJZ/@@download/IGVFFI7217ZMJZ.gtf.gz # Homo sapiens GRCh38 GENCODE v43 genome +# Example: pypy3 data_loader.py --adapter gencode_transcripts --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/IGVFFI7217ZMJZ.gtf --output-bucket-key genes_transcripts/genes_transcripts_IGVFFI7217ZMJZ.vcf.jsonl --label transcribed_to transcribed to: collection: genes_transcripts - datafiles: [] + params: + - label + command: pypy3 data_loader.py --adapter gencode_transcripts --output-bucket igvf-catalog-parsed-collections --filepath {datafile} --output-bucket-key transcripts/transcripts_{datafile}.jsonl --label gencode_transcript + pypy3: true + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI7217ZMJZ/@@download/IGVFFI7217ZMJZ.gtf.gz # Homo sapiens GRCh38 GENCODE v43 genome +# Example: pypy3 data_loader.py --adapter gencode_transcripts --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/IGVFFI7217ZMJZ.gtf --output-bucket-key genes_transcripts/transcripts_genes_IGVFFI7217ZMJZ.vcf.jsonl --label transcribed_from transcribed from: collection: genes_transcripts - datafiles: [] + params: + - label + pypy3: true + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI7217ZMJZ/@@download/IGVFFI7217ZMJZ.gtf.gz # Homo sapiens GRCh38 GENCODE v43 genome gene structure: collection: genes_structure From 82b74744747569995f7d8498a80dec1c39c97947 Mon Sep 17 00:00:00 2001 From: Pedro Assis Date: Fri, 11 Oct 2024 14:59:51 -1000 Subject: [PATCH 08/20] adding human protein files --- data/data_sources.yaml | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/data/data_sources.yaml b/data/data_sources.yaml index edbcd929..7a58a15a 100644 --- a/data/data_sources.yaml +++ b/data/data_sources.yaml @@ -398,9 +398,21 @@ ontology relationship: collection: ontology_terms_ontology_terms datafiles: [] +# Examples: +# - pypy3 data_loader.py --adapter UniProtKB_sprot --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/sprot.dat.gz --output-bucket-key proteins/sprot.jsonl --source "UniProtKB/Swiss-Prot" +# - pypy3 data_loader.py --adapter UniProtKB_trembl --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/trembl.dat.gz --output-bucket-key proteins/trembl.jsonl --source "UniProtKB/TrEMBL" protein: collection: proteins - datafiles: [] + params: + - taxonomy + - source + pypy3: true + command: + - pypy3 data_loader.py --adapter UniProtKB_sprot --output-bucket igvf-catalog-parsed-collections --filepath {datafile} --output-bucket-key proteins/sprot.jsonl --source {source} + - pypy3 data_loader.py --adapter UniProtKB_trembl --output-bucket igvf-catalog-parsed-collections --filepath {datafile} --output-bucket-key proteins/trembl.jsonl --source {source} + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI7936CWLG/@@download/IGVFFI7936CWLG.dat.gz # uniprot_sprot_human.dat.gz + - https://api.data.igvf.org/reference-files/IGVFFI6222RRHX/@@download/IGVFFI6222RRHX.dat.gz # uniprot_trembl_human.dat.gz translates to: collection: transcripts_proteins From 23592fe761d648d24b8e7588eb4948dc41ef9751 Mon Sep 17 00:00:00 2001 From: Pedro Assis Date: Thu, 17 Oct 2024 08:27:35 -1000 Subject: [PATCH 09/20] adding condig variants and gene structure files --- data/active_adapters.py | 1 + data/adapters/cellosaurus_ontology_adapter.py | 3 +- data/adapters/dbSNFP_adapter.py | 4 +- .../gencode_gene_structure_adapter.py | 3 +- data/adapters/oncotree_adapter.py | 3 +- data/adapters/pQTL_adapter.py | 4 +- data/data_sources.yaml | 193 ++++++++++++++++-- 7 files changed, 182 insertions(+), 29 deletions(-) diff --git a/data/active_adapters.py b/data/active_adapters.py index eec4199a..3200dabb 100644 --- a/data/active_adapters.py +++ b/data/active_adapters.py @@ -133,6 +133,7 @@ 'gencode_transcripts': Gencode, 'transcribed_to': Gencode, 'transcribed_from': Gencode, + 'gencode_structure': GencodeStructure, 'eqtl': GtexEQtl, 'eqtl_term': GtexEQtl, 'AFGR_eqtl': AFGREQtl, diff --git a/data/adapters/cellosaurus_ontology_adapter.py b/data/adapters/cellosaurus_ontology_adapter.py index 981c65bc..1d587248 100644 --- a/data/adapters/cellosaurus_ontology_adapter.py +++ b/data/adapters/cellosaurus_ontology_adapter.py @@ -30,11 +30,10 @@ class Cellosaurus: # NBCI TaxID for Human and Mouse SPECIES_IDS = ['NCBI_TaxID:9606', 'NCBI_TaxID:10090'] - def __init__(self, filepath, type='node', species_filter=True, dry_run=True, writer: Optional[Writer] = None, **kwargs): + def __init__(self, filepath, type='node', species_filter=True, writer: Optional[Writer] = None, **kwargs): self.filepath = filepath self.type = type self.species_filter = species_filter - self.dry_run = dry_run if type == 'node': self.dataset = 'ontology_term' else: diff --git a/data/adapters/dbSNFP_adapter.py b/data/adapters/dbSNFP_adapter.py index ad740b0d..663e4bd4 100644 --- a/data/adapters/dbSNFP_adapter.py +++ b/data/adapters/dbSNFP_adapter.py @@ -13,12 +13,10 @@ class DbSNFP: LABEL = 'dbSNFP_protein_variants' - def __init__(self, filepath=None, collection='coding_variants', dry_run=True, writer: Optional[Writer] = None, **kwargs): - + def __init__(self, filepath=None, collection='coding_variants', writer: Optional[Writer] = None, **kwargs): self.filepath = filepath self.label = DbSNFP.LABEL self.dataset = self.label - self.dry_run = dry_run self.collection_name = collection self.writer = writer diff --git a/data/adapters/gencode_gene_structure_adapter.py b/data/adapters/gencode_gene_structure_adapter.py index e5e2dfe0..f2518ac0 100644 --- a/data/adapters/gencode_gene_structure_adapter.py +++ b/data/adapters/gencode_gene_structure_adapter.py @@ -35,12 +35,11 @@ class GencodeStructure: 'mm_transcript_contains_mm_gene_structure' ] - def __init__(self, filepath=None, chr='all', label='gene_structure', dry_run=True, writer: Optional[Writer] = None, **kwargs): + def __init__(self, filepath=None, label='gene_structure', dry_run=True, writer: Optional[Writer] = None, **kwargs): if label not in GencodeStructure.ALLOWED_LABELS: raise ValueError('Invalid label. Allowed values: ' + ','.join(GencodeStructure.ALLOWED_LABELS)) self.filepath = filepath - self.chr = chr self.label = label self.dry_run = dry_run self.source = 'GENCODE' diff --git a/data/adapters/oncotree_adapter.py b/data/adapters/oncotree_adapter.py index babf8487..d481f9b6 100644 --- a/data/adapters/oncotree_adapter.py +++ b/data/adapters/oncotree_adapter.py @@ -27,7 +27,7 @@ class Oncotree: SOURCE_URL = 'https://oncotree.mskcc.org/' API_URL = 'https://oncotree.mskcc.org:443/api/tumorTypes' - def __init__(self, type, dry_run=True, writer: Optional[Writer] = None, **kwargs): + def __init__(self, type, writer: Optional[Writer] = None, **kwargs): self.type = type if self.type == 'node': @@ -36,7 +36,6 @@ def __init__(self, type, dry_run=True, writer: Optional[Writer] = None, **kwargs else: self.dataset = 'ontology_relationship' self.label = 'ontology_relationship' - self.dry_run = dry_run self.writer = writer def process_file(self): diff --git a/data/adapters/pQTL_adapter.py b/data/adapters/pQTL_adapter.py index de127fd9..4094dbfe 100644 --- a/data/adapters/pQTL_adapter.py +++ b/data/adapters/pQTL_adapter.py @@ -16,12 +16,10 @@ class pQTL: SOURCE_URL = 'https://metabolomips.org/ukbbpgwas/' BIOLOGICAL_CONTEXT = 'blood plasma' - def __init__(self, filepath, label, dry_run=True, writer: Optional[Writer] = None, **kwargs): - + def __init__(self, filepath, label, writer: Optional[Writer] = None, **kwargs): self.filepath = filepath self.label = label self.dataset = label - self.dry_run = dry_run self.type = 'edge' self.writer = writer diff --git a/data/data_sources.yaml b/data/data_sources.yaml index 7a58a15a..e8e9f874 100644 --- a/data/data_sources.yaml +++ b/data/data_sources.yaml @@ -296,7 +296,7 @@ topld in linkage disequilibrium with: # Example: python3 data_loader.py --adapter favor --output-bucket igvf-catalog-parsed-collections --filepath ~/FAVORFULLDBdbSNP155/IGVFFI0843RDRY.vcf --output-bucket-key variants/favor_IGVFFI0843RDRY.vcf.jsonl sequence variant: collection: variants - command: pypy3 data_loader.py --adapter favor --output-bucket igvf-catalog-parsed-collections --filepath {datafile} --output-bucket-key {collection}/favor_{datafile}.jsonl + command: python3 data_loader.py --adapter favor --output-bucket igvf-catalog-parsed-collections --filepath {datafile} --output-bucket-key {collection}/favor_{datafile}.jsonl pypy3: false datafiles: - https://api.data.igvf.org/reference-files/IGVFFI0843RDRY/@@download/IGVFFI0843RDRY.vcf.gz # dbSNP155Nov2.chr1.mn.agds.vcf.gz @@ -366,17 +366,27 @@ transcribed from: datafiles: - https://api.data.igvf.org/reference-files/IGVFFI7217ZMJZ/@@download/IGVFFI7217ZMJZ.gtf.gz # Homo sapiens GRCh38 GENCODE v43 genome +# Example: pypy3 data_loader.py --adapter gencode_structure --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/IGVFFI7217ZMJZ.gtf --output-bucket-key genes_structure/gencode_IGVFFI7217ZMJZ.vcf.jsonl --label gene_structure gene structure: collection: genes_structure - datafiles: [] + params: + - label + command: pypy3 data_loader.py --adapter gencode_structure --output-bucket igvf-catalog-parsed-collections --filepath {datafile} --output-bucket-key genes_structure/gencode_{datafile}.jsonl --label gene_structure + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI7217ZMJZ/@@download/IGVFFI7217ZMJZ.gtf.gz # Homo sapiens GRCh38 GENCODE v43 genome mouse gene structure: collection: mm_genes_structure datafiles: [] +# Example: pypy3 data_loader.py --adapter gencode_structure --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/IGVFFI7217ZMJZ.gtf --output-bucket-key transcripts_genes_structure/gencode_IGVFFI7217ZMJZ.vcf.jsonl --label transcript_contains_gene_structure transcript contains gene structure: collection: transcripts_genes_structure - datafiles: [] + params: + - label + command: pypy3 data_loader.py --adapter gencode_structure --output-bucket igvf-catalog-parsed-collections --filepath {datafile} --output-bucket-key transcripts_genes_structure/gencode_{datafile}.jsonl --label transcript_contains_gene_structure + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI7217ZMJZ/@@download/IGVFFI7217ZMJZ.gtf.gz # Homo sapiens GRCh38 GENCODE v43 genome mouse transcript contains mouse gene structure: collection: mm_transcripts_mm_genes_structure @@ -390,13 +400,57 @@ encode variant to regulatory region: collection: variants_regulatory_regions datafiles: [] +# Example: python3 data_loader.py --adapter ontology --output-bucket igvf-catalog-parsed-collections --filepath vario.owl --ontology vario +# for cellosaurus: python3 data_loader.py --adapter cellosaurus_terms --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/cellosaurus.obo --type node --output-bucket-key ontology_terms/cellosaurus.jsonl +# for oncotree: python3 data_loader.py --adapter oncotree_terms --output-bucket igvf-catalog-parsed-collections --output-bucket-key ontology_terms/oncotree.jsonl --filepath '' ontology term: collection: ontology_terms - datafiles: [] - + params: + - filepath + - ontology + command: + - python3 data_loader.py --adapter ontology --output-bucket igvf-catalog-parsed-collections --filepath {datafile} --ontology {ontology_name} + - python3 data_loader.py --adapter cellosaurus_terms --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/cellosaurus.obo --type node --output-bucket-key ontology_terms/cellosaurus.jsonl + - python3 data_loader.py --adapter oncotree_terms --output-bucket igvf-catalog-parsed-collections --output-bucket-key ontology_terms/oncotree.jsonl --filepath '' + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI7985BGYI/@@download/IGVFFI7985BGYI.owl.gz # uberon + - https://api.data.igvf.org/reference-files/IGVFFI7115PAJX/@@download/IGVFFI7115PAJX.owl.gz # clo + - https://api.data.igvf.org/reference-files/IGVFFI0402TNDW/@@download/IGVFFI0402TNDW.owl.gz # cl + - https://api.data.igvf.org/reference-files/IGVFFI1298JRGV/@@download/IGVFFI1298JRGV.owl.gz # hpo + - https://api.data.igvf.org/reference-files/IGVFFI5120YZYR/@@download/IGVFFI5120YZYR.owl.gz # mondo + - https://api.data.igvf.org/reference-files/IGVFFI8306RHIV/@@download/IGVFFI8306RHIV.owl.gz # go + - https://api.data.igvf.org/reference-files/IGVFFI1837PEKQ/@@download/IGVFFI1837PEKQ.owl.gz # efo + - https://api.data.igvf.org/reference-files/IGVFFI6182DQZM/@@download/IGVFFI6182DQZM.owl.gz # chebi + - https://api.data.igvf.org/reference-files/IGVFFI4219OZTA/@@download/IGVFFI4219OZTA.owl.gz # vario + - https://api.data.igvf.org/reference-files/IGVFFI8953HXRQ/@@download/IGVFFI8953HXRQ.owl.gz # orphanet + - https://api.data.igvf.org/reference-files/IGVFFI2369NSDT/@@download/IGVFFI2369NSDT.owl.gz # ncit + - https://api.data.igvf.org/reference-files/IGVFFI4854HJDG/@@download/IGVFFI4854HJDG.obo.gz # cellosaurus - adapter: cellosaurus_terms + # Oncotree needs to be run separately. It does not have datafiles (data fetched from Oncotree API). + +# Example: python3 data_loader.py --adapter ontology --output-bucket igvf-catalog-parsed-collections --filepath vario.owl --ontology vario +# for cellosaurus: python3 data_loader.py --adapter cellosaurus_relationships --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/cellosaurus.obo --type edge --output-bucket-key ontology_terms_ontology_terms/cellosaurus.jsonl +# for oncotree: python3 data_loader.py --adapter oncotree_relationships --output-bucket igvf-catalog-parsed-collections --output-bucket-key ontology_terms_ontology_terms/oncotree.jsonl --filepath '' ontology relationship: collection: ontology_terms_ontology_terms - datafiles: [] + params: + - filepath + - ontology + command: + - python3 data_loader.py --adapter ontology --output-bucket igvf-catalog-parsed-collections --filepath {datafile} --ontology {ontology_name} + - python3 data_loader.py --adapter cellosaurus_relationships --output-bucket igvf-catalog-parsed-collections --filepath {cellosaurus_datafile} --type edge --output-bucket-key ontology_terms_ontology_terms/cellosaurus.jsonl + - python3 data_loader.py --adapter oncotree_relationships --output-bucket igvf-catalog-parsed-collections --output-bucket-key ontology_terms_ontology_terms/oncotree.jsonl --filepath '' + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI7985BGYI/@@download/IGVFFI7985BGYI.owl.gz # uberon + - https://api.data.igvf.org/reference-files/IGVFFI7115PAJX/@@download/IGVFFI7115PAJX.owl.gz # clo + - https://api.data.igvf.org/reference-files/IGVFFI0402TNDW/@@download/IGVFFI0402TNDW.owl.gz # cl + - https://api.data.igvf.org/reference-files/IGVFFI1298JRGV/@@download/IGVFFI1298JRGV.owl.gz # hpo + - https://api.data.igvf.org/reference-files/IGVFFI5120YZYR/@@download/IGVFFI5120YZYR.owl.gz # mondo + - https://api.data.igvf.org/reference-files/IGVFFI8306RHIV/@@download/IGVFFI8306RHIV.owl.gz # go + - https://api.data.igvf.org/reference-files/IGVFFI1837PEKQ/@@download/IGVFFI1837PEKQ.owl.gz # efo + - https://api.data.igvf.org/reference-files/IGVFFI6182DQZM/@@download/IGVFFI6182DQZM.owl.gz # chebi + - https://api.data.igvf.org/reference-files/IGVFFI4219OZTA/@@download/IGVFFI4219OZTA.owl.gz # vario + - https://api.data.igvf.org/reference-files/IGVFFI8953HXRQ/@@download/IGVFFI8953HXRQ.owl.gz # orphanet + - https://api.data.igvf.org/reference-files/IGVFFI2369NSDT/@@download/IGVFFI2369NSDT.owl.gz # ncit # Examples: # - pypy3 data_loader.py --adapter UniProtKB_sprot --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/sprot.dat.gz --output-bucket-key proteins/sprot.jsonl --source "UniProtKB/Swiss-Prot" @@ -414,13 +468,29 @@ protein: - https://api.data.igvf.org/reference-files/IGVFFI7936CWLG/@@download/IGVFFI7936CWLG.dat.gz # uniprot_sprot_human.dat.gz - https://api.data.igvf.org/reference-files/IGVFFI6222RRHX/@@download/IGVFFI6222RRHX.dat.gz # uniprot_trembl_human.dat.gz +# Example: +# - pypy3 data_loader.py --adapter UniProtKB_Translates_To --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/sprot.dat.gz --output-bucket-key transcripts_proteins/sprot.jsonl --label UniProtKB_Translates_To --source 'UniProtKB/Swiss-Prot' +# - pypy3 data_loader.py --adapter UniProtKB_Translates_To --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/trembl.dat.gz --output-bucket-key transcripts_proteins/trembl.jsonl --label UniProtKB_Translates_To --source 'UniProtKB/TrEMBL' translates to: collection: transcripts_proteins - datafiles: [] + params: + - label + - source + pypy3: true + command: + - pypy3 data_loader.py --adapter UniProtKB_Translates_to --output-bucket igvf-catalog-parsed-collections --filepath {datafile} --output-bucket-key transcripts_proteins/uniprotkb.jsonl --label UniProtKB_Translates_To --source {source} + - pypy3 data_loader.py --adapter UniProtKB_Translates_to --output-bucket igvf-catalog-parsed-collections --filepath {datafile} --output-bucket-key transcripts_proteins/uniprotkb.jsonl --label UniProtKB_Translates_To --source {source} + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI7936CWLG/@@download/IGVFFI7936CWLG.dat.gz # uniprot_sprot_human.dat.gz + - https://api.data.igvf.org/reference-files/IGVFFI6222RRHX/@@download/IGVFFI6222RRHX.dat.gz # uniprot_trembl_human.dat.gz +# Example: python3 data_loader.py --adapter pQTL --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/IGVFFI2053GDNI.csv --output-bucket-key variants_proteins/pqtls_IGVFFI2053GDNI.csv.jsonl variant to protein association: collection: variants_proteins - datafiles: [] + pypy3: false + command: python3 data_loader.py --adapter pQTL --output-bucket igvf-catalog-parsed-collections --filepath {datafile} --output-bucket-key variants_proteins/pqtls_{datafile}.jsonl + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI2053GDNI/@@download/IGVFFI2053GDNI.csv.gz # pQTL_UKB.csv.gz allele specific binding: collection: variants_proteins @@ -430,10 +500,6 @@ allele specific binding cell ontology: collection: variants_proteins_terms datafiles: [] -translation of: - collection: transcripts_proteins - datafiles: [] - variant to gene association: collection: variants_genes datafiles: [] @@ -634,14 +700,107 @@ human mouse genes orthology: collection: genes_mm_genes datafiles: [] +# Unreleased datafiles download: curl -L -u {accesskey}:{secretkey} {url} -o {datafile} +# example: curl -L -u MSJMF5G6:ey4ac7yummfuqqk6 https://api.data.igvf.org/reference-files/IGVFFI0989JYUQ/@@download/IGVFFI0989JYUQ.vcf.gz -o IGVFFI0989JYUQ.vcf.gz + +# Example: python3 data_loader.py --adapter coding_variants --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/chr1.vcf --output-bucket-key coding_variants/chr1.jsonl --collection coding_variants coding variant: collection: coding_variants - datafiles: [] - + params: + - collection + command: python3 data_loader.py --adapter coding_variants --output-bucket igvf-catalog-parsed-collections --filepath {datafile} --output-bucket-key coding_variants/{datafile}.jsonl --collection coding_variants + pypy3: false + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI0989JYUQ/@@download/IGVFFI0989JYUQ.vcf.gz #dbNSFP4.5a_variant.chr1.gz + - https://api.data.igvf.org/reference-files/IGVFFI6512LPYF/@@download/IGVFFI6512LPYF.vcf.gz #dbNSFP4.5a_variant.chr2.gz + - https://api.data.igvf.org/reference-files/IGVFFI3122NLUA/@@download/IGVFFI3122NLUA.vcf.gz #dbNSFP4.5a_variant.chr3.gz + - https://api.data.igvf.org/reference-files/IGVFFI4160XHXX/@@download/IGVFFI4160XHXX.vcf.gz #dbNSFP4.5a_variant.chr4.gz + - https://api.data.igvf.org/reference-files/IGVFFI5904SNID/@@download/IGVFFI5904SNID.vcf.gz #dbNSFP4.5a_variant.chr5.gz + - https://api.data.igvf.org/reference-files/IGVFFI3624TSFU/@@download/IGVFFI3624TSFU.vcf.gz #dbNSFP4.5a_variant.chr6.gz + - https://api.data.igvf.org/reference-files/IGVFFI6814BELY/@@download/IGVFFI6814BELY.vcf.gz #dbNSFP4.5a_variant.chr7.gz + - https://api.data.igvf.org/reference-files/IGVFFI5538GQZA/@@download/IGVFFI5538GQZA.vcf.gz #dbNSFP4.5a_variant.chr8.gz + - https://api.data.igvf.org/reference-files/IGVFFI6749QJPP/@@download/IGVFFI6749QJPP.vcf.gz #dbNSFP4.5a_variant.chr9.gz + - https://api.data.igvf.org/reference-files/IGVFFI4817KSEU/@@download/IGVFFI4817KSEU.vcf.gz #dbNSFP4.5a_variant.chr10.gz + - https://api.data.igvf.org/reference-files/IGVFFI6143IXIZ/@@download/IGVFFI6143IXIZ.vcf.gz #dbNSFP4.5a_variant.chr11.gz + - https://api.data.igvf.org/reference-files/IGVFFI7373WKWM/@@download/IGVFFI7373WKWM.vcf.gz #dbNSFP4.5a_variant.chr12.gz + - https://api.data.igvf.org/reference-files/IGVFFI0535LVDM/@@download/IGVFFI0535LVDM.vcf.gz #dbNSFP4.5a_variant.chr13.gz + - https://api.data.igvf.org/reference-files/IGVFFI3218BMCQ/@@download/IGVFFI3218BMCQ.vcf.gz #dbNSFP4.5a_variant.chr14.gz + - https://api.data.igvf.org/reference-files/IGVFFI2289ELCY/@@download/IGVFFI2289ELCY.vcf.gz #dbNSFP4.5a_variant.chr15.gz + - https://api.data.igvf.org/reference-files/IGVFFI4257BIMS/@@download/IGVFFI4257BIMS.vcf.gz #dbNSFP4.5a_variant.chr16.gz + - https://api.data.igvf.org/reference-files/IGVFFI6803IGZS/@@download/IGVFFI6803IGZS.vcf.gz #dbNSFP4.5a_variant.chr17.gz + - https://api.data.igvf.org/reference-files/IGVFFI7393VCBY/@@download/IGVFFI7393VCBY.vcf.gz #dbNSFP4.5a_variant.chr18.gz + - https://api.data.igvf.org/reference-files/IGVFFI5456LGEZ/@@download/IGVFFI5456LGEZ.vcf.gz #dbNSFP4.5a_variant.chr19.gz + - https://api.data.igvf.org/reference-files/IGVFFI6213QGZU/@@download/IGVFFI6213QGZU.vcf.gz #dbNSFP4.5a_variant.chr20.gz + - https://api.data.igvf.org/reference-files/IGVFFI7897OALT/@@download/IGVFFI7897OALT.vcf.gz #dbNSFP4.5a_variant.chr21.gz + - https://api.data.igvf.org/reference-files/IGVFFI6918VTWL/@@download/IGVFFI6918VTWL.vcf.gz #dbNSFP4.5a_variant.chr22.gz + - https://api.data.igvf.org/reference-files/IGVFFI8283ZCLN/@@download/IGVFFI8283ZCLN.vcf.gz #dbNSFP4.5a_variant.chrX.gz + - https://api.data.igvf.org/reference-files/IGVFFI8477SBHR/@@download/IGVFFI8477SBHR.vcf.gz #dbNSFP4.5a_variant.chrY.gz + - https://api.data.igvf.org/reference-files/IGVFFI7595PQMP/@@download/IGVFFI7595PQMP.vcf.gz #dbNSFP4.5a_variant.chrM.gz + +# Example: python3 data_loader.py --adapter coding_variants --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/chr1.vcf --output-bucket-key coding_variants_proteins/chr1.jsonl --collection coding_variants_proteins coding variant to protein: collection: coding_variants_proteins - datafiles: [] - + params: + - collection + pypy3: false + command: python3 data_loader.py --adapter coding_variants --output-bucket igvf-catalog-parsed-collections --filepath {datafile} --output-bucket-key coding_variants_proteins/{datafile}.jsonl --collection coding_variants_proteins + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI0989JYUQ/@@download/IGVFFI0989JYUQ.vcf.gz #dbNSFP4.5a_variant.chr1.gz + - https://api.data.igvf.org/reference-files/IGVFFI6512LPYF/@@download/IGVFFI6512LPYF.vcf.gz #dbNSFP4.5a_variant.chr2.gz + - https://api.data.igvf.org/reference-files/IGVFFI3122NLUA/@@download/IGVFFI3122NLUA.vcf.gz #dbNSFP4.5a_variant.chr3.gz + - https://api.data.igvf.org/reference-files/IGVFFI4160XHXX/@@download/IGVFFI4160XHXX.vcf.gz #dbNSFP4.5a_variant.chr4.gz + - https://api.data.igvf.org/reference-files/IGVFFI5904SNID/@@download/IGVFFI5904SNID.vcf.gz #dbNSFP4.5a_variant.chr5.gz + - https://api.data.igvf.org/reference-files/IGVFFI3624TSFU/@@download/IGVFFI3624TSFU.vcf.gz #dbNSFP4.5a_variant.chr6.gz + - https://api.data.igvf.org/reference-files/IGVFFI6814BELY/@@download/IGVFFI6814BELY.vcf.gz #dbNSFP4.5a_variant.chr7.gz + - https://api.data.igvf.org/reference-files/IGVFFI5538GQZA/@@download/IGVFFI5538GQZA.vcf.gz #dbNSFP4.5a_variant.chr8.gz + - https://api.data.igvf.org/reference-files/IGVFFI6749QJPP/@@download/IGVFFI6749QJPP.vcf.gz #dbNSFP4.5a_variant.chr9.gz + - https://api.data.igvf.org/reference-files/IGVFFI4817KSEU/@@download/IGVFFI4817KSEU.vcf.gz #dbNSFP4.5a_variant.chr10.gz + - https://api.data.igvf.org/reference-files/IGVFFI6143IXIZ/@@download/IGVFFI6143IXIZ.vcf.gz #dbNSFP4.5a_variant.chr11.gz + - https://api.data.igvf.org/reference-files/IGVFFI7373WKWM/@@download/IGVFFI7373WKWM.vcf.gz #dbNSFP4.5a_variant.chr12.gz + - https://api.data.igvf.org/reference-files/IGVFFI0535LVDM/@@download/IGVFFI0535LVDM.vcf.gz #dbNSFP4.5a_variant.chr13.gz + - https://api.data.igvf.org/reference-files/IGVFFI3218BMCQ/@@download/IGVFFI3218BMCQ.vcf.gz #dbNSFP4.5a_variant.chr14.gz + - https://api.data.igvf.org/reference-files/IGVFFI2289ELCY/@@download/IGVFFI2289ELCY.vcf.gz #dbNSFP4.5a_variant.chr15.gz + - https://api.data.igvf.org/reference-files/IGVFFI4257BIMS/@@download/IGVFFI4257BIMS.vcf.gz #dbNSFP4.5a_variant.chr16.gz + - https://api.data.igvf.org/reference-files/IGVFFI6803IGZS/@@download/IGVFFI6803IGZS.vcf.gz #dbNSFP4.5a_variant.chr17.gz + - https://api.data.igvf.org/reference-files/IGVFFI7393VCBY/@@download/IGVFFI7393VCBY.vcf.gz #dbNSFP4.5a_variant.chr18.gz + - https://api.data.igvf.org/reference-files/IGVFFI5456LGEZ/@@download/IGVFFI5456LGEZ.vcf.gz #dbNSFP4.5a_variant.chr19.gz + - https://api.data.igvf.org/reference-files/IGVFFI6213QGZU/@@download/IGVFFI6213QGZU.vcf.gz #dbNSFP4.5a_variant.chr20.gz + - https://api.data.igvf.org/reference-files/IGVFFI7897OALT/@@download/IGVFFI7897OALT.vcf.gz #dbNSFP4.5a_variant.chr21.gz + - https://api.data.igvf.org/reference-files/IGVFFI6918VTWL/@@download/IGVFFI6918VTWL.vcf.gz #dbNSFP4.5a_variant.chr22.gz + - https://api.data.igvf.org/reference-files/IGVFFI8283ZCLN/@@download/IGVFFI8283ZCLN.vcf.gz #dbNSFP4.5a_variant.chrX.gz + - https://api.data.igvf.org/reference-files/IGVFFI8477SBHR/@@download/IGVFFI8477SBHR.vcf.gz #dbNSFP4.5a_variant.chrY.gz + - https://api.data.igvf.org/reference-files/IGVFFI7595PQMP/@@download/IGVFFI7595PQMP.vcf.gz #dbNSFP4.5a_variant.chrM.gz + +# Example: python3 data_loader.py --adapter coding_variants --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/chr1.vcf --output-bucket-key variants_coding_variants/chr1.jsonl --collection variants_coding_variants variants to coding variant: collection: variants_coding_variants - datafiles: [] + params: + - collection + pypy3: false + command: python3 data_loader.py --adapter coding_variants --output-bucket igvf-catalog-parsed-collections --filepath {datafile} --output-bucket-key variants_coding_variants/{datafile}.jsonl --collection variants_coding_variants + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI0989JYUQ/@@download/IGVFFI0989JYUQ.vcf.gz #dbNSFP4.5a_variant.chr1.gz + - https://api.data.igvf.org/reference-files/IGVFFI6512LPYF/@@download/IGVFFI6512LPYF.vcf.gz #dbNSFP4.5a_variant.chr2.gz + - https://api.data.igvf.org/reference-files/IGVFFI3122NLUA/@@download/IGVFFI3122NLUA.vcf.gz #dbNSFP4.5a_variant.chr3.gz + - https://api.data.igvf.org/reference-files/IGVFFI4160XHXX/@@download/IGVFFI4160XHXX.vcf.gz #dbNSFP4.5a_variant.chr4.gz + - https://api.data.igvf.org/reference-files/IGVFFI5904SNID/@@download/IGVFFI5904SNID.vcf.gz #dbNSFP4.5a_variant.chr5.gz + - https://api.data.igvf.org/reference-files/IGVFFI3624TSFU/@@download/IGVFFI3624TSFU.vcf.gz #dbNSFP4.5a_variant.chr6.gz + - https://api.data.igvf.org/reference-files/IGVFFI6814BELY/@@download/IGVFFI6814BELY.vcf.gz #dbNSFP4.5a_variant.chr7.gz + - https://api.data.igvf.org/reference-files/IGVFFI5538GQZA/@@download/IGVFFI5538GQZA.vcf.gz #dbNSFP4.5a_variant.chr8.gz + - https://api.data.igvf.org/reference-files/IGVFFI6749QJPP/@@download/IGVFFI6749QJPP.vcf.gz #dbNSFP4.5a_variant.chr9.gz + - https://api.data.igvf.org/reference-files/IGVFFI4817KSEU/@@download/IGVFFI4817KSEU.vcf.gz #dbNSFP4.5a_variant.chr10.gz + - https://api.data.igvf.org/reference-files/IGVFFI6143IXIZ/@@download/IGVFFI6143IXIZ.vcf.gz #dbNSFP4.5a_variant.chr11.gz + - https://api.data.igvf.org/reference-files/IGVFFI7373WKWM/@@download/IGVFFI7373WKWM.vcf.gz #dbNSFP4.5a_variant.chr12.gz + - https://api.data.igvf.org/reference-files/IGVFFI0535LVDM/@@download/IGVFFI0535LVDM.vcf.gz #dbNSFP4.5a_variant.chr13.gz + - https://api.data.igvf.org/reference-files/IGVFFI3218BMCQ/@@download/IGVFFI3218BMCQ.vcf.gz #dbNSFP4.5a_variant.chr14.gz + - https://api.data.igvf.org/reference-files/IGVFFI2289ELCY/@@download/IGVFFI2289ELCY.vcf.gz #dbNSFP4.5a_variant.chr15.gz + - https://api.data.igvf.org/reference-files/IGVFFI4257BIMS/@@download/IGVFFI4257BIMS.vcf.gz #dbNSFP4.5a_variant.chr16.gz + - https://api.data.igvf.org/reference-files/IGVFFI6803IGZS/@@download/IGVFFI6803IGZS.vcf.gz #dbNSFP4.5a_variant.chr17.gz + - https://api.data.igvf.org/reference-files/IGVFFI7393VCBY/@@download/IGVFFI7393VCBY.vcf.gz #dbNSFP4.5a_variant.chr18.gz + - https://api.data.igvf.org/reference-files/IGVFFI5456LGEZ/@@download/IGVFFI5456LGEZ.vcf.gz #dbNSFP4.5a_variant.chr19.gz + - https://api.data.igvf.org/reference-files/IGVFFI6213QGZU/@@download/IGVFFI6213QGZU.vcf.gz #dbNSFP4.5a_variant.chr20.gz + - https://api.data.igvf.org/reference-files/IGVFFI7897OALT/@@download/IGVFFI7897OALT.vcf.gz #dbNSFP4.5a_variant.chr21.gz + - https://api.data.igvf.org/reference-files/IGVFFI6918VTWL/@@download/IGVFFI6918VTWL.vcf.gz #dbNSFP4.5a_variant.chr22.gz + - https://api.data.igvf.org/reference-files/IGVFFI8283ZCLN/@@download/IGVFFI8283ZCLN.vcf.gz #dbNSFP4.5a_variant.chrX.gz + - https://api.data.igvf.org/reference-files/IGVFFI8477SBHR/@@download/IGVFFI8477SBHR.vcf.gz #dbNSFP4.5a_variant.chrY.gz + - https://api.data.igvf.org/reference-files/IGVFFI7595PQMP/@@download/IGVFFI7595PQMP.vcf.gz #dbNSFP4.5a_variant.chrM.gz From 74bd07ed7cc520662b94835c808a241f6c5b4795 Mon Sep 17 00:00:00 2001 From: Pedro Assis Date: Mon, 4 Nov 2024 15:07:36 -0300 Subject: [PATCH 10/20] adding updated chr X and chrY for variants --- data/data_sources.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data/data_sources.yaml b/data/data_sources.yaml index e8e9f874..44ec9519 100644 --- a/data/data_sources.yaml +++ b/data/data_sources.yaml @@ -321,8 +321,8 @@ sequence variant: - https://api.data.igvf.org/reference-files/IGVFFI3274JYIH/@@download/IGVFFI3274JYIH.vcf.gz # dbSNP155Nov2.chr20.mn.agds.vcf.gz - https://api.data.igvf.org/reference-files/IGVFFI2738FFUW/@@download/IGVFFI2738FFUW.vcf.gz # dbSNP155Nov2.chr21.mn.agds.vcf.gz - https://api.data.igvf.org/reference-files/IGVFFI8213FCFM/@@download/IGVFFI8213FCFM.vcf.gz # dbSNP155Nov2.chr22.mn.agds.vcf.gz - - need updated chrX vcf - - need updated chrY vcf + - https://api.data.igvf.org/reference-files/IGVFFI8500BHPQ/@@download/IGVFFI8500BHPQ.vcf.gz # dbSNP155Nov2.chrX.mn.agds.vcf.gz + - https://api.data.igvf.org/reference-files/IGVFFI6699ZOCP/@@download/IGVFFI6699ZOCP.vcf.gz # dbSNP155Nov2.chrY.mn.agds.vcf.gz - https://api.data.igvf.org/reference-files/IGVFFI2231RETG/@@download/IGVFFI2231RETG.vcf.gz # Y2AVECombinedVariants.rmna.mn.genome.v6.vcf.gz # Example: pypy3 data_loader.py --adapter gencode_genes --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/IGVFFI7217ZMJZ.gtf --output-bucket-key genes/genes_IGVFFI7217ZMJZ.vcf.jsonl --label gencode_gene --gene-alias-file-path ~/dataset/IGVFFI7344CFHT.tsv.gz From 0f70c3d1043ccfe7cd4d1ae78ad3781d4985ca4d Mon Sep 17 00:00:00 2001 From: Pedro Assis Date: Thu, 7 Nov 2024 16:28:18 -1000 Subject: [PATCH 11/20] adding ASB files --- data/data_sources.yaml | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/data/data_sources.yaml b/data/data_sources.yaml index 44ec9519..159604e5 100644 --- a/data/data_sources.yaml +++ b/data/data_sources.yaml @@ -1,7 +1,3 @@ -# Data fetched from: -# https://data.igvf.org/multireport/?type=File&content_type=&field=%40id&field=content_type&field=href&field=submitted_file_name -# Example: https://data.igvf.org/multireport/?type=File&content_type=variants_variants&field=%40id&field=content_type&field=href&field=submitted_file_name - # Example: pypy3 data_loader.py --adapter topld --output-bucket igvf-catalog-parsed-collections --filepath ~/topld/afr/AFR_chr1_no_filter_0.2_1000000_LD.csv --output-bucket-key variants_variants/topld_afr_chr1.jsonl --chr chr1 --annotation-filepath ~/topld/afr/AFR_chr1_no_filter_0.2_1000000_info_annotation.csv --ancestry AFR topld in linkage disequilibrium with: collection: variants_variants @@ -492,13 +488,23 @@ variant to protein association: datafiles: - https://api.data.igvf.org/reference-files/IGVFFI2053GDNI/@@download/IGVFFI2053GDNI.csv.gz # pQTL_UKB.csv.gz +# Example: you must 'gzip -d' + 'tar -xf' + 'unzip' the main datafile and use the extracted folder path (./relase/data) in the adapter +# python3 data_loader.py --adapter allele_specific_binding --label asb --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/release/data/ --output-bucket-key variants_proteins/asb_IGVFFI5943XCOS.jsonl allele specific binding: collection: variants_proteins - datafiles: [] + pypy3: false + command: python3 data_loader.py --adapter allele_specific_binding --label asb --output-bucket igvf-catalog-parsed-collections --filepath {datafile_path} --output-bucket-key variants_proteins/asb_IGVFFI5943XCOS.jsonl + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI5943XCOS/@@download/IGVFFI5943XCOS.tar.gz # adastra.cltf.bill_cipher.zip.tar.gz +# Example: you must 'gzip -d' + 'tar -xf' + 'unzip' the main datafile and use the extracted folder path (./relase/data) in the adapter +# python3 data_loader.py --adapter allele_specific_binding --label asb_cell_ontology --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/release/data/ --output-bucket-key variants_proteins_terms/asb_terms_IGVFFI5943XCOS.jsonl allele specific binding cell ontology: collection: variants_proteins_terms - datafiles: [] + pypy3: false + command: python3 data_loader.py --adapter allele_specific_binding --label asb_cell_ontology --output-bucket igvf-catalog-parsed-collections --filepath {datafile_path} --output-bucket-key variants_proteins_terms/asb_terms_IGVFFI5943XCOS.jsonl + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI5943XCOS/@@download/IGVFFI5943XCOS.tar.gz # adastra.cltf.bill_cipher.zip.tar.gz variant to gene association: collection: variants_genes From 42cd17517864e7b2e381706cb940e75a035974b9 Mon Sep 17 00:00:00 2001 From: Pedro Assis Date: Fri, 15 Nov 2024 08:09:23 -1000 Subject: [PATCH 12/20] adding several loading files --- data/adapters/biogrid_gene_gene_adapter.py | 4 +- data/adapters/coxpresdb_adapter.py | 60 ++-- data/adapters/depmap_adapter.py | 12 +- data/adapters/gaf_adapter.py | 6 +- .../adapters/mouse_genomes_project_adapter.py | 4 +- data/adapters/proteins_interaction_adapter.py | 4 +- data/data_sources.yaml | 278 ++++++++++++++---- data/schema-config.yaml | 2 + 8 files changed, 270 insertions(+), 100 deletions(-) diff --git a/data/adapters/biogrid_gene_gene_adapter.py b/data/adapters/biogrid_gene_gene_adapter.py index f8a5f15c..8f2b383d 100644 --- a/data/adapters/biogrid_gene_gene_adapter.py +++ b/data/adapters/biogrid_gene_gene_adapter.py @@ -21,10 +21,8 @@ class GeneGeneBiogrid: INTERACTION_MI_CODE_PATH = './data_loading_support_files/Biogrid_gene_gene/psi-mi.obo' - def __init__(self, filepath, label, dry_run=True, writer: Optional[Writer] = None, **kwargs): + def __init__(self, filepath, dry_run=True, writer: Optional[Writer] = None, **kwargs): self.filepath = filepath - self.dataset = label - self.label = label self.dry_run = dry_run self.type = 'edge' self.writer = writer diff --git a/data/adapters/coxpresdb_adapter.py b/data/adapters/coxpresdb_adapter.py index 8aa638c7..f14cb942 100644 --- a/data/adapters/coxpresdb_adapter.py +++ b/data/adapters/coxpresdb_adapter.py @@ -1,4 +1,4 @@ - +import os import pickle import json from typing import Optional @@ -14,7 +14,7 @@ class Coxpresdb: def __init__(self, filepath, dry_run=True, writer: Optional[Writer] = None, **kwargs): - self.file_path = filepath + self.filepath = filepath self.dataset = 'coxpresdb' self.label = 'coxpresdb' self.source = 'CoXPresdb' @@ -32,30 +32,32 @@ def process_file(self): # every gene has entrez gene id in gene_info file, every gene has ensembl id or hgcn id if available with open('./data_loading_support_files/entrez_to_ensembl.pkl', 'rb') as f: entrez_ensembl_dict = pickle.load(f) - entrez_id = self.file_path.split('/')[-1] - ensembl_id = entrez_ensembl_dict.get(entrez_id) - if ensembl_id: - with open(self.file_path, 'r') as input: - for line in input: - (co_entrez_id, score) = line.strip().split() - co_ensembl_id = entrez_ensembl_dict.get(co_entrez_id) - if co_ensembl_id: - # only keep those with logit_scores (i.e. z-scores) absolute value >= 3 - if abs(float(score)) >= 3: - _id = entrez_id + '_' + co_entrez_id + '_' + self.label - _source = 'genes/' + ensembl_id - _target = 'genes/' + co_ensembl_id - _props = { - '_key': _id, - '_from': _source, - '_to': _target, - 'z_score': score, # confirmed from their paper that logit_score is essentailly a z_score - 'source': self.source, - 'source_url': self.source_url, - 'name': 'coexpressed with', - 'inverse_name': 'coexpressed with', - 'associated process': 'ontology_terms/GO_0010467' - } - self.writer.write(json.dumps(_props)) - self.writer.write('\n') - self.writer.close() + + for filename in os.listdir(self.filepath): + entrez_id = filename.split('/')[-1] + ensembl_id = entrez_ensembl_dict.get(entrez_id) + if ensembl_id: + with open(self.filepath + '/' + filename, 'r') as input: + for line in input: + (co_entrez_id, score) = line.strip().split() + co_ensembl_id = entrez_ensembl_dict.get(co_entrez_id) + if co_ensembl_id: + # only keep those with logit_scores (i.e. z-scores) absolute value >= 3 + if abs(float(score)) >= 3: + _id = entrez_id + '_' + co_entrez_id + '_' + self.label + _source = 'genes/' + ensembl_id + _target = 'genes/' + co_ensembl_id + _props = { + '_key': _id, + '_from': _source, + '_to': _target, + 'z_score': score, # confirmed from their paper that logit_score is essentailly a z_score + 'source': self.source, + 'source_url': self.source_url, + 'name': 'coexpressed with', + 'inverse_name': 'coexpressed with', + 'associated process': 'ontology_terms/GO_0010467' + } + self.writer.write(json.dumps(_props)) + self.writer.write('\n') + self.writer.close() diff --git a/data/adapters/depmap_adapter.py b/data/adapters/depmap_adapter.py index 61d81b8d..c4a7676b 100644 --- a/data/adapters/depmap_adapter.py +++ b/data/adapters/depmap_adapter.py @@ -35,12 +35,8 @@ class DepMap: CELL_ONTOLOGY_ID_MAPPING_PATH = './data_loading_support_files/DepMap/DepMap_model.csv' CUTOFF = 0.5 # only load genes with dependency scores greater or equal to 0.5 for each cell - def __init__(self, filepath, type, label, dry_run=True, writer: Optional[Writer] = None, **kwargs): + def __init__(self, filepath, writer: Optional[Writer] = None, **kwargs): self.filepath = filepath - self.dataset = label - self.label = label - self.type = type - self.dry_run = dry_run self.writer = writer def process_file(self): @@ -56,7 +52,8 @@ def process_file(self): for column_index, model_id in enumerate(model_ids): model_ids_column_mapping[column_index] = model_id # check CVCL id mapping for all models once first - cell_ontology_id = self.cell_ontology_id_mapping[model_id]['cell_ontology_id'] + cell_ontology_id = self.cell_ontology_id_mapping[model_id].get( + 'cell_ontology_id') if not cell_ontology_id: print('Cell ontology unavailable for model id ' + model_id) @@ -74,7 +71,8 @@ def process_file(self): # only load gene-cell pairs with values >= cutoff (0.5) elif float(value) >= DepMap.CUTOFF: gene_model_id = model_ids_column_mapping[value_index] - cell_ontology_id = self.cell_ontology_id_mapping[gene_model_id]['cell_ontology_id'] + cell_ontology_id = self.cell_ontology_id_mapping[gene_model_id].get( + 'cell_ontology_id') if not cell_ontology_id: # no CVCL id provided for this model continue diff --git a/data/adapters/gaf_adapter.py b/data/adapters/gaf_adapter.py index 001d345b..ed1b0297 100644 --- a/data/adapters/gaf_adapter.py +++ b/data/adapters/gaf_adapter.py @@ -44,15 +44,17 @@ class GAF: DATASET = 'gaf' + + # source: https://ftp.ebi.ac.uk/pub/databases/RNAcentral/current_release/id_mapping/database_mappings/ensembl_gencode.tsv RNACENTRAL_ID_MAPPING_PATH = './samples/rnacentral_ensembl_gencode.tsv.gz' + # generated from current proteins collection in the Catalog MOUSE_MGI_TO_UNIPROT_PATH = './data_loading_support_files/mgi_to_ensembl.pkl' SOURCES = { 'human': 'http://geneontology.org/gene-associations/goa_human.gaf.gz', 'human_isoform': 'http://geneontology.org/gene-associations/goa_human_isoform.gaf.gz', 'mouse': 'https://current.geneontology.org/annotations/mgi.gaf.gz', - 'rna': 'http://geneontology.org/gene-associations/goa_human_rna.gaf.gz', - 'rnacentral': 'https://ftp.ebi.ac.uk/pub/databases/RNAcentral/current_release/id_mapping/database_mappings/ensembl_gencode.tsv' + 'rna': 'http://geneontology.org/gene-associations/goa_human_rna.gaf.gz' } def __init__(self, filepath, gaf_type='human', dry_run=True, writer: Optional[Writer] = None, **kwargs): diff --git a/data/adapters/mouse_genomes_project_adapter.py b/data/adapters/mouse_genomes_project_adapter.py index 34dbb68a..062a3e19 100644 --- a/data/adapters/mouse_genomes_project_adapter.py +++ b/data/adapters/mouse_genomes_project_adapter.py @@ -42,6 +42,8 @@ class MouseGenomesProjectAdapter: STRAINS = ['129S1_SvImJ', 'A_J', 'CAST_EiJ', 'NOD_ShiLtJ', 'NZO_HlLtJ', 'PWK_PhJ', 'WSB_EiJ'] + WRITE_THRESHOLD = 1000000 + def __init__(self, filepath=None, dry_run=True, writer: Optional[Writer] = None, **kwargs): self.filepath = filepath self.label = self.LABEL @@ -105,7 +107,7 @@ def process_file(self): to_json = { '_key': id, 'chr': 'chr' + data_line[0], - 'pos:long': int(data_line[1]) - 1, + 'pos': int(data_line[1]) - 1, 'rsid': [] if data_line[2] == '.' else [data_line[2]], 'ref': data_line[3], 'alt': alt, diff --git a/data/adapters/proteins_interaction_adapter.py b/data/adapters/proteins_interaction_adapter.py index 0d28f0cd..bc2754eb 100644 --- a/data/adapters/proteins_interaction_adapter.py +++ b/data/adapters/proteins_interaction_adapter.py @@ -15,10 +15,8 @@ class ProteinsInteraction: INTERACTION_MI_CODE_PATH = './data_loading_support_files/Biogrid_gene_gene/psi-mi.obo' - def __init__(self, filepath, label, dry_run=True, writer: Optional[Writer] = None, **kwargs): + def __init__(self, filepath, dry_run=True, writer: Optional[Writer] = None, **kwargs): self.filepath = filepath - self.dataset = label - self.label = label self.dry_run = dry_run self.type = 'edge' self.writer = writer diff --git a/data/data_sources.yaml b/data/data_sources.yaml index 159604e5..ece114c9 100644 --- a/data/data_sources.yaml +++ b/data/data_sources.yaml @@ -371,10 +371,6 @@ gene structure: datafiles: - https://api.data.igvf.org/reference-files/IGVFFI7217ZMJZ/@@download/IGVFFI7217ZMJZ.gtf.gz # Homo sapiens GRCh38 GENCODE v43 genome -mouse gene structure: - collection: mm_genes_structure - datafiles: [] - # Example: pypy3 data_loader.py --adapter gencode_structure --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/IGVFFI7217ZMJZ.gtf --output-bucket-key transcripts_genes_structure/gencode_IGVFFI7217ZMJZ.vcf.jsonl --label transcript_contains_gene_structure transcript contains gene structure: collection: transcripts_genes_structure @@ -388,14 +384,6 @@ mouse transcript contains mouse gene structure: collection: mm_transcripts_mm_genes_structure datafiles: [] -AFGR variant to regulatory region: - collection: variants_regulatory_regions - datafiles: [] - -encode variant to regulatory region: - collection: variants_regulatory_regions - datafiles: [] - # Example: python3 data_loader.py --adapter ontology --output-bucket igvf-catalog-parsed-collections --filepath vario.owl --ontology vario # for cellosaurus: python3 data_loader.py --adapter cellosaurus_terms --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/cellosaurus.obo --type node --output-bucket-key ontology_terms/cellosaurus.jsonl # for oncotree: python3 data_loader.py --adapter oncotree_terms --output-bucket igvf-catalog-parsed-collections --output-bucket-key ontology_terms/oncotree.jsonl --filepath '' @@ -502,45 +490,73 @@ allele specific binding: allele specific binding cell ontology: collection: variants_proteins_terms pypy3: false - command: python3 data_loader.py --adapter allele_specific_binding --label asb_cell_ontology --output-bucket igvf-catalog-parsed-collections --filepath {datafile_path} --output-bucket-key variants_proteins_terms/asb_terms_IGVFFI5943XCOS.jsonl + command: python3 data_loader.py --adapter allele_specific_binding --label asb_cell_ontology --output-bucket igvf-catalog-parsed-collections --filepath {datafiles_folder_path} --output-bucket-key variants_proteins_terms/asb_terms_IGVFFI5943XCOS.jsonl datafiles: - https://api.data.igvf.org/reference-files/IGVFFI5943XCOS/@@download/IGVFFI5943XCOS.tar.gz # adastra.cltf.bill_cipher.zip.tar.gz -variant to gene association: - collection: variants_genes - datafiles: [] - +# Example: python3 data_loader.py --adapter gtex_splice_qtl --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/GTEx_Analysis_v8_sQTL/ --output-bucket-key variants_genes/gtex_sqtls.jsonl --label GTEx_splice_QTL gtex splice variant to gene association: collection: variants_genes - datafiles: [] + pypy3: false + command: python3 data_loader.py --adapter gtex_splice_qtl --output-bucket igvf-catalog-parsed-collections --filepath {datafiles_folder_path} --output-bucket-key variants_genes/gtex_sqtls.jsonl --label GTEx_splice_QTL + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI0582HBMB/@@download/IGVFFI0582HBMB.tar.gz # GTEx_Analysis_v8_sQTL.tar.gz +# Example: python3 data_loader.py --adapter gtex_splice_qtl --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/GTEx_Analysis_v8_sQTL/ --output-bucket-key variants_genes_terms/gtex_sqtls_terms.jsonl --label GTEx_splice_QTL_term gtex splice variant to gene association to ontology term: collection: variants_genes_terms - datafiles: [] + pypy3: false + command: python3 data_loader.py --adapter gtex_splice_qtl --output-bucket igvf-catalog-parsed-collections --filepath {datafiles_folder_path} --output-bucket-key variants_genes_terms/gtex_sqtls_terms.jsonl --label GTEx_splice_QTL_term + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI0582HBMB/@@download/IGVFFI0582HBMB.tar.gz # GTEx_Analysis_v8_sQTL.tar.gz +# Example: python3 data_loader.py --adapter eqtl --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/GTEx_Analysis_v8_eQTL/ --output-bucket-key variants_genes/gtex_eqtls.jsonl --label GTEx_eqtl gtex variant to gene expression association: collection: variants_genes - datafiles: [] + pypy3: false + command: python3 data_loader.py --adapter eqtl --output-bucket igvf-catalog-parsed-collections --filepath {datafiles_folder_path} --output-bucket-key variants_genes/gtex_eqtls.jsonl --label GTEx_eqtl + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI5653UGOL/@@download/IGVFFI5653UGOL.tar.gz # GTEx_Analysis_v8_eQTL.tar.gz +# Example: python3 data_loader.py --adapter eqtl --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/GTEx_Analysis_v8_eQTL/ --output-bucket-key variants_genes_terms/gtex_eqtls_terms.jsonl --label GTEx_eqtl_term gtex variant to gene expression association to ontology term: collection: variants_genes_terms - datafiles: [] + pypy3: false + command: python3 data_loader.py --adapter eqtl --output-bucket igvf-catalog-parsed-collections --filepath {datafiles_folder_path} --output-bucket-key variants_genes_terms/gtex_eqtls_terms.jsonl --label GTEx_eqtl_term + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI5653UGOL/@@download/IGVFFI5653UGOL.tar.gz # GTEx_Analysis_v8_eQTL.tar.gz +# Example: python3 data_loader.py --adapter AFGR_eqtl --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/IGVFFI8011XYOB.txt.gz --output-bucket-key variants_genes/afgr_eqtls.jsonl --label AFGR_eqtl AFGR splice variant to gene association: collection: variants_genes - datafiles: [] + pypy3: false + command: python3 data_loader.py --adapter AFGR_eqtl --output-bucket igvf-catalog-parsed-collections --filepath {datafile} --output-bucket-key variants_genes/afgr_eqtls.jsonl --label AFGR_eqtl + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI8011XYOB/@@download/IGVFFI8011XYOB.txt.gz # sorted.dist.hwe.af.AFR_META.eQTL.nominal.hg38a.pvalueCut.txt.gz +# Example: python3 data_loader.py --adapter AFGR_eqtl --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/IGVFFI8011XYOB.txt.gz --output-bucket-key variants_genes_terms/afgr_eqtls_terms.jsonl --label AFGR_eqtl_term AFGR splice variant to gene association to ontology term: collection: variants_genes_terms - datafiles: [] + pypy3: false + command: python3 data_loader.py --adapter AFGR_eqtl --output-bucket igvf-catalog-parsed-collections --filepath {datafile} --output-bucket-key variants_genes_terms/afgr_eqtls_terms.jsonl --label AFGR_eqtl_term + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI8011XYOB/@@download/IGVFFI8011XYOB.txt.gz # sorted.dist.hwe.af.AFR_META.eQTL.nominal.hg38a.pvalueCut.txt.gz +# Example: python3 data_loader.py --adapter AFGR_sqtl --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/IGVFFI4560RRRS.txt.gz --output-bucket-key variants_genes/afgr_sqtls.jsonl --label AFGR_sqtl AFGR variant to gene expression association: collection: variants_genes - datafiles: [] + pypy3: false + command: python3 data_loader.py --adapter AFGR_sqtl --output-bucket igvf-catalog-parsed-collections --filepath {datafile} --output-bucket-key variants_genes/afgr_sqtls.jsonl --label AFGR_sqtl + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI4560RRRS/@@download/IGVFFI4560RRRS.txt.gz # sorted.all.AFR.Meta.sQTL.genPC.nominal.maf05.mvmeta.fe.pvalueCut.txt.gz +# Example: python3 data_loader.py --adapter AFGR_sqtl --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/IGVFFI4560RRRS.txt.gz --output-bucket-key variants_genes_terms/afgr_sqtls_terms.jsonl --label AFGR_sqtl_term AFGR variant to gene expression association to ontology term: collection: variants_genes_terms - datafiles: [] + pypy3: false + command: python3 data_loader.py --adapter AFGR_sqtl --output-bucket igvf-catalog-parsed-collections --filepath {datafile} --output-bucket-key variants_genes_terms/afgr_sqtls_terms.jsonl --label AFGR_sqtl_term + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI4560RRRS/@@download/IGVFFI4560RRRS.txt.gz # sorted.all.AFR.Meta.sQTL.genPC.nominal.maf05.mvmeta.fe.pvalueCut.txt.gz regulatory element to gene expression association: collection: regulatory_regions_genes @@ -574,137 +590,289 @@ regulatory element to biosample: collection: regulatory_regions_biosamples datafiles: [] +AFGR variant to regulatory region: + collection: variants_regulatory_regions + datafiles: [] + +encode variant to regulatory region: + collection: variants_regulatory_regions + datafiles: [] + +human mouse regulatory region mapping: + collection: regulatory_regions_mm_regulatory_regions + datafiles: [] + +regulatory region mouse: + collection: mm_regulatory_regions + datafiles: [] + +# Example: +# 1) python3 data_loader.py --adapter gaf --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/IGVFFI1490WZCV.gaf.gz --output-bucket-key gene_products_terms/go_annotations.jsonl --gaf-type human +# 2) python3 data_loader.py --adapter gaf --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/IGVFFI8870ZOTO.gaf.gz --output-bucket-key gene_products_terms/gaf_human_isoform.jsonl --gaf-type human_isoform +# 3) python3 data_loader.py --adapter gaf --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/IGVFFI6501YXMX.gaf.gz --output-bucket-key gene_products_terms/gaf_human_rna.jsonl --gaf-type rna +# 4) data_loader.py --adapter gaf --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/IGVFFI9807JOKT.gaf.gz --output-bucket-key gene_products_terms/gaf_mouse.jsonl --gaf-type mouse gaf: collection: gene_products_terms - datafiles: [] + params: + - gaf-type + pypy3: false + command: python3 data_loader.py --adapter gaf --output-bucket igvf-catalog-parsed-collections --filepath {datafile} --output-bucket-key gene_products_terms/gaf_human_rna.jsonl --gaf-type {gaf_type} + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI1490WZCV/@@download/IGVFFI1490WZCV.gaf.gz # goa_human.gaf.gz + - https://api.data.igvf.org/reference-files/IGVFFI8870ZOTO/@@download/IGVFFI8870ZOTO.gaf.gz # goa_human_isoform.gaf.gz + - https://api.data.igvf.org/reference-files/IGVFFI6501YXMX/@@download/IGVFFI6501YXMX.gaf.gz # goa_human_rna.gaf.gz + - https://api.data.igvf.org/reference-files/IGVFFI9807JOKT/@@download/IGVFFI9807JOKT.gaf.gz # mgi.gaf.gz +# Example: python3 data_loader.py --adapter motif --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/pwm/ --output-bucket-key motifs/motifs_IGVFFI9678CVIS.jsonl --label motif motif: collection: motifs + params: + - label + pypy3: false + command: python3 data_loader.py --adapter motif --output-bucket igvf-catalog-parsed-collections --filepath {datafile_path} --output-bucket-key motifs/motifs_IGVFFI9678CVIS.jsonl --label {label} datafiles: - https://api.data.igvf.org/reference-files/IGVFFI9678CVIS/@@download/IGVFFI9678CVIS.tar.gz # HOCOMOCOv11_core_pwm_HUMAN_mono.tar.gz +# Example: python3 data_loader.py --adapter motif --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/pwm/ --output-bucket-key motifs_proteins/motifs_proteins_IGVFFI9678CVIS.jsonl --label motif_protein_link motif to protein: collection: motifs_proteins + params: + - label + command: python3 data_loader.py --adapter motif --output-bucket igvf-catalog-parsed-collections --filepath {datafile_path} --output-bucket-key motifs_proteins/motifs_proteins_IGVFFI9678CVIS.jsonl --label {label} + pypy3: false datafiles: - https://api.data.igvf.org/reference-files/IGVFFI0050HPJU/@@download/IGVFFI0050HPJU.tsv.gz # HOCOMOCOv11_core_annotation_HUMAN_mono.tsv.gz +# Example: pypy3 data_loader.py --adapter protein_protein --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/merged_PPI_mouse.UniProt.csv --output-bucket-key proteins_proteins/proteins_proteins_human_IGVFFI4317VDGK.jsonl +# ATTENTION: files must be renamed to original filenames as metadata gets read from them protein to protein interaction: collection: proteins_proteins - datafiles: [] + command: pypy3 data_loader.py --adapter protein_protein --output-bucket igvf-catalog-parsed-collections --filepath {original_datafile} --output-bucket-key proteins_proteins/proteins_proteins_human_{datafile_name}.jsonl + pypy3: true + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI4317VDGK/@@download/IGVFFI4317VDGK.csv.gz # merged_PPI.UniProt.csv.gz + - https://api.data.igvf.org/reference-files/IGVFFI1165YVBA/@@download/IGVFFI1165YVBA.csv.gz # merged_PPI_mouse.UniProt.csv.gz +# Example: pypy3 data_loader.py --adapter gene_gene_biogrid --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/merged_PPI_mouse.UniProt.csv --output-bucket-key mm_genes_mm_genes/mm_genes_mm_genes_IGVFFI1165YVBA.jsonl +# ATTENTION: files must be renamed to original filenames as metadata gets read from them mouse gene to gene interaction: collection: mm_genes_mm_genes - datafiles: [] + pypy3: true + command: pypy3 data_loader.py --adapter gene_gene_biogrid --output-bucket igvf-catalog-parsed-collections --filepath {datafile} --output-bucket-key mm_genes_mm_genes/mm_genes_mm_genes_IGVFFI1165YVBA.jsonl + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI1165YVBA/@@download/IGVFFI1165YVBA.csv.gz # merged_PPI_mouse.UniProt.csv.gz +# Example: pypy3 data_loader.py --adapter coxpresdb --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/Hsa-r.v22-05.G16651-S235187.combat_pca.subagging.z.d/ --output-bucket-key genes_genes/genes_genes_coxpresdb.jsonl gene to gene coexpression association: collection: genes_genes - datafiles: [] + command: pypy3 data_loader.py --adapter coxpresdb --output-bucket igvf-catalog-parsed-collections --filepath {datafile_path} --output-bucket-key genes_genes/genes_genes_coxpresdb.jsonl + pypy3: true + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI3321YNBP/@@download/IGVFFI3321YNBP.tar.gz # Hsa-r.v22-05.G16651-S235187.combat_pca.subagging.z.d.tar.gz # CoxpresDB +# Example: python3 data_loader.py --adapter gene_gene_biogrid --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/IGVFFI4317VDGK.csv --output-bucket-key genes_genes/genes_genes_IGVFFI4317VDGK.jsonl gene to gene interaction: collection: genes_genes - datafiles: [] + command: python3 data_loader.py --adapter gene_gene_biogrid --output-bucket igvf-catalog-parsed-collections --filepath {datafile} --output-bucket-key genes_genes/genes_genes_IGVFFI4317VDGK.jsonl + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI4317VDGK/@@download/IGVFFI4317VDGK.csv.gz # merged_PPI.UniProt.csv.gz # BioGRID +# Example: pypy3 data_loader.py --adapter pathway --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/IGVFFI6573JZEO.txt --output-bucket-key pathways/pathways_IGVFFI6573JZEO.jsonl pathway: collection: pathways + pypy3: false + command: python3 data_loader.py --adapter pathway --output-bucket igvf-catalog-parsed-collections --filepath {datafile} --output-bucket-key pathways/pathways_IGVFFI6573JZEO.jsonl datafiles: - https://api.data.igvf.org/reference-files/IGVFFI6573JZEO/@@download/IGVFFI6573JZEO.txt.gz # ReactomePathways.txt.gz +# Example: pypy3 data_loader.py --adapter genes_pathways --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/IGVFFI5159WVTH.txt --output-bucket-key genes_pathways/genes_pathways_IGVFFI5159WVTH.jsonl --label genes_pathways gene to pathway association: collection: genes_pathways + params: + - label + pypy3: true + command: python3 data_loader.py --adapter genes_pathways --output-bucket igvf-catalog-parsed-collections --filepath {datafile} --output-bucket-key genes_pathways/genes_pathways_IGVFFI5159WVTH.jsonl --label genes_pathways datafiles: - https://api.data.igvf.org/reference-files/IGVFFI5159WVTH/@@download/IGVFFI5159WVTH.txt.gz # Ensembl2Reactome_All_Levels.txt.gz +# Example: python3 data_loader.py --adapter genes_pathways --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/IGVFFI8863FVFN.txt --output-bucket-key pathways_pathways/pathways_pathways_IGVFFI8863FVFN.jsonl --label parent_pathway_of parent pathway of: collection: pathways_pathways + params: + - label + pypy3: true + command: python3 data_loader.py --adapter genes_pathways --output-bucket igvf-catalog-parsed-collections --filepath {datafile} --output-bucket-key pathways_pathways/pathways_pathways_IGVFFI8863FVFN.jsonl --label parent_pathway_of datafiles: - https://api.data.igvf.org/reference-files/IGVFFI8863FVFN/@@download/IGVFFI8863FVFN.txt.gz # ReactomePathwaysRelation.txt.gz +# Example: python3 data_loader.py --adapter gwas_studies --output-bucket igvf-catalog-parsed-collections --variants-to-ontology IGVFFI1309WDQG.tsv --variants-to-gene IGVFFI5724MMHI.tsv --output-bucket-key studies/gwas_studies.jsonl --gwas-collection studies study: collection: studies - datafiles: [] + params: + - gwas-collection + - variants-to-ontology + - variants-to-genes + pypy3: false + command: python3 data_loader.py --adapter gwas_studies --output-bucket igvf-catalog-parsed-collections --variants-to-ontology {variant_to_ontology_datafile} --variants-to-gene {variant_to_gene_datafile} --output-bucket-key studies/gwas_studies.jsonl --gwas-collection studies + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI1309WDQG/@@download/IGVFFI1309WDQG.tsv.gz # v2d_igvf.tsv.gz # variant to ontology mapping + - https://api.data.igvf.org/reference-files/IGVFFI5724MMHI/@@download/IGVFFI5724MMHI.tsv.gz # v2g_scored_igvf.tsv.gz # variant to gene scored mapping +# Example: python3 data_loader.py --adapter gwas_studies --output-bucket igvf-catalog-parsed-collections --variants-to-ontology IGVFFI1309WDQG.tsv --variants-to-gene IGVFFI5724MMHI.tsv --output-bucket-key variants_phenotypes/gwas_variants_phenotypes.jsonl --gwas-collection variants_phenotypes variant to phenotype: collection: variants_phenotypes - datafiles: [] + params: + - gwas_collection + - variants-to-ontology + - variants-to-genes + pypy3: false + command: python3 data_loader.py --adapter gwas_studies --output-bucket igvf-catalog-parsed-collections --variants-to-ontology {variant_to_ontology_datafile} --variants-to-gene {variant_to_gene_datafile} --output-bucket-key variants_phenotypes/gwas_variants_phenotypes.jsonl --gwas-collection variants_phenotypes + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI1309WDQG/@@download/IGVFFI1309WDQG.tsv.gz # v2d_igvf.tsv.gz # variant to ontology mapping + - https://api.data.igvf.org/reference-files/IGVFFI5724MMHI/@@download/IGVFFI5724MMHI.tsv.gz # v2g_scored_igvf.tsv.gz # variant to gene scored mapping +# Example: python3 data_loader.py --adapter gwas_studies --output-bucket igvf-catalog-parsed-collections --variants-to-ontology IGVFFI1309WDQG.tsv --variants-to-gene IGVFFI5724MMHI.tsv --output-bucket-key variants_phenotypes_studies/gwas_variants_phenotypes_studies.jsonl --gwas-collection variants_phenotypes_studies variant to phenotype to study: collection: variants_phenotypes_studies - datafiles: [] + params: + - gwas_collection + - variants-to-ontology + - variants-to-genes + pypy3: false + command: python3 data_loader.py --adapter gwas_studies --output-bucket igvf-catalog-parsed-collections --variants-to-ontology {variant_to_ontology_datafile} --variants-to-gene {variant_to_gene_datafile} --output-bucket-key variants_phenotypes_studies/gwas_variants_phenotypes_studies.jsonl --gwas-collection variants_phenotypes_studies + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI1309WDQG/@@download/IGVFFI1309WDQG.tsv.gz # v2d_igvf.tsv.gz # variant to ontology mapping + - https://api.data.igvf.org/reference-files/IGVFFI5724MMHI/@@download/IGVFFI5724MMHI.tsv.gz # v2g_scored_igvf.tsv.gz # variant to gene scored mapping +# Example: python3 data_loader.py --adapter drug --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/IGVFFI2997DUKO.tsv --output-bucket-key drugs/drugs_IGVFFI2997DUKO.jsonl --label drug drug: collection: drugs + pypy3: true + params: + - label + command: python3 data_loader.py --adapter drug --output-bucket igvf-catalog-parsed-collections --filepath {datafile} --output-bucket-key drugs/drugs_IGVFFI2997DUKO.jsonl --label drug datafiles: - https://api.data.igvf.org/reference-files/IGVFFI2997DUKO/@@download/IGVFFI2997DUKO.tsv.gz # pharmGKB_chemicals.tsv.gz +## Example: python3 data_loader.py --adapter variant_drug --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/pharmGKB/ --output-bucket-key variants_drugs/variants_drugs_pharmGKB.jsonl --label variant_drug +## Untar datafile and point adapter to folder variant to drug: collection: variants_drugs + params: + - label + pypy3: false + command: python3 data_loader.py --adapter variant_drug --output-bucket igvf-catalog-parsed-collections --filepath {datafolder} --output-bucket-key variants_drugs/variants_drugs_pharmGKB.jsonl --label variant_drug datafiles: - - https://api.data.igvf.org/reference-files/IGVFFI4821BJHQ/@@download/IGVFFI4821BJHQ.tsv.gz # data_loading_support_files/pharmGKB_genes.tsv.gz - - https://api.data.igvf.org/reference-files/IGVFFI1149WTCK/@@download/IGVFFI1149WTCK.tsv.gz # data_loading_support_files/pharmGKB_study_parameters.tsv.gz + - https://api.data.igvf.org/reference-files/IGVFFI8835SMSP/@@download/IGVFFI8835SMSP.tar.gz # variantAnnotations.tar.gz +## Example: python3 data_loader.py --adapter variant_drug --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/pharmGKB/ --output-bucket-key variants_drugs_genes/variants_drugs_genes_pharmGKB.jsonl --label variant_drug_gene +## Untar datafile and point adapter to folder variant drug association to gene: collection: variants_drugs_genes + params: + - label + pypy3: false + command: python3 data_loader.py --adapter variant_drug --output-bucket igvf-catalog-parsed-collections --filepath {datafolder} --output-bucket-key variants_drugs_genes/variants_drugs_genes_pharmGKB.jsonl --label variant_drug_gene datafiles: - - https://api.data.igvf.org/reference-files/IGVFFI7955ICXJ/@@download/IGVFFI7955ICXJ.tsv.gz # data_loading_support_files/pharmGKB_variants.tsv.gz - https://api.data.igvf.org/reference-files/IGVFFI8835SMSP/@@download/IGVFFI8835SMSP.tar.gz # variantAnnotations.tar.gz +# Example: python3 data_loader.py --adapter disease_gene --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/IGVFFI4540ZCXZ.xml --output-bucket-key diseases_genes/diseases_genes_IGVFFI4540ZCXZ.jsonl disease to gene: collection: diseases_genes + pypy3: true + command: python3 data_loader.py --adapter disease_gene --output-bucket igvf-catalog-parsed-collections --filepath {datafile} --output-bucket-key diseases_genes/diseases_genes_IGVFFI4540ZCXZ.jsonl datafiles: - https://api.data.igvf.org/reference-files/IGVFFI4540ZCXZ/@@download/IGVFFI4540ZCXZ.xml.gz # en_product6.xml.gz +# Example: python3 data_loader.py --adapter variant_disease --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/IGVFFI5852GYTT.csv --output-bucket-key variants_diseases/variants_diseases_IGVFFI5852GYTT.jsonl --label variant_disease variant to disease: collection: variants_diseases - datafiles: [] + params: + - label + pypy3: true + command: python3 data_loader.py --adapter variant_disease --output-bucket igvf-catalog-parsed-collections --filepath {datafile} --output-bucket-key variants_diseases/variants_diseases_IGVFFI5852GYTT.jsonl --label variant_disease + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI5852GYTT/@@download/IGVFFI5852GYTT.csv.gz # variant_pathogenicity.csv.gz +# Example: python3 data_loader.py --adapter variant_disease --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/IGVFFI5852GYTT.csv --output-bucket-key variants_diseases_genes/variants_diseases_genes_IGVFFI5852GYTT.jsonl --label variant_disease_gene variant to disease to gene: collection: variants_diseases_genes - datafiles: [] - -gene to term: - collection: genes_biosamples - datafiles: [] + params: + - label + pypy3: true + command: python3 data_loader.py --adapter variant_disease --output-bucket igvf-catalog-parsed-collections --filepath {datafile} --output-bucket-key variants_diseases_genes/variants_diseases_genes_IGVFFI5852GYTT.jsonl --label variant_disease_gene + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI5852GYTT/@@download/IGVFFI5852GYTT.csv.gz # variant_pathogenicity.csv.gz +# Example: python3 data_loader.py --adapter complex --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/IGVFFI1444TRQL.tsv --output-bucket-key complexes/complexes_IGVFFI1444TRQL.jsonl --label complex complex: collection: complexes + params: + - label + pypy3: true + command: pypy3 data_loader.py --adapter complex --output-bucket igvf-catalog-parsed-collections --filepath {datafile} --output-bucket-key complexes/complexes_IGVFFI1444TRQL.jsonl --label complex datafiles: - - https://api.data.igvf.org/reference-files/IGVFFI1451BYVS/@@download/IGVFFI1451BYVS.txt.gz # column_definitions_readme.txt.gz - https://api.data.igvf.org/reference-files/IGVFFI1444TRQL/@@download/IGVFFI1444TRQL.tsv.gz # EBI_complex_proteins_9606.tsv.gz +# Example: python3 data_loader.py --adapter complex --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/IGVFFI1444TRQL.tsv --output-bucket-key complexes_proteins/complexes_proteins_IGVFFI1444TRQL.jsonl --label complex_protein complex to protein: collection: complexes_proteins - datafiles: [] + params: + - label + pypy3: true + command: pypy3 data_loader.py --adapter complex --output-bucket igvf-catalog-parsed-collections --filepath {datafile} --output-bucket-key complexes_proteins/complexes_proteins_IGVFFI1444TRQL.jsonl --label complex_protein + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI1444TRQL/@@download/IGVFFI1444TRQL.tsv.gz # EBI_complex_proteins_9606.tsv.gz +# Example: python3 data_loader.py --adapter complex --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/IGVFFI1444TRQL.tsv --output-bucket-key complexes_terms/complexes_terms_IGVFFI1444TRQL.jsonl --label complex_term complex to term: collection: complexes_terms - datafiles: [] + params: + - label + pypy3: true + command: python3 data_loader.py --adapter complex --output-bucket igvf-catalog-parsed-collections --filepath {datafile} --output-bucket-key complexes_terms/complexes_terms_IGVFFI1444TRQL.jsonl --label complex_term + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI1444TRQL/@@download/IGVFFI1444TRQL.tsv.gz # EBI_complex_proteins_9606.tsv.gz -regulatory region mouse: - collection: mm_regulatory_regions - datafiles: [] +gene to term: + collection: genes_biosamples + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI1910IIEX/@@download/IGVFFI1910IIEX.csv.gz # CRISPRGeneDependency.csv gene mouse: collection: mm_genes params: - gene_alias_file_path pypy3: true - datafiles: [] + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI9744VSJF/@@download/IGVFFI9744VSJF.gtf.gz # gencode.vM32.chr_patch_hapl_scaff.annotation.gtf.gz transcript mouse: collection: mm_transcripts datafiles: [] -sequence variant mouse: - collection: mm_variants +mouse gene structure: + collection: mm_genes_structure datafiles: [] -human mouse regulatory region mapping: - collection: regulatory_regions_mm_regulatory_regions - datafiles: [] +# Example: +# - python3 data_loader.py --adapter mouse_variant --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/IGVFFI4287QQKV.vcf --output-bucket-key mm_variants/mm_variants_IGVFFI4287QQKV.jsonl +# - python3 data_loader.py --adapter mouse_variant --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/IGVFFI7837ORJB.vcf --output-bucket-key mm_variants/mm_variants_IGVFFI7837ORJB.jsonl +sequence variant mouse: + collection: mm_variants + pypy3: false + command: python3 data_loader.py --adapter mouse_variant --output-bucket igvf-catalog-parsed-collections --filepath {datafile} --output-bucket-key mm_variants/mm_variants_IGVFFI4287QQKV.jsonl + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI4287QQKV/@@download/IGVFFI4287QQKV.vcf.gz # mgp_REL2021_snps.rsID.vcf.gz + - https://api.data.igvf.org/reference-files/IGVFFI7837ORJB/@@download/IGVFFI7837ORJB.vcf.gz # mgp_REL2021_indels.rsID.vcf.gz +# Example: python3 data_loader.py --adapter mm_orthologs --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/IGVFFI9177QQPS.txt --output-bucket-key genes_mm_genes/genes_mm_genes_IGVFFI9177QQPS.jsonl human mouse genes orthology: collection: genes_mm_genes - datafiles: [] + pypy3: true + command: python3 data_loader.py --adapter mm_orthologs --output-bucket igvf-catalog-parsed-collections --filepath {datafile} --output-bucket-key genes_mm_genes/genes_mm_genes_IGVFFI9177QQPS.jsonl + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI9177QQPS/@@download/IGVFFI9177QQPS.txt.gz # HOM_MouseHumanSequence.txt.gz # Unreleased datafiles download: curl -L -u {accesskey}:{secretkey} {url} -o {datafile} # example: curl -L -u MSJMF5G6:ey4ac7yummfuqqk6 https://api.data.igvf.org/reference-files/IGVFFI0989JYUQ/@@download/IGVFFI0989JYUQ.vcf.gz -o IGVFFI0989JYUQ.vcf.gz diff --git a/data/schema-config.yaml b/data/schema-config.yaml index c57a0abf..8d1fbf15 100644 --- a/data/schema-config.yaml +++ b/data/schema-config.yaml @@ -2441,6 +2441,8 @@ human mouse genes orthology: properties: source: str source_url: str + name: str + inverse_name: str coding variant: is_a: related to at instance level From 093e4cc5acc4ecee2829cbdf8c6143252d402209 Mon Sep 17 00:00:00 2001 From: Pedro Assis Date: Fri, 15 Nov 2024 11:06:08 -1000 Subject: [PATCH 13/20] removing :long from adapters --- data/adapters/SEM_prediction_adapter.py | 6 +- .../VAMP_coding_variant_scores_adapter.py | 16 ++--- data/adapters/biogrid_gene_gene_adapter.py | 6 +- data/adapters/dbSNFP_adapter.py | 42 ++++++------ data/adapters/favor_adapter.py | 8 +-- data/adapters/gencode_adapter.py | 4 +- data/adapters/gencode_gene_adapter.py | 4 +- .../gencode_gene_structure_adapter.py | 8 +-- data/adapters/gwas_adapter.py | 24 +++---- data/adapters/proteins_interaction_adapter.py | 4 +- data/adapters/topld_adapter.py | 4 +- data/schema-config.yaml | 64 +++++++++---------- data/tests/test_biogrid_gene_gene.py | 2 +- data/tests/test_gencode_gene_adapter.py | 4 +- .../test_gencode_gene_structure_adapter.py | 4 +- data/tests/test_gwas_adapter.py | 2 +- .../test_proteins_interaction_adapter.py | 4 +- data/tests/test_topld_adapter.py | 4 +- 18 files changed, 105 insertions(+), 105 deletions(-) diff --git a/data/adapters/SEM_prediction_adapter.py b/data/adapters/SEM_prediction_adapter.py index c5dff933..a56f04c1 100644 --- a/data/adapters/SEM_prediction_adapter.py +++ b/data/adapters/SEM_prediction_adapter.py @@ -89,9 +89,9 @@ def process_file(self): 'kmer_chr': row[6].split(':')[0], 'kmer_start': int(row[6].split(':')[-1].split('-')[0]), 'kmer_end': int(row[6].split(':')[-1].split('-')[1]), - 'ref_score:long': float(row[7]), - 'alt_score:long': float(row[8]), - 'relative_binding_affinity:long': float(row[9]), + 'ref_score': float(row[7]), + 'alt_score': float(row[8]), + 'relative_binding_affinity': float(row[9]), 'effect_on_binding': row[-1], 'name': 'modulates binding of', 'inverse_name': 'binding modulated by', diff --git a/data/adapters/VAMP_coding_variant_scores_adapter.py b/data/adapters/VAMP_coding_variant_scores_adapter.py index c09f95a3..e9d338d2 100644 --- a/data/adapters/VAMP_coding_variant_scores_adapter.py +++ b/data/adapters/VAMP_coding_variant_scores_adapter.py @@ -56,14 +56,14 @@ def process_file(self): '_key': edge_key, '_from': 'coding_variants/' + _id, '_to': 'ontology_terms/' + VAMPAdapter.PHENOTYPE_TERM, - 'abundance_score:long': float(row[1]), - 'abundance_sd:long': float(row[2]) if row[2] else None, - 'abundance_se:long': float(row[3]) if row[3] else None, - 'ci_upper:long': float(row[4]) if row[4] else None, - 'ci_lower:long': float(row[5]) if row[5] else None, - 'abundance_Rep1:long': float(row[6]) if row[6] else None, - 'abundance_Rep2:long': float(row[7]) if row[7] else None, - 'abundance_Rep3:long': float(row[8]) if row[8] else None, + 'abundance_score': float(row[1]), + 'abundance_sd': float(row[2]) if row[2] else None, + 'abundance_se': float(row[3]) if row[3] else None, + 'ci_upper': float(row[4]) if row[4] else None, + 'ci_lower': float(row[5]) if row[5] else None, + 'abundance_Rep1': float(row[6]) if row[6] else None, + 'abundance_Rep2': float(row[7]) if row[7] else None, + 'abundance_Rep3': float(row[8]) if row[8] else None, 'source': VAMPAdapter.SOURCE, 'source_url': VAMPAdapter.SOURCE_URL } diff --git a/data/adapters/biogrid_gene_gene_adapter.py b/data/adapters/biogrid_gene_gene_adapter.py index 8f2b383d..2c230355 100644 --- a/data/adapters/biogrid_gene_gene_adapter.py +++ b/data/adapters/biogrid_gene_gene_adapter.py @@ -84,13 +84,13 @@ def process_file(self): 'detection_method_code': row[4], 'interaction_type': interaction_type, 'interaction_type_code': interaction_type_code, - 'confidence_value_biogrid:long': float(row[7]) if row[7] else None, - 'confidence_value_intact:long': float(row[-2]) if row[-2] else None, + 'confidence_value_biogrid': float(row[7]) if row[7] else None, + 'confidence_value_intact': float(row[-2]) if row[-2] else None, # should be BioGRID for all edges loaded 'source': row[-1], 'pmids': [pmid_url + pmid for pmid in pmids], # assign a fake value here to get around with the indexing issue on logit_score from gene-gene coexpressdb, - 'z_score:long': 0, + 'z_score': 0, 'name': 'interacts with', 'inverse_name': 'interacts with', 'molecular_function': 'ontology_terms/GO_0005515', diff --git a/data/adapters/dbSNFP_adapter.py b/data/adapters/dbSNFP_adapter.py index 663e4bd4..50234bba 100644 --- a/data/adapters/dbSNFP_adapter.py +++ b/data/adapters/dbSNFP_adapter.py @@ -125,7 +125,7 @@ def long_data(pos): 'name': 'codes for', 'inverse_name': 'encoded by', 'chr': data(0), - 'pos:long': long_data(1), + 'pos': long_data(1), 'ref': data(2), # 1-based 'alt': data(3), } @@ -145,32 +145,32 @@ def long_data(pos): 'name': key, 'ref': data(4), 'alt': data(5), - 'aapos:long': long_data(11), # 1-based + 'aapos': long_data(11), # 1-based 'gene_name': data(12), 'protein_name': data(17), 'hgvs': data(22), 'hgvsp': data(23), 'refcodon': data(29), - 'codonpos:long': long_data(30), + 'codonpos': long_data(30), 'transcript_id': data(14), - 'SIFT_score:long': long_data(37), - 'SIFT4G_score:long': long_data(40), - 'Polyphen2_HDIV_score:long': long_data(43), - 'Polyphen2_HVAR_score:long': long_data(46), - 'VEST4_score:long': long_data(67), - 'Mcap_score:long': long_data(79), - 'REVEL_score:long': long_data(82), - 'MutPred_score:long': long_data(84), - 'BayesDel_addAF_score:long': long_data(101), - 'BayesDel_noAF_score:long': long_data(104), - 'VARITY_R_score:long': long_data(113), - 'VARITY_ER_score:long': long_data(115), - 'VARITY_R_LOO_score:long': long_data(117), - 'VARITY_ER_LOO_score:long': long_data(119), - 'ESM1b_score:long': long_data(121), - 'EVE_score:long': long_data(124), - 'AlphaMissense_score:long': long_data(137), - 'CADD_raw_score:long': long_data(146), + 'SIFT_score': long_data(37), + 'SIFT4G_score': long_data(40), + 'Polyphen2_HDIV_score': long_data(43), + 'Polyphen2_HVAR_score': long_data(46), + 'VEST4_score': long_data(67), + 'Mcap_score': long_data(79), + 'REVEL_score': long_data(82), + 'MutPred_score': long_data(84), + 'BayesDel_addAF_score': long_data(101), + 'BayesDel_noAF_score': long_data(104), + 'VARITY_R_score': long_data(113), + 'VARITY_ER_score': long_data(115), + 'VARITY_R_LOO_score': long_data(117), + 'VARITY_ER_LOO_score': long_data(119), + 'ESM1b_score': long_data(121), + 'EVE_score': long_data(124), + 'AlphaMissense_score': long_data(137), + 'CADD_raw_score': long_data(146), 'source': 'dbSNFP 4.5a', 'source_url': 'http://database.liulab.science/dbNSFP' } diff --git a/data/adapters/favor_adapter.py b/data/adapters/favor_adapter.py index 81c95526..e14fe125 100644 --- a/data/adapters/favor_adapter.py +++ b/data/adapters/favor_adapter.py @@ -126,15 +126,15 @@ def parse_metadata(self, info): values = freq_value.split(',') info_obj['freq'][freq_name] = { - 'ref:long': self.convert_freq_value(values[0]) + 'ref': self.convert_freq_value(values[0]) } if len(values) > 1: - info_obj['freq'][freq_name]['alt:long'] = self.convert_freq_value( + info_obj['freq'][freq_name]['alt'] = self.convert_freq_value( values[1]) else: if self.convert_freq_value(values[0]) == 1.0: - info_obj['freq'][freq_name]['alt:long'] = 0.0 + info_obj['freq'][freq_name]['alt'] = 0.0 # e.g. FAVORFullDB/variant_annovar if key.startswith('FAVOR'): @@ -212,7 +212,7 @@ def process_file(self): '_key': id, 'name': spdi, 'chr': 'chr' + data_line[0], - 'pos:long': int(data_line[1]) - 1, + 'pos': int(data_line[1]) - 1, 'rsid': [data_line[2]], 'ref': data_line[3], 'alt': data_line[4], diff --git a/data/adapters/gencode_adapter.py b/data/adapters/gencode_adapter.py index cc8f3ba3..65640032 100644 --- a/data/adapters/gencode_adapter.py +++ b/data/adapters/gencode_adapter.py @@ -81,8 +81,8 @@ def process_file(self): 'transcript_type': info['transcript_type'], 'chr': data[Gencode.INDEX['chr']], # the gtf file format is [1-based,1-based], needs to convert to BED format [0-based,1-based] - 'start': str(int(data[Gencode.INDEX['coord_start']]) - 1), - 'end': data[Gencode.INDEX['coord_end']], + 'start': int(data[Gencode.INDEX['coord_start']]) - 1, + 'end': int(data[Gencode.INDEX['coord_end']]), 'gene_name': info['gene_name'], 'source': 'GENCODE', 'version': self.version, diff --git a/data/adapters/gencode_gene_adapter.py b/data/adapters/gencode_gene_adapter.py index da8af488..13900ec0 100644 --- a/data/adapters/gencode_gene_adapter.py +++ b/data/adapters/gencode_gene_adapter.py @@ -150,8 +150,8 @@ def process_file(self): 'gene_type': info['gene_type'], 'chr': split_line[GencodeGene.INDEX['chr']], # the gtf file format is [1-based,1-based], needs to convert to BED format [0-based,1-based] - 'start:long': int(split_line[GencodeGene.INDEX['coord_start']]) - 1, - 'end:long': int(split_line[GencodeGene.INDEX['coord_end']]), + 'start': int(split_line[GencodeGene.INDEX['coord_start']]) - 1, + 'end': int(split_line[GencodeGene.INDEX['coord_end']]), 'name': info['gene_name'], 'source': 'GENCODE', 'version': self.version, diff --git a/data/adapters/gencode_gene_structure_adapter.py b/data/adapters/gencode_gene_structure_adapter.py index f2518ac0..7449ed6c 100644 --- a/data/adapters/gencode_gene_structure_adapter.py +++ b/data/adapters/gencode_gene_structure_adapter.py @@ -110,8 +110,8 @@ def process_file(self): 'name': info['transcript_name'] + '_exon_' + info['exon_number'] + '_' + gene_structure_type, 'chr': split_line[GencodeStructure.INDEX['chr']], # the gtf file format is [1-based,1-based], needs to convert to BED format [0-based,1-based] - 'start:long': int(split_line[GencodeStructure.INDEX['coord_start']]) - 1, - 'end:long': int(split_line[GencodeStructure.INDEX['coord_end']]), + 'start': int(split_line[GencodeStructure.INDEX['coord_start']]) - 1, + 'end': int(split_line[GencodeStructure.INDEX['coord_end']]), 'strand': split_line[GencodeStructure.INDEX['strand']], 'type': gene_structure_type, 'gene_id': gene_id_no_version, @@ -157,8 +157,8 @@ def process_file(self): '_key': key, 'name': info['transcript_name'] + '_exon_' + intron_exon_number + '_intron', 'chr': split_line[GencodeStructure.INDEX['chr']], - 'start:long': intron_start, - 'end:long': intron_end, + 'start': intron_start, + 'end': intron_end, 'strand': split_line[GencodeStructure.INDEX['strand']], 'type': 'intron', 'gene_id': gene_id_no_version, diff --git a/data/adapters/gwas_adapter.py b/data/adapters/gwas_adapter.py index 3ea2d30f..8a487d60 100644 --- a/data/adapters/gwas_adapter.py +++ b/data/adapters/gwas_adapter.py @@ -122,21 +122,21 @@ def process_variants_phenotypes_studies(self, row, edge_key, phenotype_id, tagge '_from': 'variants_phenotypes/' + edge_key, '_key': key, 'lead_chrom': row[4], - 'lead_pos:long': int(row[5]) - 1, + 'lead_pos': int(row[5]) - 1, 'lead_ref': row[6], 'lead_alt': row[7], 'phenotype_term': self.ontology_name_mapping.get(phenotype_id), 'direction': row[8], - 'beta:long': float(row[9] or 0), - 'beta_ci_lower:long': float(row[10] or 0), - 'beta_ci_upper:long': float(row[11] or 0), - 'odds_ratio:long': float(row[12] or 0), - 'oddsr_ci_lower:long': float(row[13] or 0), - 'oddsr_ci_upper:long': float(row[14] or 0), - 'p_val_mantissa:long': float(row[15] or 0), - 'p_val_exponent:long': float(row[16] or 0), - 'p_val:long': pvalue, - 'log10pvalue:long': log_pvalue, + 'beta': float(row[9] or 0), + 'beta_ci_lower': float(row[10] or 0), + 'beta_ci_upper': float(row[11] or 0), + 'odds_ratio': float(row[12] or 0), + 'oddsr_ci_lower': float(row[13] or 0), + 'oddsr_ci_upper': float(row[14] or 0), + 'p_val_mantissa': float(row[15] or 0), + 'p_val_exponent': float(row[16] or 0), + 'p_val': pvalue, + 'log10pvalue': log_pvalue, 'tagged_variants': tagged_variants[studies_variants_key], 'genes': genes.get(row[0]), 'source': 'OpenTargets', @@ -271,7 +271,7 @@ def get_tagged_variants(self): variant = { 'tag_chrom': row[34], - 'tag_pos:long': int(row[35]) - 1, + 'tag_pos': int(row[35]) - 1, 'tag_ref': row[36], 'tag_alt': row[37], 'overall_r2': row[38], diff --git a/data/adapters/proteins_interaction_adapter.py b/data/adapters/proteins_interaction_adapter.py index bc2754eb..b38e3b06 100644 --- a/data/adapters/proteins_interaction_adapter.py +++ b/data/adapters/proteins_interaction_adapter.py @@ -65,8 +65,8 @@ def process_file(self): 'detection_method_code': row[4], 'interaction_type': interaction_type, 'interaction_type_code': interaction_type_code, - 'confidence_value_biogrid:long': float(row[7]) if row[7] else None, - 'confidence_value_intact:long': float(row[-2]) if row[-2] else None, + 'confidence_value_biogrid': float(row[7]) if row[7] else None, + 'confidence_value_intact': float(row[-2]) if row[-2] else None, 'source': row[-1], # BioGRID or IntAct or BioGRID; IntAct 'pmids': [pmid_url + pmid for pmid in pmids], 'organism': self.organism, diff --git a/data/adapters/topld_adapter.py b/data/adapters/topld_adapter.py index 3eb37c1f..b93bd81c 100644 --- a/data/adapters/topld_adapter.py +++ b/data/adapters/topld_adapter.py @@ -73,8 +73,8 @@ def process_file(self): 'variant_2_base_pair': ':'.join(row[3].split(':')[1:3]), 'variant_1_rsid': self.ids[row[0]]['rsid'], 'variant_2_rsid': self.ids[row[1]]['rsid'], - 'r2:long': float(row[4]), - 'd_prime:long': float(row[5]), + 'r2': float(row[4]), + 'd_prime': float(row[5]), 'ancestry': self.ancestry, 'label': 'linkage disequilibrum', 'name': 'correlated with', diff --git a/data/schema-config.yaml b/data/schema-config.yaml index 8d1fbf15..65ad1536 100644 --- a/data/schema-config.yaml +++ b/data/schema-config.yaml @@ -24,8 +24,8 @@ topld in linkage disequilibrium with: r2: type: zkd fields: - - r2:long - - d_prime:long + - r2 + - d_prime accessible_via: name: variants_variants description: 'Retrieve variant correlation data. Example: r2 = gt:0.8, d_prime = lte:0.5, ancestry = SAS' @@ -66,7 +66,7 @@ sequence variant: region: type: zkd fields: - - pos:long + - pos accessible_via: name: variants description: 'Retrieve variant data. For example: region = chr1:1157520-1158189 or funseq_description = noncoding or rsid = rs2045642915' @@ -179,7 +179,7 @@ gene: coordinates: type: zkd fields: - - start:long, end:long + - start, end accessible_via: name: genes description: 'Retrieve gene information. For example: region = chr1:1157520-1158189 or gene_type = miRNA' @@ -241,7 +241,7 @@ transcript: coordinates: type: zkd fields: - - start:long,end:long + - start,end accessible_via: name: transcripts description: 'Retrieve transcript information. For example: region = chr20:9537369-9839076 or transcript_type = protein_coding' @@ -347,7 +347,7 @@ gene structure: coordinates: type: zkd fields: - - start:long,end:long + - start,end accessible_via: name: genes_structure filter_by: _id, chr, name, type @@ -391,7 +391,7 @@ mouse gene structure: coordinates: type: zkd fields: - - start:long,end:long + - start,end accessible_via: name: mm_gene_structure filter_by: _id, chr, name, type @@ -470,8 +470,8 @@ AFGR variant to regulatory region: coordinates: type: zkd fields: - - log10pvalue:long - - beta:long, log10pvalue:long + - log10pvalue + - beta, log10pvalue query: type: persistent fields: @@ -680,7 +680,7 @@ variant to protein association: coordinates: type: zkd fields: - - log10pvalue:long + - log10pvalue query: type: persistent fields: @@ -719,7 +719,7 @@ allele specific binding: coordinates: type: zkd fields: - - log10pvalue:long + - log10pvalue relationship: from: sequence variant to: protein @@ -807,8 +807,8 @@ variant to gene association: coordinates: type: zkd fields: - - log10pvalue:long - - effect_size:long, log10pvalue:long + - log10pvalue + - effect_size, log10pvalue query: type: persistent fields: @@ -886,7 +886,7 @@ gtex splice variant to gene association: intron: type: zkd fields: - - intron_start:long, intron_end:long + - intron_start, intron_end accessible_via: name: e-qtls description: 'Retrieve e-qtls data along with correspondent variants and genes' @@ -968,8 +968,8 @@ gtex variant to gene expression association: coordinates: type: zkd fields: - - log10pvalue:long - - effect_size:long, log10pvalue:long + - log10pvalue + - effect_size, log10pvalue query: type: persistent fields: @@ -1027,8 +1027,8 @@ AFGR splice variant to gene association: coordinates: type: zkd fields: - - log10pvalue:long - - effect_size:long, log10pvalue:long + - log10pvalue + - effect_size, log10pvalue query: type: persistent fields: @@ -1103,8 +1103,8 @@ AFGR variant to gene expression association: coordinates: type: zkd fields: - - log10pvalue:long - - beta:long, log10pvalue:long + - log10pvalue + - beta, log10pvalue query: type: persistent fields: @@ -1193,7 +1193,7 @@ regulatory element to gene expression association: coordinates: type: zkd fields: - - score:long + - score query: type: persistent fields: @@ -1219,7 +1219,7 @@ regulatory region: coordinates: type: zkd fields: - - start:long,end:long + - start,end accessible_via: name: regulatory_regions description: 'Retrieve regulatory regions data. Example: region = chr1:1157520-1158189 or biochemical_activity = CA or source = ENCODE_SCREEN (ccREs)' @@ -1407,7 +1407,7 @@ regulatory element to biosample: activity_score: type: zkd fields: - - activity_score:long + - activity_score accessible_via: name: regulatory_regions_biosamples return: _id, element_name, strand, activity_score, bed_score, DNA_count, RNA_count, source, source_url @@ -1609,7 +1609,7 @@ gene to gene coexpression association: coordinates: type: zkd fields: - - z_score:long + - z_score query: type: persistent fields: @@ -1635,7 +1635,7 @@ gene to gene interaction: coordinates: type: zkd fields: - - z_score:long + - z_score relationship: from: gene to: gene @@ -1831,7 +1831,7 @@ variant to phenotype to study: pval_idx: type: zkd fields: - - log10pvalue:long + - log10pvalue relationship: from: variant to phenotype to: study @@ -2136,7 +2136,7 @@ regulatory region mouse: coordinates: type: zkd fields: - - start:long,end:long + - start,end accessible_via: name: mm_regulatory_regions description: 'Retrieve mouse regulatory regions data. Example: region = chr1:1157520-1158189 or biochemical_activity = CA or source = ENCODE_SCREEN (ccREs)' @@ -2185,7 +2185,7 @@ gene mouse: coordinates: type: zkd fields: - - start:long, end:long + - start, end accessible_via: name: mm_genes description: 'Retrieve mouse gene information. For example: region = chr1:1157520-1158189 or gene_type = miRNA' @@ -2246,7 +2246,7 @@ transcript mouse: coordinates: type: zkd fields: - - start:long,end:long + - start,end accessible_via: name: mm_transcripts description: 'Retrieve mouse transcript information. For example: region = chr20:9537369-9839076 or transcript_type = protein_coding' @@ -2293,7 +2293,7 @@ sequence variant mouse: region: type: zkd fields: - - pos:long + - pos accessible_via: name: mm_variants description: 'Retrieve mouse variant data. For example: region = chr1:1157520-1158189 or rsid = rs3389534405' @@ -2459,7 +2459,7 @@ coding variant: coordinates: type: zkd fields: - - aapos:long + - aapos db_collection_name: coding_variants db_collection_per_chromosome: false accessible_via: @@ -2563,7 +2563,7 @@ coding variant to phenotype: abundance_score: type: zkd fields: - - abundance_score:long + - abundance_score relationship: from: coding variant to: ontology term diff --git a/data/tests/test_biogrid_gene_gene.py b/data/tests/test_biogrid_gene_gene.py index 8b916639..5809728b 100644 --- a/data/tests/test_biogrid_gene_gene.py +++ b/data/tests/test_biogrid_gene_gene.py @@ -14,7 +14,7 @@ def test_biogrid_gene_gene_adapter_gene_gene_biogrid(): assert len(writer.contents) == 4 assert len(first_item) == 15 assert first_item['source'] == 'BioGRID' - assert first_item['confidence_value_biogrid:long'] is None + assert first_item['confidence_value_biogrid'] is None assert first_item['interaction_type'] == [ 'negative genetic interaction (sensu BioGRID)'] diff --git a/data/tests/test_gencode_gene_adapter.py b/data/tests/test_gencode_gene_adapter.py index 31a46b07..bdb3f89b 100644 --- a/data/tests/test_gencode_gene_adapter.py +++ b/data/tests/test_gencode_gene_adapter.py @@ -17,8 +17,8 @@ def test_gencode_gene_adapter_human(): assert 'gene_id' in first_item assert 'gene_type' in first_item assert 'chr' in first_item - assert 'start:long' in first_item - assert 'end:long' in first_item + assert 'start' in first_item + assert 'end' in first_item assert 'name' in first_item assert first_item['source'] == 'GENCODE' assert first_item['version'] == 'v43' diff --git a/data/tests/test_gencode_gene_structure_adapter.py b/data/tests/test_gencode_gene_structure_adapter.py index 993a29d0..a1587a56 100644 --- a/data/tests/test_gencode_gene_structure_adapter.py +++ b/data/tests/test_gencode_gene_structure_adapter.py @@ -14,8 +14,8 @@ def test_gencode_structure_adapter_gene_structure(): assert '_key' in first_item assert 'name' in first_item assert 'chr' in first_item - assert 'start:long' in first_item - assert 'end:long' in first_item + assert 'start' in first_item + assert 'end' in first_item assert 'strand' in first_item assert 'type' in first_item assert 'gene_id' in first_item diff --git a/data/tests/test_gwas_adapter.py b/data/tests/test_gwas_adapter.py index cfe7ae74..be5ebe40 100644 --- a/data/tests/test_gwas_adapter.py +++ b/data/tests/test_gwas_adapter.py @@ -43,7 +43,7 @@ def test_get_tagged_variants(gwas_files): assert isinstance(variants, list) for variant in variants: assert 'tag_chrom' in variant - assert 'tag_pos:long' in variant + assert 'tag_pos' in variant assert 'tag_ref' in variant assert 'tag_alt' in variant diff --git a/data/tests/test_proteins_interaction_adapter.py b/data/tests/test_proteins_interaction_adapter.py index 4e480947..8570c029 100644 --- a/data/tests/test_proteins_interaction_adapter.py +++ b/data/tests/test_proteins_interaction_adapter.py @@ -29,8 +29,8 @@ def test_proteins_interaction_adapter(filepath, spy_writer): assert 'detection_method_code' in first_item assert 'interaction_type' in first_item assert 'interaction_type_code' in first_item - assert 'confidence_value_biogrid:long' in first_item - assert 'confidence_value_intact:long' in first_item + assert 'confidence_value_biogrid' in first_item + assert 'confidence_value_intact' in first_item assert 'source' in first_item assert 'pmids' in first_item assert 'organism' in first_item diff --git a/data/tests/test_topld_adapter.py b/data/tests/test_topld_adapter.py index 7e058ebd..53327ffd 100644 --- a/data/tests/test_topld_adapter.py +++ b/data/tests/test_topld_adapter.py @@ -42,8 +42,8 @@ def test_topld_adapter_process_file(): assert 'variant_2_base_pair' in first_item assert 'variant_1_rsid' in first_item assert 'variant_2_rsid' in first_item - assert 'r2:long' in first_item - assert 'd_prime:long' in first_item + assert 'r2' in first_item + assert 'd_prime' in first_item assert 'ancestry' in first_item assert 'label' in first_item assert 'name' in first_item From 2c42333ebbee7ef6d3ff58f850000477979c741b Mon Sep 17 00:00:00 2001 From: Pedro Assis Date: Fri, 15 Nov 2024 14:13:25 -1000 Subject: [PATCH 14/20] adding last adapters --- data/data_sources.yaml | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/data/data_sources.yaml b/data/data_sources.yaml index ece114c9..44295c74 100644 --- a/data/data_sources.yaml +++ b/data/data_sources.yaml @@ -362,12 +362,13 @@ transcribed from: datafiles: - https://api.data.igvf.org/reference-files/IGVFFI7217ZMJZ/@@download/IGVFFI7217ZMJZ.gtf.gz # Homo sapiens GRCh38 GENCODE v43 genome -# Example: pypy3 data_loader.py --adapter gencode_structure --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/IGVFFI7217ZMJZ.gtf --output-bucket-key genes_structure/gencode_IGVFFI7217ZMJZ.vcf.jsonl --label gene_structure +# Example: python3 data_loader.py --adapter gencode_structure --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/IGVFFI7217ZMJZ.gtf --output-bucket-key genes_structure/gencode_IGVFFI7217ZMJZ.vcf.jsonl --label gene_structure gene structure: collection: genes_structure params: - label - command: pypy3 data_loader.py --adapter gencode_structure --output-bucket igvf-catalog-parsed-collections --filepath {datafile} --output-bucket-key genes_structure/gencode_{datafile}.jsonl --label gene_structure + pypy3: false + command: python3 data_loader.py --adapter gencode_structure --output-bucket igvf-catalog-parsed-collections --filepath {datafile} --output-bucket-key genes_structure/gencode_{datafile}.jsonl --label gene_structure datafiles: - https://api.data.igvf.org/reference-files/IGVFFI7217ZMJZ/@@download/IGVFFI7217ZMJZ.gtf.gz # Homo sapiens GRCh38 GENCODE v43 genome @@ -837,6 +838,7 @@ complex to term: gene to term: collection: genes_biosamples datafiles: + - waiting for original file \/ (needs matrix transposition) to be updated in the portal - https://api.data.igvf.org/reference-files/IGVFFI1910IIEX/@@download/IGVFFI1910IIEX.csv.gz # CRISPRGeneDependency.csv gene mouse: @@ -845,15 +847,28 @@ gene mouse: - gene_alias_file_path pypy3: true datafiles: + - waiting for file s3://igvf-catalog-datasets/gencode/Mus_musculus.gene_info.gz to be in the portal - https://api.data.igvf.org/reference-files/IGVFFI9744VSJF/@@download/IGVFFI9744VSJF.gtf.gz # gencode.vM32.chr_patch_hapl_scaff.annotation.gtf.gz +# Example: python3 data_loader.py --adapter gencode_transcripts --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/IGVFFI9744VSJF.gtf --output-bucket-key mm_transcripts/mm_transcripts_IGVFFI9744VSJF.jsonl --label mm_gencode_transcript transcript mouse: collection: mm_transcripts - datafiles: [] + params: + - label + pypy3: false + command: python3 data_loader.py --adapter gencode_transcripts --output-bucket igvf-catalog-parsed-collections --filepath {datafile} --output-bucket-key mm_transcripts/mm_transcripts_IGVFFI9744VSJF.jsonl --label mm_gencode_transcript + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI9744VSJF/@@download/IGVFFI9744VSJF.gtf.gz # gencode.vM32.chr_patch_hapl_scaff.annotation.gtf.gz +# Example: python3 data_loader.py --adapter gencode_structure --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/IGVFFI9744VSJF.gtf --output-bucket-key mm_genes_structure/mm_genes_structure_IGVFFI9744VSJF.vcf.jsonl --label mm_gene_structure mouse gene structure: collection: mm_genes_structure - datafiles: [] + pypy3: false + params: + - label + command: python3 data_loader.py --adapter gencode_structure --output-bucket igvf-catalog-parsed-collections --filepath {datafile} --output-bucket-key mm_genes_structure/mm_genes_structure_IGVFFI9744VSJF.vcf.jsonl --label mm_gene_structure + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI9744VSJF/@@download/IGVFFI9744VSJF.gtf.gz # gencode.vM32.chr_patch_hapl_scaff.annotation.gtf.gz # Example: # - python3 data_loader.py --adapter mouse_variant --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/IGVFFI4287QQKV.vcf --output-bucket-key mm_variants/mm_variants_IGVFFI4287QQKV.jsonl From b9bc0749b6025604d7cf756d5196e7049d21445e Mon Sep 17 00:00:00 2001 From: Pedro Assis Date: Fri, 15 Nov 2024 14:53:04 -1000 Subject: [PATCH 15/20] updating genes_biosamples --- data/data_sources.yaml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/data/data_sources.yaml b/data/data_sources.yaml index 44295c74..7944e204 100644 --- a/data/data_sources.yaml +++ b/data/data_sources.yaml @@ -835,11 +835,13 @@ complex to term: datafiles: - https://api.data.igvf.org/reference-files/IGVFFI1444TRQL/@@download/IGVFFI1444TRQL.tsv.gz # EBI_complex_proteins_9606.tsv.gz +# Example: python3 data_loader.py --adapter gene_term --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/IGVFFI8863BMFF.csv --output-bucket-key genes_biosamples/genes_biosamples_IGVFFI8863BMFF.jsonl gene to term: collection: genes_biosamples + command: python3 data_loader.py --adapter gene_term --output-bucket igvf-catalog-parsed-collections --filepath {datafile} --output-bucket-key genes_biosamples/genes_biosamples_IGVFFI8863BMFF.jsonl + pypy3: false datafiles: - - waiting for original file \/ (needs matrix transposition) to be updated in the portal - - https://api.data.igvf.org/reference-files/IGVFFI1910IIEX/@@download/IGVFFI1910IIEX.csv.gz # CRISPRGeneDependency.csv + - https://api.data.igvf.org/reference-files/IGVFFI8863BMFF/@@download/IGVFFI8863BMFF.csv.gz # CRISPRGeneDependency_transposed.csv gene mouse: collection: mm_genes @@ -890,7 +892,6 @@ human mouse genes orthology: - https://api.data.igvf.org/reference-files/IGVFFI9177QQPS/@@download/IGVFFI9177QQPS.txt.gz # HOM_MouseHumanSequence.txt.gz # Unreleased datafiles download: curl -L -u {accesskey}:{secretkey} {url} -o {datafile} -# example: curl -L -u MSJMF5G6:ey4ac7yummfuqqk6 https://api.data.igvf.org/reference-files/IGVFFI0989JYUQ/@@download/IGVFFI0989JYUQ.vcf.gz -o IGVFFI0989JYUQ.vcf.gz # Example: python3 data_loader.py --adapter coding_variants --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/chr1.vcf --output-bucket-key coding_variants/chr1.jsonl --collection coding_variants coding variant: From 50d13647e6d7010163090d58ed5c79c2c1d2e4c1 Mon Sep 17 00:00:00 2001 From: Pedro Assis Date: Mon, 18 Nov 2024 10:18:28 -1000 Subject: [PATCH 16/20] adding mm_genes related collections --- data/data_sources.yaml | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/data/data_sources.yaml b/data/data_sources.yaml index 7944e204..ea5f742d 100644 --- a/data/data_sources.yaml +++ b/data/data_sources.yaml @@ -381,9 +381,15 @@ transcript contains gene structure: datafiles: - https://api.data.igvf.org/reference-files/IGVFFI7217ZMJZ/@@download/IGVFFI7217ZMJZ.gtf.gz # Homo sapiens GRCh38 GENCODE v43 genome +# Example: python3 data_loader.py --adapter gencode_structure --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/IGVFFI9744VSJF.gtf --output-bucket-key mm_transcripts_mm_genes_structure/mm_transcripts_mm_genes_structures_IGVFFI9744VSJF.jsonl --label mm_transcript_contains_mm_gene_structure mouse transcript contains mouse gene structure: collection: mm_transcripts_mm_genes_structure - datafiles: [] + params: + - label + pypy3: false + command: python3 data_loader.py --adapter gencode_structure --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/IGVFFI9744VSJF.gtf --output-bucket-key mm_transcripts_mm_genes_structure/mm_transcripts_mm_genes_structures_IGVFFI9744VSJF.jsonl --label mm_transcript_contains_mm_gene_structure + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI9744VSJF/@@download/IGVFFI9744VSJF.gtf.gz # gencode.vM32.chr_patch_hapl_scaff.annotation.gtf.gz # Example: python3 data_loader.py --adapter ontology --output-bucket igvf-catalog-parsed-collections --filepath vario.owl --ontology vario # for cellosaurus: python3 data_loader.py --adapter cellosaurus_terms --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/cellosaurus.obo --type node --output-bucket-key ontology_terms/cellosaurus.jsonl @@ -843,13 +849,16 @@ gene to term: datafiles: - https://api.data.igvf.org/reference-files/IGVFFI8863BMFF/@@download/IGVFFI8863BMFF.csv.gz # CRISPRGeneDependency_transposed.csv +# Example: python3 data_loader.py --adapter gencode_genes --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/IGVFFI9744VSJF.gtf --output-bucket-key mm_genes/mm_genes_IGVFFI9744VSJF.jsonl --label mm_gencode_gene --gene-alias-file-path ~/dataset/IGVFFI3741KWFZ.tsv.gz gene mouse: collection: mm_genes params: + - label - gene_alias_file_path pypy3: true + command: python3 data_loader.py --adapter gencode_genes --output-bucket igvf-catalog-parsed-collections --filepath {datafile} --output-bucket-key mm_genes/mm_genes_IGVFFI9744VSJF.jsonl --label mm_gencode_gene --gene-alias-file-path {gene_alias_datafile} datafiles: - - waiting for file s3://igvf-catalog-datasets/gencode/Mus_musculus.gene_info.gz to be in the portal + - https://api.data.igvf.org/reference-files/IGVFFI9744VSJF/@@download/IGVFFI3741KWFZ.tsv.gz # Mus_musculus.gene_info.gz - https://api.data.igvf.org/reference-files/IGVFFI9744VSJF/@@download/IGVFFI9744VSJF.gtf.gz # gencode.vM32.chr_patch_hapl_scaff.annotation.gtf.gz # Example: python3 data_loader.py --adapter gencode_transcripts --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/IGVFFI9744VSJF.gtf --output-bucket-key mm_transcripts/mm_transcripts_IGVFFI9744VSJF.jsonl --label mm_gencode_transcript From 7b611ae2819428820fddb8b30d7fd430deea2a6f Mon Sep 17 00:00:00 2001 From: Pedro Assis Date: Mon, 18 Nov 2024 11:35:25 -1000 Subject: [PATCH 17/20] converting adapter to writer method --- .../VAMP_coding_variant_scores_adapter.py | 39 +++++-------------- 1 file changed, 10 insertions(+), 29 deletions(-) diff --git a/data/adapters/VAMP_coding_variant_scores_adapter.py b/data/adapters/VAMP_coding_variant_scores_adapter.py index e9d338d2..ffecd306 100644 --- a/data/adapters/VAMP_coding_variant_scores_adapter.py +++ b/data/adapters/VAMP_coding_variant_scores_adapter.py @@ -1,10 +1,9 @@ import csv import json -import os - import pickle -from adapters import Adapter -from db.arango_db import ArangoDB + +from typing import Optional +from adapters.writer import Writer # Example line from file from CYP2C19 VAMP-seq (IGVFFI5890AHYL): # variant abundance_score abundance_sd abundance_se ci_upper ci_lower abundance_Rep1 abundance_Rep2 abundance_Rep3 @@ -12,7 +11,7 @@ # ENSP00000360372.3:p.Ala103Asp 0.5857497278869870 0.0603323988117348 0.0348329266948109 0.6197118314144270 0.5517876243595460 0.5265040329858070 0.647113071129789 0.5836320795453640 -class VAMPAdapter(Adapter): +class VAMPAdapter: ALLOWED_LABELS = ['vamp_coding_variants_phenotypes'] SOURCE = 'VAMP-seq' SOURCE_URL = 'https://data.igvf.org/analysis-sets/IGVFDS0368ZLPX/' @@ -22,24 +21,16 @@ class VAMPAdapter(Adapter): OUTPUT_PATH = './parsed-data' - def __init__(self, filepath, label='vamp_coding_variants_phenotypes', dry_run=True): + def __init__(self, filepath, label='vamp_coding_variants_phenotypes', writer: Optional[Writer] = None, **kwargs): if label not in VAMPAdapter.ALLOWED_LABELS: raise ValueError('Ivalid label. Allowed values: ' + ','.join(VAMPAdapter.ALLOWED_LABELS)) self.filepath = filepath - self.label = label - self.dataset = label - self.type = 'edge' - self.dry_run = dry_run - self.output_filepath = '{}/{}.json'.format( - self.OUTPUT_PATH, - self.dataset - ) - super().__init__() + self.writer = writer def process_file(self): - parsed_data_file = open(self.output_filepath, 'w') + self.writer.open() self.load_coding_variant_id() with open(self.filepath, 'r') as vamp_file: @@ -68,22 +59,12 @@ def process_file(self): 'source_url': VAMPAdapter.SOURCE_URL } - json.dump(_props, parsed_data_file) - parsed_data_file.write('\n') + self.writer.write(json.dumps(_props)) + self.writer.write('\n') - parsed_data_file.close() - self.save_to_arango() + self.writer.close() def load_coding_variant_id(self): self.coding_variant_id = {} with open(VAMPAdapter.CODING_VARIANTS_MAPPING_PATH, 'rb') as coding_variant_id_file: self.coding_variant_id = pickle.load(coding_variant_id_file) - - def save_to_arango(self): - if self.dry_run: - print(self.arangodb()[0]) - else: - os.system(self.arangodb()[0]) - - def arangodb(self): - return ArangoDB().generate_json_import_statement(self.output_filepath, self.collection, type=self.type) From b2e62bc2eedecc85513d07d4f338a7fdf312db14 Mon Sep 17 00:00:00 2001 From: Pedro Assis Date: Mon, 18 Nov 2024 11:35:43 -1000 Subject: [PATCH 18/20] adding coding_variants_phenotypes --- data/data_sources.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/data/data_sources.yaml b/data/data_sources.yaml index ea5f742d..559690e6 100644 --- a/data/data_sources.yaml +++ b/data/data_sources.yaml @@ -1003,3 +1003,13 @@ variants to coding variant: - https://api.data.igvf.org/reference-files/IGVFFI8283ZCLN/@@download/IGVFFI8283ZCLN.vcf.gz #dbNSFP4.5a_variant.chrX.gz - https://api.data.igvf.org/reference-files/IGVFFI8477SBHR/@@download/IGVFFI8477SBHR.vcf.gz #dbNSFP4.5a_variant.chrY.gz - https://api.data.igvf.org/reference-files/IGVFFI7595PQMP/@@download/IGVFFI7595PQMP.vcf.gz #dbNSFP4.5a_variant.chrM.gz + +# Example: pypy3 data_loader.py --adapter vamp_coding_variant_phenotype --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/IGVFFI5890AHYL.csv --output-bucket-key coding_variants_phenotypes/coding_variants_pheontypes_IGVFFI5890AHYL.jsonl --label vamp_coding_variants_phenotypes +coding variant to phenotype: + collection: coding_variants_phenotypes + params: + - label + pypy3: true + command: pypy3 data_loader.py --adapter vamp_coding_variant_phenotype --output-bucket igvf-catalog-parsed-collections --filepath {datafile} --output-bucket-key coding_variants_phenotypes/coding_variants_pheontypes_IGVFFI5890AHYL.jsonl --label vamp_coding_variants_phenotypes + datafiles: + - https://api.data.igvf.org/reference-files/IGVFFI5890AHYL/@@download/IGVFFI5890AHYL.csv.gz # CYP2C19_DMS_scores.csv.gz From dc1966a9934483c6ed829653f695f7f43f218c51 Mon Sep 17 00:00:00 2001 From: Pedro Assis Date: Mon, 18 Nov 2024 13:10:09 -1000 Subject: [PATCH 19/20] fix specs --- data/tests/test_coexpresdb_adapter.py | 6 +++--- data/tests/test_depmap_adapter.py | 4 ---- data/tests/test_gaf_adapter.py | 2 +- data/tests/test_proteins_interaction_adapter.py | 2 -- 4 files changed, 4 insertions(+), 10 deletions(-) diff --git a/data/tests/test_coexpresdb_adapter.py b/data/tests/test_coexpresdb_adapter.py index c361747e..382036db 100644 --- a/data/tests/test_coexpresdb_adapter.py +++ b/data/tests/test_coexpresdb_adapter.py @@ -6,7 +6,7 @@ def test_coxpresdb_adapter(): writer = SpyWriter() - adapter = Coxpresdb(filepath='./samples/coxpresdb/1', writer=writer) + adapter = Coxpresdb(filepath='./samples/coxpresdb/', writer=writer) adapter.process_file() assert len(writer.contents) > 0 @@ -25,7 +25,7 @@ def test_coxpresdb_adapter(): def test_coxpresdb_adapter_z_score_filter(): writer = SpyWriter() - adapter = Coxpresdb(filepath='./samples/coxpresdb/1', writer=writer) + adapter = Coxpresdb(filepath='./samples/coxpresdb/', writer=writer) adapter.process_file() for item in writer.contents: @@ -36,7 +36,7 @@ def test_coxpresdb_adapter_z_score_filter(): def test_coxpresdb_adapter_initialization(): adapter = Coxpresdb(filepath='foobarbaz') - assert adapter.file_path == 'foobarbaz' + assert adapter.filepath == 'foobarbaz' assert adapter.dataset == 'coxpresdb' assert adapter.label == 'coxpresdb' assert adapter.source == 'CoXPresdb' diff --git a/data/tests/test_depmap_adapter.py b/data/tests/test_depmap_adapter.py index c6031500..e7d1749f 100644 --- a/data/tests/test_depmap_adapter.py +++ b/data/tests/test_depmap_adapter.py @@ -41,10 +41,6 @@ def test_depmap_adapter_initialization(): label='depmap' ) assert adapter.filepath == './samples/DepMap/CRISPRGeneDependency_transposed_example.csv' - assert adapter.type == 'edge' - assert adapter.label == 'depmap' - assert adapter.dataset == 'depmap' - assert adapter.dry_run == True assert adapter.writer is None, 'Writer should be None by default.' diff --git a/data/tests/test_gaf_adapter.py b/data/tests/test_gaf_adapter.py index 45e12792..85616fe8 100644 --- a/data/tests/test_gaf_adapter.py +++ b/data/tests/test_gaf_adapter.py @@ -51,7 +51,7 @@ def test_gaf_adapter_rna(): def test_gaf_adapter_invalid_type(): writer = SpyWriter() - with pytest.raises(ValueError, match='Invalid type. Allowed values: human, human_isoform, mouse, rna, rnacentral'): + with pytest.raises(ValueError, match='Invalid type. Allowed values: human, human_isoform, mouse, rna'): GAF(filepath='./samples/goa_human_sample.gaf.gz', gaf_type='invalid_type', writer=writer) diff --git a/data/tests/test_proteins_interaction_adapter.py b/data/tests/test_proteins_interaction_adapter.py index 8570c029..3437e44f 100644 --- a/data/tests/test_proteins_interaction_adapter.py +++ b/data/tests/test_proteins_interaction_adapter.py @@ -43,8 +43,6 @@ def test_proteins_interaction_adapter_initialization(filepath, spy_writer): adapter = ProteinsInteraction( filepath=filepath, label='edge', writer=spy_writer) assert adapter.filepath == filepath - assert adapter.label == 'edge' - assert adapter.dataset == 'edge' assert adapter.type == 'edge' assert adapter.organism == 'Homo sapiens' From 73e66bffa606e303bc2b6bb183a8156d7bac5624 Mon Sep 17 00:00:00 2001 From: Pedro Assis Date: Tue, 19 Nov 2024 07:54:36 -1000 Subject: [PATCH 20/20] removing unused source --- data/data_sources.yaml | 9 --------- 1 file changed, 9 deletions(-) diff --git a/data/data_sources.yaml b/data/data_sources.yaml index 559690e6..4e22584a 100644 --- a/data/data_sources.yaml +++ b/data/data_sources.yaml @@ -353,15 +353,6 @@ transcribed to: datafiles: - https://api.data.igvf.org/reference-files/IGVFFI7217ZMJZ/@@download/IGVFFI7217ZMJZ.gtf.gz # Homo sapiens GRCh38 GENCODE v43 genome -# Example: pypy3 data_loader.py --adapter gencode_transcripts --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/IGVFFI7217ZMJZ.gtf --output-bucket-key genes_transcripts/transcripts_genes_IGVFFI7217ZMJZ.vcf.jsonl --label transcribed_from -transcribed from: - collection: genes_transcripts - params: - - label - pypy3: true - datafiles: - - https://api.data.igvf.org/reference-files/IGVFFI7217ZMJZ/@@download/IGVFFI7217ZMJZ.gtf.gz # Homo sapiens GRCh38 GENCODE v43 genome - # Example: python3 data_loader.py --adapter gencode_structure --output-bucket igvf-catalog-parsed-collections --filepath ~/dataset/IGVFFI7217ZMJZ.gtf --output-bucket-key genes_structure/gencode_IGVFFI7217ZMJZ.vcf.jsonl --label gene_structure gene structure: collection: genes_structure