From dc313d698621ae65d2d321360f33b0413ca12995 Mon Sep 17 00:00:00 2001 From: james hadfield Date: Tue, 3 Dec 2024 12:31:40 +1300 Subject: [PATCH] WIP A/H7N6 Very ad-hoc / temporary approach I used to generate FASTA files which I could then use as inputs for avian-flu builds. Avoids any use of the rethink db. See for the corresponding avian-flu work. --- vdb/avian_flu_upload.py | 45 +++++++++++++++++++++++++++++++++++++---- vdb/parse.py | 8 ++++---- vdb/upload.py | 4 +++- 3 files changed, 48 insertions(+), 9 deletions(-) diff --git a/vdb/avian_flu_upload.py b/vdb/avian_flu_upload.py index b67b7e9f..476e24af 100755 --- a/vdb/avian_flu_upload.py +++ b/vdb/avian_flu_upload.py @@ -41,6 +41,7 @@ def __init__(self, **kwargs): ('a / h7n1', ''): ('a', 'h7n1', None), ('a / h7n2', ''): ('a', 'h7n2', None), ('a / h7n3', ''): ('a', 'h7n3', None), + ('a / h7n6', ''): ('a', 'h7n6', None), ('a / h7n7', ''): ('a', 'h7n7', None), ('a / h7n9', ''): ('a', 'h7n9', None), ('a / h9n2', ''): ('a', 'h9n2', None), @@ -52,6 +53,7 @@ def __init__(self, **kwargs): ('b', 'victoria'): ('b', None, 'seasonal_vic'), ('b', 'yamagata'): ('b', None, 'seasonal_yam'), ('h5n1',''): ('a', 'h5n1', None), + ('h7n6',''): ('a', 'h7n6', None), ('h7n9',''): ('a', 'h7n9', None), ('h9n2',''): ('a', 'h9n2', None)} self.outgroups = {lineage: SeqIO.read('source-data/'+lineage+'_outgroup.gb', 'genbank') for lineage in ['H3N2', 'H1N1pdm', 'Vic', 'Yam']} @@ -215,7 +217,7 @@ def format_viruses(self, documents, data_source, **kwargs): doc['location'] = self.fix_location[doc['strain']] self.format_place(doc, determine_location=True) self.format_region(doc) - self.rethink_io.check_optional_attributes(doc, []) + # self.rethink_io.check_optional_attributes(doc, []) def format_sequences(self, documents, **kwargs): ''' @@ -232,7 +234,7 @@ def format_sequences(self, documents, **kwargs): self.format_passage(doc, 'passage', 'passage_category') self.format_passage(doc, 'virus_strain_passage', 'virus_strain_passage_category') #BP self.format_passage(doc, 'serum_antigen_passage', 'serum_antigen_passage_category') #BP - self.rethink_io.check_optional_attributes(doc, []) + # self.rethink_io.check_optional_attributes(doc, []) self.fix_casing(doc, args.data_source) print("Names that need to be fixed") for name in sorted(self.fix): @@ -675,7 +677,7 @@ def determine_group_fields(self, v, patterns, **kwargs): ('gisaid_location', 'Location'), ('originating_lab', 'Originating_Lab'), ('Host_Age', 'Host_Age'), ('Host_Age_Unit', 'Host_Age_Unit'), ('gender', 'Host_Gender'), ('submission_date', 'Submission_Date'), ('submitting_lab', 'Submitting_Lab'), ('authors','Authors'), ('domestic_status','Domestic_Status'), - ('PMID','PMID'), ('animal_health_status','Animal_Health_Status'), ('gisaid_clade','Clade')] + ('PMID','PMID'), ('animal_health_status','Animal_Health_Status'), ('gisaid_clade','Clade'), ('pathogenicity', 'Pathogenicity')] setattr(args, 'xls_fields_wanted', xls_fields_wanted) elif (args.data_source == 'ird'): virus_fasta_fields = {0:'strain', 4: 'vtype', 5: 'Subtype', 6:'collection_date', 8:'country', 10: 'host', 11:'h5_clade'} @@ -689,4 +691,39 @@ def determine_group_fields(self, v, patterns, **kwargs): if not os.path.isdir(args.path): os.makedirs(args.path) connVDB = flu_upload(**args.__dict__) - connVDB.upload(**args.__dict__) + (viruses, sequences) = connVDB.upload(**args.__dict__) + + # sequences are an array of {'accession': 'EPI1895707', 'strain': 'A/ruddyturnstone/DelawareBay/281/2020', 'isolate_id': 'EP, ... + # viruses are an array of {'strain': 'A/quail/Aichi/6/2009', ... 'sequences': ['EPI266264', 'EPI266265', 'EPI266266', 'EPI266267', 'EPI266268', 'EPI266269', 'EPI266270', 'EPI266271'], etc + sequences_by_accession = {s['accession']: s for s in sequences} # may override if dups + + handles = {locus: open(f"data/{locus}.fasta", 'w') for locus in {s['locus'] for s in sequences}} + print(f"Opened file handles for {', '.join(handles.keys())} loci") + + # Looking at avian-flu / ingest we want the following '|' separated FASTA header fields: + header = ['strain', 'virus', 'accession', 'collection_date', 'region', 'country', 'division', 'location', 'host', 'domestic_status', 'subtype', 'originating_lab', 'submitting_lab', 'authors', 'PMID', 'gisaid_clade', 'h5_clade', 'pathogenicity'] + + from collections import defaultdict + seen = defaultdict(set) + + for virus in viruses: + strain = virus['strain'] + for accession in virus['sequences']: + sequence = sequences_by_accession.get(accession, None) + if sequence is None: + print(f"WARNING: missing accession {accession} for virus {virus['strain']}") + continue + locus = sequence['locus'] + if locus in seen[strain]: + print(f"WARNING: skipping _virus_ {strain} _segment_ {locus} as already seen!") + continue + else: + seen[strain].add(locus) + fields = [str(virus[field]) if (field in virus and virus[field] is not None) else '?' for field in header] + fields[header.index('accession')] = accession + assert virus['subtype']=='h7n6' + handle = handles[sequence['locus']] + handle.write(">"+'|'.join(fields)+'\n') + handle.write(sequence['sequence'] + "\n") + + print("\n\nmkdir -p ../avian-flu/ingest/fauna/data\ncp data/*.fasta ../avian-flu/ingest/fauna/data/") diff --git a/vdb/parse.py b/vdb/parse.py index 9ea66555..6f27908e 100644 --- a/vdb/parse.py +++ b/vdb/parse.py @@ -170,8 +170,8 @@ def add_virus_fields(self, v, host, country, **kwargs): else : v['country'] = country v['virus'] = self.virus - v['timestamp'] = self.rethink_io.get_upload_timestamp() - v['virus_inclusion_date'] = self.rethink_io.get_upload_date() + # v['timestamp'] = self.rethink_io.get_upload_timestamp() + # v['virus_inclusion_date'] = self.rethink_io.get_upload_date() v['sequences'] = [] v['number_sequences'] = 0 return v @@ -208,8 +208,8 @@ def add_sequence_fields(self, v, locus, authors, title, source, url, public=True if 'public' not in v and public is not None: v['public'] = public v['virus'] = self.virus - v['timestamp'] = self.rethink_io.get_upload_timestamp() - v['sequence_inclusion_date'] = self.rethink_io.get_upload_date() + # v['timestamp'] = self.rethink_io.get_upload_timestamp() + # v['sequence_inclusion_date'] = self.rethink_io.get_upload_date() return v def get_GIs(self, accessions, n_entrez=2500, **kwargs): diff --git a/vdb/upload.py b/vdb/upload.py index 66b6e94f..e9e9dbb3 100644 --- a/vdb/upload.py +++ b/vdb/upload.py @@ -55,7 +55,7 @@ def upload(self, preview=False, **kwargs): ''' format virus information, then upload to database ''' - self.connect(**kwargs) + # self.connect(**kwargs) print("Uploading Viruses to VDB") viruses, sequences = self.parse(**kwargs) print('Formatting documents for upload') @@ -75,6 +75,7 @@ def upload(self, preview=False, **kwargs): #self.transfer_fields(viruses, sequences, self.virus_to_sequence_transfer_fields) print("") print("Upload Step") + preview = True # avoid uploading at all costs! if not preview: print("Uploading viruses to " + self.database + "." + self.viruses_table) self.upload_documents(self.viruses_table, viruses, index='strain', **kwargs) @@ -87,6 +88,7 @@ def upload(self, preview=False, **kwargs): print(json.dumps(sequences[0], indent=1)) print("Remove \"--preview\" to upload documents") print("Printed preview of viruses to be uploaded to make sure fields make sense") + return (viruses, sequences) def connect(self, **kwargs): if self.database not in self.uploadable_databases: