diff --git a/micall/blast_db/make_blast_db.py b/micall/blast_db/make_blast_db.py index bf4b55718..5c58a6f32 100644 --- a/micall/blast_db/make_blast_db.py +++ b/micall/blast_db/make_blast_db.py @@ -29,6 +29,9 @@ def make_blast_db(projects_json, refs_fasta): for name, region in projects['regions'].items(): if region['seed_group'] is None: continue + if region['seed_group'] == 'HIVGHA-seed': + # Exclude this project, because they're recombinant. + continue if name == 'HIV1-CON-XX-Consensus-seed': # Only used by G2P alignment. continue diff --git a/micall/drivers/sample.py b/micall/drivers/sample.py index 04b3da5f8..e5ac76f3e 100644 --- a/micall/drivers/sample.py +++ b/micall/drivers/sample.py @@ -12,6 +12,7 @@ from micall.core.coverage_plots import coverage_plot from micall.core.plot_contigs import plot_genome_coverage from micall.core.prelim_map import prelim_map +from micall.core.project_config import ProjectConfig from micall.core.remap import remap, map_to_contigs from micall.core.sam2aln import sam2aln from micall.core.trim_fastqs import trim @@ -22,6 +23,16 @@ logger = logging.getLogger(__name__) +def exclude_extra_seeds(excluded_seeds: typing.Sequence[str], + project_code: str = None) -> typing.Sequence[str]: + if project_code == 'HIVGHA': + return excluded_seeds + projects = ProjectConfig.loadDefault() + hivgha_seeds = projects.getProjectSeeds('HIVGHA') + hiv_seeds = projects.getProjectSeeds('HIV') + return sorted((hivgha_seeds - hiv_seeds) | set(excluded_seeds)) + + class Sample: def __init__(self, basespace_id=None, @@ -126,6 +137,8 @@ def process(self, use_gzip = force_gzip or self.fastq1.endswith('.gz') sample_info = self.load_sample_info() + excluded_seeds = exclude_extra_seeds(excluded_seeds, + sample_info.get('project')) with open(self.read_summary_csv, 'w') as read_summary: trim((self.fastq1, self.fastq2), diff --git a/micall/project_scoring.json b/micall/project_scoring.json index 2c997a34c..4c0bd69e7 100644 --- a/micall/project_scoring.json +++ b/micall/project_scoring.json @@ -6106,7 +6106,9 @@ "HIV1-O-SN-AJ302646-seed", "HIV1-O-US-JN571034-seed", "HIV1-P-CM-HQ179987-seed", - "HIV1-P-FR-GU111555-seed" + "HIV1-P-FR-GU111555-seed", + "HIV1-CRF02_AG-GH-AB286855-seed", + "HIV1-CRF06_CPX-GH-AB286851-seed" ] }, { @@ -6237,7 +6239,9 @@ "HIV1-O-SN-AJ302646-seed", "HIV1-O-US-JN571034-seed", "HIV1-P-CM-HQ179987-seed", - "HIV1-P-FR-GU111555-seed" + "HIV1-P-FR-GU111555-seed", + "HIV1-CRF02_AG-GH-AB286855-seed", + "HIV1-CRF06_CPX-GH-AB286851-seed" ] }, { @@ -6359,7 +6363,9 @@ "HIV1-O-SN-AJ302646-seed", "HIV1-O-US-JN571034-seed", "HIV1-P-CM-HQ179987-seed", - "HIV1-P-FR-GU111555-seed" + "HIV1-P-FR-GU111555-seed", + "HIV1-CRF02_AG-GH-AB286855-seed", + "HIV1-CRF06_CPX-GH-AB286851-seed" ] }, { @@ -6481,7 +6487,9 @@ "HIV1-O-SN-AJ302646-seed", "HIV1-O-US-JN571034-seed", "HIV1-P-CM-HQ179987-seed", - "HIV1-P-FR-GU111555-seed" + "HIV1-P-FR-GU111555-seed", + "HIV1-CRF02_AG-GH-AB286855-seed", + "HIV1-CRF06_CPX-GH-AB286851-seed" ] }, { @@ -6603,7 +6611,9 @@ "HIV1-O-SN-AJ302646-seed", "HIV1-O-US-JN571034-seed", "HIV1-P-CM-HQ179987-seed", - "HIV1-P-FR-GU111555-seed" + "HIV1-P-FR-GU111555-seed", + "HIV1-CRF02_AG-GH-AB286855-seed", + "HIV1-CRF06_CPX-GH-AB286851-seed" ] }, { @@ -6725,7 +6735,9 @@ "HIV1-O-SN-AJ302646-seed", "HIV1-O-US-JN571034-seed", "HIV1-P-CM-HQ179987-seed", - "HIV1-P-FR-GU111555-seed" + "HIV1-P-FR-GU111555-seed", + "HIV1-CRF02_AG-GH-AB286855-seed", + "HIV1-CRF06_CPX-GH-AB286851-seed" ] }, { @@ -6847,7 +6859,9 @@ "HIV1-O-SN-AJ302646-seed", "HIV1-O-US-JN571034-seed", "HIV1-P-CM-HQ179987-seed", - "HIV1-P-FR-GU111555-seed" + "HIV1-P-FR-GU111555-seed", + "HIV1-CRF02_AG-GH-AB286855-seed", + "HIV1-CRF06_CPX-GH-AB286851-seed" ] }, { @@ -6969,7 +6983,9 @@ "HIV1-O-SN-AJ302646-seed", "HIV1-O-US-JN571034-seed", "HIV1-P-CM-HQ179987-seed", - "HIV1-P-FR-GU111555-seed" + "HIV1-P-FR-GU111555-seed", + "HIV1-CRF02_AG-GH-AB286855-seed", + "HIV1-CRF06_CPX-GH-AB286851-seed" ] }, { @@ -7168,7 +7184,9 @@ "HIV1-O-SN-AJ302646-seed", "HIV1-O-US-JN571034-seed", "HIV1-P-CM-HQ179987-seed", - "HIV1-P-FR-GU111555-seed" + "HIV1-P-FR-GU111555-seed", + "HIV1-CRF02_AG-GH-AB286855-seed", + "HIV1-CRF06_CPX-GH-AB286851-seed" ] }, { @@ -7347,7 +7365,9 @@ "HIV1-O-SN-AJ302646-seed", "HIV1-O-US-JN571034-seed", "HIV1-P-CM-HQ179987-seed", - "HIV1-P-FR-GU111555-seed" + "HIV1-P-FR-GU111555-seed", + "HIV1-CRF02_AG-GH-AB286855-seed", + "HIV1-CRF06_CPX-GH-AB286851-seed" ] }, { @@ -7582,7 +7602,9 @@ "HIV1-O-SN-AJ302646-seed", "HIV1-O-US-JN571034-seed", "HIV1-P-CM-HQ179987-seed", - "HIV1-P-FR-GU111555-seed" + "HIV1-P-FR-GU111555-seed", + "HIV1-CRF02_AG-GH-AB286855-seed", + "HIV1-CRF06_CPX-GH-AB286851-seed" ] }, { @@ -7704,7 +7726,9 @@ "HIV1-O-SN-AJ302646-seed", "HIV1-O-US-JN571034-seed", "HIV1-P-CM-HQ179987-seed", - "HIV1-P-FR-GU111555-seed" + "HIV1-P-FR-GU111555-seed", + "HIV1-CRF02_AG-GH-AB286855-seed", + "HIV1-CRF06_CPX-GH-AB286851-seed" ] } ] diff --git a/micall/projects.json b/micall/projects.json index 9a0ce1fba..a4d1d7247 100644 --- a/micall/projects.json +++ b/micall/projects.json @@ -6830,7 +6830,9 @@ "HIV1-O-SN-AJ302646-seed", "HIV1-O-US-JN571034-seed", "HIV1-P-CM-HQ179987-seed", - "HIV1-P-FR-GU111555-seed" + "HIV1-P-FR-GU111555-seed", + "HIV1-CRF02_AG-GH-AB286855-seed", + "HIV1-CRF06_CPX-GH-AB286851-seed" ] }, { @@ -6947,7 +6949,9 @@ "HIV1-O-SN-AJ302646-seed", "HIV1-O-US-JN571034-seed", "HIV1-P-CM-HQ179987-seed", - "HIV1-P-FR-GU111555-seed" + "HIV1-P-FR-GU111555-seed", + "HIV1-CRF02_AG-GH-AB286855-seed", + "HIV1-CRF06_CPX-GH-AB286851-seed" ] }, { @@ -7064,7 +7068,9 @@ "HIV1-O-SN-AJ302646-seed", "HIV1-O-US-JN571034-seed", "HIV1-P-CM-HQ179987-seed", - "HIV1-P-FR-GU111555-seed" + "HIV1-P-FR-GU111555-seed", + "HIV1-CRF02_AG-GH-AB286855-seed", + "HIV1-CRF06_CPX-GH-AB286851-seed" ] }, { @@ -7181,7 +7187,9 @@ "HIV1-O-SN-AJ302646-seed", "HIV1-O-US-JN571034-seed", "HIV1-P-CM-HQ179987-seed", - "HIV1-P-FR-GU111555-seed" + "HIV1-P-FR-GU111555-seed", + "HIV1-CRF02_AG-GH-AB286855-seed", + "HIV1-CRF06_CPX-GH-AB286851-seed" ] }, { @@ -7298,7 +7306,9 @@ "HIV1-O-SN-AJ302646-seed", "HIV1-O-US-JN571034-seed", "HIV1-P-CM-HQ179987-seed", - "HIV1-P-FR-GU111555-seed" + "HIV1-P-FR-GU111555-seed", + "HIV1-CRF02_AG-GH-AB286855-seed", + "HIV1-CRF06_CPX-GH-AB286851-seed" ] }, { @@ -7415,7 +7425,9 @@ "HIV1-O-SN-AJ302646-seed", "HIV1-O-US-JN571034-seed", "HIV1-P-CM-HQ179987-seed", - "HIV1-P-FR-GU111555-seed" + "HIV1-P-FR-GU111555-seed", + "HIV1-CRF02_AG-GH-AB286855-seed", + "HIV1-CRF06_CPX-GH-AB286851-seed" ] }, { @@ -7532,7 +7544,9 @@ "HIV1-O-SN-AJ302646-seed", "HIV1-O-US-JN571034-seed", "HIV1-P-CM-HQ179987-seed", - "HIV1-P-FR-GU111555-seed" + "HIV1-P-FR-GU111555-seed", + "HIV1-CRF02_AG-GH-AB286855-seed", + "HIV1-CRF06_CPX-GH-AB286851-seed" ] }, { @@ -7649,7 +7663,9 @@ "HIV1-O-SN-AJ302646-seed", "HIV1-O-US-JN571034-seed", "HIV1-P-CM-HQ179987-seed", - "HIV1-P-FR-GU111555-seed" + "HIV1-P-FR-GU111555-seed", + "HIV1-CRF02_AG-GH-AB286855-seed", + "HIV1-CRF06_CPX-GH-AB286851-seed" ] }, { @@ -7766,7 +7782,9 @@ "HIV1-O-SN-AJ302646-seed", "HIV1-O-US-JN571034-seed", "HIV1-P-CM-HQ179987-seed", - "HIV1-P-FR-GU111555-seed" + "HIV1-P-FR-GU111555-seed", + "HIV1-CRF02_AG-GH-AB286855-seed", + "HIV1-CRF06_CPX-GH-AB286851-seed" ] }, { @@ -7883,7 +7901,9 @@ "HIV1-O-SN-AJ302646-seed", "HIV1-O-US-JN571034-seed", "HIV1-P-CM-HQ179987-seed", - "HIV1-P-FR-GU111555-seed" + "HIV1-P-FR-GU111555-seed", + "HIV1-CRF02_AG-GH-AB286855-seed", + "HIV1-CRF06_CPX-GH-AB286851-seed" ] }, { @@ -8000,7 +8020,9 @@ "HIV1-O-SN-AJ302646-seed", "HIV1-O-US-JN571034-seed", "HIV1-P-CM-HQ179987-seed", - "HIV1-P-FR-GU111555-seed" + "HIV1-P-FR-GU111555-seed", + "HIV1-CRF02_AG-GH-AB286855-seed", + "HIV1-CRF06_CPX-GH-AB286851-seed" ] }, { @@ -8117,7 +8139,9 @@ "HIV1-O-SN-AJ302646-seed", "HIV1-O-US-JN571034-seed", "HIV1-P-CM-HQ179987-seed", - "HIV1-P-FR-GU111555-seed" + "HIV1-P-FR-GU111555-seed", + "HIV1-CRF02_AG-GH-AB286855-seed", + "HIV1-CRF06_CPX-GH-AB286851-seed" ] }, { @@ -8234,7 +8258,9 @@ "HIV1-O-SN-AJ302646-seed", "HIV1-O-US-JN571034-seed", "HIV1-P-CM-HQ179987-seed", - "HIV1-P-FR-GU111555-seed" + "HIV1-P-FR-GU111555-seed", + "HIV1-CRF02_AG-GH-AB286855-seed", + "HIV1-CRF06_CPX-GH-AB286851-seed" ] }, { @@ -8351,7 +8377,9 @@ "HIV1-O-SN-AJ302646-seed", "HIV1-O-US-JN571034-seed", "HIV1-P-CM-HQ179987-seed", - "HIV1-P-FR-GU111555-seed" + "HIV1-P-FR-GU111555-seed", + "HIV1-CRF02_AG-GH-AB286855-seed", + "HIV1-CRF06_CPX-GH-AB286851-seed" ] }, { @@ -8468,7 +8496,9 @@ "HIV1-O-SN-AJ302646-seed", "HIV1-O-US-JN571034-seed", "HIV1-P-CM-HQ179987-seed", - "HIV1-P-FR-GU111555-seed" + "HIV1-P-FR-GU111555-seed", + "HIV1-CRF02_AG-GH-AB286855-seed", + "HIV1-CRF06_CPX-GH-AB286851-seed" ] }, { @@ -8585,7 +8615,9 @@ "HIV1-O-SN-AJ302646-seed", "HIV1-O-US-JN571034-seed", "HIV1-P-CM-HQ179987-seed", - "HIV1-P-FR-GU111555-seed" + "HIV1-P-FR-GU111555-seed", + "HIV1-CRF02_AG-GH-AB286855-seed", + "HIV1-CRF06_CPX-GH-AB286851-seed" ] }, { @@ -8702,7 +8734,9 @@ "HIV1-O-SN-AJ302646-seed", "HIV1-O-US-JN571034-seed", "HIV1-P-CM-HQ179987-seed", - "HIV1-P-FR-GU111555-seed" + "HIV1-P-FR-GU111555-seed", + "HIV1-CRF02_AG-GH-AB286855-seed", + "HIV1-CRF06_CPX-GH-AB286851-seed" ] }, { @@ -8819,7 +8853,9 @@ "HIV1-O-SN-AJ302646-seed", "HIV1-O-US-JN571034-seed", "HIV1-P-CM-HQ179987-seed", - "HIV1-P-FR-GU111555-seed" + "HIV1-P-FR-GU111555-seed", + "HIV1-CRF02_AG-GH-AB286855-seed", + "HIV1-CRF06_CPX-GH-AB286851-seed" ] }, { @@ -8936,7 +8972,9 @@ "HIV1-O-SN-AJ302646-seed", "HIV1-O-US-JN571034-seed", "HIV1-P-CM-HQ179987-seed", - "HIV1-P-FR-GU111555-seed" + "HIV1-P-FR-GU111555-seed", + "HIV1-CRF02_AG-GH-AB286855-seed", + "HIV1-CRF06_CPX-GH-AB286851-seed" ] }, { @@ -9053,7 +9091,9 @@ "HIV1-O-SN-AJ302646-seed", "HIV1-O-US-JN571034-seed", "HIV1-P-CM-HQ179987-seed", - "HIV1-P-FR-GU111555-seed" + "HIV1-P-FR-GU111555-seed", + "HIV1-CRF02_AG-GH-AB286855-seed", + "HIV1-CRF06_CPX-GH-AB286851-seed" ] }, { @@ -9170,7 +9210,9 @@ "HIV1-O-SN-AJ302646-seed", "HIV1-O-US-JN571034-seed", "HIV1-P-CM-HQ179987-seed", - "HIV1-P-FR-GU111555-seed" + "HIV1-P-FR-GU111555-seed", + "HIV1-CRF02_AG-GH-AB286855-seed", + "HIV1-CRF06_CPX-GH-AB286851-seed" ] }, { @@ -9287,7 +9329,9 @@ "HIV1-O-SN-AJ302646-seed", "HIV1-O-US-JN571034-seed", "HIV1-P-CM-HQ179987-seed", - "HIV1-P-FR-GU111555-seed" + "HIV1-P-FR-GU111555-seed", + "HIV1-CRF02_AG-GH-AB286855-seed", + "HIV1-CRF06_CPX-GH-AB286851-seed" ] } ] @@ -23586,6 +23630,409 @@ ], "seed_group": null }, + "HIV1-CRF02_AG-GH-AB286855-seed": { + "is_nucleotide": true, + "reference": [ + "TGGAAGGGTTAATTTACTCCAAGAAAAGACAAGAGATCCTTGATCTGTGG", + "GTCTATCACACACAAGGATACTTCCCTGATTGGCAGAACTACACACCAGG", + "GCCAGGGCCTAGATTCCCACTGACCTTTGGGTGGTGCTTCAAACTAGTAC", + "CAATAGATCCAGCAACAGTAGAGGAAGCCACTGAAGGAGAGAACAGCAGT", + "TTATTACACCCCATCAGTCAACATGGAATGGAGGACGAAGACAGAGAAGT", + "GCTGGTCTGGAGATTTGACAGTTACCTGGCATTTAGACACCTAGCTAGAG", + "AAAAGCACCCGGAGTTCTACAAAGACTGTTGACACAGAACTGCTGACGGG", + "GACTTTCAGAGTTGCTGACAAGGGACTTTCCACTGGGGACTTTCCGCGGG", + "GAGGTGTGGTGTGGGAGGAGTTGGGGAGTGGCTAACCCTCAGAAGCTGCA", + "TATAAGCAGCTGCTTCTCGCCTGTACTGGGTCTCTCTTGCTAGACCAGAT", + "TTGAGCCTGGGAGCTCTCTGGCTAGCGGGGGAACCCACTGCTTAAGCCTC", + "AATAAAGCTTGCCTTGAGTGCTTCAAGTAGTGTGTGCCCGTCTGTTGTGT", + "GACTCTGGTATCTAGAGATCCCTCAGACCACTGTAGACTGTGTAAAAAAT", + "CTCTAGCAGTGGCGCCCGAACAGGGACCTGAAATTAATAGGGACTCGAAA", + "GCGAAAGTTCCAGAGAAGTTCTCTCGACGCAAGGACTCGGCTTGCTGAGG", + "TGCACGCAGCAAGAGGCGAGAGCGGCGACTGGTGAGTACGCCAATTTTTG", + "ACTAGCGGAGGCTAGAAGGAGAGAGATGGGTGCGAGAGCGTCAATATTAA", + "GTGGAGGACAATTAGATGCATGGGAGAGAATTCGGTTAAGGCCAGGGGGA", + "AAGAAAAAATATAGACTAAAACATTTAGTATGGGCAAGCAGGGAGTTGGA", + "AAGATTTGCACTTAACCCTGGCCTTTTAGAAACAGCAGGAGGATGTCAGC", + "AACTAATGGAACAGTTACAGTCAACTCTCAGGACAGGATCAGAAGAACTT", + "AAATCATTATATAATACAATAGCAACCCTTTGGTGTGTACATCAAAGGAT", + "AGAAATAAAAGACACCAAGGAAGCCTTAGATAAGTTAGAGGAAGTACAAA", + "ATAAGAGCAAACAAAAGACACAGCAGGCAGCAGCTGCCACAGGAAGTAGC", + "AGCAGTCAAAATTACCCTATAGTGCAAAATGCACAAGGGCAATGGACACA", + "TCAGGCCATGTCACCTAGGACTTTGAATGCATGGGTGAAAGCAATAGAAG", + "AAAGGGCTTTTAGCCCAGAAGTAATACCCATGTTTACAGCATTATCAGAG", + "GGGGCCACCCCACAAGATTTGAATATGATGCTAAACATAGTGGGGGGACA", + "CCAGGCAGCAATGCAGATGCTAAAAGATACCATCAATGAAGAAGCTGCAG", + "ACTGGGACAGGGCACATCCAGTGCAAGCAGGGCCTATTCCACCAGGCCAG", + "ATGAGGGAACCAAGGGGAAGTGACATAGCAGGAACTACTAGTACCCTTCA", + "AGAACAAATAGGGTGGATGACCAGCAACCCACCTATCCCAGTGGGAGAAA", + "TCTATAAAAGATGGATAGTTCTGGGATTAAATAAAATAGTAAGAATGTAT", + "AGCCCTACCAGCATTTTGGACATAAGACAAGGGCCAAAAGAACCCTTTAG", + "AGATTATGTAGATAGGTTCTTTAAAACTTTGAGAGCTGAACAATCTTCGC", + "AGGAGGTAAAAAACTGGATGACAGATACCTTGCTGGTCCAAAATGCGAAT", + "CCAGACTGTAAGTCCATTTTAAGAGCATTAGGACCAGGGGCTACTCTAGA", + "AGAAATGATGACAGCATGTCAGGGAGTGGGAGGACCTAGCCATAAAGCAA", + "GAGTTTTGGCCGAGGCAATGAGTCAAGCACAACAGTCCAACATAATGATA", + "CAGAAAGGCAATTTTAGGGGCCAGAGAACAATAAAGTGTTTCAACTGTGG", + "CAAAGAAGGACACCTAGCCAGAAATTGCAAGGCCCCTAGGAAAAGGGGCT", + "GTTGGAAATGTGGAAAGGAAGGACACCAAATGAAGGACTGCACTGAGAGA", + "CAGGCTAATTTTTTAGGGAAAATTTGGCCTTCCTACAAAGGGAGGCCAGG", + "AAATTTTCCCCAGAGCAGACCGGAACCAACAGCCCCACCAGCAGAGAGCT", + "TAGGGATGGGGGAAGAGACAACCTCCTCACCGAAGCAGGAACCGAGGGAC", + "AAGGGACTATATCCTCCTTTAACTTCCCTCAAATCACTCTTTGGCAACGA", + "CCCTTAGTTACAGTAAAAATAGGGGGACAGCTAATAGAAGCTCTATTAGA", + "CACAGGAGCAGATGATACAGTATTAGAAGAAATAGATTTACCAGGAAAAT", + "GGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAAACAG", + "TATGATCATATACTTATAGAAATTTGTGGAAAAAGGGCTATAGGTTCAGT", + "ATTGGTAGGGCCTACACCTGTCAACATAATTGGGCGAAATATGTTGACTC", + "AGATTGGTTGTACTTTAAATTTTCCAATTAGCCCTATTGAAACTGTGCCA", + "GTAAAATTAAAGCCAGGAATGGATGGTCCAAAAGTTAAACAATGGCCATT", + "GACAGAAGAAAAAATAAAAGCATTGACAGAGATTTGTTTAGAAATGGAAA", + "AAGAAGGAAAAATTTCAAAAATTGGGCCTGAGAATCCATACAATACTCCA", + "ATATTTGCCATAAAGAAAAAAGATGGTACTAAATGGAGAAAATTAGTAGA", + "CTTCAGAGAACTCAATAAGAGAACTCAAGACTTCTGGGAAGTCCAATTAG", + "GAATACCTCATCCCGCAGGATTAAAAAAGAAAAAATCAGTAACAGTACTA", + "GATGTGGGGGATGCATATTTTTCAGTTCCTCTAGATAAAGACTTTAGAAA", + "GTATACAGCATTCACTATACCTAGTGTAAATAATGAAACACCAGGGATTA", + "GATATCAGTACAATGTACTTCCACAGGGATGGAAAGGATCACCAGCAATC", + "TTTCAGGCAAGCATGACAAAAATATTAGAACCCTTTAGAACAAACAATCC", + "AGAGTTGGTAATATACCAATATATGGATGATTTATATGTAGGATCAGACT", + "TAGAGATAGGGCAGCATAGAGCAAAAATAGAGGAGTTGAGAGAACATCTA", + "CTGAGATGGGGATTTACCACACCAGACAAAAAACATCAGAAAGAACCTCC", + "ATTTCTGTGGATGGGATATGAACTCCATCCTGACAAATGGACAGTCCAGC", + "CTATAACGCTGCCAGAAAAGAACAGCTGGACTGTCAATGATATACAGAAA", + "TTAGTGGGAAAACTAAATTGGGCAAGTCAGATTTATGCAGGAATTAAAGT", + "AAAGCAACTGTGTAAACTCCTCAGGGGAGCCAAAGCACTAACAGATATAG", + "TAACATTGACTGAGGAAGCAGAATTAGAATTGGCAGAGAACAGGGAAATT", + "CTAAAAGAACCTGTACATGGAGTATATTATGACCCAACAAAAGACTTAGT", + "AGCAGAAATACAGAAACAAGGGCAAGATCAATGGACATATCAGATTTATC", + "AAGAACCATTTAAAAATCTAAAGACAGGAAAATATGCAAAAAGGAGGTCT", + "GCTCACACTAATGATGTAAAGCAATTAACAGAGTTAGTACAAAAAGTGTC", + "TACTGAAAGCATAGTAATATGGGGAAAAACCCCTAAATTTAGACTACCCA", + "TACAAAAAGAAACATGGGAAGCATGGTGGATGGAGTATTGGCAGGCTACC", + "TGGATTCCCGAGTGGGAGTTTGTCAATACCCCTCCTCTAGTAAAATTATG", + "GTACCAGTTAGAGAAAGACCCCATAGCAGGAGCAGAAACTTACTATGTAG", + "ATGGAGCAGCTAATAGGGAAACTAAGCTAGGAAAGGCAGGGTATGTCACT", + "GACAAAGGAAGACAAAAGGTTGTTTCCCTAACTGAGACAACAAATCAAAA", + "GACTGAATTATATGCAATTCACCTAGCCTTGCAGGACTCAGGATCAGAAG", + "TAAATATAGTAACAGACTCCCAGTATGCATTAGGAATCATTCAGGCACAA", + "CCAGACAGAAGTGAGTCAGAGTTAGTCAATCAAATAATAGAGAAACTAAT", + "AGAAAAGGACAGAGTCTACTTATCATGGGTACCAGCACACAAAGGGATTG", + "GAGGAAACGAACAAGTAGATAAATTAGTCAGTAGTGGAATCAGGAAAGTA", + "TTATTTTTAGATGGCATAGATAAAGCCCAAGAAGAGCATGAAAGATATCA", + "CAGCAATTGGAGAGCAATGGCTAGTGATTTTAATCTACCACCTGTAGTGG", + "CAAAAGAAATAGTGGCCAGTTGTGATAAATGTCAGCAAAAAGGGGAGGCC", + "ATGCATGGACAAGTAGACTGTAGTCCAGGAATATGGCAATTAGATTGTAC", + "ACATTTAGAAGGAAAAACTATCCTGGTAGCAGTCCATGTAGCCAGTGGCT", + "ATATAGAAGCAGAAGTTATCCCAGCAGAAACAGGACAGGAAACAGCATAC", + "TTTATTTTAAAATTAGCAGGAAGATGGCCAGTAAAAGTAGTACACACAGA", + "CAATGGCAGCAATTTCACCAGTGCTGCAGTAAAGGCAGCATGTTGGTGGG", + "CAAATGTCACACAGGAATTTGGAATTCCCTACAATCCCCAAAGCCAAGGA", + "GTAGTGGAATCTATGAATAAAGAATTAAAGAAAATTATAGGGCAGGTCAG", + "GGATCAAGCTGAACACCTTAAGACAGCAGTACAGATGGCAGTATTCATTC", + "ACAATTTTAAAAGAAAAGGGGGGATTGGGGGGTACAGTGCAGGGGAAAGA", + "ATAATAGACATAATAGCATCAGACATACAAACTAAAGAACTACAAAAACA", + "AATTACAAAAATTCAAAATTTTCGGGTTTATTACAGGGACAGCAGAGACC", + "CCATTTGGAAAGGACCAGCAAAACTACTCTGGAAAGGTGAAGGGGCAGTA", + "GTAATACAGGACAATAGTGATATAAAAGTAGTACCAAGAAGAAAAGCAAA", + "AATCATTAAGGATTATGGAAAACAGATGGCAGGTGGTGATTGTGTGGCAG", + "GTAGACAGGATGAGGATTAGAACATGGAACAGTTTAGTAAAACATCATAT", + "GTACATCTCTAAGAAAGCAAAAGGTTGGTTTTATAGACATCACTATGAAA", + "GTAGGCATCCAAAAGTAAGTTCAGAAGTACACATCCCACTAGGGGATGCT", + "AAATTAGTAGTAAGAACATATTGGGGTCTGCATACAGGAGAAAGAGACTG", + "GCACTTGGGTCATGGGGTCTCCATAGAATGGTGGCAGAAAAGATATAGCA", + "CACAAATAGATCCTGACCTAGCAGACCAACTGATTCACCTGCATTATTTT", + "GACTGTTTTTCAGACTATGCCATAAGGAAAGCCATATTAGGACAATTAGT", + "TAGTCCTAGTTGTGAATATCAAGCAGGACATAATAAGGTAGGATCACTGC", + "AATATTTGGCACTGAAAGCTTTAGTAACCCCAACAAGGAAGAAGCCACCT", + "TTGCCTAGTGTTAAGAAATTAACAGAAGACAGATGGAACAAGCCCCAGAA", + "GACCAGGGGCCACAGAGGGAGCCGTTCAATGAATGGACACTAGAACTGTT", + "AGAAGAGCTTAAACATGAAGCTGTTAGACATTTTCCTAGGCCCTGGCTCC", + "ATGGCTTAGGACAATATATCTATGAAACATATGGGGATACTTGGGCAGGG", + "GTTGAAGCTATAATAAGAATTTTGCAACAACTACTGTTTGTTCATTTCAG", + "AATTGGGTGTCACCATAGCAGAATAGGCATTATTCGAGGGAGAAGAGGCA", + "GGAATGGAGCTGATAGATCCTAGCCTAGAGCCCTGGAACCACCCAGGAAG", + "TCAGCCTACCACTGCTTGTAACAAGTGTTACTGTAAAAAATGCTGCTGGC", + "ATTGCCAATTGTGCTTTCTGAACAAGGGCTTAGGCATCTCCTATGGCAGG", + "AAGAAGCGGAGACGCCGACGAGGAACTCCTCAGAGCCGTCAGGATCATCA", + "AAATCCTGTACCAAAGCAGTGAGTAATAATAATTAGTATTGTGATGCAAT", + "CTTTAGCAATAGCTGCAATAGTAGGACTAGTAGTAGCATTCATAGCAGCC", + "ATAGTTGTGTGGAGCATAGTACTTATAGAATATAGGAAAATAAGGAAACA", + "GAAGAAAATAGACCAGTTACTAGATAGAATAAGAGAAAGAGCAGAAGATA", + "GTGGCAATGAGAGTGATGGGGACACAGAAGAACTTTACACCCTGTTGGAA", + "ATGGGGTATGATAATATTTTGGATAATGATGATTTGTAAGGCTGAAGATT", + "TGTGGGTCACGGTCTACTATGGGGTACCTGTGTGGAGAGACGCAGAGACC", + "ACCCTATTTTGTGCATCAGATGCTAAAACATATGATACAGAAGTACATAA", + "TGTCTGGGCCACACATGCCTGTGTACCCACAGACCCTAACCCACAGGAAA", + "TGCATTTGGAAAATGTAACAGAACAGTTTAACATGTGGAAAAATAAAATG", + "GTAGATCAGATGCATGCAGATATAATTAGTCTATGGGATCAAAGCCTAAA", + "ACCATGTGTAAAGTTAACCCCTCTCTGTGTTACTCTAAATTGTCAAGACT", + "TTAATGTCAGCTATAGTAATACCTCTCGCGAAGTCAACACAACTATATCT", + "GAAGAAATGAAAGGGGAAATAAAAAACTGCTCTTTCAATATGACCACAGT", + "ATTAAGAGATAAGACACAGAAAATGTCTGCACTTTTTTATAAACTTGATG", + "TAGTACAAATGGGAAATGATAGTAGTCAGTACATATTAATAAATTGTAAT", + "ACCTCAGCCATTAAACAGGCTTGTCCAAAGGTAACCTTTGAACCAATTCC", + "CATACATTATTGTGCCCCAGCTGGTTTTGCAATTCTAAAGTGTAATGATA", + "AGGAGTTCAATGGAACAGGGCCATGCAAGAATGTCAGTACAGTACAATGC", + "ACACATGGAATCAAGCCAGTAGTATCAACTCAACTGCTGTTAAATGGCAG", + "TCTAGCAGAAGGAGACATAATGATTAGATCTGAAAATATCACAGACAATA", + "CCAAGAACATAATAGTACAGTTAAATGAGACTGTAAGGATTAATTGTTCC", + "AGACCTGGCAATAATACTAGACAGAGTGTACGAATAGGACCAGGGCAAAC", + "ATTCTATGCAAGAGGTGACATAATAGGGGATATAAGACAGGCACATTGTA", + "ATGTCAGTAAAAAAGAATGGAATGCCACTTTACGAAAGGTAGCTGAAAAA", + "TTAAGGCAGCACTTTAGTAACGACACAACAATAAATTTTACTAACCACTC", + "CGGAGGGGATTTAGAAATTACAACACATACTTTTAATTGTAGAGGAGAAT", + "TTTTCTATTGCTATACATCAGACCTGTTTAATAGCACTTGGCCCAATAAC", + "ACTGCAGAGTCAAATAGCACAGGGTCAATAAGGTCAAATGAGACTATAAC", + "TCTCAAATGCAGAATAAAGCAAATTATAAATATGTGGCAGAGAGTAGGAC", + "AAGCAATATATGCCCCTCCCATCAGAGGAGAAATAAGGTGTGATTCAAAC", + "ATTACAGGACTAATATTAACAAGAGATGGAGGGAATAATAGTGAGAGTGC", + "AAATGGAACTGAAATTTTCAGGCCTGGAGGAGGAGATATGAGGGACAATT", + "GGAGAAGTGAATTATATAAGTATAAAGTAGTAAAAATTGAACCACTAGGA", + "TTAGCACCCACTAGAGCAAAAAGAAGAGTGGTGCAGAGAGAAAGAAGAGC", + "AGTTGGCCTGGGAGCTTTATTCATTGGGTTCCTAGGAGCAGCAGGAAGCA", + "CTATGGGCGCGGCGTCATTAACGCTGACGGTACAGGCCAGACAATTACTG", + "TCTGGTATAGTGCAACAGCAAAGCAATTTGCTGAGGGCTATAGAGGCTCA", + "ACAACATCTGTTGAAACTCACAGTCTGGGGCATTAAACAGCTCCAGGCAA", + "GAGTCCTGGCTCTGGAAGGATACCTAAGGGATCAACAGCTCCTAGGAATT", + "TGGGGCTGCTCTGGAAAACTCATCTGCACCACTACTGTACCCTGGAACTC", + "TAGTTGGAGTAATAAAAGTTACAATAACATATGGAATAACATGACCTGGC", + "TAGAATGGGATAAAGAAATTGACAATTATACAGACATAATATATAATCTA", + "CTTGAGGAAGCGCAAAACCAGCAGGAAAATAATGAACAAGACTTATTGGC", + "ATTGGACAAGTGGGCAAGTCTGTGGAATTGGTTTGACATATCAAGATGGC", + "TATGGTATATAAAAATATTTATAATGATAGTAGGAGGTTTGATAGGTTTA", + "AGAATAATTTTTGCTGTACTTGCTATAATAAATAGAGTTAGGCAGGGATA", + "CTCACCTTTGTCTTTCCAGACCCTTCCCCATCTCCAGAGGGAACCCGACA", + "GGCCCGAAAGAATCGAAGAAGGAGGTGGCGAGCAAGACAGAGAGAGGTCC", + "GTGCGCTTAGTGAACGGATTCTTAGCACTTGCCTGGGACGACTTGAGGAA", + "CCTGTGCCTCTTCAGCTACCACCGATTGAGAGACTTCATCTTGATTGCAG", + "CGAGGATTGTGGAACTTCTGGGACACAGCATCCTCAAGGGACTGAGACTG", + "GGGTGGGAAGCCCTCAAATATCTGTGGAACCTTCTAGCATACTGGGGCCA", + "GGAACTAAAAATTAGTGCTATTAGTTTGCTTGATACAATAGCAATAGCAG", + "TAGCTAATTGGACAGATAGAGTTATAGAACTAGGACAAAGATTTGGTCGA", + "GCCATTCTCAACATACCTAGAAGAATCAGACAGGGCTTAGAAAGGGCTTT", + "GCTATAACATGGGTGGCAAGTGGTCAAAAAGCAGCATAGTAGGATGGCCT", + "AAGGTTAGAGAAAGACTAAGACAAACCCCTCCAGCTCCAGCAGCACCAGG", + "AGTAGGAGCAGCATCTCAAGATTTAGCTAGACATGGAGCAATCACAAGCA", + "GTAATACATCACATACAAATGCTGATTGTGCCTGGCTGGAAGCACAAGAG", + "GACGAGGATGTAGGCTTTCCAGTCAGGCCACAGGTACCATTGAGACCAAT", + "GACTTACAAGGCAGCTGTCGATCTCAGCCACTTTTTAAAAGAAAAGGGGG", + "GACTGGAAGGGTTAATTTACTCCAAGAAAAGACAAGAGATCCTTGATCTG", + "TGGGTCTATCACACACAAGGATACTTCCCTGATTGGCAGAACTACACACC", + "AGGGCCAGGGCCTAGATTCCCACTGACCTTTGGGTGGTGCTTCAAACTAG", + "TACCAATAGATCCAGCAACAGTAGAGGAAGCCACTGAAGGAGAGAACAGC", + "AGTTTATTACACCCCATCAGTCAACATGGAATGGAGGACGAAGACAGAGA", + "AGTGCTGGTCTGGAGATTTGACAGTTACCTGGCATTTAGACACCTAGCTA", + "GAGAAAAGCACCCGGAGTTCTACAAAGACTGTTGACACAGAACTGCTGAC", + "GGGGACTTTCAGAGTTGCTGACAAGGGACTTTCCACTGGGGACTTTCCGC", + "GGGGAGGTGTGGTGTGGGAGGAGTTGGGGAGTGGCTAACCCTCAGAAGCT", + "GCATATAAGCAGCTGCTTCTCGCCTGTACTGGGTCTCTCTTGCTAGACCA", + "GATTTGAGCCTGGGAGCTCTCTGGCTAGCGGGGGAACCCACTGCTTAAGC", + "CTCAATAAAGCTTGCCTTGAGTGC" + ], + "seed_group": "HIVGHA-seed" + }, + "HIV1-CRF06_CPX-GH-AB286851-seed": { + "is_nucleotide": true, + "reference": [ + "TGGAAGGGCTAATTTACTCCAAGAAAAGACAAGAGATCCTTGATCTGTGG", + "GTTTATAACACACAAGGTTACTTCCCTGATTGGCAAAACTACACACCAGG", + "GCCAGGGATCAGATACCCCCTGACCTTTGGATGGTGCTACAAGCTGGTAC", + "CAGTTGATCCAAAGGAAGTAGAAGAAGCTACTGAAGGAGAGAACAACTGC", + "CTGTTACACCCTCTGAACCAGCATGGAGCAGATGATGAACATGGAGAAGT", + "GTTAATGTGGAAGTTTGACAGCTCCCTTGCACGGAGGCACATAGCCCGTG", + "AGAAACATCCGGAGTTTTACAAAGACTGCTGACACAAGACTGCTGACACA", + "GAAGATTCTAACTGGGACTTTCCGCTGGGGACTTTCCAGGGGAGGTGTGG", + "ACTGGGCGGGTCCGGGGAGTGGCTAACCCTCAGAAGCTGCATAAAAGCAG", + "CCGCTTCTCGCTTGTACTGGGTCTCTCTTGTTAGACCAGATCTGAGCCTG", + "GGAGCTCTCTGGCTAGCAGGAGAACCCACTGCTTAAGCCTCAATAAAGCT", + "TGCCTTGAGTGCTTACAGTAGTGTGTGCCCGTCTGTTGTGTGACTCTGGT", + "AACTAGAGATCCCTCAGACCACTCTAGAAGGTGTAAAAATCTCTAGCAGT", + "GGCGCCCGAACAGGGACCCGAAAGTGAAAGTTAATAGGGACTCGAAAGCG", + "GAAGTTCCAGAGAAGTTCTCTCGACGCAGGACTCGGCTTGCTGAGGTGCA", + "CACAGCAAGAGGCGAGAGCGGCGACTGGTGAGTACGCCAATTTTTGACTA", + "GCAGAGGCTAGAAGGAGAGAGATGGGTGCGAGAGCGTCAGTATTAAGCGG", + "GGGAAAATTAGATGAATGGGAGAAAATTCGGTTACGGCCAGGGGGAAAGA", + "AAAAATATAGAATGAAACACTTAGTATGGGCAAGCAGGGAGCTGGACAGA", + "TTTGCACTTAACCCTGGCCTTTTAGAAACAACAGAAGGGTGCCAGCAGAT", + "AATGGAACAGTTACAACCAAGTCTCAAGACAGGATCAGAAGAAATTAAGT", + "CATTGTATAATACAGTAGCAACCCTCTATTGTGTACATCAAAGGATAAAA", + "GTAACAGACACCAAGGAAGCTCTAGATAAAATAGAGGAGATGCAAAATAA", + "GAGCAAGCAAAAGGCACAGCAAGCAGCAGCTGCCACAGGAAGCAGCAGTA", + "ATGTCAGTCAAAATTACCCTATAGTGCAAAATGCACAAGGGCAAATGGTG", + "CATCAGCCCATGTCACCAAGAACTTTAAATGCATGGGTAAAAGCAATAGA", + "AGAAAGGGCTTTCAGTCCAGAAGTAATACCCATGTTTACAGCATTATCAG", + "AAGGATGCACCCCACAAGATTTAAATATGATGCTAAACATAGTGGGGGGA", + "CACCAGGCAGCTATGCAAATGTTAAAAGATACCATCAATGAGGAAGCTGC", + "AGATTGGGATAGGACACATCCAGTACATGCAGGGCCTATTCCACCAGGCC", + "AGATGAGAGAACCAAGGGGAAGTGATATAGCAGGAACTACCAGTAACCTA", + "CAGGAGCAAATAGGGTGGATGACAGGCAACCCACCTATTCCAGTGGGAGA", + "AATCTATAGAAGATGGATAATCCTAGGATTAAATAAAATAGTAAGAATGT", + "ATAGTCCTGTCAGCATTTTAGACATAAAACAAGGGCCAAAAGAGCCCTTC", + "AGAGATTATGTAGATCGGTTCTTTAAAGTTTTAAGAGCTGAGCAAGCCAC", + "ACAGGAAGTAAAAAATTGGATGACAGACACCTTGTTGGTCCAAAATGCTA", + "ACCCAGATTGTAAGACCATTTTAAGAGGATTAGGACCAGGAGCTACACTA", + "GAAGAAATGATGACAGCATGTCAGGGAGTGGGAGGACCCAGCCATAAAGC", + "AAGAGTTTTAGCTGAGGCAATGAGCCAAGCATCAGGTGCAGCAGCAGCTA", + "TAATGATGCAGAAAAGCAATTTTAAGGGCCCGAAAAGAAATATTAAGTGT", + "TTCAACTGTGGCAAGGAAGGACATCTAGCCAGAAATTGCAGGGCCCCTAG", + "AAAAAAGGGCTGTTGGAAATGTGGAAAGGAAGGACATCAAATGAAAGACT", + "GCACCGAGAGACAGGCTAATTTTTTAGGGAGAATCTGGCCTTCCAGCAAG", + "GGGAGGCCAGGGAATTTTCTTCAGAACAGGCCAGAGCCAACAGCCCGGAC", + "AGAGCCAACAGCCCCACCAGCAGAGATCTTTGGGTTCGGAGAGGGGACAG", + "CCCCCTCCCCGAGACAGGAGCCGAGACAGGAGTCAAGGCAGGAGCCGAAG", + "GAGAAGGAGGAGCTATATCCCTTAGCTTCCCTCAAATCACTCTTTGGCAG", + "CGACCCCTAGTCACAGTAAGAATAGGGGAACAGCTAATAGAAGCTCTATT", + "AGACACAGGAGCAGATGATACAGTATTAGAAGATATAAATTTACCAGGAA", + "AATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGA", + "CAGTATGATCAAATACTTATGGAAATTTGTGGAAAAAGGGCAATAGGTAC", + "AGTATTAGTAGGACCTACACCTGTCAACATAATTGGACGGAACATGTTGA", + "CCCAGATTGGTTGTACTTTAAATTTTCCAATAAGTCCTATTGAAACTGTA", + "CCAGTAAAATTAAAGCCAGGAATGGATGGCCCAAAGGTTAAACAATGGCC", + "ATTGACAGAAGAGAAAATAAAAGCATTAACAGAGATTTGTACAGAAATGG", + "AAAAGGAAGGAAAAATTTCAAAAATTGGGCCTGAAAATCCATACAATACT", + "CCAATATTTGCCATAAAGAAAAAAGATAGCACTAAATGGAGAAAATTAGT", + "AGATTTCAGAGAACTTAATAAAAGAACTCAAGACTTTTGGGAAGTTCAAT", + "TAGGAATACCACACCCTGCTGGGTTAAAGAAGAAAAAATCAGTAACAGTA", + "TTAGATGTAGGGGATGCATATTTTTCAGTTCCCTTAGATGAAGACTTTAG", + "AAAGTATACTGCATTCACTATACCTAGTATAAATAATGAGACACCAGGGA", + "TTAGATATCAGTACAATGTACTTCCACAGGGATGGAAAGGATCACCAGCA", + "ATATTTCAGAGTAGCATGACAAAAATCTTAGAGCCCTTTAGAACAAAAAA", + "TCCAGAAATGGTGATCTACCAGTACATGGATGATTTATATGTAGGATCTG", + "ACTTAGAGATAGGGCAACACAGAGCAAAAATAGAGGAGTTAAGAGAACAT", + "CTCTTGAAGTGGGGATTTACCACACCAGATAAAAAACATCAGAAAGAACC", + "CCCATTTCTTTGGATGGGGTATGAACTCCATCCTGACAAATGGACAGTGC", + "AGCCTATACAGCTGCCAAACAAAGAGAGCTGGACTGTCAATGATATACAG", + "AAACTGGTGGGAAAACTAAATTGGGCAAGTCAGATTTATCCAGGGATTAA", + "GGTAAAGCAATTATGTAAACTCCTTAGGGGGGCAAAAGCACTAACAGACA", + "TAGTACCACTAACTGCAGAAGCAGAATTAGAATTGGCAGAGAACAGGGAA", + "ATTCTAAAAGAACCAGTGCATGGGGCATATTATGACCCATCAAAGGACTT", + "AATAGCAGAAATACAGAAGCAGGGGCAAGGCCAATGGACATATCAGATAT", + "ATCAGGAGCAGCATAAAAACTTGAAAACAGGGAAGTATGCAAGAACAAAG", + "TCTACCCACACTAATGATGTAAGACAATTAACAGAAGCAGTGCAAAAGAT", + "AGCCCTAGAAAGCATAGTAATATGGGGAAAAACTCCTAAATTTAGACTAC", + "CCATACAAAAGGAAACATGGGAGACATGGTGGACAGAATATTGGCAAGCC", + "ACCTGGATCCCTGAATGGGAATTTGTCAATACCCCTCCCCTAGTAAAACT", + "ATGGTACCAGTTAGAAACAGAGCCCATAGTAGGAGCAGAAACTTTCTATG", + "TAGATGGGGCAGCTAATAGAGAAACTAAAAAAGGAAAAGCAGGATATGTT", + "ACTGACAGAGGAAGACAAAAGGTTATCTCCTTAACTGACACAACAAATCA", + "GAAGACAGAATTACAAGCAATTAATCTAGCCTTGCAGGATTCAGGACCAG", + "AAGTAAACGTAGTAACAGACTCACAGTATGCATTAGGAATCATTCAAGCA", + "CAGCCAGATACAAGTGAATCAGAAGTAGTCAACCAAATAATAGAACAGCT", + "AATAAAAAAGGAAAAGGTCTACCTGTCATGGGTACCAGCACACAAAGGGA", + "TTGGAGGAAATGAACAAGTAGATAAATTAGTCAGTAATGGAATCAGGAAG", + "GTACTATTTTTAGATGGCATAGATAAAGCCCAAGAAGAGCATGAAAGATA", + "TCATAGCAATTGGAGAGCCATGGCTAATGATTTTAATCTGCCACCTATAG", + "TAGCAAAAGAAATAGTAGCCAGCTGTGACAAATGTCAGCTAAAAGGGGAA", + "GCCATGCATGGACAAGTAGACTGTAGTCCAGGGATATGGCAATTAGATTG", + "CACACACCTAGAGGGAAAAGTAATCCTGGTAGCAGTCCATGTAGCCAGTG", + "GCTATATAGAAGCAGAAGTTATCCCAGCAGAAACAGGACAGGAAACAGCA", + "TACTTTATACTAAAATTAGCAGGAAGATGGCCAGTAAAAGTGGTACACAC", + "AGATAATGGTAGCAATTTCACCAGTGCTGCAGTTAAAGCAGCATGTTGGT", + "GGGCAAATATCACACAAGAATTTGGAATTCCCTACAATCCCCAAAGTCAA", + "GGAGTAGTGGAATCTATGAATAAGGAATTAAAGAAAATCATAGGGCAGGT", + "AAGAGACCAAGCTGAACACCTTAAGACAGCAGTACAAATGGCAGTATTCA", + "TTCACAATTTTAAAAGAAAAGGGGGGATTGGGGGGTACAGTGCAGGGGAA", + "AGAATAATAGACATAATAGCATCAGACATACAAACTAAAGAACTACAAAA", + "ACAAATTACAAAAATTCAAAATTTTCGGGTTTATTACAGGGACAGCAGAG", + "ACCCAATTTGGAAAGGACCAGCAAAACTACTTTGGAAAGGTGAAGGGGCA", + "GTAGTAATACAAGACAACAGTGACATAAAGGTAGTACCAAGGAGAAAAGC", + "AAAGATCATCAGGGATTATGGAAAACAGATGGCAGGTGATGATTGTGTGG", + "CAGGTAGACAGGATGAGGATTAATGCATGGAAAAGTTTAGTAAAATACCA", + "TATGCATATATCAAGGAAAGCTAAAGGATGGCTTTATAGACATCACTATG", + "ACAGTTATCATCCAAAAATAAGTTCAGAAGTACACATCCCACTAGGATGT", + "GCTGAATTAGTGATAACAACATATTGGGGTCTAAATACAGGAGAAAGAGA", + "ATGGCACTTGGGCCAGGGAGTCTCCATAGAATGGAGGATGAGAAGGTATA", + "GAACACAAATAGACCCTGTTCTGGCAGACCAACTAATTCATGTGCATTAC", + "TTTGATTGTTTTTCAGAATCTGCCATAAGGAAAGCCATATTAGGACATAT", + "AGTTAGCCCTAGGTGTGAATATCAAGCAGGACATAATAAGGTAGGATCTC", + "TACAATATCTGGCACTAACAGCCTTACTGAAACCAAAGAAGAGAAAGCCA", + "CCTCTGCCTAGTGTTCAAAAACTAGTAGAGGATAGATGGAACAAGCCCCA", + "GAAGACCAGGGACCACAGAGACAGCCATACAATGCATGGGCACTAGAACT", + "TATGGAAGAGCTTAAAAGTGAAGCTGTTAGACATTTTCCTAGACCATGGC", + "TCCACGGCTTAGGACAGCATATCTATAACACTTATGGGGATACTTGGGAA", + "GGAGTTGAAGCTCTACTAAGAATGCTGCAACAACTACTGTTTATTCATTT", + "CAGAATCGGGTGTAAACATAGCAGAATAGGCATTATGCCACAGAGAAGAG", + "GGAGAGATGGAGCCAGTAGATCCTAACATAGAGCCCTGGAATCAACCAGG", + "AAGTCGACCTAAAACTGCTTGTAACTCATGCTTTTGCAAAAAGTGTTGCT", + "ATCATTGCCAGATGTGCTTTTTAAACAAAGGCTTAGGCATTTCCTATGGC", + "AGGAAGAAGCGGAGACAGCGACGCACAGCTCCTCCTGGCAGTAAGAGCCA", + "TCAAACTCCTGTACAGCAGCAGTAAGTACTAATAAATAGTATATGTAATG", + "CAGGCATTAGAGATATCTGCAATAGTAGGATTAGTAGTAGCATTCATAGC", + "AGCCATAATTGTGTGGACTATAGTGTATATACAATATAGGGAAATAAGAA", + "AACAGAGAAAAATAGAAAAGTTATATGAGAGAATAATAGAAAAAGCAGAA", + "GACAGTGGAAATGAGAGTGAGGGGGATGCACAGGAATTGGCAGCACTTAT", + "GGAATTGGGGAACTTTGATCCTTGGGTTGGTGATAATTTGTAGTGCCTCA", + "AGCAACTTGTGGGTCACAGTCTATTATGGGGTGCCTGCGTGGGAAGATGC", + "AGATACAATCCTATTCTGTGCATCTGATGCTAAAGCATATAGTAATGAAA", + "AGCATAATGTCTGGGTTACACATGCTTGTGTGCCCACAGACCCCAACCCA", + "CAAGAAATACCTCTGGAAAATATAACAGAAAATTTTAATATGTGGAAAAA", + "TAACATGGTAGAACAGATGCATGAGGATATAATCAGTTTATGGGATGAAA", + "GTCTAAAGCCATGTGTAAAGATGACCCCTCTCTGTGTTACTTTAACCTGT", + "AATAACGCAACCATCAATGTTACCAATACTAACACAACGCAGAGTAACAC", + "AAACAGCAGTACTAACGTAACTAAAACTAACAATATCACTGTAAAAGAGG", + "TAGAAGACATGAGAAACTGTTCTTTCAATGAAACCACAGAAATAAGAGAT", + "AAGACAAAGCAGGAATACGCGCTTTTCTATAAAACTGATGTAGTACAAAT", + "GGGTAAGGAAGGTAATAAGACTACTTATAAATTAATAAATTGTAATGTCT", + "CAACCATTAGACAGGCTTGTCCAAAGGTGACTTTTGAACCAATTCCCATA", + "CATTATTGTGCTCCAGCTGGTTTTGCGATTTTAAAGTGTAGGGATAAGAA", + "TTTCAATGGAACAGGACCATGTAAAAATGTCAGTACAGTACAATGTACAC", + "ATGGAATTAAGCCAGTGGTGTCAACTCAATTACTGCTGAATGGTAGTTTA", + "GCAGAAAAAGAAATAATAATTAGATCTGAAAATCTCACAGACAATACCAA", + "AAACATAATAGTGCAGCTTAATAGCACTGTACAAATTACATGCAATAGAC", + "CCAATAACAATACAAGAAGAGGTATACACCTTGGACCAGGGCAAGTGTTC", + "TTTGCAACAGGTGACATAATAGGAGATATAAGACAAGCTCATTGTAATAT", + "TAGTGGAATAAAATGGGGAGAAGTGGTACGCAATGTAAGTGCAAAACTAA", + "AAGAGATCTATAATAAAAGCATAACCTTTAGTCCACCTATAGGAGGGGAC", + "CTAGAAATCACAACACATAGTTTTAATTGTAGAGGAGAATTTTTCTATTG", + "TAATACATCAAAACTATTTAATAGTACTTTTTCTAATAATAATTCCACAA", + "ATAATGATACCTCTTTAGAAAATAATGGTACCATCACACTCCCATGTAAG", + "ATAAAACAAATTGTAAGAATGTGGCAGAGAGTGGGACAGGCAATGTATGC", + "CCCTCCCATTGAAGGAAACATTACATGTATATCAAACATTACAGGACTAA", + "TATTAACAAGAGATGGGCACAATGGGCACAATCAGAGTAATAATGAGACC", + "TTCAGACCTACAGGAGGAGATATGAGGGACAATTGGAGAAGTGAATTATA", + "TAAGTATAAAGTAGTAAGAATTAAACCACTAGGAATAGCACCCACCAAGG", + "CAAGAAGAAGAGTAGTGGGCAGAGAGAAAAGAGCAGTTGGACTGGGAGCT", + "GTTTTCCTTGGGTTCTTAGGAGCGGCAGGAAGCACTATGGGCGCAGCGTC", + "AATAACGCTGACGGTACAGGTCAGACAATTATTGTCTGGCATAGTGCAGC", + "AGCAAAGCAATTTGCTGAGAGCTATAGAAGCGCAGCAGCATCTGTTGCAG", + "CTCACAGTCTGGGGCATTAAACAGCTCCAGGCAAGAGTCCTGGCTCTGGA", + "AAGATACCTAAAGGATCAACAGCTCCTAGGAATTTGGGGCTGCTCTGGAA", + "AACTCATCTGCCCCACTAATGTGCCTTGGAACACTAGTTGGAGTAATAAA", + "ACTTATAGTGAAATTTGGGACAACATGACCTGGCTAGATTGGGATAGGGA", + "AATTAGCAATTACACAGACCTAATATACAGCCTAATTGAAGAATCGCAAA", + "CTCAGCAGGAAAAGAATGAACAAGATTTATTGGCATTGGACAAATGGGAA", + "AGTTTGTGGTCTTGGTTCAACATATCAAACTGGCTATGGTACATAAAAAT", + "ATTTGTAATGGTAGTAGGAGGCTTAATAGGTTTAAAAATAGTTTTTTCTG", + "TGCTTTCTATAGTCAATAGAGTTAGGCAGGGATACTCACCTTTGTCGTTG", + "CAGACCCTTATCCCAAGCCCAAGGGGAGCAGACAGGCCCGGAGGAATCGA", + "AGAAGGAGGTGGAGAGCAAGACAGAACCAGATCGATTCGATTAGTGAGCG", + "GATTCTTAGCACTTGCCTGGGACGATCTGAGGAGCCTGTTCCTCTCCAGC", + "TGCCACCGATTGAGAGACTTCATCTTGATTGCAGCGAGGACTGTGGAAAC", + "TCTGGGACGCAGGGGGTGGGAACTCCTCAAGTACCTGGGGAACCTGGTGT", + "GTTATTGGGGACAGGAGATAAAGAATAGTGCTATTAGTTTGCTTGATACA", + "ATAGCAATAGCAGTAGGTAACTGGACAGATAGGGTTATAGAAGTAGGACA", + "AAGAGCTGTTAGAGCTGTGCTTAACATACCTAGAAGAATAAGACAAGGAT", + "TTGAAAGAGCTTTGCTGTAAAATGGGAGCCAAGTGGTCAAAATGTAGCCT", + "GGGGGGATGGACTCAGGTAAGGGAAAGAATGAGAAGAACCCCAGTAACAG", + "AAAGAGCAACAGAAAGAGCAGCAGAGGGAGTAGGAGCAGTGTCTCAAGAT", + "TTGGATAGACATGGGGCAATCACAAGCAGCAATACAGCAGCTTCAAATAG", + "TGACTGTGCATGGGTAGAAGCACAAACAGAGGAAGAGGTAGGCTTTCCAG", + "TCAGACCTCAGGTACCTTTGAGACCAATGACTTATAAAGGTGCTTTTGAT", + "CTCAGCTTCTTTTTAAAAGAAAAGGGGGGACTGGATGGGCTAATTCACTC", + "CAAGAAAAGACAAGAGATCCTTGATCTGTGGGTTTATAACACACAAGGTT", + "ACTTCCCTGATTGGCAAAACTACACACCAGGGCCAGGGATCAGATACCCC", + "CTGACCTTTGGATGGTGCTACAAGCTGGTACCAGTTGATCCAAAGGAAGT", + "AGAAGAAGCTACTGAAGGAGAGAACAACTGCCTGTTACACCCTCTGAACC", + "AGCATGGAGCAGATGATGAACATGGAGAAGTGTTAATGTGGAAGTTTGAC", + "AGCTCCCTTGCACGGAGGCACATAGCCCGTGAGAAACATCCGGAGTTTTA", + "CAAAGACTGCTGACACAAGACTGCTGACACAGAAGATTCTAACTGGGACT", + "TTCCGCTGGGGACTTTCCAGGGGAGGTGTGGACTGGGCGGGTCCAGGGAG", + "TGGCTAACCCTCAGAAGCTGCATAAAAGCAGCCGCTTCTCGCTTGTACTG", + "GGTCTCTCTTGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCTAGCAG", + "GAGAACCCACTGCTTAAGCCTCAATAAAGCTTGCCTTGAGTGC" + ], + "seed_group": "HIVGHA-seed" + }, "HIV1-A1-CD-AM000053-seed": { "is_nucleotide": true, "reference": [ diff --git a/micall/tests/test_denovo.py b/micall/tests/test_denovo.py index 051026c8c..a7fbff009 100644 --- a/micall/tests/test_denovo.py +++ b/micall/tests/test_denovo.py @@ -1,6 +1,7 @@ from io import StringIO from pathlib import Path +from Bio import SeqIO from pytest import fixture, mark from micall.core.denovo import write_contig_refs, denovo, DEFAULT_DATABASE, genotype @@ -11,7 +12,12 @@ def check_hcv_db(): db_path = Path(DEFAULT_DATABASE) index_path = db_path.parent / "refs.fasta.nin" - if not index_path.exists(): + build_needed = not index_path.exists() + if not build_needed: + projects_date = Path(DEFAULT_PROJECTS).stat().st_mtime + index_date = index_path.stat().st_mtime + build_needed = index_date < projects_date + if build_needed: with open(DEFAULT_PROJECTS) as projects_json, \ open(DEFAULT_DATABASE, 'w') as refs_fasta: make_blast_db(projects_json, refs_fasta) @@ -19,6 +25,14 @@ def check_hcv_db(): return db_path +def test_make_blast_db_excludes_hivgha(hcv_db): + fasta_path = Path(DEFAULT_DATABASE) + with fasta_path.open() as f: + for reference in SeqIO.parse(f, 'fasta'): + # Exclude the Ghana project, because they're recombinant. + assert reference.name != 'HIV1-CRF02_AG-GH-AB286855-seed' + + def test_write_contig_refs_two_sequences(tmpdir, hcv_db): contigs_fasta = Path(tmpdir) / "contigs.fasta" contigs_fasta.write_text("""\ diff --git a/micall/tests/test_sample.py b/micall/tests/test_sample.py index c6ebf14c5..9f79b0a0f 100644 --- a/micall/tests/test_sample.py +++ b/micall/tests/test_sample.py @@ -1,7 +1,9 @@ import pickle from unittest import TestCase -from micall.drivers.sample import Sample +import pytest + +from micall.drivers.sample import Sample, exclude_extra_seeds class SampleTest(TestCase): @@ -154,3 +156,19 @@ def test_pickle(self): data = pickle.dumps(sample) self.assertNotEqual(b'', data) + + +@pytest.mark.parametrize( + 'project_code,excluded,expected', + [('HIVGHA', (), ()), + (None, (), ["HIV1-CRF06_CPX-GH-AB286851-seed", + "HIV1-CRF30_0206-GH-AB286854-seed"]), + ('HIV', (), ["HIV1-CRF06_CPX-GH-AB286851-seed", + "HIV1-CRF30_0206-GH-AB286854-seed"]), + ('HIV', ["HLA-B-seed"], ["HIV1-CRF06_CPX-GH-AB286851-seed", + "HIV1-CRF30_0206-GH-AB286854-seed", + "HLA-B-seed"])]) +def test_exclude_extra_seeds(project_code, excluded, expected): + all_excluded = exclude_extra_seeds(excluded, project_code) + + assert all_excluded == expected diff --git a/micall/utils/fetch_sequences.py b/micall/utils/fetch_sequences.py index 980e47bdb..cf68bde9d 100644 --- a/micall/utils/fetch_sequences.py +++ b/micall/utils/fetch_sequences.py @@ -20,6 +20,7 @@ from micall.utils.translation import translate try: + # noinspection PyPackageRequirements import requests except ImportError: # Allow tests to run without requests module @@ -37,6 +38,7 @@ def main(): error_count += check_hcv_coordinates(project_config, unchecked_ref_names) error_count += check_hiv_seeds(project_config, unchecked_ref_names) error_count += check_hiv_coordinates(project_config, unchecked_ref_names) + error_count += check_hivgha_seeds(project_config, unchecked_ref_names) error_count += check_hiv_wild_types(project_config) error_count += check_hla_seeds(project_config, unchecked_ref_names) error_count += check_hla_coordinates(project_config, unchecked_ref_names) @@ -95,6 +97,27 @@ def check_hiv_seeds(project_config, unchecked_ref_names: set): return error_count +def check_hivgha_seeds(project_config, unchecked_ref_names: set): + print("HIVGHA project uses the same seeds as HIV, plus three chimeric seeds.") + ref_names = ["HIV1-CRF02_AG-GH-AB286855-seed", + "HIV1-CRF06_CPX-GH-AB286851-seed"] + source_sequences = {} + for ref_name in ref_names: + parts = ref_name.split('-') + accession_number = parts[3] + source_sequences[ref_name] = fetch_by_accession(accession_number) + + ref_names = set(project_config.getProjectSeeds('HIVGHA')) + ref_names -= set(project_config.getProjectSeeds('HIV')) + unchecked_ref_names.difference_update(ref_names) + + report, error_count = compare_config(ref_names, + project_config, + source_sequences) + print(report) + return error_count + + def check_hiv_wild_types(project_config): print("""\ HIV wild types for resistance reports are extracted from Consensus B.