Skip to content

Commit

Permalink
added ranks and rank_propagation (#207)
Browse files Browse the repository at this point in the history
Thanks @VinzentRisch! LGTM!

- tests passed
- flake8 passed

I also ran the following and the taxonomy came out as expected:

qiime rescript get-ncbi-genomes --p-taxon 1322345 --p-assembly-source all --p-assembly-levels contig --p-no-only-reference --p-ranks phylum class subclass order family subfamily genus species --output-dir ncbi-genome-test-dl --verbose

INFO:2024-10-23 16:13:01,936:MainProcess:Downloading 1 records
INFO:2024-10-23 16:13:04,860:LokyProcess-3:got 1 records
Saved FeatureData[Sequence] to: ncbi-genome-test-dl/genome_assemblies.qza
Saved GenomeData[Loci] to: ncbi-genome-test-dl/loci.qza
Saved GenomeData[Proteins] to: ncbi-genome-test-dl/proteins.qza
Saved FeatureData[Taxonomy] to: ncbi-genome-test-dl/taxonomies.qza
  • Loading branch information
VinzentRisch authored Oct 23, 2024
1 parent 29bf18a commit 0091c79
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 6 deletions.
12 changes: 9 additions & 3 deletions rescript/ncbi_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,12 +156,14 @@ def _fetch_and_extract_dataset(
def _fetch_taxonomy(
all_acc_ids: list,
all_tax_ids: list,
accession_to_assembly: pd.Series
accession_to_assembly: pd.Series,
ranks: list,
rank_propagation: bool
):
manager = Manager()
taxa, bad_accs = get_taxonomies(
taxids={k: v for k, v in zip(all_acc_ids, all_tax_ids)},
ranks=_default_ranks, rank_propagation=True,
ranks=ranks, rank_propagation=rank_propagation,
logging_level='INFO', n_jobs=2, request_lock=manager.Lock()
)
# technically, this should never happen as the taxa accession
Expand Down Expand Up @@ -213,6 +215,8 @@ def get_ncbi_genomes(
only_genomic: bool = False,
tax_exact_match: bool = False,
page_size: int = 20,
ranks: list = _default_ranks,
rank_propagation: bool = True,
) -> (DNAFASTAFormat, LociDirectoryFormat,
ProteinsDirectoryFormat, pd.DataFrame):
# we use a deepcopy of assembly_levels because the new versions of
Expand All @@ -239,7 +243,9 @@ def get_ncbi_genomes(
taxa = _fetch_taxonomy(
assembly_to_taxon.keys(),
assembly_to_taxon.values(),
accession_map.explode()
accession_map.explode(),
ranks,
rank_propagation
)

return genomes, loci, proteins, taxa
8 changes: 7 additions & 1 deletion rescript/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -1190,7 +1190,9 @@
'assembly_levels': List[Str % Choices(
['complete_genome', 'chromosome', 'scaffold', 'contig'])],
'tax_exact_match': Bool,
'page_size': Int % Range(20, 1000, inclusive_end=True)
'page_size': Int % Range(20, 1000, inclusive_end=True),
'ranks': List[Str % Choices(_allowed_ranks)],
'rank_propagation': Bool,
},
outputs=[
('genome_assemblies', FeatureData[Sequence]),
Expand All @@ -1217,6 +1219,10 @@
'request. If number of genomes to fetch is higher than '
'this number, requests will be repeated until all '
'assemblies are fetched.',
'ranks': 'List of taxonomic ranks for building a taxonomy from the '
'NCBI Taxonomy database.',
'rank_propagation': RANK_PROPAGATE_DESCRIPTION,

},
output_descriptions={
'genome_assemblies': 'Nucleotide sequences of requested genomes.',
Expand Down
10 changes: 8 additions & 2 deletions rescript/tests/test_ncbi_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,9 @@ def test_fetch_taxonomy(self, p):
pd.Series(
{'GCF_123': ['AC_12.1'], 'GCF_234': ['AC_23.2']},
name="assembly_id"
).explode()
).explode(),
_default_ranks,
True
)

exp_taxa = pd.DataFrame(
Expand All @@ -221,7 +223,11 @@ def test_fetch_taxonomy_bad_accs(self, p):
Exception, r'Invalid taxonomy.*\: ACC1, ACC2. Please check.*'
):
_fetch_taxonomy(
self.fake_assembly_ids, self.fake_tax_ids, pd.Series()
self.fake_assembly_ids,
self.fake_tax_ids,
pd.Series(),
_default_ranks,
True
)

# just test that everything works together
Expand Down

0 comments on commit 0091c79

Please sign in to comment.