Skip to content

Commit

Permalink
patch get_unite (#166)
Browse files Browse the repository at this point in the history
LGTM! Thanks @colinbrislawn!
  • Loading branch information
colinbrislawn authored Nov 14, 2023
1 parent c75d971 commit 92ee605
Show file tree
Hide file tree
Showing 5 changed files with 57 additions and 36 deletions.
1 change: 0 additions & 1 deletion rescript/citations.bib
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,6 @@ @article{nilsson2019unite
pages = {D259-D264},
year = {2018},
month = {10},
abstract = "{UNITE (https://unite.ut.ee/) is a web-based database and sequence management environment for the molecular identification of fungi. It targets the formal fungal barcode—the nuclear ribosomal internal transcribed spacer (ITS) region—and offers all ∼1 000 000 public fungal ITS sequences for reference. These are clustered into ∼459 000 species hypotheses and assigned digital object identifiers (DOIs) to promote unambiguous reference across studies. In-house and web-based third-party sequence curation and annotation have resulted in more than 275 000 improvements to the data over the past 15 years. UNITE serves as a data provider for a range of metabarcoding software pipelines and regularly exchanges data with all major fungal sequence databases and other community resources. Recent improvements include redesigned handling of unclassifiable species hypotheses, integration with the taxonomic backbone of the Global Biodiversity Information Facility, and support for an unlimited number of parallel taxonomic classification systems.}",
issn = {0305-1048},
doi = {10.1093/nar/gky1022},
url = {https://doi.org/10.1093/nar/gky1022},
Expand Down
38 changes: 20 additions & 18 deletions rescript/get_unite.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,13 @@
import tempfile
import tarfile
import requests
from requests.exceptions import HTTPError

from pandas import DataFrame
from q2_types.feature_data import TaxonomyFormat, DNAFASTAFormat, DNAIterator
from q2_types.feature_data import (
TaxonomyFormat,
MixedCaseDNAFASTAFormat,
DNAIterator,
)

# Source: https://unite.ut.ee/repository.php
UNITE_DOIS = {
Expand Down Expand Up @@ -76,6 +79,8 @@ def _unite_get_tgz(
for retry in range(retries):
# Track downloaded size
file_size = 0
# Prepair error text
dlfail = "File incomplete on try " + str(retry + 1)
try:
response = requests.get(url, stream=True)
# Save .tgz file
Expand All @@ -89,20 +94,15 @@ def _unite_get_tgz(
if file_size == int(response.headers.get("content-length", 0)):
return unite_file_path # done!
else:
raise ValueError("File download incomplete")
except HTTPError as e:
print(
"Request failed with code "
+ str(e.response.status_code)
+ ", on try "
+ str(retry)
)
raise ValueError(dlfail)
except ValueError:
print("File incomplete, on try " + str(retry))
print(dlfail)
if retry + 1 == retries:
raise ValueError(dlfail)


def _unite_get_artifacts(
tgz_file: str = None, cluster_id: str = None
tgz_file: str = None, cluster_id: str = "99"
) -> (DataFrame, DNAIterator):
"""
Find and import files with matching cluster_id from .tgz
Expand All @@ -115,7 +115,7 @@ def _unite_get_artifacts(
# Keep only _dev files
members = [m for m in tar.getmembers() if "_dev" in m.name]
if not members:
raise ValueError("No '_dev' files found")
raise ValueError("No '_dev' files found in Unite .tgz file")
for member in members:
# Keep only base name
member.name = os.path.basename(member.name)
Expand All @@ -126,9 +126,9 @@ def _unite_get_artifacts(
filtered_files = [
f for f in files if f.split("_")[4] == cluster_id
]
if not filtered_files or len(filtered_files) > 2:
if not filtered_files or len(filtered_files) != 2:
raise ValueError(
"Found "
"Expected 2, but found "
+ str(len(filtered_files))
+ " files found with cluster_id = "
+ cluster_id
Expand All @@ -138,13 +138,15 @@ def _unite_get_artifacts(
if file.endswith(".txt"):
taxa = TaxonomyFormat(fp, mode="r").view(DataFrame)
elif file.endswith(".fasta"):
seqs = DNAFASTAFormat(fp, mode="r").view(DNAIterator)
seqs = MixedCaseDNAFASTAFormat(fp, mode="r").view(
DNAIterator
)
return taxa, seqs


def get_unite_data(
version: str = None,
taxon_group: str = None,
version: str = "9.0",
taxon_group: str = "eukaryotes",
cluster_id: str = "99",
singletons: bool = False,
) -> (DataFrame, DNAIterator):
Expand Down
9 changes: 5 additions & 4 deletions rescript/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -994,17 +994,18 @@
input_descriptions={},
parameter_descriptions={
'version': 'UNITE version to download.',
'taxon_group': 'Just \'fungi\' or all \'eukaryotes\' in '
'the database.',
'taxon_group': 'Download a database with only \'fungi\' '
'or including all \'eukaryotes\'.',
'cluster_id': 'Percent similarity at which sequences in '
'the of database were clustered.',
'singletons': 'Included global and 3 percent distance singletons.'},
'singletons': 'Include singleton clusters in the database.'},
output_descriptions={
'taxonomy': 'UNITE reference taxonomy.',
'sequences': 'UNITE reference sequences.'},
name='Download and import UNITE reference data.',
description=(
'Outputs ready-to-use sequence and taxonomy artifacts, given a '
'Download and import ITS sequences and taxonomy from the '
'UNITE database, given a '
'version number and taxon_group, with the option to select a '
'cluster_id and include singletons. '
'Downloads data directly from UNITE\'s PlutoF REST API. ' +
Expand Down
Binary file added rescript/tests/data/unitefile_no_dev.tgz
Binary file not shown.
45 changes: 32 additions & 13 deletions rescript/tests/test_get_unite.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

import pkg_resources
import tempfile
import pandas.core.frame
import q2_types.feature_data
from qiime2.plugin.testing import TestPluginBase
from rescript.get_unite import (
UNITE_DOIS,
Expand All @@ -18,8 +20,7 @@
)

from urllib.request import urlopen
from urllib.error import HTTPError
from unittest.mock import patch
from unittest.mock import patch, Mock


class TestGetUNITE(TestPluginBase):
Expand All @@ -30,6 +31,9 @@ def setUp(self):
self.unitefile = pkg_resources.resource_filename(
"rescript.tests", "data/unitefile.tgz"
)
self.unitefile_no_dev = pkg_resources.resource_filename(
"rescript.tests", "data/unitefile_no_dev.tgz"
)

# Requires internet access
def test_unite_get_url(self):
Expand All @@ -38,18 +42,22 @@ def test_unite_get_url(self):
for tg in UNITE_DOIS[v].keys():
for s in UNITE_DOIS[v][tg].keys():
# ... try to get the URL
try:
url = _unite_get_url(v, tg, s)
urlopen(url)
except HTTPError:
raise ValueError("No URL for combo: " + v + tg + s)
url = _unite_get_url(v, tg, s)
urlopen(url)
self.assertTrue(True)

# Requires internet access
def test_unite_get_tgz(self):
# Download a single, small, unrelated file for testing
url = "https://files.plutof.ut.ee/doi/C9/F6/C9F687C997F72F674AA539CB80BF5D5BF6D1F402A2ACF840B20322850D3DFBA4.zip" # noqa E501
with tempfile.TemporaryDirectory() as tmpdirname:
_unite_get_tgz(url, tmpdirname)
# mock the response object
mock_response = Mock()
mock_response.iter_content.return_value = [b"mock"]
mock_response.headers.get.return_value = "4" # matches content
# mock successful download
with patch("requests.get", return_value=mock_response):
_unite_get_tgz("fakeURL", tmpdirname)
# real failed download
with self.assertRaisesRegex(ValueError, "File incomplete on try"):
_unite_get_tgz("https://files.plutof.ut.ee/nope", tmpdirname)

def test_unite_get_artifacts(self):
# Test on small data/unitefile.tgz with two items inside
Expand All @@ -66,6 +74,9 @@ def test_unite_get_artifacts(self):
str(type(res_two)),
"<class 'q2_types.feature_data._transformer.DNAIterator'>",
)
# test no _dev files found
with self.assertRaises(ValueError):
_unite_get_artifacts(self.unitefile_no_dev, cluster_id="97")
# test missing files or misspelled cluster_id
with self.assertRaises(ValueError):
_unite_get_artifacts(self.unitefile, "nothing")
Expand All @@ -77,5 +88,13 @@ def test_get_unite_data(self):
with patch(
"rescript.get_unite._unite_get_tgz", return_value=self.unitefile
):
get_unite_data(version="8.3", taxon_group="fungi", cluster_id="97")
self.assertTrue(True)
res = get_unite_data(
version="8.3", taxon_group="fungi", cluster_id="97"
)
self.assertEqual(len(res), 2)
self.assertTrue(isinstance(res[0], pandas.core.frame.DataFrame))
self.assertTrue(
isinstance(
res[1], q2_types.feature_data._transformer.DNAIterator
)
)

0 comments on commit 92ee605

Please sign in to comment.