diff --git a/rescript/citations.bib b/rescript/citations.bib index a483909..7061da8 100644 --- a/rescript/citations.bib +++ b/rescript/citations.bib @@ -167,7 +167,6 @@ @article{nilsson2019unite pages = {D259-D264}, year = {2018}, month = {10}, - abstract = "{UNITE (https://unite.ut.ee/) is a web-based database and sequence management environment for the molecular identification of fungi. It targets the formal fungal barcode—the nuclear ribosomal internal transcribed spacer (ITS) region—and offers all ∼1 000 000 public fungal ITS sequences for reference. These are clustered into ∼459 000 species hypotheses and assigned digital object identifiers (DOIs) to promote unambiguous reference across studies. In-house and web-based third-party sequence curation and annotation have resulted in more than 275 000 improvements to the data over the past 15 years. UNITE serves as a data provider for a range of metabarcoding software pipelines and regularly exchanges data with all major fungal sequence databases and other community resources. Recent improvements include redesigned handling of unclassifiable species hypotheses, integration with the taxonomic backbone of the Global Biodiversity Information Facility, and support for an unlimited number of parallel taxonomic classification systems.}", issn = {0305-1048}, doi = {10.1093/nar/gky1022}, url = {https://doi.org/10.1093/nar/gky1022}, diff --git a/rescript/get_unite.py b/rescript/get_unite.py index 9bc055b..79365e5 100644 --- a/rescript/get_unite.py +++ b/rescript/get_unite.py @@ -10,10 +10,13 @@ import tempfile import tarfile import requests -from requests.exceptions import HTTPError from pandas import DataFrame -from q2_types.feature_data import TaxonomyFormat, DNAFASTAFormat, DNAIterator +from q2_types.feature_data import ( + TaxonomyFormat, + MixedCaseDNAFASTAFormat, + DNAIterator, +) # Source: https://unite.ut.ee/repository.php UNITE_DOIS = { @@ -76,6 +79,8 @@ def _unite_get_tgz( for retry in range(retries): # Track downloaded size file_size = 0 + # Prepair error text + dlfail = "File incomplete on try " + str(retry + 1) try: response = requests.get(url, stream=True) # Save .tgz file @@ -89,20 +94,15 @@ def _unite_get_tgz( if file_size == int(response.headers.get("content-length", 0)): return unite_file_path # done! else: - raise ValueError("File download incomplete") - except HTTPError as e: - print( - "Request failed with code " - + str(e.response.status_code) - + ", on try " - + str(retry) - ) + raise ValueError(dlfail) except ValueError: - print("File incomplete, on try " + str(retry)) + print(dlfail) + if retry + 1 == retries: + raise ValueError(dlfail) def _unite_get_artifacts( - tgz_file: str = None, cluster_id: str = None + tgz_file: str = None, cluster_id: str = "99" ) -> (DataFrame, DNAIterator): """ Find and import files with matching cluster_id from .tgz @@ -115,7 +115,7 @@ def _unite_get_artifacts( # Keep only _dev files members = [m for m in tar.getmembers() if "_dev" in m.name] if not members: - raise ValueError("No '_dev' files found") + raise ValueError("No '_dev' files found in Unite .tgz file") for member in members: # Keep only base name member.name = os.path.basename(member.name) @@ -126,9 +126,9 @@ def _unite_get_artifacts( filtered_files = [ f for f in files if f.split("_")[4] == cluster_id ] - if not filtered_files or len(filtered_files) > 2: + if not filtered_files or len(filtered_files) != 2: raise ValueError( - "Found " + "Expected 2, but found " + str(len(filtered_files)) + " files found with cluster_id = " + cluster_id @@ -138,13 +138,15 @@ def _unite_get_artifacts( if file.endswith(".txt"): taxa = TaxonomyFormat(fp, mode="r").view(DataFrame) elif file.endswith(".fasta"): - seqs = DNAFASTAFormat(fp, mode="r").view(DNAIterator) + seqs = MixedCaseDNAFASTAFormat(fp, mode="r").view( + DNAIterator + ) return taxa, seqs def get_unite_data( - version: str = None, - taxon_group: str = None, + version: str = "9.0", + taxon_group: str = "eukaryotes", cluster_id: str = "99", singletons: bool = False, ) -> (DataFrame, DNAIterator): diff --git a/rescript/plugin_setup.py b/rescript/plugin_setup.py index 307e732..99bb282 100644 --- a/rescript/plugin_setup.py +++ b/rescript/plugin_setup.py @@ -994,17 +994,18 @@ input_descriptions={}, parameter_descriptions={ 'version': 'UNITE version to download.', - 'taxon_group': 'Just \'fungi\' or all \'eukaryotes\' in ' - 'the database.', + 'taxon_group': 'Download a database with only \'fungi\' ' + 'or including all \'eukaryotes\'.', 'cluster_id': 'Percent similarity at which sequences in ' 'the of database were clustered.', - 'singletons': 'Included global and 3 percent distance singletons.'}, + 'singletons': 'Include singleton clusters in the database.'}, output_descriptions={ 'taxonomy': 'UNITE reference taxonomy.', 'sequences': 'UNITE reference sequences.'}, name='Download and import UNITE reference data.', description=( - 'Outputs ready-to-use sequence and taxonomy artifacts, given a ' + 'Download and import ITS sequences and taxonomy from the ' + 'UNITE database, given a ' 'version number and taxon_group, with the option to select a ' 'cluster_id and include singletons. ' 'Downloads data directly from UNITE\'s PlutoF REST API. ' + diff --git a/rescript/tests/data/unitefile_no_dev.tgz b/rescript/tests/data/unitefile_no_dev.tgz new file mode 100644 index 0000000..83546ac Binary files /dev/null and b/rescript/tests/data/unitefile_no_dev.tgz differ diff --git a/rescript/tests/test_get_unite.py b/rescript/tests/test_get_unite.py index 495b5c5..9a47d3d 100644 --- a/rescript/tests/test_get_unite.py +++ b/rescript/tests/test_get_unite.py @@ -8,6 +8,8 @@ import pkg_resources import tempfile +import pandas.core.frame +import q2_types.feature_data from qiime2.plugin.testing import TestPluginBase from rescript.get_unite import ( UNITE_DOIS, @@ -18,8 +20,7 @@ ) from urllib.request import urlopen -from urllib.error import HTTPError -from unittest.mock import patch +from unittest.mock import patch, Mock class TestGetUNITE(TestPluginBase): @@ -30,6 +31,9 @@ def setUp(self): self.unitefile = pkg_resources.resource_filename( "rescript.tests", "data/unitefile.tgz" ) + self.unitefile_no_dev = pkg_resources.resource_filename( + "rescript.tests", "data/unitefile_no_dev.tgz" + ) # Requires internet access def test_unite_get_url(self): @@ -38,18 +42,22 @@ def test_unite_get_url(self): for tg in UNITE_DOIS[v].keys(): for s in UNITE_DOIS[v][tg].keys(): # ... try to get the URL - try: - url = _unite_get_url(v, tg, s) - urlopen(url) - except HTTPError: - raise ValueError("No URL for combo: " + v + tg + s) + url = _unite_get_url(v, tg, s) + urlopen(url) + self.assertTrue(True) - # Requires internet access def test_unite_get_tgz(self): - # Download a single, small, unrelated file for testing - url = "https://files.plutof.ut.ee/doi/C9/F6/C9F687C997F72F674AA539CB80BF5D5BF6D1F402A2ACF840B20322850D3DFBA4.zip" # noqa E501 with tempfile.TemporaryDirectory() as tmpdirname: - _unite_get_tgz(url, tmpdirname) + # mock the response object + mock_response = Mock() + mock_response.iter_content.return_value = [b"mock"] + mock_response.headers.get.return_value = "4" # matches content + # mock successful download + with patch("requests.get", return_value=mock_response): + _unite_get_tgz("fakeURL", tmpdirname) + # real failed download + with self.assertRaisesRegex(ValueError, "File incomplete on try"): + _unite_get_tgz("https://files.plutof.ut.ee/nope", tmpdirname) def test_unite_get_artifacts(self): # Test on small data/unitefile.tgz with two items inside @@ -66,6 +74,9 @@ def test_unite_get_artifacts(self): str(type(res_two)), "", ) + # test no _dev files found + with self.assertRaises(ValueError): + _unite_get_artifacts(self.unitefile_no_dev, cluster_id="97") # test missing files or misspelled cluster_id with self.assertRaises(ValueError): _unite_get_artifacts(self.unitefile, "nothing") @@ -77,5 +88,13 @@ def test_get_unite_data(self): with patch( "rescript.get_unite._unite_get_tgz", return_value=self.unitefile ): - get_unite_data(version="8.3", taxon_group="fungi", cluster_id="97") - self.assertTrue(True) + res = get_unite_data( + version="8.3", taxon_group="fungi", cluster_id="97" + ) + self.assertEqual(len(res), 2) + self.assertTrue(isinstance(res[0], pandas.core.frame.DataFrame)) + self.assertTrue( + isinstance( + res[1], q2_types.feature_data._transformer.DNAIterator + ) + )