diff --git a/q2_types_genomics/reference_db/__init__.py b/q2_types_genomics/reference_db/__init__.py index f17c079..a86c545 100644 --- a/q2_types_genomics/reference_db/__init__.py +++ b/q2_types_genomics/reference_db/__init__.py @@ -19,10 +19,13 @@ DiamondDatabaseFileFmt, DiamondDatabaseDirFmt, NCBITaxonomyDirFmt, + NCBITaxonomyVersionFormat, EggnogProteinSequencesDirFmt ) __all__ = ['ReferenceDB', 'Diamond', 'Eggnog', 'DiamondDatabaseFileFmt', 'DiamondDatabaseDirFmt', 'EggnogRefDirFmt', 'EggnogRefTextFileFmt', 'EggnogRefBinFileFmt', 'NCBITaxonomyDirFmt', 'NCBITaxonomy', - 'EggnogProteinSequencesDirFmt', 'EggnogProteinSequences'] + 'EggnogProteinSequencesDirFmt', 'EggnogProteinSequences', + 'NCBITaxonomyVersionFormat' + ] diff --git a/q2_types_genomics/reference_db/_format.py b/q2_types_genomics/reference_db/_format.py index 574f331..0deb798 100644 --- a/q2_types_genomics/reference_db/_format.py +++ b/q2_types_genomics/reference_db/_format.py @@ -6,7 +6,7 @@ # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- - +import datetime import gzip import re from qiime2.plugin import model @@ -263,8 +263,88 @@ def _validate_(self, level): line_no += 1 +class NCBITaxonomyVersionFormat(model.TextFileFormat): + def _validate_header(self, lines): + first_line = lines[0].strip("\n").split("\t") + if len(first_line) > 3: + raise ValidationError( + "Too many columns.\n" + "Expected columns:\n" + "['file_name', 'date', 'time']\n" + "Columns given:\n" + f"{first_line}" + ) + + if not ( + first_line[0] == 'file_name' and + first_line[1] == 'date' and + first_line[2] == 'time' + ): + raise ValidationError( + "Wrong columns.\n" + "Expected columns:\n" + "['file_name', 'date', 'time']\n" + "Columns given:\n" + f"{first_line}" + ) + + def _validate_body(self, lines): + file_names = ['nodes.dmp', 'names.dmp', 'prot.accession2taxid.gz'] + if len(lines[1:]) > 3: + raise ValidationError( + "Too many entries. " + f"There should only be 3, namely: {file_names}\n" + "Printing entires in version.tsv\n" + f"{lines}" + ) + + for line in lines[1:]: + fields = line.strip("\n").split("\t") + # Raise error if file name is not valid + if fields[0] not in file_names: + raise ValidationError( + "Invalid or repeated filename found in version.tsv.\n" + "Printing entires in version.tsv\n" + f"{lines}" + ) + else: + # Remove file name to insure its not repeated + file_names.remove(fields[0]) + + # Raise error if invalid date + try: + day, month, year = fields[1].strip("\n").split("/") + datetime.date(day=int(day), month=int(month), year=int(year)) + except ValueError: + raise ValidationError( + "Invalid date found in version.tsv\n" + "Printing invalid date:\n" + f"{fields[1]}" + ) + + # Raise error if invalid time + try: + hour, minute, second = fields[2].strip("\n").split(":") + datetime.time( + hour=int(hour), minute=int(minute), second=int(second) + ) + except ValueError: + raise ValidationError( + "Invalid time found in version.tsv\n" + "Printing invalid time:\n" + f"{fields[2]}\n" + ) + + def _validate_(self, level): + with open(str(self), "r") as file: + lines = file.readlines() + self._validate_header(lines) + self._validate_body(lines) + + plugin.register_formats( - NCBITaxonomyNodesFormat, NCBITaxonomyNamesFormat, NCBITaxonomyBinaryFileFmt + NCBITaxonomyNodesFormat, NCBITaxonomyNamesFormat, + NCBITaxonomyBinaryFileFmt, NCBITaxonomyVersionFormat ) @@ -275,6 +355,7 @@ class NCBITaxonomyDirFmt(model.DirectoryFormat): 'prot.accession2taxid.gz', format=NCBITaxonomyBinaryFileFmt ) + version = model.File("version.tsv", format=NCBITaxonomyVersionFormat) plugin.register_formats(NCBITaxonomyDirFmt) diff --git a/q2_types_genomics/reference_db/tests/data/ncbi/db-valid/version.tsv b/q2_types_genomics/reference_db/tests/data/ncbi/db-valid/version.tsv new file mode 100644 index 0000000..b6972bb --- /dev/null +++ b/q2_types_genomics/reference_db/tests/data/ncbi/db-valid/version.tsv @@ -0,0 +1,4 @@ +file_name date time +names.dmp 01/12/2023 10:28:10 +nodes.dmp 01/12/2023 10:27:36 +prot.accession2taxid.gz 05/12/2023 10:33:51 \ No newline at end of file diff --git a/q2_types_genomics/reference_db/tests/data/ncbi/version.tsv b/q2_types_genomics/reference_db/tests/data/ncbi/version.tsv new file mode 100644 index 0000000..65a8ea9 --- /dev/null +++ b/q2_types_genomics/reference_db/tests/data/ncbi/version.tsv @@ -0,0 +1,4 @@ +file_name date time +names.dmp 01/12/2023 10:28:10 +nodes.dmp 01/12/2023 10:27:36 +prot.accession2taxid.gz 05/12/2023 10:33:51 \ No newline at end of file diff --git a/q2_types_genomics/reference_db/tests/data/ncbi/version_invalid_date.tsv b/q2_types_genomics/reference_db/tests/data/ncbi/version_invalid_date.tsv new file mode 100644 index 0000000..b81ca4f --- /dev/null +++ b/q2_types_genomics/reference_db/tests/data/ncbi/version_invalid_date.tsv @@ -0,0 +1,4 @@ +file_name date time +names.dmp 01/13/2023 10:28:10 +nodes.dmp 01/12/2023 10:27:36 +prot.accession2taxid.gz 05/12/2023 10:33:51 \ No newline at end of file diff --git a/q2_types_genomics/reference_db/tests/data/ncbi/version_invalid_filename.tsv b/q2_types_genomics/reference_db/tests/data/ncbi/version_invalid_filename.tsv new file mode 100644 index 0000000..880a126 --- /dev/null +++ b/q2_types_genomics/reference_db/tests/data/ncbi/version_invalid_filename.tsv @@ -0,0 +1,4 @@ +file_name date time +names.dmp 01/12/2023 10:28:10 +nodes.dmp 01/12/2023 10:27:36 +something_else 05/12/2023 10:33:51 \ No newline at end of file diff --git a/q2_types_genomics/reference_db/tests/data/ncbi/version_invalid_time.tsv b/q2_types_genomics/reference_db/tests/data/ncbi/version_invalid_time.tsv new file mode 100644 index 0000000..12ba169 --- /dev/null +++ b/q2_types_genomics/reference_db/tests/data/ncbi/version_invalid_time.tsv @@ -0,0 +1,4 @@ +file_name date time +names.dmp 01/12/2023 10:28:10 +nodes.dmp 01/12/2023 10:27:36 +prot.accession2taxid.gz 05/12/2023 25:33:51 \ No newline at end of file diff --git a/q2_types_genomics/reference_db/tests/data/ncbi/version_repeated_filename.tsv b/q2_types_genomics/reference_db/tests/data/ncbi/version_repeated_filename.tsv new file mode 100644 index 0000000..84b5ff7 --- /dev/null +++ b/q2_types_genomics/reference_db/tests/data/ncbi/version_repeated_filename.tsv @@ -0,0 +1,4 @@ +file_name date time +names.dmp 01/12/2023 10:28:10 +nodes.dmp 01/12/2023 10:27:36 +nodes.dmp 05/12/2023 10:33:51 \ No newline at end of file diff --git a/q2_types_genomics/reference_db/tests/data/ncbi/version_too_many_cols.tsv b/q2_types_genomics/reference_db/tests/data/ncbi/version_too_many_cols.tsv new file mode 100644 index 0000000..a32a1ec --- /dev/null +++ b/q2_types_genomics/reference_db/tests/data/ncbi/version_too_many_cols.tsv @@ -0,0 +1,4 @@ +file_name date time something else +names.dmp 01/12/2023 10:28:10 +nodes.dmp 01/12/2023 10:27:36 +prot.accession2taxid.gz 05/12/2023 10:33:51 \ No newline at end of file diff --git a/q2_types_genomics/reference_db/tests/data/ncbi/version_too_many_entries.tsv b/q2_types_genomics/reference_db/tests/data/ncbi/version_too_many_entries.tsv new file mode 100644 index 0000000..a3816fe --- /dev/null +++ b/q2_types_genomics/reference_db/tests/data/ncbi/version_too_many_entries.tsv @@ -0,0 +1,5 @@ +file_name date time +names.dmp 01/12/2023 10:28:10 +nodes.dmp 01/12/2023 10:27:36 +prot.accession2taxid.gz 05/12/2023 10:33:51 +prot.accession2taxid.gz 05/12/2023 10:33:51 \ No newline at end of file diff --git a/q2_types_genomics/reference_db/tests/data/ncbi/version_wrong_cols.tsv b/q2_types_genomics/reference_db/tests/data/ncbi/version_wrong_cols.tsv new file mode 100644 index 0000000..bac3ba2 --- /dev/null +++ b/q2_types_genomics/reference_db/tests/data/ncbi/version_wrong_cols.tsv @@ -0,0 +1,4 @@ +file_name date something else +names.dmp 01/12/2023 10:28:10 +nodes.dmp 01/12/2023 10:27:36 +prot.accession2taxid.gz 05/12/2023 10:33:51 \ No newline at end of file diff --git a/q2_types_genomics/reference_db/tests/test_format.py b/q2_types_genomics/reference_db/tests/test_format.py index 9961515..eae6552 100644 --- a/q2_types_genomics/reference_db/tests/test_format.py +++ b/q2_types_genomics/reference_db/tests/test_format.py @@ -10,7 +10,8 @@ DiamondDatabaseFileFmt, DiamondDatabaseDirFmt, EggnogRefBinFileFmt, EggnogRefDirFmt, NCBITaxonomyNamesFormat, NCBITaxonomyNodesFormat, NCBITaxonomyDirFmt, NCBITaxonomyBinaryFileFmt, - EggnogProteinSequencesDirFmt, EggnogRefTextFileFmt + EggnogProteinSequencesDirFmt, EggnogRefTextFileFmt, + NCBITaxonomyVersionFormat ) from qiime2.plugin import ValidationError @@ -266,3 +267,71 @@ def test_binary_file_fmt_wrong_gi(self): r"['A0A009IHW8', 'A0A009IHW8.1', '1310613', '1835922267s']" ): format.validate() + + def test_version_file_fmt_positive(self): + dirpath = self.get_data_path("ncbi/db-valid/version.tsv") + format = NCBITaxonomyVersionFormat(dirpath, mode="r") + format.validate() + + def test_version_file_fmt_too_many_cols(self): + dirpath = self.get_data_path("ncbi/version_too_many_cols.tsv") + format = NCBITaxonomyVersionFormat(dirpath, mode="r") + with self.assertRaisesRegex( + ValidationError, + "Too many columns" + ): + format.validate() + + def test_version_file_fmt_wrong_cols(self): + dirpath = self.get_data_path("ncbi/version_wrong_cols.tsv") + format = NCBITaxonomyVersionFormat(dirpath, mode="r") + with self.assertRaisesRegex( + ValidationError, + "Wrong columns" + ): + format.validate() + + def test_version_file_fmt_too_many_entries(self): + dirpath = self.get_data_path("ncbi/version_too_many_entries.tsv") + format = NCBITaxonomyVersionFormat(dirpath, mode="r") + with self.assertRaisesRegex( + ValidationError, + "Too many entries" + ): + format.validate() + + def test_version_file_fmt_invalid_filename(self): + dirpath = self.get_data_path("ncbi/version_invalid_filename.tsv") + format = NCBITaxonomyVersionFormat(dirpath, mode="r") + with self.assertRaisesRegex( + ValidationError, + "Invalid or repeated filename" + ): + format.validate() + + def test_version_file_fmt_repeated_filename(self): + dirpath = self.get_data_path("ncbi/version_repeated_filename.tsv") + format = NCBITaxonomyVersionFormat(dirpath, mode="r") + with self.assertRaisesRegex( + ValidationError, + "Invalid or repeated filename" + ): + format.validate() + + def test_version_file_fmt_invalid_date(self): + dirpath = self.get_data_path("ncbi/version_invalid_date.tsv") + format = NCBITaxonomyVersionFormat(dirpath, mode="r") + with self.assertRaisesRegex( + ValidationError, + "Invalid date" + ): + format.validate() + + def test_version_file_fmt_invalid_time(self): + dirpath = self.get_data_path("ncbi/version_invalid_time.tsv") + format = NCBITaxonomyVersionFormat(dirpath, mode="r") + with self.assertRaisesRegex( + ValidationError, + "Invalid time" + ): + format.validate()