Skip to content
This repository has been archived by the owner on May 21, 2024. It is now read-only.

ENH: Update NCBITaxonomyDirFmt to accomodate data-version file #73

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 83 additions & 2 deletions q2_types_genomics/reference_db/_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------


import datetime
import gzip
import re
from qiime2.plugin import model
Expand Down Expand Up @@ -263,8 +263,88 @@ def _validate_(self, level):
line_no += 1


class NCBITaxonomyVersionFormat(model.TextFileFormat):
def _validate_header(self, lines):
first_line = lines[0].strip("\n").split("\t")
if len(first_line) > 3:
raise ValidationError(
"Too many columns.\n"
"Expected columns:\n"
"['file_name', 'date', 'time']\n"
"Columns given:\n"
f"{first_line}"
)

if not (
first_line[0] == 'file_name' and
first_line[1] == 'date' and
first_line[2] == 'time'
):
raise ValidationError(
"Wrong columns.\n"
"Expected columns:\n"
"['file_name', 'date', 'time']\n"
"Columns given:\n"
f"{first_line}"
)

def _validate_body(self, lines):
file_names = ['nodes.dmp', 'names.dmp', 'prot.accession2taxid.gz']
if len(lines[1:]) > 3:
raise ValidationError(
"Too many entries. "
f"There should only be 3, namely: {file_names}\n"
"Printing entires in version.tsv\n"
f"{lines}"
)

for line in lines[1:]:
fields = line.strip("\n").split("\t")
# Raise error if file name is not valid
if fields[0] not in file_names:
raise ValidationError(
"Invalid or repeated filename found in version.tsv.\n"
"Printing entires in version.tsv\n"
f"{lines}"
)
else:
# Remove file name to insure its not repeated
file_names.remove(fields[0])

# Raise error if invalid date
try:
day, month, year = fields[1].strip("\n").split("/")
datetime.date(day=int(day), month=int(month), year=int(year))
except ValueError:
raise ValidationError(
"Invalid date found in version.tsv\n"
"Printing invalid date:\n"
f"{fields[1]}"
)

# Raise error if invalid time
try:
hour, minute, second = fields[2].strip("\n").split(":")
datetime.time(
hour=int(hour), minute=int(minute), second=int(second)
)
except ValueError:
raise ValidationError(
"Invalid time found in version.tsv\n"
"Printing invalid time:\n"
f"{fields[2]}\n"
)

def _validate_(self, level):
with open(str(self), "r") as file:
lines = file.readlines()
self._validate_header(lines)
self._validate_body(lines)


plugin.register_formats(
NCBITaxonomyNodesFormat, NCBITaxonomyNamesFormat, NCBITaxonomyBinaryFileFmt
NCBITaxonomyNodesFormat, NCBITaxonomyNamesFormat,
NCBITaxonomyBinaryFileFmt, NCBITaxonomyVersionFormat
)


Expand All @@ -275,6 +355,7 @@ class NCBITaxonomyDirFmt(model.DirectoryFormat):
'prot.accession2taxid.gz',
format=NCBITaxonomyBinaryFileFmt
)
version = model.File("version.tsv", format=NCBITaxonomyVersionFormat)


plugin.register_formats(NCBITaxonomyDirFmt)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
file_name date time
names.dmp 01/12/2023 10:28:10
nodes.dmp 01/12/2023 10:27:36
prot.accession2taxid.gz 05/12/2023 10:33:51
4 changes: 4 additions & 0 deletions q2_types_genomics/reference_db/tests/data/ncbi/version.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
file_name date time
names.dmp 01/12/2023 10:28:10
nodes.dmp 01/12/2023 10:27:36
prot.accession2taxid.gz 05/12/2023 10:33:51
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
file_name date time
names.dmp 01/13/2023 10:28:10
nodes.dmp 01/12/2023 10:27:36
prot.accession2taxid.gz 05/12/2023 10:33:51
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
file_name date time
names.dmp 01/12/2023 10:28:10
nodes.dmp 01/12/2023 10:27:36
something_else 05/12/2023 10:33:51
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
file_name date time
names.dmp 01/12/2023 10:28:10
nodes.dmp 01/12/2023 10:27:36
prot.accession2taxid.gz 05/12/2023 25:33:51
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
file_name date time
names.dmp 01/12/2023 10:28:10
nodes.dmp 01/12/2023 10:27:36
nodes.dmp 05/12/2023 10:33:51
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
file_name date time something else
names.dmp 01/12/2023 10:28:10
nodes.dmp 01/12/2023 10:27:36
prot.accession2taxid.gz 05/12/2023 10:33:51
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
file_name date time
names.dmp 01/12/2023 10:28:10
nodes.dmp 01/12/2023 10:27:36
prot.accession2taxid.gz 05/12/2023 10:33:51
prot.accession2taxid.gz 05/12/2023 10:33:51
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
file_name date something else
names.dmp 01/12/2023 10:28:10
nodes.dmp 01/12/2023 10:27:36
prot.accession2taxid.gz 05/12/2023 10:33:51
71 changes: 70 additions & 1 deletion q2_types_genomics/reference_db/tests/test_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
DiamondDatabaseFileFmt, DiamondDatabaseDirFmt, EggnogRefBinFileFmt,
EggnogRefDirFmt, NCBITaxonomyNamesFormat, NCBITaxonomyNodesFormat,
NCBITaxonomyDirFmt, NCBITaxonomyBinaryFileFmt,
EggnogProteinSequencesDirFmt, EggnogRefTextFileFmt
EggnogProteinSequencesDirFmt, EggnogRefTextFileFmt,
NCBITaxonomyVersionFormat
)
from qiime2.plugin import ValidationError

Expand Down Expand Up @@ -266,3 +267,71 @@ def test_binary_file_fmt_wrong_gi(self):
r"['A0A009IHW8', 'A0A009IHW8.1', '1310613', '1835922267s']"
):
format.validate()

def test_version_file_fmt_positive(self):
dirpath = self.get_data_path("ncbi/db-valid/version.tsv")
format = NCBITaxonomyVersionFormat(dirpath, mode="r")
format.validate()

def test_version_file_fmt_too_many_cols(self):
dirpath = self.get_data_path("ncbi/version_too_many_cols.tsv")
format = NCBITaxonomyVersionFormat(dirpath, mode="r")
with self.assertRaisesRegex(
ValidationError,
"Too many columns"
):
format.validate()

def test_version_file_fmt_wrong_cols(self):
dirpath = self.get_data_path("ncbi/version_wrong_cols.tsv")
format = NCBITaxonomyVersionFormat(dirpath, mode="r")
with self.assertRaisesRegex(
ValidationError,
"Wrong columns"
):
format.validate()

def test_version_file_fmt_too_many_entries(self):
dirpath = self.get_data_path("ncbi/version_too_many_entries.tsv")
format = NCBITaxonomyVersionFormat(dirpath, mode="r")
with self.assertRaisesRegex(
ValidationError,
"Too many entries"
):
format.validate()

def test_version_file_fmt_invalid_filename(self):
dirpath = self.get_data_path("ncbi/version_invalid_filename.tsv")
format = NCBITaxonomyVersionFormat(dirpath, mode="r")
with self.assertRaisesRegex(
ValidationError,
"Invalid or repeated filename"
):
format.validate()

def test_version_file_fmt_repeated_filename(self):
dirpath = self.get_data_path("ncbi/version_repeated_filename.tsv")
format = NCBITaxonomyVersionFormat(dirpath, mode="r")
with self.assertRaisesRegex(
ValidationError,
"Invalid or repeated filename"
):
format.validate()

def test_version_file_fmt_invalid_date(self):
dirpath = self.get_data_path("ncbi/version_invalid_date.tsv")
format = NCBITaxonomyVersionFormat(dirpath, mode="r")
with self.assertRaisesRegex(
ValidationError,
"Invalid date"
):
format.validate()

def test_version_file_fmt_invalid_time(self):
dirpath = self.get_data_path("ncbi/version_invalid_time.tsv")
format = NCBITaxonomyVersionFormat(dirpath, mode="r")
with self.assertRaisesRegex(
ValidationError,
"Invalid time"
):
format.validate()