From 66820b4d2141ed9aa4e3d1e1843c745cf0994d2c Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Wed, 8 May 2024 16:10:48 +0200 Subject: [PATCH 1/7] added FeatureData[SequenceCharacteristics] type with semantic validation --- q2_types/feature_data/__init__.py | 8 ++- q2_types/feature_data/_format.py | 33 +++++++++++- q2_types/feature_data/_transformer.py | 15 +++++- q2_types/feature_data/_type.py | 43 +++++++++++++-- q2_types/feature_data/tests/test_format.py | 36 ++++++++++++- .../feature_data/tests/test_transformer.py | 31 ++++++++++- q2_types/feature_data/tests/test_type.py | 54 ++++++++++++++++++- 7 files changed, 208 insertions(+), 12 deletions(-) diff --git a/q2_types/feature_data/__init__.py b/q2_types/feature_data/__init__.py index 3e6eacbd..14d131e5 100644 --- a/q2_types/feature_data/__init__.py +++ b/q2_types/feature_data/__init__.py @@ -27,11 +27,13 @@ MixedCaseAlignedDNAFASTAFormat, MixedCaseAlignedDNASequencesDirectoryFormat, MixedCaseAlignedRNAFASTAFormat, - MixedCaseAlignedRNASequencesDirectoryFormat) + MixedCaseAlignedRNASequencesDirectoryFormat, + SequenceCharacteristicsDirectoryFormat, + SequenceCharacteristicsFormat) from ._type import ( FeatureData, Taxonomy, Sequence, PairedEndSequence, AlignedSequence, Differential, ProteinSequence, AlignedProteinSequence, RNASequence, - AlignedRNASequence, PairedEndRNASequence, BLAST6) + AlignedRNASequence, PairedEndRNASequence, BLAST6, SequenceCharacteristics) # TODO remove these imports when tests are rewritten. Remove from __all__ too from ._transformer import ( @@ -67,6 +69,8 @@ 'MixedCaseAlignedProteinFASTAFormat', 'MixedCaseProteinSequencesDirectoryFormat', 'MixedCaseAlignedProteinSequencesDirectoryFormat', + 'SequenceCharacteristics', 'SequenceCharacteristicsDirectoryFormat', + 'SequenceCharacteristicsFormat' ] importlib.import_module('q2_types.feature_data._transformer') diff --git a/q2_types/feature_data/_format.py b/q2_types/feature_data/_format.py index a15a5dbb..69c18c70 100644 --- a/q2_types/feature_data/_format.py +++ b/q2_types/feature_data/_format.py @@ -470,6 +470,36 @@ def validate(self, *args): 'BLAST6DirectoryFormat', 'blast6.tsv', BLAST6Format) +class SequenceCharacteristicsFormat(model.TextFileFormat): + """ + Format for a TSV file with information about sequences like length of a + feature. The first column contains feature identifiers and is followed by + other optional columns. + + The file cannot be empty and must have at least two columns. + + Validation for additional columns can be added with a semantic validator + tied to a property. For example the + "validate_sequence_characteristics_length" validator for + "FeatureData[SequenceCharacteristics % Properties("length")]" + adds validation for a numerical column called "length". + """ + + def validate(self, n_records=None): + try: + data = pd.read_csv(str(self), sep="\t", index_col=0) + except pd.errors.EmptyDataError: + raise ValidationError('File cannot be empty.') + + if not data.columns.any(): + raise ValidationError('File needs to have at least two columns.') + + +SequenceCharacteristicsDirectoryFormat = model.SingleFileDirectoryFormat( + "SequenceCharacteristicsDirectoryFormat", + "sequence_characteristics.txt", SequenceCharacteristicsFormat +) + plugin.register_formats( TSVTaxonomyFormat, TSVTaxonomyDirectoryFormat, HeaderlessTSVTaxonomyFormat, HeaderlessTSVTaxonomyDirectoryFormat, @@ -489,5 +519,6 @@ def validate(self, *args): MixedCaseRNASequencesDirectoryFormat, MixedCaseAlignedDNAFASTAFormat, MixedCaseAlignedDNASequencesDirectoryFormat, MixedCaseAlignedRNAFASTAFormat, - MixedCaseAlignedRNASequencesDirectoryFormat + MixedCaseAlignedRNASequencesDirectoryFormat, SequenceCharacteristicsFormat, + SequenceCharacteristicsDirectoryFormat ) diff --git a/q2_types/feature_data/_transformer.py b/q2_types/feature_data/_transformer.py index c5af41ca..33ea679b 100644 --- a/q2_types/feature_data/_transformer.py +++ b/q2_types/feature_data/_transformer.py @@ -22,7 +22,8 @@ AlignedProteinFASTAFormat, RNAFASTAFormat, AlignedRNAFASTAFormat, PairedRNASequencesDirectoryFormat, BLAST6Format, MixedCaseDNAFASTAFormat, MixedCaseRNAFASTAFormat, - MixedCaseAlignedDNAFASTAFormat, MixedCaseAlignedRNAFASTAFormat) + MixedCaseAlignedDNAFASTAFormat, MixedCaseAlignedRNAFASTAFormat, + SequenceCharacteristicsFormat) # Taxonomy format transformers @@ -797,3 +798,15 @@ def _227(ff: BLAST6Format) -> qiime2.Metadata: # default int index but cast to a str and give it a name. data.index = pd.Index(data.index.astype(str), name='id') return qiime2.Metadata(data) + + +@plugin.register_transformer +def _228(ff: SequenceCharacteristicsFormat) -> pd.DataFrame: + return pd.read_csv(str(ff), sep="\t", index_col=0) + + +@plugin.register_transformer +def _229(data: pd.DataFrame) -> SequenceCharacteristicsFormat: + ff = SequenceCharacteristicsFormat() + data.to_csv(str(ff), sep='\t') + return ff diff --git a/q2_types/feature_data/_type.py b/q2_types/feature_data/_type.py index db79f87e..6484b3dc 100644 --- a/q2_types/feature_data/_type.py +++ b/q2_types/feature_data/_type.py @@ -5,7 +5,9 @@ # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- - +import pandas as pd +from qiime2.core.exceptions import ValidationError +from qiime2.core.type import Properties from qiime2.plugin import SemanticType from ..plugin_setup import plugin @@ -15,7 +17,8 @@ DifferentialDirectoryFormat, ProteinSequencesDirectoryFormat, AlignedProteinSequencesDirectoryFormat, RNASequencesDirectoryFormat, AlignedRNASequencesDirectoryFormat, - PairedRNASequencesDirectoryFormat, BLAST6DirectoryFormat) + PairedRNASequencesDirectoryFormat, BLAST6DirectoryFormat, + SequenceCharacteristicsDirectoryFormat) from q2_types.sample_data import SampleData @@ -52,13 +55,40 @@ variant_of=[FeatureData.field['type'], SampleData.field['type']]) +SequenceCharacteristics = SemanticType('SequenceCharacteristics', + variant_of=FeatureData.field['type']) + + +@plugin.register_validator(FeatureData[SequenceCharacteristics % + Properties("length")]) +def validate_sequence_characteristics_length(data: pd.DataFrame, level): + """ + Semantic validator that validates a numerical column called 'length', + which cannot contain empty or negative values, for the + FeatureData[SequenceCharacteristics] type with property "length". + """ + if 'length' not in data.columns: + raise ValidationError("Column 'length' has to exist in the file.") + + if data['length'].isnull().any(): + raise ValidationError("Column 'length' cannot contain empty (NaN) " + "values.") + + if not pd.api.types.is_numeric_dtype(data['length']): + raise ValidationError("Values in column 'length' have to be " + "numerical.") + + if not (data['length'] > 0).all(): + raise ValidationError("Column 'length' cannot contain negative " + "values.") + + plugin.register_semantic_types(FeatureData, Taxonomy, Sequence, PairedEndSequence, AlignedSequence, Differential, ProteinSequence, AlignedProteinSequence, RNASequence, AlignedRNASequence, PairedEndRNASequence, - BLAST6) - + BLAST6, SequenceCharacteristics) plugin.register_artifact_class( FeatureData[Taxonomy], @@ -120,3 +150,8 @@ directory_format=BLAST6DirectoryFormat, description=("BLAST results associated with a set of feature " "identifiers.")) +plugin.register_artifact_class( + FeatureData[SequenceCharacteristics], + directory_format=SequenceCharacteristicsDirectoryFormat, + description=("Characteristics of sequences (e.g., the length of a genes " + "in basepairs).")) diff --git a/q2_types/feature_data/tests/test_format.py b/q2_types/feature_data/tests/test_format.py index 6a703137..6a356171 100644 --- a/q2_types/feature_data/tests/test_format.py +++ b/q2_types/feature_data/tests/test_format.py @@ -29,7 +29,9 @@ MixedCaseRNAFASTAFormat, MixedCaseRNASequencesDirectoryFormat, MixedCaseAlignedDNAFASTAFormat, MixedCaseAlignedDNASequencesDirectoryFormat, - MixedCaseAlignedRNAFASTAFormat, MixedCaseAlignedRNASequencesDirectoryFormat + MixedCaseAlignedRNAFASTAFormat, + MixedCaseAlignedRNASequencesDirectoryFormat, + SequenceCharacteristicsDirectoryFormat, SequenceCharacteristicsFormat ) from qiime2.plugin.testing import TestPluginBase from qiime2.plugin import ValidationError @@ -901,5 +903,37 @@ def test_blast6_format_invalid(self): BLAST6DirectoryFormat(temp_dir, mode='r').validate() +class TestSequenceCharacteristicsFormat(TestPluginBase): + package = 'q2_types.feature_data.tests' + + def test_sequence_characteristics_directory_format(self): + filepath = self.get_data_path('sequence_characteristics_length.txt') + temp_dir = self.temp_dir.name + shutil.copy(filepath, os.path.join(temp_dir, + 'sequence_characteristics.txt')) + format = SequenceCharacteristicsDirectoryFormat(temp_dir, mode='r') + format.validate() + + def test_sequence_characteristics_format(self): + filepath = self.get_data_path('sequence_characteristics_length.txt') + format = SequenceCharacteristicsFormat(filepath, mode='r') + format.validate() + + def test_sequence_characteristics_format_empty(self): + path = self.get_data_path('empty.txt') + format = SequenceCharacteristicsFormat(path, mode='r') + with self.assertRaises(ValidationError) as context: + format.validate() + self.assertEqual(str(context.exception), 'File cannot be empty.') + + def test_sequence_characteristics_format_only_index(self): + path = self.get_data_path('sequence_characteristics_only_index.txt') + format = SequenceCharacteristicsFormat(path, mode='r') + with self.assertRaises(ValidationError) as context: + format.validate() + self.assertEqual(str(context.exception), + 'File needs to have at least two columns.') + + if __name__ == '__main__': unittest.main() diff --git a/q2_types/feature_data/tests/test_transformer.py b/q2_types/feature_data/tests/test_transformer.py index 034998d1..89d27c3a 100644 --- a/q2_types/feature_data/tests/test_transformer.py +++ b/q2_types/feature_data/tests/test_transformer.py @@ -5,7 +5,7 @@ # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- - +import filecmp import os.path import unittest @@ -25,7 +25,7 @@ AlignedProteinFASTAFormat, RNAFASTAFormat, AlignedRNAFASTAFormat, RNAIterator, AlignedRNAIterator, BLAST6Format, MixedCaseDNAFASTAFormat, MixedCaseRNAFASTAFormat, MixedCaseAlignedDNAFASTAFormat, - MixedCaseAlignedRNAFASTAFormat + MixedCaseAlignedRNAFASTAFormat, SequenceCharacteristicsFormat ) from q2_types.feature_data._transformer import ( _taxonomy_formats_to_dataframe, _dataframe_to_tsv_taxonomy_format, @@ -1507,5 +1507,32 @@ def test_blast6_to_metadata(self): assert_frame_equal(obs.to_dataframe(), exp) +class TestSequenceCharacteristicsTransformer(TestPluginBase): + package = 'q2_types.feature_data.tests' + + def setUp(self): + super().setUp() + self.exp_file = self.get_data_path( + "sequence_characteristics_length.txt") + self.exp_df = pd.DataFrame({'length': [876, 54]}, + index=pd.Index([1, 2], name='id')) + + def test_df_to_sequence_characteristics_format(self): + transformer = self.get_transformer(pd.DataFrame, + SequenceCharacteristicsFormat) + obs = transformer(self.exp_df) + + self.assertIsInstance(obs, SequenceCharacteristicsFormat) + assert filecmp.cmp(self.exp_file, obs.path) + + def test_sequence_characteristics_format_to_df(self): + transformer = self.get_transformer(SequenceCharacteristicsFormat, + pd.DataFrame) + format = SequenceCharacteristicsFormat(self.exp_file, mode="r") + obs = transformer(format) + + assert_frame_equal(self.exp_df, obs) + + if __name__ == '__main__': unittest.main() diff --git a/q2_types/feature_data/tests/test_type.py b/q2_types/feature_data/tests/test_type.py index 74971a7d..a4db0a60 100644 --- a/q2_types/feature_data/tests/test_type.py +++ b/q2_types/feature_data/tests/test_type.py @@ -8,6 +8,9 @@ import unittest +import pandas as pd +from qiime2.core.exceptions import ValidationError + from q2_types.feature_data import ( FeatureData, Taxonomy, Sequence, PairedEndSequence, AlignedSequence, Differential, TSVTaxonomyDirectoryFormat, DNASequencesDirectoryFormat, @@ -17,10 +20,14 @@ AlignedProteinSequence, RNASequence, RNASequencesDirectoryFormat, AlignedRNASequencesDirectoryFormat, AlignedRNASequence, PairedRNASequencesDirectoryFormat, PairedEndRNASequence, - BLAST6, BLAST6DirectoryFormat + BLAST6, BLAST6DirectoryFormat, SequenceCharacteristics, + SequenceCharacteristicsDirectoryFormat ) from qiime2.plugin.testing import TestPluginBase +from q2_types.feature_data._type import \ + validate_sequence_characteristics_length + class TestTypes(TestPluginBase): package = 'q2_types.feature_data.tests' @@ -118,6 +125,51 @@ def test_blast6_semantic_type_to_format_registration(self): self.assertSemanticTypeRegisteredToFormat( FeatureData[BLAST6], BLAST6DirectoryFormat) + def test_sequence_characteristics_semantic_type_registration(self): + self.assertRegisteredSemanticType(SequenceCharacteristics) + + def test_sequence_characteristics_semantic_type_format_registration(self): + self.assertSemanticTypeRegisteredToFormat( + FeatureData[SequenceCharacteristics], + SequenceCharacteristicsDirectoryFormat) + + def test_validate_sequence_characteristics_length(self): + data = self._setup_df() + validate_sequence_characteristics_length(data, None) + + def test_validate_sequence_characteristics_length_no_length_column(self): + data = self._setup_df() + data.drop(columns=['length'], inplace=True) + self._assert_validation_error(data, "Column 'length' has to exist in " + "the file.") + + def test_validate_sequence_characteristics_length_not_numerical(self): + data = self._setup_df() + data.loc[1, 'length'] = 'a' + self._assert_validation_error(data, "Values in column 'length' have " + "to be numerical.") + + def test_validate_sequence_characteristics_length_empty_values(self): + data = self._setup_df() + data.loc[1, 'length'] = None + self._assert_validation_error(data, "Column 'length' cannot contain " + "empty (NaN) values.") + + def test_validate_sequence_characteristics_length_negative_values(self): + data = self._setup_df() + data.loc[1, 'length'] = -1 + self._assert_validation_error(data, "Column 'length' cannot contain " + "negative values.") + + def _setup_df(self): + data_path = self.get_data_path("sequence_characteristics_length.txt") + return pd.read_csv(data_path, sep="\t", index_col=0) + + def _assert_validation_error(self, data, error_message): + with self.assertRaises(ValidationError) as context: + validate_sequence_characteristics_length(data, None) + self.assertEqual(str(context.exception), error_message) + if __name__ == "__main__": unittest.main() From 5d57bb2089f1d5bc761c7e484b49425285390be9 Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Wed, 8 May 2024 16:14:26 +0200 Subject: [PATCH 2/7] added testing files --- q2_types/feature_data/tests/data/empty.txt | 0 .../tests/data/sequence_characteristics_length.txt | 3 +++ .../tests/data/sequence_characteristics_only_index.txt | 3 +++ 3 files changed, 6 insertions(+) create mode 100644 q2_types/feature_data/tests/data/empty.txt create mode 100644 q2_types/feature_data/tests/data/sequence_characteristics_length.txt create mode 100644 q2_types/feature_data/tests/data/sequence_characteristics_only_index.txt diff --git a/q2_types/feature_data/tests/data/empty.txt b/q2_types/feature_data/tests/data/empty.txt new file mode 100644 index 00000000..e69de29b diff --git a/q2_types/feature_data/tests/data/sequence_characteristics_length.txt b/q2_types/feature_data/tests/data/sequence_characteristics_length.txt new file mode 100644 index 00000000..e088b9b8 --- /dev/null +++ b/q2_types/feature_data/tests/data/sequence_characteristics_length.txt @@ -0,0 +1,3 @@ +id length +1 876 +2 54 diff --git a/q2_types/feature_data/tests/data/sequence_characteristics_only_index.txt b/q2_types/feature_data/tests/data/sequence_characteristics_only_index.txt new file mode 100644 index 00000000..60ec7060 --- /dev/null +++ b/q2_types/feature_data/tests/data/sequence_characteristics_only_index.txt @@ -0,0 +1,3 @@ +id +1 +2 \ No newline at end of file From 9efd0556e91f3297bf38ab97469a6c9db2f57aa6 Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Tue, 14 May 2024 11:04:00 +0200 Subject: [PATCH 3/7] added transformer to metadata --- q2_types/feature_data/_format.py | 3 +-- q2_types/feature_data/_transformer.py | 7 +++++++ q2_types/feature_data/_type.py | 2 +- q2_types/feature_data/tests/test_transformer.py | 11 +++++++++++ q2_types/feature_data/tests/test_type.py | 6 +++--- 5 files changed, 23 insertions(+), 6 deletions(-) diff --git a/q2_types/feature_data/_format.py b/q2_types/feature_data/_format.py index 69c18c70..1d465d92 100644 --- a/q2_types/feature_data/_format.py +++ b/q2_types/feature_data/_format.py @@ -479,8 +479,7 @@ class SequenceCharacteristicsFormat(model.TextFileFormat): The file cannot be empty and must have at least two columns. Validation for additional columns can be added with a semantic validator - tied to a property. For example the - "validate_sequence_characteristics_length" validator for + tied to a property. For example the "validate_seq_char_len" validator for "FeatureData[SequenceCharacteristics % Properties("length")]" adds validation for a numerical column called "length". """ diff --git a/q2_types/feature_data/_transformer.py b/q2_types/feature_data/_transformer.py index 33ea679b..f3142284 100644 --- a/q2_types/feature_data/_transformer.py +++ b/q2_types/feature_data/_transformer.py @@ -810,3 +810,10 @@ def _229(data: pd.DataFrame) -> SequenceCharacteristicsFormat: ff = SequenceCharacteristicsFormat() data.to_csv(str(ff), sep='\t') return ff + + +@plugin.register_transformer +def _230(ff: SequenceCharacteristicsFormat) -> qiime2.Metadata: + df = pd.read_csv(str(ff), sep="\t", index_col=0) + df.index = pd.Index(df.index.astype(str), name='id') + return qiime2.Metadata(df) diff --git a/q2_types/feature_data/_type.py b/q2_types/feature_data/_type.py index 6484b3dc..dd60945b 100644 --- a/q2_types/feature_data/_type.py +++ b/q2_types/feature_data/_type.py @@ -61,7 +61,7 @@ @plugin.register_validator(FeatureData[SequenceCharacteristics % Properties("length")]) -def validate_sequence_characteristics_length(data: pd.DataFrame, level): +def validate_seq_char_len(data: pd.DataFrame, level): """ Semantic validator that validates a numerical column called 'length', which cannot contain empty or negative values, for the diff --git a/q2_types/feature_data/tests/test_transformer.py b/q2_types/feature_data/tests/test_transformer.py index 89d27c3a..d91f9464 100644 --- a/q2_types/feature_data/tests/test_transformer.py +++ b/q2_types/feature_data/tests/test_transformer.py @@ -1533,6 +1533,17 @@ def test_sequence_characteristics_format_to_df(self): assert_frame_equal(self.exp_df, obs) + def test_sequence_characteristics_format_to_metadata(self): + transformer = self.get_transformer(SequenceCharacteristicsFormat, + qiime2.Metadata) + format = SequenceCharacteristicsFormat(self.exp_file, mode="r") + obs = transformer(format) + + self.exp_df.index = pd.Index(self.exp_df.index.astype(str)) + self.exp_df['length'] = self.exp_df['length'].astype('float64') + + assert_frame_equal(obs.to_dataframe(), self.exp_df) + if __name__ == '__main__': unittest.main() diff --git a/q2_types/feature_data/tests/test_type.py b/q2_types/feature_data/tests/test_type.py index a4db0a60..40e08073 100644 --- a/q2_types/feature_data/tests/test_type.py +++ b/q2_types/feature_data/tests/test_type.py @@ -26,7 +26,7 @@ from qiime2.plugin.testing import TestPluginBase from q2_types.feature_data._type import \ - validate_sequence_characteristics_length + validate_seq_char_len class TestTypes(TestPluginBase): @@ -135,7 +135,7 @@ def test_sequence_characteristics_semantic_type_format_registration(self): def test_validate_sequence_characteristics_length(self): data = self._setup_df() - validate_sequence_characteristics_length(data, None) + validate_seq_char_len(data, None) def test_validate_sequence_characteristics_length_no_length_column(self): data = self._setup_df() @@ -167,7 +167,7 @@ def _setup_df(self): def _assert_validation_error(self, data, error_message): with self.assertRaises(ValidationError) as context: - validate_sequence_characteristics_length(data, None) + validate_seq_char_len(data, None) self.assertEqual(str(context.exception), error_message) From d37df0ca0460ba2a65b559a2a49dfd307ca02dba Mon Sep 17 00:00:00 2001 From: Liz Gehret <54517601+lizgehret@users.noreply.github.com> Date: Wed, 15 May 2024 09:09:53 -0700 Subject: [PATCH 4/7] Update q2_types/feature_data/_format.py Co-authored-by: Michal Ziemski --- q2_types/feature_data/_format.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/q2_types/feature_data/_format.py b/q2_types/feature_data/_format.py index 1d465d92..1be5bced 100644 --- a/q2_types/feature_data/_format.py +++ b/q2_types/feature_data/_format.py @@ -496,7 +496,7 @@ def validate(self, n_records=None): SequenceCharacteristicsDirectoryFormat = model.SingleFileDirectoryFormat( "SequenceCharacteristicsDirectoryFormat", - "sequence_characteristics.txt", SequenceCharacteristicsFormat + "sequence_characteristics.tsv", SequenceCharacteristicsFormat ) plugin.register_formats( From 9916d1090b7805b9e8fe4e13986ac85e6a265aa1 Mon Sep 17 00:00:00 2001 From: VinzentRisch <100149044+VinzentRisch@users.noreply.github.com> Date: Thu, 16 May 2024 10:09:55 +0200 Subject: [PATCH 5/7] Update q2_types/feature_data/_transformer.py Co-authored-by: Liz Gehret <54517601+lizgehret@users.noreply.github.com> --- q2_types/feature_data/_transformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/q2_types/feature_data/_transformer.py b/q2_types/feature_data/_transformer.py index f3142284..6cf852b3 100644 --- a/q2_types/feature_data/_transformer.py +++ b/q2_types/feature_data/_transformer.py @@ -802,7 +802,7 @@ def _227(ff: BLAST6Format) -> qiime2.Metadata: @plugin.register_transformer def _228(ff: SequenceCharacteristicsFormat) -> pd.DataFrame: - return pd.read_csv(str(ff), sep="\t", index_col=0) + return pd.read_csv(str(ff), sep='\t', index_col=0) @plugin.register_transformer From 6eea912bed7b3a31097456be6aeb50e7467acbfa Mon Sep 17 00:00:00 2001 From: VinzentRisch <100149044+VinzentRisch@users.noreply.github.com> Date: Thu, 16 May 2024 10:11:22 +0200 Subject: [PATCH 6/7] Apply suggestions from code review Co-authored-by: Liz Gehret <54517601+lizgehret@users.noreply.github.com> --- q2_types/feature_data/_transformer.py | 2 +- q2_types/feature_data/_type.py | 2 +- q2_types/feature_data/tests/test_format.py | 10 +++++----- q2_types/feature_data/tests/test_transformer.py | 2 +- q2_types/feature_data/tests/test_type.py | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/q2_types/feature_data/_transformer.py b/q2_types/feature_data/_transformer.py index 6cf852b3..387a2428 100644 --- a/q2_types/feature_data/_transformer.py +++ b/q2_types/feature_data/_transformer.py @@ -814,6 +814,6 @@ def _229(data: pd.DataFrame) -> SequenceCharacteristicsFormat: @plugin.register_transformer def _230(ff: SequenceCharacteristicsFormat) -> qiime2.Metadata: - df = pd.read_csv(str(ff), sep="\t", index_col=0) + df = pd.read_csv(str(ff), sep='\t', index_col=0) df.index = pd.Index(df.index.astype(str), name='id') return qiime2.Metadata(df) diff --git a/q2_types/feature_data/_type.py b/q2_types/feature_data/_type.py index dd60945b..ab0379f4 100644 --- a/q2_types/feature_data/_type.py +++ b/q2_types/feature_data/_type.py @@ -153,5 +153,5 @@ def validate_seq_char_len(data: pd.DataFrame, level): plugin.register_artifact_class( FeatureData[SequenceCharacteristics], directory_format=SequenceCharacteristicsDirectoryFormat, - description=("Characteristics of sequences (e.g., the length of a genes " + description=("Characteristics of sequences (e.g., the length of a gene " "in basepairs).")) diff --git a/q2_types/feature_data/tests/test_format.py b/q2_types/feature_data/tests/test_format.py index 6a356171..f61073aa 100644 --- a/q2_types/feature_data/tests/test_format.py +++ b/q2_types/feature_data/tests/test_format.py @@ -907,27 +907,27 @@ class TestSequenceCharacteristicsFormat(TestPluginBase): package = 'q2_types.feature_data.tests' def test_sequence_characteristics_directory_format(self): - filepath = self.get_data_path('sequence_characteristics_length.txt') + filepath = self.get_data_path('sequence_characteristics_length.tsv') temp_dir = self.temp_dir.name shutil.copy(filepath, os.path.join(temp_dir, - 'sequence_characteristics.txt')) + 'sequence_characteristics.tsv')) format = SequenceCharacteristicsDirectoryFormat(temp_dir, mode='r') format.validate() def test_sequence_characteristics_format(self): - filepath = self.get_data_path('sequence_characteristics_length.txt') + filepath = self.get_data_path('sequence_characteristics_length.tsv') format = SequenceCharacteristicsFormat(filepath, mode='r') format.validate() def test_sequence_characteristics_format_empty(self): - path = self.get_data_path('empty.txt') + path = self.get_data_path('empty.tsv') format = SequenceCharacteristicsFormat(path, mode='r') with self.assertRaises(ValidationError) as context: format.validate() self.assertEqual(str(context.exception), 'File cannot be empty.') def test_sequence_characteristics_format_only_index(self): - path = self.get_data_path('sequence_characteristics_only_index.txt') + path = self.get_data_path('sequence_characteristics_only_index.tsv') format = SequenceCharacteristicsFormat(path, mode='r') with self.assertRaises(ValidationError) as context: format.validate() diff --git a/q2_types/feature_data/tests/test_transformer.py b/q2_types/feature_data/tests/test_transformer.py index d91f9464..48b60f7d 100644 --- a/q2_types/feature_data/tests/test_transformer.py +++ b/q2_types/feature_data/tests/test_transformer.py @@ -1513,7 +1513,7 @@ class TestSequenceCharacteristicsTransformer(TestPluginBase): def setUp(self): super().setUp() self.exp_file = self.get_data_path( - "sequence_characteristics_length.txt") + "sequence_characteristics_length.tsv") self.exp_df = pd.DataFrame({'length': [876, 54]}, index=pd.Index([1, 2], name='id')) diff --git a/q2_types/feature_data/tests/test_type.py b/q2_types/feature_data/tests/test_type.py index 40e08073..11693f7a 100644 --- a/q2_types/feature_data/tests/test_type.py +++ b/q2_types/feature_data/tests/test_type.py @@ -162,7 +162,7 @@ def test_validate_sequence_characteristics_length_negative_values(self): "negative values.") def _setup_df(self): - data_path = self.get_data_path("sequence_characteristics_length.txt") + data_path = self.get_data_path("sequence_characteristics_length.tsv") return pd.read_csv(data_path, sep="\t", index_col=0) def _assert_validation_error(self, data, error_message): From 19e305348401f549e44e400620b7f5e39b722536 Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Thu, 16 May 2024 10:27:00 +0200 Subject: [PATCH 7/7] changed names to tsv, cleanup --- q2_types/feature_data/_format.py | 2 +- q2_types/feature_data/_type.py | 14 +++++++------- .../tests/data/{empty.txt => empty.tsv} | 0 ...txt => sequence_characteristics_length.tsv} | 0 ...=> sequence_characteristics_only_index.tsv} | 0 .../feature_data/tests/test_transformer.py | 6 +++--- q2_types/feature_data/tests/test_type.py | 18 +++++++++--------- 7 files changed, 20 insertions(+), 20 deletions(-) rename q2_types/feature_data/tests/data/{empty.txt => empty.tsv} (100%) rename q2_types/feature_data/tests/data/{sequence_characteristics_length.txt => sequence_characteristics_length.tsv} (100%) rename q2_types/feature_data/tests/data/{sequence_characteristics_only_index.txt => sequence_characteristics_only_index.tsv} (100%) diff --git a/q2_types/feature_data/_format.py b/q2_types/feature_data/_format.py index 1be5bced..aefb9e3f 100644 --- a/q2_types/feature_data/_format.py +++ b/q2_types/feature_data/_format.py @@ -486,7 +486,7 @@ class SequenceCharacteristicsFormat(model.TextFileFormat): def validate(self, n_records=None): try: - data = pd.read_csv(str(self), sep="\t", index_col=0) + data = pd.read_csv(str(self), sep='\t', index_col=0) except pd.errors.EmptyDataError: raise ValidationError('File cannot be empty.') diff --git a/q2_types/feature_data/_type.py b/q2_types/feature_data/_type.py index ab0379f4..7d65baa3 100644 --- a/q2_types/feature_data/_type.py +++ b/q2_types/feature_data/_type.py @@ -68,19 +68,19 @@ def validate_seq_char_len(data: pd.DataFrame, level): FeatureData[SequenceCharacteristics] type with property "length". """ if 'length' not in data.columns: - raise ValidationError("Column 'length' has to exist in the file.") + raise ValidationError('Column "length" has to exist in the file.') if data['length'].isnull().any(): - raise ValidationError("Column 'length' cannot contain empty (NaN) " - "values.") + raise ValidationError('Column "length" cannot contain empty (NaN) ' + 'values.') if not pd.api.types.is_numeric_dtype(data['length']): - raise ValidationError("Values in column 'length' have to be " - "numerical.") + raise ValidationError('Values in column "length" have to be ' + 'numerical.') if not (data['length'] > 0).all(): - raise ValidationError("Column 'length' cannot contain negative " - "values.") + raise ValidationError('Column "length" cannot contain negative ' + 'values.') plugin.register_semantic_types(FeatureData, Taxonomy, Sequence, diff --git a/q2_types/feature_data/tests/data/empty.txt b/q2_types/feature_data/tests/data/empty.tsv similarity index 100% rename from q2_types/feature_data/tests/data/empty.txt rename to q2_types/feature_data/tests/data/empty.tsv diff --git a/q2_types/feature_data/tests/data/sequence_characteristics_length.txt b/q2_types/feature_data/tests/data/sequence_characteristics_length.tsv similarity index 100% rename from q2_types/feature_data/tests/data/sequence_characteristics_length.txt rename to q2_types/feature_data/tests/data/sequence_characteristics_length.tsv diff --git a/q2_types/feature_data/tests/data/sequence_characteristics_only_index.txt b/q2_types/feature_data/tests/data/sequence_characteristics_only_index.tsv similarity index 100% rename from q2_types/feature_data/tests/data/sequence_characteristics_only_index.txt rename to q2_types/feature_data/tests/data/sequence_characteristics_only_index.tsv diff --git a/q2_types/feature_data/tests/test_transformer.py b/q2_types/feature_data/tests/test_transformer.py index 48b60f7d..3b19b48c 100644 --- a/q2_types/feature_data/tests/test_transformer.py +++ b/q2_types/feature_data/tests/test_transformer.py @@ -1513,7 +1513,7 @@ class TestSequenceCharacteristicsTransformer(TestPluginBase): def setUp(self): super().setUp() self.exp_file = self.get_data_path( - "sequence_characteristics_length.tsv") + 'sequence_characteristics_length.tsv') self.exp_df = pd.DataFrame({'length': [876, 54]}, index=pd.Index([1, 2], name='id')) @@ -1528,7 +1528,7 @@ def test_df_to_sequence_characteristics_format(self): def test_sequence_characteristics_format_to_df(self): transformer = self.get_transformer(SequenceCharacteristicsFormat, pd.DataFrame) - format = SequenceCharacteristicsFormat(self.exp_file, mode="r") + format = SequenceCharacteristicsFormat(self.exp_file, mode='r') obs = transformer(format) assert_frame_equal(self.exp_df, obs) @@ -1536,7 +1536,7 @@ def test_sequence_characteristics_format_to_df(self): def test_sequence_characteristics_format_to_metadata(self): transformer = self.get_transformer(SequenceCharacteristicsFormat, qiime2.Metadata) - format = SequenceCharacteristicsFormat(self.exp_file, mode="r") + format = SequenceCharacteristicsFormat(self.exp_file, mode='r') obs = transformer(format) self.exp_df.index = pd.Index(self.exp_df.index.astype(str)) diff --git a/q2_types/feature_data/tests/test_type.py b/q2_types/feature_data/tests/test_type.py index 11693f7a..6af42d30 100644 --- a/q2_types/feature_data/tests/test_type.py +++ b/q2_types/feature_data/tests/test_type.py @@ -140,29 +140,29 @@ def test_validate_sequence_characteristics_length(self): def test_validate_sequence_characteristics_length_no_length_column(self): data = self._setup_df() data.drop(columns=['length'], inplace=True) - self._assert_validation_error(data, "Column 'length' has to exist in " - "the file.") + self._assert_validation_error(data, 'Column "length" has to exist in ' + 'the file.') def test_validate_sequence_characteristics_length_not_numerical(self): data = self._setup_df() data.loc[1, 'length'] = 'a' - self._assert_validation_error(data, "Values in column 'length' have " - "to be numerical.") + self._assert_validation_error(data, 'Values in column "length" have ' + 'to be numerical.') def test_validate_sequence_characteristics_length_empty_values(self): data = self._setup_df() data.loc[1, 'length'] = None - self._assert_validation_error(data, "Column 'length' cannot contain " - "empty (NaN) values.") + self._assert_validation_error(data, 'Column "length" cannot contain ' + 'empty (NaN) values.') def test_validate_sequence_characteristics_length_negative_values(self): data = self._setup_df() data.loc[1, 'length'] = -1 - self._assert_validation_error(data, "Column 'length' cannot contain " - "negative values.") + self._assert_validation_error(data, 'Column "length" cannot contain ' + 'negative values.') def _setup_df(self): - data_path = self.get_data_path("sequence_characteristics_length.tsv") + data_path = self.get_data_path('sequence_characteristics_length.tsv') return pd.read_csv(data_path, sep="\t", index_col=0) def _assert_validation_error(self, data, error_message):