From 0a3825412fb8c45a3075ce1943c1ac6770e3ebe9 Mon Sep 17 00:00:00 2001 From: "Dbhasin@1" Date: Fri, 2 Jul 2021 18:56:10 +0530 Subject: [PATCH 1/9] functionality to accept compressed files as input to predict --- allennlp/commands/predict.py | 32 ++++++++++++++++++++++++++------ allennlp/common/file_utils.py | 22 ++++++++++++++-------- 2 files changed, 40 insertions(+), 14 deletions(-) diff --git a/allennlp/commands/predict.py b/allennlp/commands/predict.py index bdf0a0b7635..e14d2df2f0e 100644 --- a/allennlp/commands/predict.py +++ b/allennlp/commands/predict.py @@ -13,7 +13,7 @@ from allennlp.commands.subcommand import Subcommand from allennlp.common import logging as common_logging from allennlp.common.checks import check_for_gpu, ConfigurationError -from allennlp.common.file_utils import cached_path +from allennlp.common.file_utils import cached_path, open_compressed from allennlp.common.util import lazy_groups_of from allennlp.data.dataset_readers import MultiTaskDatasetReader from allennlp.models.archival import load_archive @@ -73,6 +73,14 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument "flag is set.", ) + subparser.add_argument( + "--compression-type", + type=str, + choices=["gz", "bz2", "lzma"], + default=None, + help="Indicates the compressed format of the input file.", + ) + subparser.add_argument( "--multitask-head", type=str, @@ -152,6 +160,7 @@ def __init__( batch_size: int, print_to_console: bool, has_dataset_reader: bool, + compression_type: str = None, multitask_head: Optional[str] = None, ) -> None: self._predictor = predictor @@ -160,7 +169,7 @@ def __init__( self._batch_size = batch_size self._print_to_console = print_to_console self._dataset_reader = None if not has_dataset_reader else predictor._dataset_reader - + self.compression_type = compression_type self._multitask_head = multitask_head if self._multitask_head is not None: if self._dataset_reader is None: @@ -212,10 +221,21 @@ def _get_json_data(self) -> Iterator[JsonDict]: yield self._predictor.load_line(line) else: input_file = cached_path(self._input_file) - with open(input_file, "r") as file_input: - for line in file_input: - if not line.isspace(): - yield self._predictor.load_line(line) + try: + with open_compressed(input_file) as file_input: + for line in file_input: + if not line.isspace(): + yield self._predictor.load_line(line) + except OSError: + if self.compression_type: + with open_compressed(input_file, self.compression_type) as file_input: + for line in file_input: + if not line.isspace(): + yield self._predictor.load_line(line) + else: + print( + "Automatic detection of compression type failed, please specify the compression type argument" + ) def _get_instance_data(self) -> Iterator[Instance]: if self._input_file == "-": diff --git a/allennlp/common/file_utils.py b/allennlp/common/file_utils.py index 0acd91b2257..dffd7fcb45d 100644 --- a/allennlp/common/file_utils.py +++ b/allennlp/common/file_utils.py @@ -1085,20 +1085,26 @@ def get_file_extension(path: str, dot=True, lower: bool = True): def open_compressed( - filename: Union[str, PathLike], mode: str = "rt", encoding: Optional[str] = "UTF-8", **kwargs + filename: Union[str, PathLike], + compression_type: str = None, + mode: str = "rt", + encoding: Optional[str] = "UTF-8", + **kwargs, ): if not isinstance(filename, str): filename = str(filename) open_fn: Callable = open - if filename.endswith(".gz"): - import gzip - - open_fn = gzip.open - elif filename.endswith(".bz2"): - import bz2 + compression_modules = {"gz": "gzip", "bz2": "bz2", "lzma": "lzma"} + if not compression_type: + for extension in compression_modules: + if filename.endswith(extension): + module = __import__(compression_modules[extension]) + open_fn = module.open + else: + module = __import__(compression_modules[extension]) + open_fn = module.open - open_fn = bz2.open return open_fn(cached_path(filename), mode=mode, encoding=encoding, **kwargs) From 316b15e0f39861a22b384ee9d98dbb7112f209b6 Mon Sep 17 00:00:00 2001 From: "Dbhasin@1" Date: Fri, 2 Jul 2021 19:25:48 +0530 Subject: [PATCH 2/9] test for lzma format included --- tests/common/file_utils_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/common/file_utils_test.py b/tests/common/file_utils_test.py index cf773842dab..b525d4ddf78 100644 --- a/tests/common/file_utils_test.py +++ b/tests/common/file_utils_test.py @@ -347,7 +347,7 @@ def test_open_compressed(self): with open_compressed(uncompressed_file) as f: uncompressed_lines = [line.strip() for line in f] - for suffix in ["bz2", "gz"]: + for suffix in ["bz2", "gz", "lzma"]: compressed_file = f"{uncompressed_file}.{suffix}" with open_compressed(compressed_file) as f: compressed_lines = [line.strip() for line in f] From 47349d4a7b7c70a48a2dfc7a0b8c21681b3517b2 Mon Sep 17 00:00:00 2001 From: "Dbhasin@1" Date: Fri, 2 Jul 2021 19:29:48 +0530 Subject: [PATCH 3/9] minor logical error --- allennlp/common/file_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/allennlp/common/file_utils.py b/allennlp/common/file_utils.py index dffd7fcb45d..791f47af905 100644 --- a/allennlp/common/file_utils.py +++ b/allennlp/common/file_utils.py @@ -1101,6 +1101,7 @@ def open_compressed( if filename.endswith(extension): module = __import__(compression_modules[extension]) open_fn = module.open + break else: module = __import__(compression_modules[extension]) open_fn = module.open From 0ac8f67cd1d33609e941481cb96e487f018d9d78 Mon Sep 17 00:00:00 2001 From: "Dbhasin@1" Date: Fri, 9 Jul 2021 03:54:09 +0530 Subject: [PATCH 4/9] suggested changes incorporated --- allennlp/commands/predict.py | 24 ++++++++++++++---------- allennlp/common/file_utils.py | 12 ++++++------ 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/allennlp/commands/predict.py b/allennlp/commands/predict.py index e14d2df2f0e..565bc5cfa0d 100644 --- a/allennlp/commands/predict.py +++ b/allennlp/commands/predict.py @@ -221,22 +221,26 @@ def _get_json_data(self) -> Iterator[JsonDict]: yield self._predictor.load_line(line) else: input_file = cached_path(self._input_file) - try: - with open_compressed(input_file) as file_input: - for line in file_input: - if not line.isspace(): - yield self._predictor.load_line(line) - except OSError: - if self.compression_type: - with open_compressed(input_file, self.compression_type) as file_input: + if self.compression_type is None: + try: + with open_compressed(input_file) as file_input: for line in file_input: if not line.isspace(): yield self._predictor.load_line(line) - else: + except OSError: print( - "Automatic detection of compression type failed, please specify the compression type argument" + "Automatic detection failed, please specify the compression type argument." ) + else: + try: + with open_compressed(input_file, compression_type=self.compression_type) as file_input: + for line in file_input: + if not line.isspace(): + yield self._predictor.load_line(line) + except OSError: + print("please specify the correct compression type argument.") + def _get_instance_data(self) -> Iterator[Instance]: if self._input_file == "-": raise ConfigurationError("stdin is not an option when using a DatasetReader.") diff --git a/allennlp/common/file_utils.py b/allennlp/common/file_utils.py index 791f47af905..64d9e845f4b 100644 --- a/allennlp/common/file_utils.py +++ b/allennlp/common/file_utils.py @@ -1086,9 +1086,9 @@ def get_file_extension(path: str, dot=True, lower: bool = True): def open_compressed( filename: Union[str, PathLike], - compression_type: str = None, mode: str = "rt", encoding: Optional[str] = "UTF-8", + compression_type: Optional[str] = None, **kwargs, ): if not isinstance(filename, str): @@ -1096,15 +1096,15 @@ def open_compressed( open_fn: Callable = open compression_modules = {"gz": "gzip", "bz2": "bz2", "lzma": "lzma"} - if not compression_type: + if compression_type in compression_modules: + module = __import__(compression_modules[compression_type]) + open_fn = module.open + else: for extension in compression_modules: if filename.endswith(extension): module = __import__(compression_modules[extension]) open_fn = module.open - break - else: - module = __import__(compression_modules[extension]) - open_fn = module.open + break return open_fn(cached_path(filename), mode=mode, encoding=encoding, **kwargs) From 217387f518eedc0972cba68ef0e7c6959a6d94e8 Mon Sep 17 00:00:00 2001 From: Dirk Groeneveld Date: Wed, 23 Feb 2022 10:56:15 -0800 Subject: [PATCH 5/9] Compression is always auto-detected --- allennlp/commands/predict.py | 33 ++++----------------------------- 1 file changed, 4 insertions(+), 29 deletions(-) diff --git a/allennlp/commands/predict.py b/allennlp/commands/predict.py index 97fd788f46c..26ee819ff63 100644 --- a/allennlp/commands/predict.py +++ b/allennlp/commands/predict.py @@ -71,14 +71,6 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument "flag is set.", ) - subparser.add_argument( - "--compression-type", - type=str, - choices=["gz", "bz2", "lzma"], - default=None, - help="Indicates the compressed format of the input file.", - ) - subparser.add_argument( "--multitask-head", type=str, @@ -158,7 +150,6 @@ def __init__( batch_size: int, print_to_console: bool, has_dataset_reader: bool, - compression_type: str = None, multitask_head: Optional[str] = None, ) -> None: self._predictor = predictor @@ -167,7 +158,6 @@ def __init__( self._batch_size = batch_size self._print_to_console = print_to_console self._dataset_reader = None if not has_dataset_reader else predictor._dataset_reader - self.compression_type = compression_type self._multitask_head = multitask_head if self._multitask_head is not None: if self._dataset_reader is None: @@ -219,25 +209,10 @@ def _get_json_data(self) -> Iterator[JsonDict]: yield self._predictor.load_line(line) else: input_file = cached_path(self._input_file) - if self.compression_type is None: - try: - with open_compressed(input_file) as file_input: - for line in file_input: - if not line.isspace(): - yield self._predictor.load_line(line) - except OSError: - print( - "Automatic detection failed, please specify the compression type argument." - ) - - else: - try: - with open_compressed(input_file, compression_type=self.compression_type) as file_input: - for line in file_input: - if not line.isspace(): - yield self._predictor.load_line(line) - except OSError: - print("please specify the correct compression type argument.") + with open_compressed(input_file) as file_input: + for line in file_input: + if not line.isspace(): + yield self._predictor.load_line(line) def _get_instance_data(self) -> Iterator[Instance]: if self._input_file == "-": From b13fd60675f7ad06160849de44634cada037dc01 Mon Sep 17 00:00:00 2001 From: Dirk Groeneveld Date: Wed, 23 Feb 2022 10:56:31 -0800 Subject: [PATCH 6/9] Import the open_compressed() function from Tango --- allennlp/common/file_utils.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/allennlp/common/file_utils.py b/allennlp/common/file_utils.py index 052ad24c62b..a541f500f43 100644 --- a/allennlp/common/file_utils.py +++ b/allennlp/common/file_utils.py @@ -1,6 +1,9 @@ """ Utilities for working with the local dataset cache. """ +import bz2 +import gzip +import lzma import weakref from contextlib import contextmanager import glob @@ -443,27 +446,30 @@ def get_file_extension(path: str, dot=True, lower: bool = True): return ext.lower() if lower else ext +_SUFFIXES: Dict[Callable, str] = { + open: "", + gzip.open: ".gz", + bz2.open: ".bz2", + lzma.open: ".xz", +} + + def open_compressed( filename: Union[str, PathLike], mode: str = "rt", encoding: Optional[str] = "UTF-8", - compression_type: Optional[str] = None, **kwargs, ): if not isinstance(filename, str): filename = str(filename) - open_fn: Callable = open - compression_modules = {"gz": "gzip", "bz2": "bz2", "lzma": "lzma"} - if compression_type in compression_modules: - module = __import__(compression_modules[compression_type]) - open_fn = module.open + open_fn: Callable + filename = str(filename) + for open_fn, suffix in _SUFFIXES.items(): + if len(suffix) > 0 and filename.endswith(suffix): + break else: - for extension in compression_modules: - if filename.endswith(extension): - module = __import__(compression_modules[extension]) - open_fn = module.open - break + open_fn = open return open_fn(cached_path(filename), mode=mode, encoding=encoding, **kwargs) From b18dfec82bcc7a9c62b666e6d9fb507b61662b9b Mon Sep 17 00:00:00 2001 From: Dirk Groeneveld Date: Wed, 23 Feb 2022 11:01:46 -0800 Subject: [PATCH 7/9] Changelog --- CHANGELOG.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index e6d2e047951..40c92f38b25 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Running the test suite out-of-tree (e.g. after installation) is now possible by pointing the environment variable `ALLENNLP_SRC_DIR` to the sources. - Silenced a warning that happens when you inappropriately clone a tensor. +### Added + +- We can now transparently read compressed input files during prediction. +- LZMA compression is now supported. + + ## [v2.9.0](https://github.com/allenai/allennlp/releases/tag/v2.9.0) - 2022-01-27 ### Added From bc3c750669d801be5688dd4aaf12778685df6bd5 Mon Sep 17 00:00:00 2001 From: Dirk Groeneveld Date: Wed, 23 Feb 2022 16:13:23 -0800 Subject: [PATCH 8/9] The canonical extension for lzma is xz --- allennlp/modules/token_embedders/embedding.py | 2 +- ...gs.5d.txt.lzma => fake_embeddings.5d.txt.xz} | Bin 316 -> 360 bytes tests/common/file_utils_test.py | 12 ++++++------ tests/data/vocabulary_test.py | 2 +- tests/modules/token_embedders/embedding_test.py | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) rename test_fixtures/embeddings/{fake_embeddings.5d.txt.lzma => fake_embeddings.5d.txt.xz} (53%) diff --git a/allennlp/modules/token_embedders/embedding.py b/allennlp/modules/token_embedders/embedding.py index 3c16526b47d..a841ccdd793 100644 --- a/allennlp/modules/token_embedders/embedding.py +++ b/allennlp/modules/token_embedders/embedding.py @@ -550,7 +550,7 @@ def __init__( import bz2 package = bz2 - elif extension == ".lzma": + elif extension == ".xz": import lzma package = lzma diff --git a/test_fixtures/embeddings/fake_embeddings.5d.txt.lzma b/test_fixtures/embeddings/fake_embeddings.5d.txt.xz similarity index 53% rename from test_fixtures/embeddings/fake_embeddings.5d.txt.lzma rename to test_fixtures/embeddings/fake_embeddings.5d.txt.xz index f8370841c3212f765e45e527a72d8087afdef1a7..6d4fb0df44c3fc18c9ec842fe0a13516c0befa3e 100644 GIT binary patch delta 77 zcmV-T0J8tQ0_XxB{Wp48S^xk9=GL@E0stWa761SMbT8$j-~tT+C|$7)B>^S%pvTw% jmTgpkx&)WW00G1TjsySz%d%rRvBYQl0ssI200dcD{mB{c delta 33 jcmaFCw1XJB>=Fh56j2a% diff --git a/tests/common/file_utils_test.py b/tests/common/file_utils_test.py index de8a9c4f75d..a567825c63c 100644 --- a/tests/common/file_utils_test.py +++ b/tests/common/file_utils_test.py @@ -221,16 +221,16 @@ def test_extract_with_external_symlink(self): with pytest.raises(ValueError): cached_path(dangerous_file, extract_archive=True) - def test_open_compressed(self): + @pytest.mark.parametrize("suffix", ["bz2", "gz", "xz"]) + def test_open_compressed(self, suffix: str): uncompressed_file = self.FIXTURES_ROOT / "embeddings/fake_embeddings.5d.txt" with open_compressed(uncompressed_file) as f: uncompressed_lines = [line.strip() for line in f] - for suffix in ["bz2", "gz", "lzma"]: - compressed_file = f"{uncompressed_file}.{suffix}" - with open_compressed(compressed_file) as f: - compressed_lines = [line.strip() for line in f] - assert compressed_lines == uncompressed_lines + compressed_file = f"{uncompressed_file}.{suffix}" + with open_compressed(compressed_file) as f: + compressed_lines = [line.strip() for line in f] + assert compressed_lines == uncompressed_lines def test_meta_backwards_compatible(self): url = "http://fake.datastore.com/glove.txt.gz" diff --git a/tests/data/vocabulary_test.py b/tests/data/vocabulary_test.py index b586867202e..bcac9078b68 100644 --- a/tests/data/vocabulary_test.py +++ b/tests/data/vocabulary_test.py @@ -677,7 +677,7 @@ def test_read_pretrained_words(self): # Reading from a single (compressed) file or a single-file archive base_path = str(self.FIXTURES_ROOT / "embeddings/fake_embeddings.5d.txt") - for ext in ["", ".gz", ".lzma", ".bz2", ".zip", ".tar.gz"]: + for ext in ["", ".gz", ".xz", ".bz2", ".zip", ".tar.gz"]: file_path = base_path + ext words_read = set(_read_pretrained_tokens(file_path)) assert words_read == words, ( diff --git a/tests/modules/token_embedders/embedding_test.py b/tests/modules/token_embedders/embedding_test.py index fac0ff32a0d..c217edf5f6b 100644 --- a/tests/modules/token_embedders/embedding_test.py +++ b/tests/modules/token_embedders/embedding_test.py @@ -164,7 +164,7 @@ def test_embeddings_text_file(self): assert text == correct_text, "Test failed for file: " + path # Check for a file contained inside an archive with multiple files - for ext in [".zip", ".tar.gz", ".tar.bz2", ".tar.lzma"]: + for ext in [".zip", ".tar.gz", ".tar.bz2", ".tar.xz"]: archive_path = str(self.FIXTURES_ROOT / "utf-8_sample/archives/utf-8") + ext file_uri = format_embeddings_file_uri(archive_path, "folder/utf-8_sample.txt") with EmbeddingsTextFile(file_uri) as f: From 68098b49e4ccc822a1fb924f54712612f8aa1dc9 Mon Sep 17 00:00:00 2001 From: Dirk Groeneveld Date: Wed, 23 Feb 2022 16:39:34 -0800 Subject: [PATCH 9/9] Updated file name --- .../archives/{utf-8.tar.lzma => utf-8.tar.xz} | Bin 1 file changed, 0 insertions(+), 0 deletions(-) rename test_fixtures/utf-8_sample/archives/{utf-8.tar.lzma => utf-8.tar.xz} (100%) diff --git a/test_fixtures/utf-8_sample/archives/utf-8.tar.lzma b/test_fixtures/utf-8_sample/archives/utf-8.tar.xz similarity index 100% rename from test_fixtures/utf-8_sample/archives/utf-8.tar.lzma rename to test_fixtures/utf-8_sample/archives/utf-8.tar.xz