From 0a3825412fb8c45a3075ce1943c1ac6770e3ebe9 Mon Sep 17 00:00:00 2001
From: "Dbhasin@1" <drishti_b@me.iitr.ac.in>
Date: Fri, 2 Jul 2021 18:56:10 +0530
Subject: [PATCH 1/9] functionality to accept compressed files as input to
 predict

---
 allennlp/commands/predict.py  | 32 ++++++++++++++++++++++++++------
 allennlp/common/file_utils.py | 22 ++++++++++++++--------
 2 files changed, 40 insertions(+), 14 deletions(-)

diff --git a/allennlp/commands/predict.py b/allennlp/commands/predict.py
index bdf0a0b7635..e14d2df2f0e 100644
--- a/allennlp/commands/predict.py
+++ b/allennlp/commands/predict.py
@@ -13,7 +13,7 @@
 from allennlp.commands.subcommand import Subcommand
 from allennlp.common import logging as common_logging
 from allennlp.common.checks import check_for_gpu, ConfigurationError
-from allennlp.common.file_utils import cached_path
+from allennlp.common.file_utils import cached_path, open_compressed
 from allennlp.common.util import lazy_groups_of
 from allennlp.data.dataset_readers import MultiTaskDatasetReader
 from allennlp.models.archival import load_archive
@@ -73,6 +73,14 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument
             "flag is set.",
         )
 
+        subparser.add_argument(
+            "--compression-type",
+            type=str,
+            choices=["gz", "bz2", "lzma"],
+            default=None,
+            help="Indicates the compressed format of the input file.",
+        )
+
         subparser.add_argument(
             "--multitask-head",
             type=str,
@@ -152,6 +160,7 @@ def __init__(
         batch_size: int,
         print_to_console: bool,
         has_dataset_reader: bool,
+        compression_type: str = None,
         multitask_head: Optional[str] = None,
     ) -> None:
         self._predictor = predictor
@@ -160,7 +169,7 @@ def __init__(
         self._batch_size = batch_size
         self._print_to_console = print_to_console
         self._dataset_reader = None if not has_dataset_reader else predictor._dataset_reader
-
+        self.compression_type = compression_type
         self._multitask_head = multitask_head
         if self._multitask_head is not None:
             if self._dataset_reader is None:
@@ -212,10 +221,21 @@ def _get_json_data(self) -> Iterator[JsonDict]:
                     yield self._predictor.load_line(line)
         else:
             input_file = cached_path(self._input_file)
-            with open(input_file, "r") as file_input:
-                for line in file_input:
-                    if not line.isspace():
-                        yield self._predictor.load_line(line)
+            try:
+                with open_compressed(input_file) as file_input:
+                    for line in file_input:
+                        if not line.isspace():
+                            yield self._predictor.load_line(line)
+            except OSError:
+                if self.compression_type:
+                    with open_compressed(input_file, self.compression_type) as file_input:
+                        for line in file_input:
+                            if not line.isspace():
+                                yield self._predictor.load_line(line)
+                else:
+                    print(
+                        "Automatic detection of compression type failed, please specify the compression type argument"
+                    )
 
     def _get_instance_data(self) -> Iterator[Instance]:
         if self._input_file == "-":
diff --git a/allennlp/common/file_utils.py b/allennlp/common/file_utils.py
index 0acd91b2257..dffd7fcb45d 100644
--- a/allennlp/common/file_utils.py
+++ b/allennlp/common/file_utils.py
@@ -1085,20 +1085,26 @@ def get_file_extension(path: str, dot=True, lower: bool = True):
 
 
 def open_compressed(
-    filename: Union[str, PathLike], mode: str = "rt", encoding: Optional[str] = "UTF-8", **kwargs
+    filename: Union[str, PathLike],
+    compression_type: str = None,
+    mode: str = "rt",
+    encoding: Optional[str] = "UTF-8",
+    **kwargs,
 ):
     if not isinstance(filename, str):
         filename = str(filename)
     open_fn: Callable = open
 
-    if filename.endswith(".gz"):
-        import gzip
-
-        open_fn = gzip.open
-    elif filename.endswith(".bz2"):
-        import bz2
+    compression_modules = {"gz": "gzip", "bz2": "bz2", "lzma": "lzma"}
+    if not compression_type:
+        for extension in compression_modules:
+            if filename.endswith(extension):
+                module = __import__(compression_modules[extension])
+                open_fn = module.open
+    else:
+        module = __import__(compression_modules[extension])
+        open_fn = module.open
 
-        open_fn = bz2.open
     return open_fn(cached_path(filename), mode=mode, encoding=encoding, **kwargs)
 
 

From 316b15e0f39861a22b384ee9d98dbb7112f209b6 Mon Sep 17 00:00:00 2001
From: "Dbhasin@1" <drishti_b@me.iitr.ac.in>
Date: Fri, 2 Jul 2021 19:25:48 +0530
Subject: [PATCH 2/9] test for lzma format included

---
 tests/common/file_utils_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/common/file_utils_test.py b/tests/common/file_utils_test.py
index cf773842dab..b525d4ddf78 100644
--- a/tests/common/file_utils_test.py
+++ b/tests/common/file_utils_test.py
@@ -347,7 +347,7 @@ def test_open_compressed(self):
         with open_compressed(uncompressed_file) as f:
             uncompressed_lines = [line.strip() for line in f]
 
-        for suffix in ["bz2", "gz"]:
+        for suffix in ["bz2", "gz", "lzma"]:
             compressed_file = f"{uncompressed_file}.{suffix}"
             with open_compressed(compressed_file) as f:
                 compressed_lines = [line.strip() for line in f]

From 47349d4a7b7c70a48a2dfc7a0b8c21681b3517b2 Mon Sep 17 00:00:00 2001
From: "Dbhasin@1" <drishti_b@me.iitr.ac.in>
Date: Fri, 2 Jul 2021 19:29:48 +0530
Subject: [PATCH 3/9] minor logical error

---
 allennlp/common/file_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/allennlp/common/file_utils.py b/allennlp/common/file_utils.py
index dffd7fcb45d..791f47af905 100644
--- a/allennlp/common/file_utils.py
+++ b/allennlp/common/file_utils.py
@@ -1101,6 +1101,7 @@ def open_compressed(
             if filename.endswith(extension):
                 module = __import__(compression_modules[extension])
                 open_fn = module.open
+                break 
     else:
         module = __import__(compression_modules[extension])
         open_fn = module.open

From 0ac8f67cd1d33609e941481cb96e487f018d9d78 Mon Sep 17 00:00:00 2001
From: "Dbhasin@1" <drishti_b@me.iitr.ac.in>
Date: Fri, 9 Jul 2021 03:54:09 +0530
Subject: [PATCH 4/9] suggested changes incorporated

---
 allennlp/commands/predict.py  | 24 ++++++++++++++----------
 allennlp/common/file_utils.py | 12 ++++++------
 2 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/allennlp/commands/predict.py b/allennlp/commands/predict.py
index e14d2df2f0e..565bc5cfa0d 100644
--- a/allennlp/commands/predict.py
+++ b/allennlp/commands/predict.py
@@ -221,22 +221,26 @@ def _get_json_data(self) -> Iterator[JsonDict]:
                     yield self._predictor.load_line(line)
         else:
             input_file = cached_path(self._input_file)
-            try:
-                with open_compressed(input_file) as file_input:
-                    for line in file_input:
-                        if not line.isspace():
-                            yield self._predictor.load_line(line)
-            except OSError:
-                if self.compression_type:
-                    with open_compressed(input_file, self.compression_type) as file_input:
+            if self.compression_type is None:
+                try:
+                    with open_compressed(input_file) as file_input:
                         for line in file_input:
                             if not line.isspace():
                                 yield self._predictor.load_line(line)
-                else:
+                except OSError:
                     print(
-                        "Automatic detection of compression type failed, please specify the compression type argument"
+                        "Automatic detection failed, please specify the compression type argument."
                     )
 
+            else:
+                try:
+                    with open_compressed(input_file, compression_type=self.compression_type) as file_input:
+                        for line in file_input:
+                            if not line.isspace():
+                                yield self._predictor.load_line(line)
+                except OSError:
+                    print("please specify the correct compression type argument.")
+
     def _get_instance_data(self) -> Iterator[Instance]:
         if self._input_file == "-":
             raise ConfigurationError("stdin is not an option when using a DatasetReader.")
diff --git a/allennlp/common/file_utils.py b/allennlp/common/file_utils.py
index 791f47af905..64d9e845f4b 100644
--- a/allennlp/common/file_utils.py
+++ b/allennlp/common/file_utils.py
@@ -1086,9 +1086,9 @@ def get_file_extension(path: str, dot=True, lower: bool = True):
 
 def open_compressed(
     filename: Union[str, PathLike],
-    compression_type: str = None,
     mode: str = "rt",
     encoding: Optional[str] = "UTF-8",
+    compression_type: Optional[str] = None,
     **kwargs,
 ):
     if not isinstance(filename, str):
@@ -1096,15 +1096,15 @@ def open_compressed(
     open_fn: Callable = open
 
     compression_modules = {"gz": "gzip", "bz2": "bz2", "lzma": "lzma"}
-    if not compression_type:
+    if compression_type in compression_modules:
+        module = __import__(compression_modules[compression_type])
+        open_fn = module.open
+    else:
         for extension in compression_modules:
             if filename.endswith(extension):
                 module = __import__(compression_modules[extension])
                 open_fn = module.open
-                break 
-    else:
-        module = __import__(compression_modules[extension])
-        open_fn = module.open
+                break
 
     return open_fn(cached_path(filename), mode=mode, encoding=encoding, **kwargs)
 

From 217387f518eedc0972cba68ef0e7c6959a6d94e8 Mon Sep 17 00:00:00 2001
From: Dirk Groeneveld <dirkg@allenai.org>
Date: Wed, 23 Feb 2022 10:56:15 -0800
Subject: [PATCH 5/9] Compression is always auto-detected

---
 allennlp/commands/predict.py | 33 ++++-----------------------------
 1 file changed, 4 insertions(+), 29 deletions(-)

diff --git a/allennlp/commands/predict.py b/allennlp/commands/predict.py
index 97fd788f46c..26ee819ff63 100644
--- a/allennlp/commands/predict.py
+++ b/allennlp/commands/predict.py
@@ -71,14 +71,6 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument
             "flag is set.",
         )
 
-        subparser.add_argument(
-            "--compression-type",
-            type=str,
-            choices=["gz", "bz2", "lzma"],
-            default=None,
-            help="Indicates the compressed format of the input file.",
-        )
-
         subparser.add_argument(
             "--multitask-head",
             type=str,
@@ -158,7 +150,6 @@ def __init__(
         batch_size: int,
         print_to_console: bool,
         has_dataset_reader: bool,
-        compression_type: str = None,
         multitask_head: Optional[str] = None,
     ) -> None:
         self._predictor = predictor
@@ -167,7 +158,6 @@ def __init__(
         self._batch_size = batch_size
         self._print_to_console = print_to_console
         self._dataset_reader = None if not has_dataset_reader else predictor._dataset_reader
-        self.compression_type = compression_type
         self._multitask_head = multitask_head
         if self._multitask_head is not None:
             if self._dataset_reader is None:
@@ -219,25 +209,10 @@ def _get_json_data(self) -> Iterator[JsonDict]:
                     yield self._predictor.load_line(line)
         else:
             input_file = cached_path(self._input_file)
-            if self.compression_type is None:
-                try:
-                    with open_compressed(input_file) as file_input:
-                        for line in file_input:
-                            if not line.isspace():
-                                yield self._predictor.load_line(line)
-                except OSError:
-                    print(
-                        "Automatic detection failed, please specify the compression type argument."
-                    )
-
-            else:
-                try:
-                    with open_compressed(input_file, compression_type=self.compression_type) as file_input:
-                        for line in file_input:
-                            if not line.isspace():
-                                yield self._predictor.load_line(line)
-                except OSError:
-                    print("please specify the correct compression type argument.")
+            with open_compressed(input_file) as file_input:
+                for line in file_input:
+                    if not line.isspace():
+                        yield self._predictor.load_line(line)
 
     def _get_instance_data(self) -> Iterator[Instance]:
         if self._input_file == "-":

From b13fd60675f7ad06160849de44634cada037dc01 Mon Sep 17 00:00:00 2001
From: Dirk Groeneveld <dirkg@allenai.org>
Date: Wed, 23 Feb 2022 10:56:31 -0800
Subject: [PATCH 6/9] Import the open_compressed() function from Tango

---
 allennlp/common/file_utils.py | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/allennlp/common/file_utils.py b/allennlp/common/file_utils.py
index 052ad24c62b..a541f500f43 100644
--- a/allennlp/common/file_utils.py
+++ b/allennlp/common/file_utils.py
@@ -1,6 +1,9 @@
 """
 Utilities for working with the local dataset cache.
 """
+import bz2
+import gzip
+import lzma
 import weakref
 from contextlib import contextmanager
 import glob
@@ -443,27 +446,30 @@ def get_file_extension(path: str, dot=True, lower: bool = True):
     return ext.lower() if lower else ext
 
 
+_SUFFIXES: Dict[Callable, str] = {
+    open: "",
+    gzip.open: ".gz",
+    bz2.open: ".bz2",
+    lzma.open: ".xz",
+}
+
+
 def open_compressed(
     filename: Union[str, PathLike],
     mode: str = "rt",
     encoding: Optional[str] = "UTF-8",
-    compression_type: Optional[str] = None,
     **kwargs,
 ):
     if not isinstance(filename, str):
         filename = str(filename)
-    open_fn: Callable = open
 
-    compression_modules = {"gz": "gzip", "bz2": "bz2", "lzma": "lzma"}
-    if compression_type in compression_modules:
-        module = __import__(compression_modules[compression_type])
-        open_fn = module.open
+    open_fn: Callable
+    filename = str(filename)
+    for open_fn, suffix in _SUFFIXES.items():
+        if len(suffix) > 0 and filename.endswith(suffix):
+            break
     else:
-        for extension in compression_modules:
-            if filename.endswith(extension):
-                module = __import__(compression_modules[extension])
-                open_fn = module.open
-                break
+        open_fn = open
 
     return open_fn(cached_path(filename), mode=mode, encoding=encoding, **kwargs)
 

From b18dfec82bcc7a9c62b666e6d9fb507b61662b9b Mon Sep 17 00:00:00 2001
From: Dirk Groeneveld <dirkg@allenai.org>
Date: Wed, 23 Feb 2022 11:01:46 -0800
Subject: [PATCH 7/9] Changelog

---
 CHANGELOG.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e6d2e047951..40c92f38b25 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,6 +13,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Running the test suite out-of-tree (e.g. after installation) is now possible by pointing the environment variable `ALLENNLP_SRC_DIR` to the sources.
 - Silenced a warning that happens when you inappropriately clone a tensor.
 
+### Added
+
+- We can now transparently read compressed input files during prediction.
+- LZMA compression is now supported.
+
+
 ## [v2.9.0](https://github.com/allenai/allennlp/releases/tag/v2.9.0) - 2022-01-27
 
 ### Added

From bc3c750669d801be5688dd4aaf12778685df6bd5 Mon Sep 17 00:00:00 2001
From: Dirk Groeneveld <dirkg@allenai.org>
Date: Wed, 23 Feb 2022 16:13:23 -0800
Subject: [PATCH 8/9] The canonical extension for lzma is xz

---
 allennlp/modules/token_embedders/embedding.py   |   2 +-
 ...gs.5d.txt.lzma => fake_embeddings.5d.txt.xz} | Bin 316 -> 360 bytes
 tests/common/file_utils_test.py                 |  12 ++++++------
 tests/data/vocabulary_test.py                   |   2 +-
 tests/modules/token_embedders/embedding_test.py |   2 +-
 5 files changed, 9 insertions(+), 9 deletions(-)
 rename test_fixtures/embeddings/{fake_embeddings.5d.txt.lzma => fake_embeddings.5d.txt.xz} (53%)

diff --git a/allennlp/modules/token_embedders/embedding.py b/allennlp/modules/token_embedders/embedding.py
index 3c16526b47d..a841ccdd793 100644
--- a/allennlp/modules/token_embedders/embedding.py
+++ b/allennlp/modules/token_embedders/embedding.py
@@ -550,7 +550,7 @@ def __init__(
                 import bz2
 
                 package = bz2
-            elif extension == ".lzma":
+            elif extension == ".xz":
                 import lzma
 
                 package = lzma
diff --git a/test_fixtures/embeddings/fake_embeddings.5d.txt.lzma b/test_fixtures/embeddings/fake_embeddings.5d.txt.xz
similarity index 53%
rename from test_fixtures/embeddings/fake_embeddings.5d.txt.lzma
rename to test_fixtures/embeddings/fake_embeddings.5d.txt.xz
index f8370841c3212f765e45e527a72d8087afdef1a7..6d4fb0df44c3fc18c9ec842fe0a13516c0befa3e 100644
GIT binary patch
delta 77
zcmV-T0J8tQ0_XxB{Wp48S^xk9=GL@E0stWa761SMbT8$j-~tT+C|$7)B>^S%pvTw%
jmTgpkx&)WW00G1TjsySz%d%rRvBYQl0ssI200dcD{mB{c

delta 33
jcmaFCw1<f|mVu#x;Xf2?lv8Eo`Yxu^u>XJB>=Fh56j2a%

diff --git a/tests/common/file_utils_test.py b/tests/common/file_utils_test.py
index de8a9c4f75d..a567825c63c 100644
--- a/tests/common/file_utils_test.py
+++ b/tests/common/file_utils_test.py
@@ -221,16 +221,16 @@ def test_extract_with_external_symlink(self):
         with pytest.raises(ValueError):
             cached_path(dangerous_file, extract_archive=True)
 
-    def test_open_compressed(self):
+    @pytest.mark.parametrize("suffix", ["bz2", "gz", "xz"])
+    def test_open_compressed(self, suffix: str):
         uncompressed_file = self.FIXTURES_ROOT / "embeddings/fake_embeddings.5d.txt"
         with open_compressed(uncompressed_file) as f:
             uncompressed_lines = [line.strip() for line in f]
 
-        for suffix in ["bz2", "gz", "lzma"]:
-            compressed_file = f"{uncompressed_file}.{suffix}"
-            with open_compressed(compressed_file) as f:
-                compressed_lines = [line.strip() for line in f]
-            assert compressed_lines == uncompressed_lines
+        compressed_file = f"{uncompressed_file}.{suffix}"
+        with open_compressed(compressed_file) as f:
+            compressed_lines = [line.strip() for line in f]
+        assert compressed_lines == uncompressed_lines
 
     def test_meta_backwards_compatible(self):
         url = "http://fake.datastore.com/glove.txt.gz"
diff --git a/tests/data/vocabulary_test.py b/tests/data/vocabulary_test.py
index b586867202e..bcac9078b68 100644
--- a/tests/data/vocabulary_test.py
+++ b/tests/data/vocabulary_test.py
@@ -677,7 +677,7 @@ def test_read_pretrained_words(self):
 
         # Reading from a single (compressed) file or a single-file archive
         base_path = str(self.FIXTURES_ROOT / "embeddings/fake_embeddings.5d.txt")
-        for ext in ["", ".gz", ".lzma", ".bz2", ".zip", ".tar.gz"]:
+        for ext in ["", ".gz", ".xz", ".bz2", ".zip", ".tar.gz"]:
             file_path = base_path + ext
             words_read = set(_read_pretrained_tokens(file_path))
             assert words_read == words, (
diff --git a/tests/modules/token_embedders/embedding_test.py b/tests/modules/token_embedders/embedding_test.py
index fac0ff32a0d..c217edf5f6b 100644
--- a/tests/modules/token_embedders/embedding_test.py
+++ b/tests/modules/token_embedders/embedding_test.py
@@ -164,7 +164,7 @@ def test_embeddings_text_file(self):
             assert text == correct_text, "Test failed for file: " + path
 
         # Check for a file contained inside an archive with multiple files
-        for ext in [".zip", ".tar.gz", ".tar.bz2", ".tar.lzma"]:
+        for ext in [".zip", ".tar.gz", ".tar.bz2", ".tar.xz"]:
             archive_path = str(self.FIXTURES_ROOT / "utf-8_sample/archives/utf-8") + ext
             file_uri = format_embeddings_file_uri(archive_path, "folder/utf-8_sample.txt")
             with EmbeddingsTextFile(file_uri) as f:

From 68098b49e4ccc822a1fb924f54712612f8aa1dc9 Mon Sep 17 00:00:00 2001
From: Dirk Groeneveld <dirkg@allenai.org>
Date: Wed, 23 Feb 2022 16:39:34 -0800
Subject: [PATCH 9/9] Updated file name

---
 .../archives/{utf-8.tar.lzma => utf-8.tar.xz}       | Bin
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename test_fixtures/utf-8_sample/archives/{utf-8.tar.lzma => utf-8.tar.xz} (100%)

diff --git a/test_fixtures/utf-8_sample/archives/utf-8.tar.lzma b/test_fixtures/utf-8_sample/archives/utf-8.tar.xz
similarity index 100%
rename from test_fixtures/utf-8_sample/archives/utf-8.tar.lzma
rename to test_fixtures/utf-8_sample/archives/utf-8.tar.xz