This repository has been archived by the owner on Dec 16, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2.3k
Functionality to accept compressed files as input to predict when using a Predictor #5299
Closed
Closed
Changes from all commits
Commits
Show all changes
9 commits
Select commit
Hold shift + click to select a range
0a38254
functionality to accept compressed files as input to predict
Dbhasin1 316b15e
test for lzma format included
Dbhasin1 47349d4
minor logical error
Dbhasin1 79d7e96
Merge branch 'main' into zipped-file-handler
dirkgr 0ac8f67
suggested changes incorporated
Dbhasin1 85bddc7
Merge branch 'main' into zipped-file-handler
dirkgr ef821b9
Merge branch 'main' into zipped-file-handler
dirkgr 026e42a
Merge branch 'main' into zipped-file-handler
dirkgr 23c50c7
Merge branch 'main' into zipped-file-handler
dirkgr File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,7 +12,7 @@ | |
from allennlp.commands.subcommand import Subcommand | ||
from allennlp.common import logging as common_logging | ||
from allennlp.common.checks import check_for_gpu, ConfigurationError | ||
from allennlp.common.file_utils import cached_path | ||
from allennlp.common.file_utils import cached_path, open_compressed | ||
from allennlp.common.util import lazy_groups_of | ||
from allennlp.data.dataset_readers import MultiTaskDatasetReader | ||
from allennlp.models.archival import load_archive | ||
|
@@ -71,6 +71,14 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument | |
"flag is set.", | ||
) | ||
|
||
subparser.add_argument( | ||
"--compression-type", | ||
type=str, | ||
choices=["gz", "bz2", "lzma"], | ||
default=None, | ||
help="Indicates the compressed format of the input file.", | ||
) | ||
|
||
subparser.add_argument( | ||
"--multitask-head", | ||
type=str, | ||
|
@@ -150,6 +158,7 @@ def __init__( | |
batch_size: int, | ||
print_to_console: bool, | ||
has_dataset_reader: bool, | ||
compression_type: str = None, | ||
multitask_head: Optional[str] = None, | ||
) -> None: | ||
self._predictor = predictor | ||
|
@@ -158,7 +167,7 @@ def __init__( | |
self._batch_size = batch_size | ||
self._print_to_console = print_to_console | ||
self._dataset_reader = None if not has_dataset_reader else predictor._dataset_reader | ||
|
||
self.compression_type = compression_type | ||
self._multitask_head = multitask_head | ||
if self._multitask_head is not None: | ||
if self._dataset_reader is None: | ||
|
@@ -210,10 +219,25 @@ def _get_json_data(self) -> Iterator[JsonDict]: | |
yield self._predictor.load_line(line) | ||
else: | ||
input_file = cached_path(self._input_file) | ||
with open(input_file, "r") as file_input: | ||
for line in file_input: | ||
if not line.isspace(): | ||
yield self._predictor.load_line(line) | ||
if self.compression_type is None: | ||
try: | ||
with open_compressed(input_file) as file_input: | ||
for line in file_input: | ||
if not line.isspace(): | ||
yield self._predictor.load_line(line) | ||
except OSError: | ||
print( | ||
"Automatic detection failed, please specify the compression type argument." | ||
) | ||
|
||
else: | ||
try: | ||
with open_compressed(input_file, compression_type=self.compression_type) as file_input: | ||
for line in file_input: | ||
if not line.isspace(): | ||
yield self._predictor.load_line(line) | ||
except OSError: | ||
print("please specify the correct compression type argument.") | ||
Comment on lines
+239
to
+240
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why |
||
|
||
def _get_instance_data(self) -> Iterator[Instance]: | ||
if self._input_file == "-": | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -444,20 +444,27 @@ def get_file_extension(path: str, dot=True, lower: bool = True): | |
|
||
|
||
def open_compressed( | ||
filename: Union[str, PathLike], mode: str = "rt", encoding: Optional[str] = "UTF-8", **kwargs | ||
filename: Union[str, PathLike], | ||
mode: str = "rt", | ||
encoding: Optional[str] = "UTF-8", | ||
compression_type: Optional[str] = None, | ||
**kwargs, | ||
): | ||
if not isinstance(filename, str): | ||
filename = str(filename) | ||
open_fn: Callable = open | ||
|
||
if filename.endswith(".gz"): | ||
import gzip | ||
|
||
open_fn = gzip.open | ||
elif filename.endswith(".bz2"): | ||
import bz2 | ||
compression_modules = {"gz": "gzip", "bz2": "bz2", "lzma": "lzma"} | ||
if compression_type in compression_modules: | ||
module = __import__(compression_modules[compression_type]) | ||
open_fn = module.open | ||
else: | ||
for extension in compression_modules: | ||
if filename.endswith(extension): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you use |
||
module = __import__(compression_modules[extension]) | ||
open_fn = module.open | ||
break | ||
|
||
open_fn = bz2.open | ||
return open_fn(cached_path(filename), mode=mode, encoding=encoding, **kwargs) | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The type should be
Optional[str]
.