From 01814207e4fcaace14c338bd7907c3e765861363 Mon Sep 17 00:00:00 2001 From: Sergii Dymchenko Date: Fri, 9 Dec 2022 16:37:46 -0800 Subject: [PATCH] Fix 'overwite' typo in parameter name --- torchtext/transforms.py | 6 +++--- torchtext/utils.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/torchtext/transforms.py b/torchtext/transforms.py index e34ec17bf..4684d5808 100644 --- a/torchtext/transforms.py +++ b/torchtext/transforms.py @@ -830,7 +830,7 @@ def __init__( if never_split is None: never_split = [] self.bert_model = BERTEncoderPyBind( - get_asset_local_path(vocab_path, overwite=True), do_lower_case, strip_accents, never_split + get_asset_local_path(vocab_path, overwrite=True), do_lower_case, strip_accents, never_split ) self._return_tokens = return_tokens self._vocab_path = vocab_path @@ -929,7 +929,7 @@ class RegexTokenizer(Module): Caveats - The RE2 library does not support arbitrary lookahead or lookbehind assertions, nor does it support backreferences. Look at the `docs `_ here for more info. - - The final tokenization step always uses spaces as seperators. To split strings based on a specific regex pattern, similar to Python's `re.split `_, a tuple of ``('', ' ')`` can be provided. + - The final tokenization step always uses spaces as separators. To split strings based on a specific regex pattern, similar to Python's `re.split `_, a tuple of ``('', ' ')`` can be provided. Example Regex tokenization based on ``(patterns, replacements)`` list. @@ -998,7 +998,7 @@ def bytes_to_unicode(): The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. - This is a signficant percentage of your normal, say, 32K bpe vocab. + This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup tables between utf-8 bytes and unicode strings. And avoids mapping to whitespace/control characters the bpe code barfs on. """ diff --git a/torchtext/utils.py b/torchtext/utils.py index 9ed1379b1..fb5dde7c3 100644 --- a/torchtext/utils.py +++ b/torchtext/utils.py @@ -207,8 +207,8 @@ def _log_class_usage(klass): torch._C._log_api_usage_once(identifier) -def get_asset_local_path(asset_path: str, overwite=False) -> str: - """Get local path for assets. Download if path does not exost locally +def get_asset_local_path(asset_path: str, overwrite=False) -> str: + """Get local path for assets. Download if path does not exist locally Args: asset_path: Local path to asset or remote URL overwrite: Indicate whether to overwrite the file when downloading from URL (default: False) @@ -224,5 +224,5 @@ def get_asset_local_path(asset_path: str, overwite=False) -> str: if os.path.exists(asset_path): local_path = asset_path else: - local_path = download_from_url(url=asset_path, root=_CACHE_DIR, overwrite=overwite) + local_path = download_from_url(url=asset_path, root=_CACHE_DIR, overwrite=overwrite) return local_path