Skip to content

Commit

Permalink
Fix 'overwite' typo in parameter name (#2006)
Browse files Browse the repository at this point in the history
  • Loading branch information
kit1980 authored Dec 13, 2022
1 parent 651a033 commit 7c7b640
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 6 deletions.
6 changes: 3 additions & 3 deletions torchtext/transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -830,7 +830,7 @@ def __init__(
if never_split is None:
never_split = []
self.bert_model = BERTEncoderPyBind(
get_asset_local_path(vocab_path, overwite=True), do_lower_case, strip_accents, never_split
get_asset_local_path(vocab_path, overwrite=True), do_lower_case, strip_accents, never_split
)
self._return_tokens = return_tokens
self._vocab_path = vocab_path
Expand Down Expand Up @@ -929,7 +929,7 @@ class RegexTokenizer(Module):
Caveats
- The RE2 library does not support arbitrary lookahead or lookbehind assertions, nor does it support backreferences. Look at the `docs <https://swtch.com/~rsc/regexp/regexp3.html#caveats>`_ here for more info.
- The final tokenization step always uses spaces as seperators. To split strings based on a specific regex pattern, similar to Python's `re.split <https://docs.python.org/3/library/re.html#re.split>`_, a tuple of ``('<regex_pattern>', ' ')`` can be provided.
- The final tokenization step always uses spaces as separators. To split strings based on a specific regex pattern, similar to Python's `re.split <https://docs.python.org/3/library/re.html#re.split>`_, a tuple of ``('<regex_pattern>', ' ')`` can be provided.
Example
Regex tokenization based on ``(patterns, replacements)`` list.
Expand Down Expand Up @@ -998,7 +998,7 @@ def bytes_to_unicode():
The reversible bpe codes work on unicode strings.
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
This is a signficant percentage of your normal, say, 32K bpe vocab.
This is a significant percentage of your normal, say, 32K bpe vocab.
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
And avoids mapping to whitespace/control characters the bpe code barfs on.
"""
Expand Down
6 changes: 3 additions & 3 deletions torchtext/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,8 +207,8 @@ def _log_class_usage(klass):
torch._C._log_api_usage_once(identifier)


def get_asset_local_path(asset_path: str, overwite=False) -> str:
"""Get local path for assets. Download if path does not exost locally
def get_asset_local_path(asset_path: str, overwrite=False) -> str:
"""Get local path for assets. Download if path does not exist locally
Args:
asset_path: Local path to asset or remote URL
overwrite: Indicate whether to overwrite the file when downloading from URL (default: False)
Expand All @@ -224,5 +224,5 @@ def get_asset_local_path(asset_path: str, overwite=False) -> str:
if os.path.exists(asset_path):
local_path = asset_path
else:
local_path = download_from_url(url=asset_path, root=_CACHE_DIR, overwrite=overwite)
local_path = download_from_url(url=asset_path, root=_CACHE_DIR, overwrite=overwrite)
return local_path

0 comments on commit 7c7b640

Please sign in to comment.