Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fixes for spellmapper #6994

Merged
merged 2 commits into from
Jul 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
print("Size of customization vocabulary:", len(custom_phrases))

# Load n-gram mappings vocabulary
ngram_mapping_vocab, ban_ngram = load_ngram_mappings(args.ngram_mappings, max_misspelled_freq=125000)
ngram_mapping_vocab, ban_ngram = load_ngram_mappings(args.ngram_mappings, max_misspelled_freq=args.max_misspelled_freq)

# Generate index of custom phrases
phrases, ngram2phrases = get_index(
Expand Down
2 changes: 1 addition & 1 deletion examples/nlp/spellchecking_asr_customization/run_infer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ BIG_SAMPLE=spellmapper_asr_customization_en/big_sample.txt
## File with input nemo ASR manifest
INPUT_MANIFEST=spellmapper_en_evaluation/medical_manifest_ctc.json
## File containing custom words and phrases (plain text)
CUSTOM_VOCAB=spellmapper_en_evaluation/medical_custom_vocab.json
CUSTOM_VOCAB=spellmapper_en_evaluation/medical_custom_vocab.txt

## Other files will be created
## File with index of custom vocabulary
Expand Down
84 changes: 84 additions & 0 deletions nemo/collections/nlp/data/spellchecking_asr_customization/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -764,12 +764,30 @@ def check_banned_replacements(src: str, dst: str) -> bool:
# anticipated => anticipate
if src.endswith("ed") and dst.endswith("e") and src[0:-2] == dst[0:-1]:
return True
# blocks => blocked
if src.endswith("s") and dst.endswith("ed") and src[0:-1] == dst[0:-2]:
return True
# blocked => blocks
if src.endswith("ed") and dst.endswith("s") and src[0:-2] == dst[0:-1]:
return True
# lives => lived
if src.endswith("es") and dst.endswith("ed") and src[0:-2] == dst[0:-2]:
return True
# lived => lives
if src.endswith("ed") and dst.endswith("es") and src[0:-2] == dst[0:-2]:
return True
# regarded => regard
if src.endswith("ed") and src[0:-2] == dst:
return True
# regard => regarded
if dst.endswith("ed") and dst[0:-2] == src:
return True
# regardeding => regard
if src.endswith("ing") and src[0:-3] == dst:
return True
# regard => regarding
if dst.endswith("ing") and dst[0:-3] == src:
return True
# longer => long
if src.endswith("er") and src[0:-2] == dst:
return True
Expand All @@ -782,48 +800,102 @@ def check_banned_replacements(src: str, dst: str) -> bool:
# discussing => discussed
if src.endswith("ing") and dst.endswith("ed") and src[0:-3] == dst[0:-2]:
return True
# live => living
if src.endswith("e") and dst.endswith("ing") and src[0:-1] == dst[0:-3]:
return True
# living => live
if src.endswith("ing") and dst.endswith("e") and src[0:-3] == dst[0:-1]:
return True
# discussion => discussing
if src.endswith("ion") and dst.endswith("ing") and src[0:-3] == dst[0:-3]:
return True
# discussing => discussion
if src.endswith("ing") and dst.endswith("ion") and src[0:-3] == dst[0:-3]:
return True
# alignment => aligning
if src.endswith("ment") and dst.endswith("ing") and src[0:-4] == dst[0:-3]:
return True
# aligning => alignment
if src.endswith("ing") and dst.endswith("ment") and src[0:-3] == dst[0:-4]:
return True
# dispensers => dispensing
if src.endswith("ers") and dst.endswith("ing") and src[0:-3] == dst[0:-3]:
return True
# dispensing => dispensers
if src.endswith("ing") and dst.endswith("ers") and src[0:-3] == dst[0:-3]:
return True
# integrate => integrity
if src.endswith("ate") and dst.endswith("ity") and src[0:-3] == dst[0:-3]:
return True
# integrity => integrate
if src.endswith("ity") and dst.endswith("ate") and src[0:-3] == dst[0:-3]:
return True
# discussion => discussed
if src.endswith("ion") and dst.endswith("ed") and src[0:-3] == dst[0:-2]:
return True
# discussed => discussion
if src.endswith("ed") and dst.endswith("ion") and src[0:-2] == dst[0:-3]:
return True
# anticipation => anticipate
if src.endswith("ion") and dst.endswith("e") and src[0:-3] == dst[0:-1]:
return True
# anticipate => anticipation
if src.endswith("e") and dst.endswith("ion") and src[0:-1] == dst[0:-3]:
return True
# incremental => increment
if src.endswith("ntal") and dst.endswith("nt") and src[0:-4] == dst[0:-2]:
return True
# increment => incremental
if src.endswith("nt") and dst.endswith("ntal") and src[0:-2] == dst[0:-4]:
return True
# national => nation
if src.endswith("nal") and dst.endswith("n") and src[0:-3] == dst[0:-1]:
return True
# nation => national
if src.endswith("n") and dst.endswith("nal") and src[0:-1] == dst[0:-3]:
return True
# significantly => significant
if src.endswith("ntly") and dst.endswith("nt") and src[0:-4] == dst[0:-2]:
return True
# significant => significantly
if src.endswith("nt") and dst.endswith("ntly") and src[0:-2] == dst[0:-4]:
return True
# delivery => deliverer
if src.endswith("ery") and dst.endswith("erer") and src[0:-3] == dst[0:-4]:
return True
# deliverer => delivery
if src.endswith("erer") and dst.endswith("ery") and src[0:-4] == dst[0:-3]:
return True
# deliver => deliverer
if src.endswith("er") and dst.endswith("erer") and src[0:-2] == dst[0:-4]:
return True
# deliverer => deliver
if src.endswith("erer") and dst.endswith("er") and src[0:-4] == dst[0:-2]:
return True
# comparably => comparable
if src.endswith("bly") and dst.endswith("ble") and src[0:-3] == dst[0:-3]:
return True
# comparable => comparably
if src.endswith("ble") and dst.endswith("bly") and src[0:-3] == dst[0:-3]:
return True
# comparably => comparability
if src.endswith("bly") and dst.endswith("bility") and src[0:-3] == dst[0:-6]:
return True
# comparability => comparably
if src.endswith("bility") and dst.endswith("bly") and src[0:-6] == dst[0:-3]:
return True
# beautiful => beautifully
if src.endswith("l") and dst.endswith("lly") and src[0:-1] == dst[0:-3]:
return True
# beautifully => beautiful
if src.endswith("lly") and dst.endswith("l") and src[0:-3] == dst[0:-1]:
return True
# active => actively
if src.endswith("e") and dst.endswith("ely") and src[0:-1] == dst[0:-3]:
return True
# actively => active
if src.endswith("ely") and dst.endswith("e") and src[0:-3] == dst[0:-1]:
return True
# america => american
if src.endswith("a") and dst.endswith("an") and src[0:-1] == dst[0:-2]:
return True
Expand All @@ -836,6 +908,18 @@ def check_banned_replacements(src: str, dst: str) -> bool:
# investing => reinvesting
if dst.startswith("re") and dst[2:] == src:
return True
# unchanged => changed
if src.startswith("un") and src[2:] == dst:
return True
# changed => unchanged
if dst.startswith("un") and dst[2:] == src:
return True
# disrespected => respected
if src.startswith("dis") and src[3:] == dst:
return True
# respected => disrespected
if dst.startswith("dis") and dst[3:] == src:
return True
# outperformance => performance
if src.startswith("out") and src[3:] == dst:
return True
Expand Down