Skip to content

Commit

Permalink
Update preprocess_commonvoice.py
Browse files Browse the repository at this point in the history
  • Loading branch information
JinZr committed Mar 13, 2024
1 parent a39aa8a commit 09a358a
Showing 1 changed file with 17 additions and 2 deletions.
19 changes: 17 additions & 2 deletions egs/commonvoice/ASR/local/preprocess_commonvoice.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,15 +56,30 @@ def normalize_text(utt: str, language: str) -> str:
# Mozilla Common Voice uses both "yue" and "zh-HK" for Cantonese
# Not sure why they decided to do this...
return (
utt.replace(" ", "")
.replace(",", "")
utt.replace(",", "")
.replace("。", " ")
.replace("?", "")
.replace("!", "")
.replace("?", "")
.replace("!", "")
.replace("‘", "")
.replace("、", "")
.replace(",", "")
.replace(".", "")
.replace(":", "")
.replace(";", "")
.replace("「", "")
.replace("」", "")
.replace("“", "")
.replace("”", "")
.replace("\\", "")
.replace("~", "")
.replace("—", "")
.replace("ㄧ", "")
.replace("《", "")
.replace("》", "")
.replace("…", "")
.replace("⋯", "")
.upper()
)
else:
Expand Down

0 comments on commit 09a358a

Please sign in to comment.