Skip to content

Commit

Permalink
scripts updated
Browse files Browse the repository at this point in the history
  • Loading branch information
JinZr committed Mar 13, 2024
1 parent 750e2ac commit a39aa8a
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 4 deletions.
5 changes: 4 additions & 1 deletion egs/commonvoice/ASR/local/preprocess_commonvoice.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,14 +52,17 @@ def normalize_text(utt: str, language: str) -> str:
return re.sub(r"[^A-ZÀÂÆÇÉÈÊËÎÏÔŒÙÛÜ' ]", "", utt).upper()
elif language == "pl":
return re.sub(r"[^a-ząćęłńóśźżA-ZĄĆĘŁŃÓŚŹŻ' ]", "", utt).upper()
elif language == "yue":
elif language in ["yue", "zh-HK"]:
# Mozilla Common Voice uses both "yue" and "zh-HK" for Cantonese
# Not sure why they decided to do this...
return (
utt.replace(" ", "")
.replace(",", "")
.replace("。", " ")
.replace("?", "")
.replace("!", "")
.replace("?", "")
.replace("!", "")
.replace("‘", "")
.replace("、", "")
.upper()
Expand Down
6 changes: 3 additions & 3 deletions egs/commonvoice/ASR/prepare.sh
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then
fi

if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then
if [ $lang == "yue" ] || [ $lang == "zh_TW" ] || [ $lang == "zh_CN" ] || [ $lang == "zh_HK" ]; then
if [ $lang == "yue" ] || [ $lang == "zh-TW" ] || [ $lang == "zh-CN" ] || [ $lang == "zh-HK" ]; then
log "Stage 9: Prepare Char based lang"
lang_dir=data/${lang}/lang_char/
mkdir -p $lang_dir
Expand All @@ -190,7 +190,7 @@ if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then
gunzip -c data/${lang}/manifests/cv-${lang}_supervisions_train.jsonl.gz \
| jq '.text' | sed 's/"//g' > $lang_dir/text

if [ $lang == "yue" ]; then
if [ $lang == "yue" ] || [ $lang == "zh-HK" ]; then
# Get words.txt and words_no_ids.txt
./local/word_segment_yue.py \
--input-file $lang_dir/text \
Expand Down Expand Up @@ -299,7 +299,7 @@ if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then
# We assume you have install kaldilm, if not, please install
# it using: pip install kaldilm

if [ $lang == "yue" ] || [ $lang == "zh_TW" ] || [ $lang == "zh_CN" ] || [ $lang == "zh_HK" ]; then
if [ $lang == "yue" ] || [ $lang == "zh-TW" ] || [ $lang == "zh-CN" ] || [ $lang == "zh-HK" ]; then
lang_dir=data/${lang}/lang_char
mkdir -p $lang_dir/lm

Expand Down

0 comments on commit a39aa8a

Please sign in to comment.