diff --git a/egs/commonvoice/ASR/local/preprocess_commonvoice.py b/egs/commonvoice/ASR/local/preprocess_commonvoice.py index c0be499c4f..ae1a2c09a8 100755 --- a/egs/commonvoice/ASR/local/preprocess_commonvoice.py +++ b/egs/commonvoice/ASR/local/preprocess_commonvoice.py @@ -21,7 +21,7 @@ from pathlib import Path from typing import Optional -from lhotse import CutSet, SupervisionSegment +from lhotse import CutSet from lhotse.recipes.utils import read_manifests_if_cached @@ -82,6 +82,17 @@ def normalize_text(utt: str, language: str) -> str: .replace("…", "") .replace("⋯", "") .replace("·", "") + .replace("﹒", "") + .replace(".", "") + .replace(":", "") + .replace("︰", "") + .replace("﹖", "") + .replace("(", "") + .replace(")", "") + .replace("-", "") + .replace("~", "") + .replace(";", "") + .replace("", "") .upper() ) else: