Skip to content

Commit

Permalink
Fix failure of ljspeech's get_data.py (NVIDIA#7430)
Browse files Browse the repository at this point in the history
* Fix failure of ljspeech's get_data.py

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Robin Dong <robin.k.dong@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Sasha Meister <sasha.meister.work@gmail.com>
  • Loading branch information
2 people authored and ssh-meister committed Oct 5, 2023
1 parent 569dabc commit edb95cd
Showing 1 changed file with 3 additions and 20 deletions.
23 changes: 3 additions & 20 deletions scripts/dataset_processing/tts/ljspeech/get_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,6 @@
def get_args():
parser = argparse.ArgumentParser(description='Download LJSpeech and create manifests with predefined split')
parser.add_argument("--data-root", required=True, type=Path)
parser.add_argument(
'--whitelist-path',
type=str,
default="lj_speech.tsv extracted from the readme file in the dataset. You can also download the file from https://github.com/NVIDIA/NeMo-text-processing/blob/main/nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv",
)

args = parser.parse_args()
return args
Expand All @@ -57,20 +52,9 @@ def __extract_file(filepath, data_dir):
print(f"Error while extracting {filepath}. Already extracted?")


def __process_data(data_root, whitelist_path):
if whitelist_path is None:
wget.download(
"https://raw.githubusercontent.com/NVIDIA/NeMo-text-processing/main/nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv",
out=str(data_root),
)
whitelist_path = data_root / "lj_speech.tsv"

def __process_data(data_root):
text_normalizer = Normalizer(
lang="en",
input_case="cased",
whitelist=whitelist_path,
overwrite_cache=True,
cache_dir=data_root / "cache_dir",
lang="en", input_case="cased", overwrite_cache=True, cache_dir=data_root / "cache_dir",
)
text_normalizer_call_kwargs = {"punct_pre_process": True, "punct_post_process": True}
normalizer_call = lambda x: text_normalizer.normalize(x, **text_normalizer_call_kwargs)
Expand Down Expand Up @@ -117,9 +101,8 @@ def main():
__extract_file(str(tarred_data_path), str(args.data_root))

data_root = args.data_root / "LJSpeech-1.1"
whitelist_path = args.whitelist_path

__process_data(data_root, whitelist_path)
__process_data(data_root)


if __name__ == '__main__':
Expand Down

0 comments on commit edb95cd

Please sign in to comment.