From ec518ccc74b85e3b50304ab70ae5a1f069df0038 Mon Sep 17 00:00:00 2001 From: Gunnar Thor Date: Wed, 23 Feb 2022 11:31:56 +0000 Subject: [PATCH 1/4] Add progress bar to phonemization --- .../pyscripts/utils/convert_text_to_phn.py | 26 ++++++++++++++++--- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/egs2/TEMPLATE/asr1/pyscripts/utils/convert_text_to_phn.py b/egs2/TEMPLATE/asr1/pyscripts/utils/convert_text_to_phn.py index 21f8f4daf46..bb8be8b861b 100755 --- a/egs2/TEMPLATE/asr1/pyscripts/utils/convert_text_to_phn.py +++ b/egs2/TEMPLATE/asr1/pyscripts/utils/convert_text_to_phn.py @@ -7,9 +7,11 @@ import argparse import codecs +from tqdm import tqdm +import contextlib from joblib import delayed -from joblib import Parallel +from joblib import Parallel, parallel from espnet2.text.cleaner import TextCleaner from espnet2.text.phoneme_tokenizer import PhonemeTokenizer @@ -34,13 +36,29 @@ def main(): text = {line.split()[0]: " ".join(line.split()[1:]) for line in lines} if cleaner is not None: text = {k: cleaner(v) for k, v in text.items()} - phns_list = Parallel(n_jobs=args.nj)( - [delayed(phoneme_tokenizer.text2tokens)(sentence) for sentence in text.values()] - ) + with tqdm_joblib(tqdm(total=len(text.values()), desc="Phonemizing")) as progress_bar: + phns_list = Parallel(n_jobs=args.nj)( + [delayed(phoneme_tokenizer.text2tokens)(sentence) for sentence in text.values()] + ) with codecs.open(args.out_text, "w", encoding="utf8") as g: for utt_id, phns in zip(text.keys(), phns_list): g.write(f"{utt_id} " + " ".join(phns) + "\n") +@contextlib.contextmanager +def tqdm_joblib(tqdm_object): + """Context manager to patch joblib to report into tqdm progress bar given as argument""" + class TqdmBatchCompletionCallback(parallel.BatchCompletionCallBack): + def __call__(self, *args, **kwargs): + tqdm_object.update(n=self.batch_size) + return super().__call__(*args, **kwargs) + + old_batch_callback = parallel.BatchCompletionCallBack + parallel.BatchCompletionCallBack = TqdmBatchCompletionCallback + try: + yield tqdm_object + finally: + parallel.BatchCompletionCallBack = old_batch_callback + tqdm_object.close() if __name__ == "__main__": main() From 91d48d920c229af3902fc05c361ba1b5f1636c67 Mon Sep 17 00:00:00 2001 From: Gunnar Thor Date: Tue, 26 Apr 2022 22:21:13 +0000 Subject: [PATCH 2/4] applied black --- .../asr1/pyscripts/utils/convert_text_to_phn.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/egs2/TEMPLATE/asr1/pyscripts/utils/convert_text_to_phn.py b/egs2/TEMPLATE/asr1/pyscripts/utils/convert_text_to_phn.py index bb8be8b861b..5686129211d 100755 --- a/egs2/TEMPLATE/asr1/pyscripts/utils/convert_text_to_phn.py +++ b/egs2/TEMPLATE/asr1/pyscripts/utils/convert_text_to_phn.py @@ -36,17 +36,24 @@ def main(): text = {line.split()[0]: " ".join(line.split()[1:]) for line in lines} if cleaner is not None: text = {k: cleaner(v) for k, v in text.items()} - with tqdm_joblib(tqdm(total=len(text.values()), desc="Phonemizing")) as progress_bar: + with tqdm_joblib( + tqdm(total=len(text.values()), desc="Phonemizing") + ) as progress_bar: phns_list = Parallel(n_jobs=args.nj)( - [delayed(phoneme_tokenizer.text2tokens)(sentence) for sentence in text.values()] + [ + delayed(phoneme_tokenizer.text2tokens)(sentence) + for sentence in text.values() + ] ) with codecs.open(args.out_text, "w", encoding="utf8") as g: for utt_id, phns in zip(text.keys(), phns_list): g.write(f"{utt_id} " + " ".join(phns) + "\n") + @contextlib.contextmanager def tqdm_joblib(tqdm_object): """Context manager to patch joblib to report into tqdm progress bar given as argument""" + class TqdmBatchCompletionCallback(parallel.BatchCompletionCallBack): def __call__(self, *args, **kwargs): tqdm_object.update(n=self.batch_size) @@ -60,5 +67,6 @@ def __call__(self, *args, **kwargs): parallel.BatchCompletionCallBack = old_batch_callback tqdm_object.close() + if __name__ == "__main__": main() From 10e6c7ea2e5783442631213dfc20dd7b9543839d Mon Sep 17 00:00:00 2001 From: Gunnar Thor Date: Wed, 27 Apr 2022 09:30:47 +0000 Subject: [PATCH 3/4] split docstring to conform with linter --- egs2/TEMPLATE/asr1/pyscripts/utils/convert_text_to_phn.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/egs2/TEMPLATE/asr1/pyscripts/utils/convert_text_to_phn.py b/egs2/TEMPLATE/asr1/pyscripts/utils/convert_text_to_phn.py index 5686129211d..67628f6ad00 100755 --- a/egs2/TEMPLATE/asr1/pyscripts/utils/convert_text_to_phn.py +++ b/egs2/TEMPLATE/asr1/pyscripts/utils/convert_text_to_phn.py @@ -52,7 +52,10 @@ def main(): @contextlib.contextmanager def tqdm_joblib(tqdm_object): - """Context manager to patch joblib to report into tqdm progress bar given as argument""" + """ + Context manager to patch joblib to report + into tqdm progress bar given as argument + """ class TqdmBatchCompletionCallback(parallel.BatchCompletionCallBack): def __call__(self, *args, **kwargs): From 664414c8f27d5148377ffa733c7f8369eaf7ebd4 Mon Sep 17 00:00:00 2001 From: kan-bayashi Date: Thu, 28 Apr 2022 21:31:45 +0900 Subject: [PATCH 4/4] fixed flake8 --- .../pyscripts/utils/convert_text_to_phn.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/egs2/TEMPLATE/asr1/pyscripts/utils/convert_text_to_phn.py b/egs2/TEMPLATE/asr1/pyscripts/utils/convert_text_to_phn.py index 67628f6ad00..a6605409f15 100755 --- a/egs2/TEMPLATE/asr1/pyscripts/utils/convert_text_to_phn.py +++ b/egs2/TEMPLATE/asr1/pyscripts/utils/convert_text_to_phn.py @@ -1,17 +1,18 @@ #!/usr/bin/env python3 -# Copyright 2021 Tomoki Hayashi +# Copyright 2021 Tomoki Hayashi and Gunnar Thor # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """Convert kaldi-style text into phonemized sentences.""" import argparse import codecs -from tqdm import tqdm import contextlib from joblib import delayed -from joblib import Parallel, parallel +from joblib import Parallel +from joblib import parallel +from tqdm import tqdm from espnet2.text.cleaner import TextCleaner from espnet2.text.phoneme_tokenizer import PhonemeTokenizer @@ -36,9 +37,7 @@ def main(): text = {line.split()[0]: " ".join(line.split()[1:]) for line in lines} if cleaner is not None: text = {k: cleaner(v) for k, v in text.items()} - with tqdm_joblib( - tqdm(total=len(text.values()), desc="Phonemizing") - ) as progress_bar: + with tqdm_joblib(tqdm(total=len(text.values()), desc="Phonemizing")): phns_list = Parallel(n_jobs=args.nj)( [ delayed(phoneme_tokenizer.text2tokens)(sentence) @@ -52,9 +51,11 @@ def main(): @contextlib.contextmanager def tqdm_joblib(tqdm_object): - """ - Context manager to patch joblib to report - into tqdm progress bar given as argument + """Patch joblib to report into tqdm progress bar given as argument. + + Reference: + https://stackoverflow.com/questions/24983493 + """ class TqdmBatchCompletionCallback(parallel.BatchCompletionCallBack):