From b986ef5ddc5c29a310674c0ae6173bcba9fb1503 Mon Sep 17 00:00:00 2001 From: WeijiZhuang Date: Tue, 2 Aug 2022 23:38:41 +0800 Subject: [PATCH 1/3] add multiprocessing for wenetspeech text segmentation --- egs/wenetspeech/ASR/local/text2segments.py | 58 +++++++++++++++------- 1 file changed, 41 insertions(+), 17 deletions(-) diff --git a/egs/wenetspeech/ASR/local/text2segments.py b/egs/wenetspeech/ASR/local/text2segments.py index 3df727c67d..c98bfc0229 100644 --- a/egs/wenetspeech/ASR/local/text2segments.py +++ b/egs/wenetspeech/ASR/local/text2segments.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- # Copyright 2021 Xiaomi Corp. (authors: Mingshuang Luo) +# 2022 Xiaomi Corp. (authors: Weiji Zhuang) # # See ../../../../LICENSE for clarification regarding multiple authors # @@ -29,10 +30,17 @@ import argparse - -import jieba from tqdm import tqdm +from multiprocessing import Pool +import paddle +import jieba +# In PaddlePaddle 2.x, dynamic graph mode is turned on by default, +# and 'data()' is only supported in static graph mode. So if you +# want to use this api, should call 'paddle.enable_static()' before +# this api to enter static graph mode. +paddle.enable_static() +paddle.disable_signal_handler() jieba.enable_paddle() @@ -41,42 +49,58 @@ def get_parser(): description="Chinese Word Segmentation for text", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) + parser.add_argument( + "--num-process", + "-n", + default=20, + type=int, + help="the number of processes" + ) parser.add_argument( "--input-file", + "-i", default="data/lang_char/text", type=str, - help="the input text file for WenetSpeech", + help="the input text file for WenetSpeech" ) parser.add_argument( "--output-file", + "-o", default="data/lang_char/text_words_segmentation", type=str, - help="the text implemented with words segmenting for WenetSpeech", + help="the text implemented with words segmenting for WenetSpeech" ) return parser +def cut(lines): + if lines != None: + cut_lines = jieba.cut(lines, use_paddle=True) + return [i for i in cut_lines] + else: + return None + + def main(): parser = get_parser() args = parser.parse_args() + num_process = args.num_process input_file = args.input_file output_file = args.output_file + # parallel mode does not support use_paddle + # jieba.enable_parallel(num_process) + + with open(input_file, "r", encoding="utf-8") as fr: + lines = fr.readlines() + + with Pool(processes=num_process) as p: + new_lines = list(tqdm(p.imap(cut, lines), total=len(lines))) - f = open(input_file, "r", encoding="utf-8") - lines = f.readlines() - new_lines = [] - for i in tqdm(range(len(lines))): - x = lines[i].rstrip() - seg_list = jieba.cut(x, use_paddle=True) - new_line = " ".join(seg_list) - new_lines.append(new_line) - - f_new = open(output_file, "w", encoding="utf-8") - for line in new_lines: - f_new.write(line) - f_new.write("\n") + with open(output_file, "w", encoding="utf-8") as fw: + for line in new_lines: + fw.write(' '.join(line) + "\n") if __name__ == "__main__": From fffd3bbd9e57833151e4ab8ab956dae0b8f7d6d4 Mon Sep 17 00:00:00 2001 From: WeijiZhuang Date: Tue, 2 Aug 2022 23:40:27 +0800 Subject: [PATCH 2/3] Fix preparing char based lang for wenetspeech --- egs/wenetspeech/ASR/prepare.sh | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/egs/wenetspeech/ASR/prepare.sh b/egs/wenetspeech/ASR/prepare.sh index 6573a94ade..755fbb2d70 100755 --- a/egs/wenetspeech/ASR/prepare.sh +++ b/egs/wenetspeech/ASR/prepare.sh @@ -28,6 +28,7 @@ num_splits=1000 # - speech dl_dir=$PWD/download +lang_char_dir=data/lang_char . shared/parse_options.sh || exit 1 @@ -186,24 +187,27 @@ fi if [ $stage -le 15 ] && [ $stop_stage -ge 15 ]; then log "Stage 15: Prepare char based lang" - lang_char_dir=data/lang_char mkdir -p $lang_char_dir - # Prepare text. - # Note: in Linux, you can install jq with the following command: - # 1. wget -O jq https://github.com/stedolan/jq/releases/download/jq-1.6/jq-linux64 - # 2. chmod +x ./jq - # 3. cp jq /usr/bin - if [ ! -f $lang_char_dir/text ]; then - gunzip -c data/manifests/supervisions_L.jsonl.gz \ - | jq 'text' | sed 's/"//g' \ + if ! which jq; then + echo "This script is intended to be used with jq but you have not installed jq + Note: in Linux, you can install jq with the following command: + 1. wget -O jq https://github.com/stedolan/jq/releases/download/jq-1.6/jq-linux64 + 2. chmod +x ./jq + 3. cp jq /usr/bin" && exit 1 + fi + if [ ! -f $lang_char_dir/text ] || [ ! -s $lang_char_dir/text ]; then + log "Prepare text." + gunzip -c data/manifests/wenetspeech_supervisions_L.jsonl.gz \ + | jq '.text' | sed 's/"//g' \ | ./local/text2token.py -t "char" > $lang_char_dir/text fi # The implementation of chinese word segmentation for text, # and it will take about 15 minutes. if [ ! -f $lang_char_dir/text_words_segmentation ]; then - python ./local/text2segments.py \ + python3 ./local/text2segments.py \ + --num-process $nj \ --input-file $lang_char_dir/text \ --output-file $lang_char_dir/text_words_segmentation fi @@ -212,7 +216,7 @@ if [ $stage -le 15 ] && [ $stop_stage -ge 15 ]; then | sort -u | sed '/^$/d' | uniq > $lang_char_dir/words_no_ids.txt if [ ! -f $lang_char_dir/words.txt ]; then - python ./local/prepare_words.py \ + python3 ./local/prepare_words.py \ --input-file $lang_char_dir/words_no_ids.txt \ --output-file $lang_char_dir/words.txt fi @@ -221,7 +225,7 @@ fi if [ $stage -le 16 ] && [ $stop_stage -ge 16 ]; then log "Stage 16: Prepare char based L_disambig.pt" if [ ! -f data/lang_char/L_disambig.pt ]; then - python ./local/prepare_char.py \ + python3 ./local/prepare_char.py \ --lang-dir data/lang_char fi fi @@ -232,9 +236,8 @@ if [ $stage -le 17 ] && [ $stop_stage -ge 17 ]; then # It will take about 20 minutes. # We assume you have install kaldilm, if not, please install # it using: pip install kaldilm - lang_char_dir=data/lang_char if [ ! -f $lang_char_dir/3-gram.unpruned.arpa ]; then - python ./shared/make_kn_lm.py \ + python3 ./shared/make_kn_lm.py \ -ngram-order 3 \ -text $lang_char_dir/text_words_segmentation \ -lm $lang_char_dir/3-gram.unpruned.arpa @@ -253,6 +256,5 @@ fi if [ $stage -le 18 ] && [ $stop_stage -ge 18 ]; then log "Stage 18: Compile LG" - lang_char_dir=data/lang_char python ./local/compile_lg.py --lang-dir $lang_char_dir fi From 71871d5614dc918798b52ba15abc26ec59535d7d Mon Sep 17 00:00:00 2001 From: WeijiZhuang Date: Wed, 3 Aug 2022 18:24:07 +0800 Subject: [PATCH 3/3] fix style --- egs/wenetspeech/ASR/local/text2segments.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/egs/wenetspeech/ASR/local/text2segments.py b/egs/wenetspeech/ASR/local/text2segments.py index c98bfc0229..df5b3c1195 100644 --- a/egs/wenetspeech/ASR/local/text2segments.py +++ b/egs/wenetspeech/ASR/local/text2segments.py @@ -30,11 +30,12 @@ import argparse -from tqdm import tqdm from multiprocessing import Pool -import paddle import jieba +import paddle +from tqdm import tqdm + # In PaddlePaddle 2.x, dynamic graph mode is turned on by default, # and 'data()' is only supported in static graph mode. So if you # want to use this api, should call 'paddle.enable_static()' before @@ -54,29 +55,29 @@ def get_parser(): "-n", default=20, type=int, - help="the number of processes" + help="the number of processes", ) parser.add_argument( "--input-file", "-i", default="data/lang_char/text", type=str, - help="the input text file for WenetSpeech" + help="the input text file for WenetSpeech", ) parser.add_argument( "--output-file", "-o", default="data/lang_char/text_words_segmentation", type=str, - help="the text implemented with words segmenting for WenetSpeech" + help="the text implemented with words segmenting for WenetSpeech", ) return parser def cut(lines): - if lines != None: - cut_lines = jieba.cut(lines, use_paddle=True) + if lines is not None: + cut_lines = jieba.cut(lines, use_paddle=True) return [i for i in cut_lines] else: return None @@ -100,7 +101,7 @@ def main(): with open(output_file, "w", encoding="utf-8") as fw: for line in new_lines: - fw.write(' '.join(line) + "\n") + fw.write(" ".join(line) + "\n") if __name__ == "__main__":