Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix preparing char based lang and add multiprocessing for wenetspeech text segmentation #513

Merged
merged 3 commits into from
Aug 3, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 38 additions & 13 deletions egs/wenetspeech/ASR/local/text2segments.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# -*- coding: utf-8 -*-

# Copyright 2021 Xiaomi Corp. (authors: Mingshuang Luo)
# 2022 Xiaomi Corp. (authors: Weiji Zhuang)
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
Expand Down Expand Up @@ -29,10 +30,18 @@


import argparse
from multiprocessing import Pool

import jieba
import paddle
from tqdm import tqdm

# In PaddlePaddle 2.x, dynamic graph mode is turned on by default,
# and 'data()' is only supported in static graph mode. So if you
# want to use this api, should call 'paddle.enable_static()' before
# this api to enter static graph mode.
paddle.enable_static()
paddle.disable_signal_handler()
jieba.enable_paddle()


Expand All @@ -41,14 +50,23 @@ def get_parser():
description="Chinese Word Segmentation for text",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument(
"--num-process",
"-n",
default=20,
type=int,
help="the number of processes",
)
parser.add_argument(
"--input-file",
"-i",
default="data/lang_char/text",
type=str,
help="the input text file for WenetSpeech",
)
parser.add_argument(
"--output-file",
"-o",
default="data/lang_char/text_words_segmentation",
type=str,
help="the text implemented with words segmenting for WenetSpeech",
Expand All @@ -57,26 +75,33 @@ def get_parser():
return parser


def cut(lines):
if lines is not None:
cut_lines = jieba.cut(lines, use_paddle=True)
return [i for i in cut_lines]
else:
return None


def main():
parser = get_parser()
args = parser.parse_args()

num_process = args.num_process
input_file = args.input_file
output_file = args.output_file
# parallel mode does not support use_paddle
# jieba.enable_parallel(num_process)

with open(input_file, "r", encoding="utf-8") as fr:
lines = fr.readlines()

with Pool(processes=num_process) as p:
new_lines = list(tqdm(p.imap(cut, lines), total=len(lines)))

f = open(input_file, "r", encoding="utf-8")
lines = f.readlines()
new_lines = []
for i in tqdm(range(len(lines))):
x = lines[i].rstrip()
seg_list = jieba.cut(x, use_paddle=True)
new_line = " ".join(seg_list)
new_lines.append(new_line)

f_new = open(output_file, "w", encoding="utf-8")
for line in new_lines:
f_new.write(line)
f_new.write("\n")
with open(output_file, "w", encoding="utf-8") as fw:
for line in new_lines:
fw.write(" ".join(line) + "\n")


if __name__ == "__main__":
Expand Down
32 changes: 17 additions & 15 deletions egs/wenetspeech/ASR/prepare.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ num_splits=1000
# - speech

dl_dir=$PWD/download
lang_char_dir=data/lang_char

. shared/parse_options.sh || exit 1

Expand Down Expand Up @@ -186,24 +187,27 @@ fi

if [ $stage -le 15 ] && [ $stop_stage -ge 15 ]; then
log "Stage 15: Prepare char based lang"
lang_char_dir=data/lang_char
mkdir -p $lang_char_dir

# Prepare text.
# Note: in Linux, you can install jq with the following command:
# 1. wget -O jq https://github.com/stedolan/jq/releases/download/jq-1.6/jq-linux64
# 2. chmod +x ./jq
# 3. cp jq /usr/bin
if [ ! -f $lang_char_dir/text ]; then
gunzip -c data/manifests/supervisions_L.jsonl.gz \
| jq 'text' | sed 's/"//g' \
if ! which jq; then
echo "This script is intended to be used with jq but you have not installed jq
Note: in Linux, you can install jq with the following command:
1. wget -O jq https://github.com/stedolan/jq/releases/download/jq-1.6/jq-linux64
2. chmod +x ./jq
3. cp jq /usr/bin" && exit 1
fi
if [ ! -f $lang_char_dir/text ] || [ ! -s $lang_char_dir/text ]; then
log "Prepare text."
gunzip -c data/manifests/wenetspeech_supervisions_L.jsonl.gz \
| jq '.text' | sed 's/"//g' \
| ./local/text2token.py -t "char" > $lang_char_dir/text
fi

# The implementation of chinese word segmentation for text,
# and it will take about 15 minutes.
if [ ! -f $lang_char_dir/text_words_segmentation ]; then
python ./local/text2segments.py \
python3 ./local/text2segments.py \
--num-process $nj \
--input-file $lang_char_dir/text \
--output-file $lang_char_dir/text_words_segmentation
fi
Expand All @@ -212,7 +216,7 @@ if [ $stage -le 15 ] && [ $stop_stage -ge 15 ]; then
| sort -u | sed '/^$/d' | uniq > $lang_char_dir/words_no_ids.txt

if [ ! -f $lang_char_dir/words.txt ]; then
python ./local/prepare_words.py \
python3 ./local/prepare_words.py \
--input-file $lang_char_dir/words_no_ids.txt \
--output-file $lang_char_dir/words.txt
fi
Expand All @@ -221,7 +225,7 @@ fi
if [ $stage -le 16 ] && [ $stop_stage -ge 16 ]; then
log "Stage 16: Prepare char based L_disambig.pt"
if [ ! -f data/lang_char/L_disambig.pt ]; then
python ./local/prepare_char.py \
python3 ./local/prepare_char.py \
--lang-dir data/lang_char
fi
fi
Expand All @@ -232,9 +236,8 @@ if [ $stage -le 17 ] && [ $stop_stage -ge 17 ]; then
# It will take about 20 minutes.
# We assume you have install kaldilm, if not, please install
# it using: pip install kaldilm
lang_char_dir=data/lang_char
if [ ! -f $lang_char_dir/3-gram.unpruned.arpa ]; then
python ./shared/make_kn_lm.py \
python3 ./shared/make_kn_lm.py \
-ngram-order 3 \
-text $lang_char_dir/text_words_segmentation \
-lm $lang_char_dir/3-gram.unpruned.arpa
Expand All @@ -253,6 +256,5 @@ fi

if [ $stage -le 18 ] && [ $stop_stage -ge 18 ]; then
log "Stage 18: Compile LG"
lang_char_dir=data/lang_char
python ./local/compile_lg.py --lang-dir $lang_char_dir
fi