Skip to content

Commit

Permalink
Keep the custom fields in libriheavy manifest (#1719)
Browse files Browse the repository at this point in the history
  • Loading branch information
marcoyang1998 committed Aug 17, 2024
1 parent 6ac3343 commit 5952972
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 4 deletions.
10 changes: 7 additions & 3 deletions egs/libriheavy/ASR/local/prepare_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,17 +29,21 @@ def simple_cleanup(text: str) -> str:

# Assign text of the supervisions and remove unnecessary entries.
def main():
assert len(sys.argv) == 3, "Usage: ./local/prepare_manifest.py INPUT OUTPUT_DIR"
assert (
len(sys.argv) == 4
), "Usage: ./local/prepare_manifest.py INPUT OUTPUT_DIR KEEP_CUSTOM_FIELDS"
fname = Path(sys.argv[1]).name
oname = Path(sys.argv[2]) / fname
keep_custom_fields = bool(sys.argv[3])
with gzip.open(sys.argv[1], "r") as fin, gzip.open(oname, "w") as fout:
for line in fin:
cut = json.loads(line)
cut["supervisions"][0]["text"] = simple_cleanup(
cut["supervisions"][0]["custom"]["texts"][0]
)
del cut["supervisions"][0]["custom"]
del cut["custom"]
if not keep_custom_fields:
del cut["supervisions"][0]["custom"]
del cut["custom"]
fout.write((json.dumps(cut) + "\n").encode())


Expand Down
7 changes: 6 additions & 1 deletion egs/libriheavy/ASR/prepare.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@ export CUDA_VISIBLE_DEVICES=""
# - speech
dl_dir=$PWD/download

# If you want to do PromptASR experiments, please set it to True
# as this will keep the texts and pre_text information required for
# the training of PromptASR.
keep_custom_fields=False

. shared/parse_options.sh || exit 1

# vocab size for sentence piece models.
Expand Down Expand Up @@ -134,7 +139,7 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
for subset in small medium large dev test_clean test_other; do
if [ ! -e $manifests_dir/libriheavy_cuts_${subset}.jsonl.gz ]; then
log "Prepare manifest for subset : ${subset}"
./local/prepare_manifest.py $dl_dir/libriheavy/libriheavy_cuts_${subset}.jsonl.gz $manifests_dir
./local/prepare_manifest.py $dl_dir/libriheavy/libriheavy_cuts_${subset}.jsonl.gz $manifests_dir $keep_custom_fields
fi
done
fi
Expand Down

0 comments on commit 5952972

Please sign in to comment.