wenet-e2e · czy97 · Aug 20, 2024 · Aug 11, 2024 · Aug 11, 2024 · Aug 11, 2024
diff --git a/examples/voxconverse/v3/README.md b/examples/voxconverse/v3/README.md
@@ -0,0 +1,34 @@
+## Overview
+
+* We suggest to run this recipe on a gpu-available machine, with onnxruntime-gpu supported.
+* Dataset: voxconverse_dev that consists of 216 utterances
+* Speaker model: ResNet34 model pretrained by wespeaker
+  * Refer to [voxceleb sv recipe](https://github.com/wenet-e2e/wespeaker/tree/master/examples/voxceleb/v2)
+  * [pretrained model path](https://wespeaker-1256283475.cos.ap-shanghai.myqcloud.com/models/voxceleb/voxceleb_resnet34_LM.onnx)
+* Speaker activity detection model: oracle SAD (from ground truth annotation) or system SAD (VAD model pretrained by silero, https://github.com/snakers4/silero-vad)
+* Clustering method: spectral clustering
+* Metric: DER = MISS + FALSE ALARM + SPEAKER CONFUSION (%)
+
+## Results
+
+* Dev set
+
+    | system | MISS | FA | SC | DER |
+    |:---|:---:|:---:|:---:|:---:|
+    | This repo (with oracle SAD) | 2.3 | 0.0 | 1.3 | 3.6 |
+    | This repo (with system SAD) | 3.4 | 0.6 | 1.4 | 5.4 |
+    | DIHARD 2019 baseline [^1] | 11.1 | 1.4 | 11.3 | 23.8 |
+    | DIHARD 2019 baseline w/ SE [^1] | 9.3 | 1.3 | 9.7 | 20.2 |
+    | (SyncNet ASD only) [^1] | 2.2 | 4.1 | 4.0 | 10.4 |
+    | (AVSE ASD only) [^1] | 2.0 | 5.9 | 4.6 | 12.4 |
+    | (proposed) [^1] | 2.4 | 2.3 | 3.0 | 7.7 |
+
+* Test set
+
+    | system | MISS | FA | SC | DER |
+    |:---|:---:|:---:|:---:|:---:|
+    | This repo (with oracle SAD) | 1.6 | 0.0 | 1.9 | 3.5 |
+    | This repo (with system SAD) | 3.8 | 1.7 | 1.8 | 7.4 |
+
+
+[^1]: Spot the conversation: speaker diarisation in the wild, https://arxiv.org/pdf/2007.01216.pdf
diff --git a/examples/voxconverse/v3/local/extract_emb.sh b/examples/voxconverse/v3/local/extract_emb.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+# Copyright (c) 2022 Zhengyang Chen (chenzhengyang117@gmail.com)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+. ./path.sh || exit 1
+
+scp=''
+pretrained_model=''
+device=cuda
+store_dir=''
+subseg_cmn=true
+nj=1
+
+batch_size=96
+frame_shift=10
+window_secs=1.5
+period_secs=0.75
+
+. tools/parse_options.sh
+
+split_dir=$store_dir/split_scp
+log_dir=$store_dir/log
+mkdir -p $split_dir
+mkdir -p $log_dir
+
+# split the scp file to sub_file, and we can use multi-process to extract embeddings
+file_len=`wc -l $scp | awk '{print $1}'`
+subfile_len=$[$file_len / $nj + 1]
+prefix='split'
+split -l $subfile_len -d -a 3 $scp ${split_dir}/${prefix}_scp_
+
+for suffix in `seq 0 $[$nj-1]`;do
+    suffix=`printf '%03d' $suffix`
+    scp_subfile=${split_dir}/${prefix}_scp_${suffix}
+    write_ark=$store_dir/emb_${suffix}.ark
+    python3 wespeaker/diar/extract_emb.py \
+            --scp ${scp_subfile} \
+            --ark-path ${write_ark} \
+            --source ${pretrained_model} \
+            --device ${device} \
+            --batch-size ${batch_size} \
+            --frame-shift ${frame_shift} \
+            --window-secs ${window_secs} \
+            --period-secs ${period_secs} \
+            --subseg-cmn ${subseg_cmn} \
+            > ${log_dir}/${prefix}.${suffix}.log 2>&1 &
+done
+
+wait
+
+cat $store_dir/emb_*.scp > $store_dir/emb.scp
+echo "Finish extract embedding."
diff --git a/examples/voxconverse/v3/local/make_fbank.sh b/examples/voxconverse/v3/local/make_fbank.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+# Copyright (c) 2022 Zhengyang Chen (chenzhengyang117@gmail.com)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+. ./path.sh || exit 1
+
+scp=''
+segments=''
+store_dir=''
+subseg_cmn=true
+nj=1
+
+. tools/parse_options.sh
+
+split_dir=$store_dir/split_scp
+log_dir=$store_dir/log
+mkdir -p $split_dir
+mkdir -p $log_dir
+
+# split the scp file to sub_file, and we can use multi-process to extract Fbank feature
+file_len=`wc -l $scp | awk '{print $1}'`
+subfile_len=$[$file_len / $nj + 1]
+prefix='split'
+split -l $subfile_len -d -a 3 $scp ${split_dir}/${prefix}_scp_
+
+for suffix in `seq 0 $[$nj-1]`;do
+    suffix=`printf '%03d' $suffix`
+    scp_subfile=${split_dir}/${prefix}_scp_${suffix}
+    write_ark=$store_dir/fbank_${suffix}.ark
+    python3 wespeaker/diar/make_fbank.py \
+            --scp ${scp_subfile} \
+            --segments ${segments} \
+            --ark-path ${write_ark} \
+            --subseg-cmn ${subseg_cmn} \
+            > ${log_dir}/${prefix}.${suffix}.log 2>&1 &
+done
+
+wait
+
+cat $store_dir/fbank_*.scp > $store_dir/fbank.scp
+echo "Finish make Fbank."
diff --git a/examples/voxconverse/v3/path.sh b/examples/voxconverse/v3/path.sh
@@ -0,0 +1,5 @@
+export PATH=$PWD:$PATH
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=../../../:$PYTHONPATH
diff --git a/examples/voxconverse/v3/run.sh b/examples/voxconverse/v3/run.sh
@@ -0,0 +1,186 @@
+#!/bin/bash
+# Copyright (c) 2022-2023 Xu Xiang
+#               2022 Zhengyang Chen (chenzhengyang117@gmail.com)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+. ./path.sh || exit 1
+
+stage=-1
+stop_stage=-1
+sad_type="oracle"
+partition="dev"
+
+# do cmn on the sub-segment or on the vad segment
+subseg_cmn=true
+# whether print the evaluation result for each file
+get_each_file_res=1
+
+. tools/parse_options.sh
+
+# Prerequisite
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    mkdir -p external_tools
+
+    # [1] Download evaluation toolkit
+    wget -c https://github.com/usnistgov/SCTK/archive/refs/tags/v2.4.12.zip -O external_tools/SCTK-v2.4.12.zip
+    unzip -o external_tools/SCTK-v2.4.12.zip -d external_tools
+
+    # [3] Download ResNet34 speaker model pretrained by WeSpeaker Team
+    mkdir -p pretrained_models
+
+    wget -c https://wespeaker-1256283475.cos.ap-shanghai.myqcloud.com/models/voxceleb/voxceleb_resnet34_LM.onnx -O pretrained_models/voxceleb_resnet34_LM.onnx
+fi
+
+
+# Download VoxConverse dev/test audios and the corresponding annotations
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    mkdir -p data
+
+    # Download annotations for dev and test sets (version 0.0.3)
+    wget -c https://github.com/joonson/voxconverse/archive/refs/heads/master.zip -O data/voxconverse_master.zip
+    unzip -o data/voxconverse_master.zip -d data
+
+    # Download annotations from VoxSRC-23 validation toolkit (looks like version 0.0.2)
+    # cd data && git clone https://github.com/JaesungHuh/VoxSRC2023.git --recursive && cd -
+
+    # Download dev audios
+    mkdir -p data/dev
+
+    #wget --no-check-certificate -c https://mm.kaist.ac.kr/datasets/voxconverse/data/voxconverse_dev_wav.zip -O data/voxconverse_dev_wav.zip
+    # The above url may not be reachable, you can try the link below.
+    # This url is from https://github.com/joonson/voxconverse/blob/master/README.md
+    wget --no-check-certificate -c https://www.robots.ox.ac.uk/~vgg/data/voxconverse/data/voxconverse_dev_wav.zip -O data/voxconverse_dev_wav.zip
+    unzip -o data/voxconverse_dev_wav.zip -d data/dev
+
+    # Create wav.scp for dev audios
+    ls `pwd`/data/dev/audio/*.wav | awk -F/ '{print substr($NF, 1, length($NF)-4), $0}' > data/dev/wav.scp
+
+    # Test audios
+    mkdir -p data/test
+
+    #wget --no-check-certificate -c https://mm.kaist.ac.kr/datasets/voxconverse/data/voxconverse_test_wav.zip -O data/voxconverse_test_wav.zip
+    # The above url may not be reachable, you can try the link below.
+    # This url is from https://github.com/joonson/voxconverse/blob/master/README.md
+    wget  --no-check-certificate -c https://www.robots.ox.ac.uk/~vgg/data/voxconverse/data/voxconverse_test_wav.zip -O data/voxconverse_test_wav.zip
+    unzip -o data/voxconverse_test_wav.zip -d data/test
+
+    # Create wav.scp for test audios
+    ls `pwd`/data/test/voxconverse_test_wav/*.wav | awk -F/ '{print substr($NF, 1, length($NF)-4), $0}' > data/test/wav.scp
+fi
+
+
+# Voice activity detection
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # Set VAD min duration
+    min_duration=0.255
+
+    if [[ "x${sad_type}" == "xoracle" ]]; then
+        # Oracle SAD: handling overlapping or too short regions in ground truth RTTM
+        while read -r utt wav_path; do
+            python3 wespeaker/diar/make_oracle_sad.py \
+                    --rttm data/voxconverse-master/${partition}/${utt}.rttm \
+                    --min-duration $min_duration
+        done < data/${partition}/wav.scp > data/${partition}/oracle_sad
+    fi
+
+    if [[ "x${sad_type}" == "xsystem" ]]; then
+       # System SAD: applying 'silero' VAD
+       python3 wespeaker/diar/make_system_sad.py \
+               --scp data/${partition}/wav.scp \
+               --min-duration $min_duration > data/${partition}/system_sad
+    fi
+fi
+
+
+# Extract fbank features
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+
+    [ -d "exp/${sad_type}_sad_fbank" ] && rm -r exp/${sad_type}_sad_fbank
+
+    echo "Make Fbank features and store it under exp/${sad_type}_sad_fbank"
+    echo "..."
+    bash local/make_fbank.sh \
+            --scp data/${partition}/wav.scp \
+            --segments data/${partition}/${sad_type}_sad \
+            --store_dir exp/${partition}_${sad_type}_sad_fbank \
+            --subseg_cmn ${subseg_cmn} \
+            --nj 24
+fi
+
+# Extract embeddings
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+
+    [ -d "exp/${sad_type}_sad_embedding" ] && rm -r exp/${sad_type}_sad_embedding
+
+    echo "Extract embeddings and store it under exp/${sad_type}_sad_embedding"
+    echo "..."
+    bash local/extract_emb.sh \
+            --scp exp/${partition}_${sad_type}_sad_fbank/fbank.scp \
+            --pretrained_model pretrained_models/voxceleb_resnet34_LM.onnx \
+            --device cuda \
+            --store_dir exp/${partition}_${sad_type}_sad_embedding \
+            --batch_size 96 \
+            --frame_shift 10 \
+            --window_secs 1.5 \
+            --period_secs 0.75 \
+            --subseg_cmn ${subseg_cmn} \
+            --nj 1
+fi
+
+
+# Applying umap clustering algorithm
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+
+    [ -f "exp/umap_cluster/${partition}_${sad_type}_sad_labels" ] && rm exp/umap_cluster/${partition}_${sad_type}_sad_labels
+
+    echo "Doing umap clustering and store the result in exp/umap_cluster/${partition}_${sad_type}_sad_labels"
+    echo "..."
+    python3 wespeaker/diar/umap_clusterer.py \
+            --scp exp/${partition}_${sad_type}_sad_embedding/emb.scp \
+            --output exp/umap_cluster/${partition}_${sad_type}_sad_labels
+fi
+
+
+# Convert labels to RTTMs
+if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
+    python3 wespeaker/diar/make_rttm.py \
+            --labels exp/umap_cluster/${partition}_${sad_type}_sad_labels \
+            --channel 1 > exp/umap_cluster/${partition}_${sad_type}_sad_rttm
+fi
+
+
+# Evaluate the result
+if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
+    ref_dir=data/voxconverse-master/
+    #ref_dir=data/VoxSRC2023/voxconverse/
+    echo -e "Get the DER results\n..."
+    perl external_tools/SCTK-2.4.12/src/md-eval/md-eval.pl \
+         -c 0.25 \
+         -r <(cat ${ref_dir}/${partition}/*.rttm) \
+         -s exp/umap_cluster/${partition}_${sad_type}_sad_rttm 2>&1 | tee exp/umap_cluster/${partition}_${sad_type}_sad_res
+
+    if [ ${get_each_file_res} -eq 1 ];then
+        single_file_res_dir=exp/umap_cluster/${partition}_${sad_type}_single_file_res
+        mkdir -p $single_file_res_dir
+        echo -e "\nGet the DER results for each file and the results will be stored underd ${single_file_res_dir}\n..."
+
+        awk '{print $2}' exp/umap_cluster/${partition}_${sad_type}_sad_rttm | sort -u  | while read file_name; do
+            perl external_tools/SCTK-2.4.12/src/md-eval/md-eval.pl \
+                 -c 0.25 \
+                 -r <(cat ${ref_dir}/${partition}/${file_name}.rttm) \
+                 -s <(grep "${file_name}" exp/umap_cluster/${partition}_${sad_type}_sad_rttm) > ${single_file_res_dir}/${partition}_${file_name}_res
+        done
+        echo "Done!"
+    fi
+fi
diff --git a/examples/voxconverse/v3/tools b/examples/voxconverse/v3/tools
@@ -0,0 +1 @@
+../../../tools
diff --git a/examples/voxconverse/v3/wespeaker b/examples/voxconverse/v3/wespeaker
@@ -0,0 +1 @@
+../../../wespeaker
diff --git a/requirements.txt b/requirements.txt
@@ -23,3 +23,5 @@ soundfile==0.10.3.post1
 pypeln==0.4.9
 silero-vad
 pre-commit==3.5.0
+hdbscan==0.8.37
+umap-learn==0.5.6