wenet-e2e · JiJiJiang · Jun 21, 2023 · Jun 6, 2023 · Jun 21, 2023 · Jun 21, 2023
diff --git a/examples/voxconverse/v1/README.md b/examples/voxconverse/v1/README.md
@@ -1,4 +1,5 @@
-## Results
+## Overview
+
 * We suggest to run this recipe on a gpu-available machine, with onnxruntime-gpu supported.
 * Dataset: voxconverse_dev that consists of 216 utterances
 * Speaker model: ResNet34 model pretrained by wespeaker
@@ -8,17 +9,22 @@
 * Clustering method: spectral clustering
 * Metric: DER = MISS + FALSE ALARM + SPEAKER CONFUSION (%)
 
-| system | MISS | FA | SC | DER |
-|:---|:---:|:---:|:---:|:---:
-| This repo (with oracle SAD) | 2.3 | 0.0 | 1.9 | 4.2 |
-| This repo (with system SAD) | 4.4 | 0.6 | 2.1 | 7.1 |
-| [1] DIHARD 2019 baseline | 11.1 | 1.4 | 11.3 | 23.8 |
-| [1] DIHARD 2019 baseline w/ SE | 9.3 | 1.3 | 9.7 | 20.2 |
-| [1] (SyncNet ASD only) | 2.2 | 4.1 | 4.0 | 10.4 |
-| [1] (AVSE ASD only) | 2.0 | 5.9 | 4.6 | 12.4 |
-| [1] (proposed) | 2.4 | 2.3 | 3.0 | 7.7 |
+## Results
+
+* Dev set
+
+    | system | MISS | FA | SC | DER |
+    |:---|:---:|:---:|:---:|:---:|
+    | This repo (with oracle SAD) | 2.3 | 0.0 | 1.9 | 4.2 |
+    | This repo (with system SAD) | 3.7 | 0.8 | 2.0 | 6.5 |
+    | DIHARD 2019 baseline [^1] | 11.1 | 1.4 | 11.3 | 23.8 |
+    | DIHARD 2019 baseline w/ SE [^1] | 9.3 | 1.3 | 9.7 | 20.2 |
+    | (SyncNet ASD only) [^1] | 2.2 | 4.1 | 4.0 | 10.4 |
+    | (AVSE ASD only) [^1] | 2.0 | 5.9 | 4.6 | 12.4 |
+    | (proposed) [^1] | 2.4 | 2.3 | 3.0 | 7.7 |
+
 
-[1] Spot the conversation: speaker diarisation in the wild, https://arxiv.org/pdf/2007.01216.pdf
+[^1]: Spot the conversation: speaker diarisation in the wild, https://arxiv.org/pdf/2007.01216.pdf
 
 ## Update 09/2022 : GPU Clustering
 * You can use diar/clusterer\_gpu.py to run GPU Clustering

diff --git a/examples/voxconverse/v1/sad/make_system_sad.py b/examples/voxconverse/v1/sad/make_system_sad.py
@@ -51,7 +51,7 @@ def read_scp(scp):
 
 
 def silero_vad(utt_wav_pair, repo_path, min_duration,
-               sampling_rate=16000, threshold=0.36):
+               sampling_rate=16000, threshold=0.25):
 
     def module_from_file(module_name, file_path):
         spec = importlib.util.spec_from_file_location(module_name, file_path)

diff --git a/examples/voxconverse/v2/README.md b/examples/voxconverse/v2/README.md
@@ -1,4 +1,5 @@
-## Results
+## Overview
+
 * Compared with [v1](https://github.com/wenet-e2e/wespeaker/tree/master/examples/voxconverse/v1) version, here we split the Fbank extraction, embedding extraction and clustering modules to different stages.
 * We suggest to run this recipe on a gpu-available machine, with onnxruntime-gpu supported.
 * Dataset: voxconverse_dev that consists of 216 utterances
@@ -9,14 +10,25 @@
 * Clustering method: spectral clustering
 * Metric: DER = MISS + FALSE ALARM + SPEAKER CONFUSION (%)
 
-| system | MISS | FA | SC | DER |
-|:---|:---:|:---:|:---:|:---:
-| This repo (with oracle SAD) | 2.3 | 0.0 | 1.9 | 4.4 |
-| This repo (with system SAD) | 4.4 | 0.6 | 2.1 | 7.0 |
-| [1] DIHARD 2019 baseline | 11.1 | 1.4 | 11.3 | 23.8 |
-| [1] DIHARD 2019 baseline w/ SE | 9.3 | 1.3 | 9.7 | 20.2 |
-| [1] (SyncNet ASD only) | 2.2 | 4.1 | 4.0 | 10.4 |
-| [1] (AVSE ASD only) | 2.0 | 5.9 | 4.6 | 12.4 |
-| [1] (proposed) | 2.4 | 2.3 | 3.0 | 7.7 |
-
-[1] Spot the conversation: speaker diarisation in the wild, https://arxiv.org/pdf/2007.01216.pdf
+## Results
+
+* Dev set
+
+    | system | MISS | FA | SC | DER |
+    |:---|:---:|:---:|:---:|:---:|
+    | This repo (with oracle SAD) | 2.3 | 0.0 | 2.1 | 4.4 |
+    | This repo (with system SAD) | 3.7 | 0.8 | 2.2 | 6.8 |
+    | DIHARD 2019 baseline [^1] | 11.1 | 1.4 | 11.3 | 23.8 |
+    | DIHARD 2019 baseline w/ SE [^1] | 9.3 | 1.3 | 9.7 | 20.2 |
+    | (SyncNet ASD only) [^1] | 2.2 | 4.1 | 4.0 | 10.4 |
+    | (AVSE ASD only) [^1] | 2.0 | 5.9 | 4.6 | 12.4 |
+    | (proposed) [^1] | 2.4 | 2.3 | 3.0 | 7.7 |
+
+* Test set
+
+    | system | MISS | FA | SC | DER |
+    |:---|:---:|:---:|:---:|:---:|
+    | This repo (with system SAD) | 4.0 | 2.4 | 3.5 | 9.8 |
+
+
+[^1]: Spot the conversation: speaker diarisation in the wild, https://arxiv.org/pdf/2007.01216.pdf
diff --git a/examples/voxconverse/v2/diar/extract_emb.sh b/examples/voxconverse/v2/diar/extract_emb.sh
@@ -44,7 +44,7 @@ for suffix in `seq 0 $[$nj-1]`;do
     suffix=`printf '%03d' $suffix`
     scp_subfile=${split_dir}/${prefix}_scp_${suffix}
     write_ark=$store_dir/emb_${suffix}.ark
-    python diar/extract_emb.py \
+    python3 diar/extract_emb.py \
             --scp ${scp_subfile} \
             --ark-path ${write_ark} \
             --source ${pretrained_model} \

diff --git a/examples/voxconverse/v2/diar/make_fbank.sh b/examples/voxconverse/v2/diar/make_fbank.sh
@@ -38,7 +38,7 @@ for suffix in `seq 0 $[$nj-1]`;do
     suffix=`printf '%03d' $suffix`
     scp_subfile=${split_dir}/${prefix}_scp_${suffix}
     write_ark=$store_dir/fbank_${suffix}.ark
-    python diar/make_fbank.py \
+    python3 diar/make_fbank.py \
             --scp ${scp_subfile} \
             --segments ${segments} \
             --ark-path ${write_ark} \

diff --git a/examples/voxconverse/v2/diar/spectral_clusterer.py b/examples/voxconverse/v2/diar/spectral_clusterer.py
@@ -27,7 +27,6 @@
 
 import numpy as np
 import scipy.linalg
-
 from sklearn.cluster._kmeans import k_means
 from wespeaker.utils.utils import validate_path
 
@@ -132,6 +131,4 @@ def main():
 
 
 if __name__ == '__main__':
-    scipy.random.seed(1)
-
     main()
diff --git a/examples/voxconverse/v2/run.sh b/examples/voxconverse/v2/run.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2022 Xu Xiang
+# Copyright (c) 2022-2023 Xu Xiang
 #               2022 Zhengyang Chen (chenzhengyang117@gmail.com)
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -19,6 +19,8 @@
 stage=-1
 stop_stage=-1
 sad_type="system"
+partition="test"
+
 # do cmn on the sub-segment or on the vad segment
 subseg_cmn=true
 # whether print the evaluation result for each file
@@ -49,10 +51,13 @@ fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     mkdir -p data
 
-    # Download annotations for dev and test sets
+    # Download annotations for dev and test sets (version 0.0.3)
     wget -c https://github.com/joonson/voxconverse/archive/refs/heads/master.zip -O data/voxconverse_master.zip
     unzip -o data/voxconverse_master.zip -d data
 
+    # Download annotations from VoxSRC-23 validation toolkit (looks like version 0.0.2)
+    cd data && git clone https://github.com/JaesungHuh/VoxSRC2023.git --recursive && cd -
+
     # Download dev audios
     mkdir -p data/dev
     wget -c https://mm.kaist.ac.kr/datasets/voxconverse/data/voxconverse_dev_wav.zip -O data/voxconverse_dev_wav.zip
@@ -62,9 +67,12 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     ls `pwd`/data/dev/audio/*.wav | awk -F/ '{print substr($NF, 1, length($NF)-4), $0}' > data/dev/wav.scp
 
     # Test audios
-    # mkdir -p data/test
-    # wget -c https://mm.kaist.ac.kr/datasets/voxconverse/data/voxconverse_test_wav.zip -O data/voxconverse_test_wav.zip
-    # unzip -o data/voxconverse_test_wav.zip -d data/test
+    mkdir -p data/test
+    wget -c https://mm.kaist.ac.kr/datasets/voxconverse/data/voxconverse_test_wav.zip -O data/voxconverse_test_wav.zip
+    unzip -o data/voxconverse_test_wav.zip -d data/test
+
+    # Create wav.scp for test audios
+    ls `pwd`/data/test/voxconverse_test_wav/*.wav | awk -F/ '{print substr($NF, 1, length($NF)-4), $0}' > data/test/wav.scp
 fi
 
 
@@ -77,17 +85,17 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
         # Oracle SAD: handling overlapping or too short regions in ground truth RTTM
         while read -r utt wav_path; do
             python3 sad/make_oracle_sad.py \
-                    --rttm data/voxconverse-master/dev/${utt}.rttm \
+                    --rttm data/voxconverse-master/${partition}/${utt}.rttm \
                     --min-duration $min_duration
-        done < data/dev/wav.scp > data/dev/oracle_sad
+        done < data/${partition}/wav.scp > data/${partition}/oracle_sad
     fi
 
     if [[ "x${sad_type}" == "xsystem" ]]; then
        # System SAD: applying 'silero' VAD
        python3 sad/make_system_sad.py \
                --repo-path external_tools/silero-vad-3.1 \
-               --scp data/dev/wav.scp \
-               --min-duration $min_duration > data/dev/system_sad
+               --scp data/${partition}/wav.scp \
+               --min-duration $min_duration > data/${partition}/system_sad
     fi
 fi
 
@@ -100,11 +108,11 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
     echo "Make Fbank features and store it under exp/${sad_type}_sad_fbank"
     echo "..."
     bash diar/make_fbank.sh \
-            --scp data/dev/wav.scp \
-            --segments data/dev/${sad_type}_sad \
-            --store_dir exp/${sad_type}_sad_fbank \
+            --scp data/${partition}/wav.scp \
+            --segments data/${partition}/${sad_type}_sad \
+            --store_dir exp/${partition}_${sad_type}_sad_fbank \
             --subseg_cmn ${subseg_cmn} \
-            --nj 20
+            --nj 24
 fi
 
 # Extract embeddings
@@ -115,59 +123,61 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
     echo "Extract embeddings and store it under exp/${sad_type}_sad_embedding"
     echo "..."
     bash diar/extract_emb.sh \
-            --scp exp/${sad_type}_sad_fbank/fbank.scp \
+            --scp exp/${partition}_${sad_type}_sad_fbank/fbank.scp \
             --pretrained_model pretrained_models/voxceleb_resnet34_LM.onnx \
             --device cuda \
-            --store_dir exp/${sad_type}_sad_embedding \
+            --store_dir exp/${partition}_${sad_type}_sad_embedding \
             --batch_size 96 \
             --frame_shift 10 \
             --window_secs 1.5 \
             --period_secs 0.75 \
             --subseg_cmn ${subseg_cmn} \
-            --nj 4
+            --nj 1
 fi
 
 
 # Applying spectral clustering algorithm
 if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
 
-    [ -f "exp/spectral_cluster/${sad_type}_sad_labels" ] && rm exp/spectral_cluster/${sad_type}_sad_labels
+    [ -f "exp/spectral_cluster/${partition}_${sad_type}_sad_labels" ] && rm exp/spectral_cluster/${partition}_${sad_type}_sad_labels
 
-    echo "Doing spectral clustering and store the result in exp/spectral_cluster/${sad_type}_sad_labels"
+    echo "Doing spectral clustering and store the result in exp/spectral_cluster/${partition}_${sad_type}_sad_labels"
     echo "..."
     python3 diar/spectral_clusterer.py \
-            --scp exp/${sad_type}_sad_embedding/emb.scp \
-            --output exp/spectral_cluster/${sad_type}_sad_labels
+            --scp exp/${partition}_${sad_type}_sad_embedding/emb.scp \
+            --output exp/spectral_cluster/${partition}_${sad_type}_sad_labels
 fi
 
 
 # Convert labels to RTTMs
 if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
     python3 diar/make_rttm.py \
-            --labels exp/spectral_cluster/${sad_type}_sad_labels \
-            --channel 1 > exp/spectral_cluster/${sad_type}_sad_rttm
+            --labels exp/spectral_cluster/${partition}_${sad_type}_sad_labels \
+            --channel 1 > exp/spectral_cluster/${partition}_${sad_type}_sad_rttm
 fi
 
 
 # Evaluate the result
 if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
+    # ref_dir=data/voxconverse-master/
+    ref_dir=data/VoxSRC2023/voxconverse/
 
     echo -e "Get the DER results\n..."
     perl external_tools/SCTK-2.4.12/src/md-eval/md-eval.pl \
          -c 0.25 \
-         -r <(cat data/voxconverse-master/dev/*.rttm) \
-         -s exp/spectral_cluster/${sad_type}_sad_rttm 2>&1 | tee exp/spectral_cluster/${sad_type}_sad_res
+         -r <(cat ${ref_dir}/${partition}/*.rttm) \
+         -s exp/spectral_cluster/${partition}_${sad_type}_sad_rttm 2>&1 | tee exp/spectral_cluster/${partition}_${sad_type}_sad_res
 
     if [ ${get_each_file_res} -eq 1 ];then
-        single_file_res_dir=exp/spectral_cluster/${sad_type}_single_file_res
+        single_file_res_dir=exp/spectral_cluster/${partition}_${sad_type}_single_file_res
         mkdir -p $single_file_res_dir
         echo -e "\nGet the DER results for each file and the results will be stored underd ${single_file_res_dir}\n..."
 
-        awk '{print $2}' exp/spectral_cluster/${sad_type}_sad_rttm | sort -u  | while read file_name; do
+        awk '{print $2}' exp/spectral_cluster/${partition}_${sad_type}_sad_rttm | sort -u  | while read file_name; do
             perl external_tools/SCTK-2.4.12/src/md-eval/md-eval.pl \
                  -c 0.25 \
-                 -r <(cat data/voxconverse-master/dev/${file_name}.rttm) \
-                 -s <(grep "${file_name}" exp/spectral_cluster/${sad_type}_sad_rttm) > ${single_file_res_dir}/${file_name}_res
+                 -r <(cat ${ref_dir}/${partition}/${file_name}.rttm) \
+                 -s <(grep "${file_name}" exp/spectral_cluster/${partition}_${sad_type}_sad_rttm) > ${single_file_res_dir}/${partition}_${file_name}_res
         done
         echo "Done!"
     fi