From bd84525d8e0dedeb57c56dc1714027dce23f1ab1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <petezor@gmail.com>
Date: Wed, 4 Aug 2021 13:15:26 -0400
Subject: [PATCH] Fix import of Kaldi data dirs with pipes in wav.scp

---
 lhotse/kaldi.py | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/lhotse/kaldi.py b/lhotse/kaldi.py
index 10ccbfbcb..f37df8598 100644
--- a/lhotse/kaldi.py
+++ b/lhotse/kaldi.py
@@ -16,16 +16,27 @@ def get_duration(
     """
     Read a audio file, it supports pipeline style wave path and real waveform.
     
-    :param path: Path to an audio file supported by libsoundfile (pysoundfile).
-    :return: duration of wav it is float.
-    """ 
+    :param path: Path to an audio file or a Kaldi-style pipe.
+    :return: float duration of the recording, in seconds.
+    """
+    path = str(path)
+    if path.strip().endswith('|'):
+        if not is_module_available('kaldiio'):
+            raise ValueError("To read Kaldi's data dir where wav.scp has 'pipe' inputs, "
+                             "please 'pip install kaldiio' first.")
+        from kaldiio import load_mat
+        # Note: kaldiio.load_mat returns (sampling_rate: int, samples: 1-D np.array[int])
+        sampling_rate, samples = load_mat(path)
+        assert len(samples.shape) == 1
+        duration = samples.shape[0] / sampling_rate
+        return duration
     try:
         # Try to parse the file using pysoundfile first.
         import soundfile
-        info = soundfile.info(str(path))
+        info = soundfile.info(path)
     except:
         # Try to parse the file using audioread as a fallback.
-        info = audioread_info(str(path))
+        info = audioread_info(path)
     return info.duration
 
 
@@ -47,7 +58,7 @@ def load_kaldi_data_dir(
     # must exist for RecordingSet
     recordings = load_kaldi_text_mapping(path / 'wav.scp', must_exist=True)
 
-    durations = defaultdict(float)
+    durations = {}
     for recording_id, path_or_cmd in recordings.items():
         duration = get_duration(path_or_cmd)
         durations[recording_id] = duration