Handle is_final and endpointing together with utterance end

deepgram · Apr 16, 2024 · 773460c · 773460c
1 parent d2ade7b
commit 773460c
Showing 1 changed file with 42 additions and 10 deletions.
diff --git a/examples/node-live/index.js b/examples/node-live/index.js
@@ -1,19 +1,25 @@
-const {
-  createClient,
-  LiveTranscriptionEvents,
-  LiveTranscriptionEvent,
-} = require("../../dist/main/index");
+const { createClient, LiveTranscriptionEvents } = require("../../dist/main/index");
 const fetch = require("cross-fetch");
 
 const live = async () => {
   const url = "http://stream.live.vc.bbcmedia.co.uk/bbc_world_service";
 
   const deepgram = createClient(process.env.DEEPGRAM_API_KEY);
 
+  // We will collect the is_final=true messages here so we can use them when the person finishes speaking
+  let is_finals = [];
+
   const connection = deepgram.listen.live({
     model: "nova-2",
-    utterance_end_ms: 1500,
+    language: "en-US",
+    // Apply smart formatting to the output
+    smart_format: true,
+    // To get UtteranceEnd, the following must be set:
     interim_results: true,
+    utterance_end_ms: 1000,
+    vad_events: true,
+    // Time in milliseconds of silence to wait for before finalizing speech
+    endpointing: 300,
   });
 
   connection.on(LiveTranscriptionEvents.Open, () => {
@@ -22,19 +28,45 @@ const live = async () => {
     });
 
     connection.on(LiveTranscriptionEvents.Metadata, (data) => {
-      console.log(data);
+      console.log(`Deepgram Metadata: ${data}`);
     });
 
     connection.on(LiveTranscriptionEvents.Transcript, (data) => {
-      console.log(data.channel);
+      const sentence = data.channel.alternatives[0].transcript;
+
+      // Ignore empty transcripts
+      if (sentence.length == 0) {
+        return;
+      }
+      if (data.is_final) {
+        // We need to collect these and concatenate them together when we get a speech_final=true
+        // See docs: https://developers.deepgram.com/docs/understand-endpointing-interim-results
+        is_finals.push(sentence);
+
+        // Speech final means we have detected sufficent silence to consider this end of speech
+        // Speech final is the lowest latency result as it triggers as soon an the endpointing value has triggered
+        if (data.speech_final) {
+          const utterance = is_finals.join(" ");
+          console.log(`Speech Final: ${utterance}`);
+          is_finals = [];
+        } else {
+          // These are useful if you need real time captioning and update what the Interim Results produced
+          console.log(`Is Final: ${sentence}`);
+        }
+      } else {
+        // These are useful if you need real time captioning of what is being spoken
+        console.log(`Interim Results: ${sentence}`);
+      }
     });
 
     connection.on(LiveTranscriptionEvents.UtteranceEnd, (data) => {
-      console.log(data);
+      const utterance = is_finals.join(" ");
+      console.log(`Deepgram UtteranceEnd: ${utterance}`);
+      is_finals = [];
     });
 
     connection.on(LiveTranscriptionEvents.SpeechStarted, (data) => {
-      console.log(data);
+      // console.log("Deepgram SpeechStarted");
     });
 
     connection.on(LiveTranscriptionEvents.Error, (err) => {