From 773460c477aadcb23208d9b9c1a94c85d0c1a380 Mon Sep 17 00:00:00 2001 From: Damien Murphy Date: Mon, 15 Apr 2024 17:34:45 -0700 Subject: [PATCH] Handle is_final and endpointing together with utterance end --- examples/node-live/index.js | 52 ++++++++++++++++++++++++++++++------- 1 file changed, 42 insertions(+), 10 deletions(-) diff --git a/examples/node-live/index.js b/examples/node-live/index.js index bcbefe83..bd84405c 100644 --- a/examples/node-live/index.js +++ b/examples/node-live/index.js @@ -1,8 +1,4 @@ -const { - createClient, - LiveTranscriptionEvents, - LiveTranscriptionEvent, -} = require("../../dist/main/index"); +const { createClient, LiveTranscriptionEvents } = require("../../dist/main/index"); const fetch = require("cross-fetch"); const live = async () => { @@ -10,10 +6,20 @@ const live = async () => { const deepgram = createClient(process.env.DEEPGRAM_API_KEY); + // We will collect the is_final=true messages here so we can use them when the person finishes speaking + let is_finals = []; + const connection = deepgram.listen.live({ model: "nova-2", - utterance_end_ms: 1500, + language: "en-US", + // Apply smart formatting to the output + smart_format: true, + // To get UtteranceEnd, the following must be set: interim_results: true, + utterance_end_ms: 1000, + vad_events: true, + // Time in milliseconds of silence to wait for before finalizing speech + endpointing: 300, }); connection.on(LiveTranscriptionEvents.Open, () => { @@ -22,19 +28,45 @@ const live = async () => { }); connection.on(LiveTranscriptionEvents.Metadata, (data) => { - console.log(data); + console.log(`Deepgram Metadata: ${data}`); }); connection.on(LiveTranscriptionEvents.Transcript, (data) => { - console.log(data.channel); + const sentence = data.channel.alternatives[0].transcript; + + // Ignore empty transcripts + if (sentence.length == 0) { + return; + } + if (data.is_final) { + // We need to collect these and concatenate them together when we get a speech_final=true + // See docs: https://developers.deepgram.com/docs/understand-endpointing-interim-results + is_finals.push(sentence); + + // Speech final means we have detected sufficent silence to consider this end of speech + // Speech final is the lowest latency result as it triggers as soon an the endpointing value has triggered + if (data.speech_final) { + const utterance = is_finals.join(" "); + console.log(`Speech Final: ${utterance}`); + is_finals = []; + } else { + // These are useful if you need real time captioning and update what the Interim Results produced + console.log(`Is Final: ${sentence}`); + } + } else { + // These are useful if you need real time captioning of what is being spoken + console.log(`Interim Results: ${sentence}`); + } }); connection.on(LiveTranscriptionEvents.UtteranceEnd, (data) => { - console.log(data); + const utterance = is_finals.join(" "); + console.log(`Deepgram UtteranceEnd: ${utterance}`); + is_finals = []; }); connection.on(LiveTranscriptionEvents.SpeechStarted, (data) => { - console.log(data); + // console.log("Deepgram SpeechStarted"); }); connection.on(LiveTranscriptionEvents.Error, (err) => {