Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle is_final and endpointing together with utterance end + clean u… #364

Merged
merged 7 commits into from
Apr 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 45 additions & 13 deletions examples/streaming/async_microphone/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@

load_dotenv()

# We will collect the is_final=true messages here so we can use them when the person finishes speaking
is_finals = []

async def main():
try:
Expand All @@ -42,31 +44,52 @@ async def main():
dg_connection = deepgram.listen.asynclive.v("1")

async def on_open(self, open, **kwargs):
print(f"\n\n{open}\n\n")
print(f"Deepgram Connection Open")

async def on_message(self, result, **kwargs):
global is_finals
sentence = result.channel.alternatives[0].transcript
if len(sentence) == 0:
return
print(f"speaker: {sentence}")
if result.is_final:
# We need to collect these and concatenate them together when we get a speech_final=true
# See docs: https://developers.deepgram.com/docs/understand-endpointing-interim-results
is_finals.append(sentence)

# Speech Final means we have detected sufficent silence to consider this end of speech
# Speech final is the lowest latency result as it triggers as soon an the endpointing value has triggered
if result.speech_final:
utterance = ' '.join(is_finals)
print(f"Speech Final: {utterance}")
is_finals = []
else:
# These are useful if you need real time captioning and update what the Interim Results produced
print(f"Is Final: {sentence}")
else:
# These are useful if you need real time captioning of what is being spoken
print(f"Interim Results: {sentence}")

async def on_metadata(self, metadata, **kwargs):
print(f"\n\n{metadata}\n\n")
print(f"Deepgram Metadata: {metadata}")

async def on_speech_started(self, speech_started, **kwargs):
print(f"\n\n{speech_started}\n\n")
print(f"Deepgram Speech Started")

async def on_utterance_end(self, utterance_end, **kwargs):
print(f"\n\n{utterance_end}\n\n")
global is_finals
if len(is_finals) > 0:
utterance = ' '.join(is_finals)
print(f"Deepgram Utterance End: {utterance}")
is_finals = []

def on_close(self, close, **kwargs):
print(f"\n\n{close}\n\n")
async def on_close(self, close, **kwargs):
print(f"Deepgram Connection Closed")

def on_error(self, error, **kwargs):
print(f"\n\n{error}\n\n")
async def on_error(self, error, **kwargs):
print(f"Deepgram Handled Error: {error}")

def on_unhandled(self, unhandled, **kwargs):
print(f"\n\n{unhandled}\n\n")
async def on_unhandled(self, unhandled, **kwargs):
print(f"Deepgram Unhandled Websocket Message: {unhandled}")

dg_connection.on(LiveTranscriptionEvents.Open, on_open)
dg_connection.on(LiveTranscriptionEvents.Transcript, on_message)
Expand All @@ -80,19 +103,28 @@ def on_unhandled(self, unhandled, **kwargs):
# connect to websocket
options: LiveOptions = LiveOptions(
model="nova-2",
punctuate=True,
language="en-US",
# Apply smart formatting to the output
smart_format=True,
# Raw audio format deatils
encoding="linear16",
channels=1,
sample_rate=16000,
# To get UtteranceEnd, the following must be set:
interim_results=True,
utterance_end_ms="1000",
vad_events=True,
# Time in milliseconds of silence to wait for before finalizing speech
endpointing=300
)

addons = {
# Prevent waiting for additional numbers
"no_delay": "true"
}

print("\n\nStart talking! Press Ctrl+C to stop...\n")
if await dg_connection.start(options) is False:
if await dg_connection.start(options, addons=addons) is False:
print("Failed to connect to Deepgram")
return

Expand Down
52 changes: 42 additions & 10 deletions examples/streaming/microphone/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@

load_dotenv()

# We will collect the is_final=true messages here so we can use them when the person finishes speaking
is_finals = []

def main():
try:
Expand All @@ -30,31 +32,52 @@ def main():
dg_connection = deepgram.listen.live.v("1")

def on_open(self, open, **kwargs):
print(f"\n\n{open}\n\n")
print(f"Deepgram Connection Open")

def on_message(self, result, **kwargs):
global is_finals
sentence = result.channel.alternatives[0].transcript
if len(sentence) == 0:
return
print(f"speaker: {sentence}")
if result.is_final:
# We need to collect these and concatenate them together when we get a speech_final=true
# See docs: https://developers.deepgram.com/docs/understand-endpointing-interim-results
is_finals.append(sentence)

# Speech Final means we have detected sufficent silence to consider this end of speech
# Speech final is the lowest latency result as it triggers as soon an the endpointing value has triggered
if result.speech_final:
utterance = ' '.join(is_finals)
print(f"Speech Final: {utterance}")
is_finals = []
else:
# These are useful if you need real time captioning and update what the Interim Results produced
print(f"Is Final: {sentence}")
else:
# These are useful if you need real time captioning of what is being spoken
print(f"Interim Results: {sentence}")

def on_metadata(self, metadata, **kwargs):
print(f"\n\n{metadata}\n\n")
print(f"Deepgram Metadata: {metadata}")

def on_speech_started(self, speech_started, **kwargs):
print(f"\n\n{speech_started}\n\n")
print(f"Deepgram Speech Started")

def on_utterance_end(self, utterance_end, **kwargs):
print(f"\n\n{utterance_end}\n\n")
global is_finals
if len(is_finals) > 0:
utterance = ' '.join(is_finals)
print(f"Deepgram Utterance End: {utterance}")
is_finals = []

def on_close(self, close, **kwargs):
print(f"\n\n{close}\n\n")
print(f"Deepgram Connection Closed")

def on_error(self, error, **kwargs):
print(f"\n\n{error}\n\n")
print(f"Deepgram Handled Error: {error}")

def on_unhandled(self, unhandled, **kwargs):
print(f"\n\n{unhandled}\n\n")
print(f"Deepgram Unhandled Websocket Message: {unhandled}")

dg_connection.on(LiveTranscriptionEvents.Open, on_open)
dg_connection.on(LiveTranscriptionEvents.Transcript, on_message)
Expand All @@ -67,19 +90,28 @@ def on_unhandled(self, unhandled, **kwargs):

options: LiveOptions = LiveOptions(
model="nova-2",
punctuate=True,
language="en-US",
# Apply smart formatting to the output
smart_format=True,
# Raw audio format deatils
encoding="linear16",
channels=1,
sample_rate=16000,
# To get UtteranceEnd, the following must be set:
interim_results=True,
utterance_end_ms="1000",
vad_events=True,
# Time in milliseconds of silence to wait for before finalizing speech
endpointing=300
)

addons = {
# Prevent waiting for additional numbers
"no_delay": "true"
}

print("\n\nPress Enter to stop recording...\n\n")
if dg_connection.start(options) is False:
if dg_connection.start(options, addons=addons) is False:
print("Failed to connect to Deepgram")
return

Expand Down
Loading