-
Notifications
You must be signed in to change notification settings - Fork 2.8k
fix streaming tts retries #4410
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
f32ace8
5fd20f8
6ab2b6c
ce3e57e
2eba83a
a3dcbb5
3463c65
56daf5f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -260,7 +260,7 @@ async def current_connection(self) -> _Connection: | |
| if ( | ||
| self._current_connection | ||
| and self._current_connection.is_current | ||
| and not self._current_connection._closed | ||
| and self._current_connection.is_open | ||
| ): | ||
| return self._current_connection | ||
|
|
||
|
|
@@ -363,17 +363,20 @@ def __init__(self, *, tts: TTS, conn_options: APIConnectOptions): | |
| self._tts: TTS = tts | ||
| self._opts = replace(tts._opts) | ||
| self._context_id = utils.shortuuid() | ||
| self._sent_tokenizer_stream = self._opts.word_tokenizer.stream() | ||
| self._text_buffer = "" | ||
| self._start_times_ms: list[int] = [] | ||
| self._durations_ms: list[int] = [] | ||
| self._connection: _Connection | None = None | ||
|
|
||
| async def aclose(self) -> None: | ||
| await self._sent_tokenizer_stream.aclose() | ||
| await super().aclose() | ||
|
|
||
| async def _run(self, output_emitter: tts.AudioEmitter) -> None: | ||
| self._context_id = utils.shortuuid() | ||
| self._text_buffer = "" | ||
| self._start_times_ms.clear() | ||
| self._durations_ms.clear() | ||
|
|
||
| output_emitter.initialize( | ||
| request_id=self._context_id, | ||
| sample_rate=self._opts.sample_rate, | ||
|
|
@@ -383,6 +386,8 @@ async def _run(self, output_emitter: tts.AudioEmitter) -> None: | |
| ) | ||
| output_emitter.start_segment(segment_id=self._context_id) | ||
|
|
||
| sent_tokenizer_stream = self._opts.word_tokenizer.stream() | ||
|
|
||
| connection: _Connection | ||
| try: | ||
| connection = await asyncio.wait_for( | ||
|
|
@@ -399,10 +404,10 @@ async def _run(self, output_emitter: tts.AudioEmitter) -> None: | |
| async def _input_task() -> None: | ||
| async for data in self._input_ch: | ||
| if isinstance(data, self._FlushSentinel): | ||
| self._sent_tokenizer_stream.flush() | ||
| sent_tokenizer_stream.flush() | ||
| continue | ||
| self._sent_tokenizer_stream.push_text(data) | ||
| self._sent_tokenizer_stream.end_input() | ||
| sent_tokenizer_stream.push_text(data) | ||
| sent_tokenizer_stream.end_input() | ||
|
|
||
| async def _sentence_stream_task() -> None: | ||
| flush_on_chunk = ( | ||
|
|
@@ -411,7 +416,7 @@ async def _sentence_stream_task() -> None: | |
| and self._opts.auto_mode | ||
| ) | ||
| xml_content: list[str] = [] | ||
| async for data in self._sent_tokenizer_stream: | ||
| async for data in sent_tokenizer_stream: | ||
| text = data.token | ||
| # send xml tags fully formed | ||
| xml_start_tokens = ["<phoneme", "<break"] | ||
|
|
@@ -455,12 +460,13 @@ async def _sentence_stream_task() -> None: | |
| except asyncio.TimeoutError as e: | ||
| raise APITimeoutError() from e | ||
| except Exception as e: | ||
| if isinstance(e, APIStatusError): | ||
| if isinstance(e, (APIStatusError, APIError, APIConnectionError, APITimeoutError)): | ||
| raise e | ||
| raise APIStatusError("Could not synthesize") from e | ||
| finally: | ||
| output_emitter.end_segment() | ||
| await utils.aio.gracefully_cancel(input_t, stream_t) | ||
| await sent_tokenizer_stream.aclose() | ||
|
|
||
|
|
||
| @dataclass | ||
|
|
@@ -504,6 +510,8 @@ class _StreamData: | |
| stream: SynthesizeStream | ||
| waiter: asyncio.Future[None] | ||
| timeout_timer: asyncio.TimerHandle | None = None | ||
| received_audio: bool = False | ||
| sent_text: bool = False | ||
|
|
||
|
|
||
| class _Connection: | ||
|
|
@@ -531,6 +539,10 @@ def voice_id(self) -> str: | |
| def is_current(self) -> bool: | ||
| return self._is_current | ||
|
|
||
| @property | ||
| def is_open(self) -> bool: | ||
| return self._ws is not None and not self._ws.closed and not self._closed | ||
|
|
||
| def mark_non_current(self) -> None: | ||
| """Mark this connection as no longer current - it will shut down when drained""" | ||
| self._is_current = False | ||
|
|
@@ -560,6 +572,10 @@ def send_content(self, content: _SynthesizeContent) -> None: | |
| """Send synthesis content to the connection""" | ||
| if self._closed or not self._ws or self._ws.closed: | ||
| raise APIConnectionError("WebSocket connection is closed") | ||
| if content.text.strip(): | ||
| ctx = self._context_data.get(content.context_id) | ||
| if ctx: | ||
| ctx.sent_text = True | ||
| self._input_queue.send_nowait(content) | ||
|
|
||
| def close_context(self, context_id: str) -> None: | ||
|
|
@@ -709,10 +725,25 @@ async def _recv_loop(self) -> None: | |
| if data.get("audio"): | ||
| b64data = base64.b64decode(data["audio"]) | ||
| emitter.push(b64data) | ||
| ctx.received_audio = True | ||
| if ctx.timeout_timer: | ||
| ctx.timeout_timer.cancel() | ||
|
|
||
| if data.get("isFinal"): | ||
| if not ctx.received_audio and ctx.sent_text and not ctx.waiter.done(): | ||
| # ElevenLabs sometimes returns `isFinal` with an empty `audio` payload. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is there a valid case that elevenlabs returns final without audio, like when the pushed text is empty?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, empty/whitespace input returns isFinal with audio: null. I added a sent_text guard so we only error when real text was sent |
||
| # Empty input can return isFinal without audio, so only treat | ||
| # it as a retryable failure when we actually sent text. | ||
| ctx.waiter.set_exception( | ||
| APIError("11labs stream ended without audio", retryable=True) | ||
| ) | ||
| self.mark_non_current() | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It just ensures the next attempt gets a fresh connection |
||
| self._cleanup_context(context_id) | ||
| if not self._is_current and not self._active_contexts: | ||
| logger.debug("no active contexts, shutting down connection") | ||
| break | ||
| continue | ||
|
|
||
| if stream is not None: | ||
| timed_words, _ = _to_timed_words( | ||
| stream._text_buffer, | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
push_textis a sync method, any reason a lock is needed? I don't think we are going to push text from different threads, right?