Skip to content

Commit ca1f49f

Browse files
author
Gunpal Jain
committed
Add session resumption, transcription, and server events support
Enhanced multimodal live client with session resumption and transcription handling. Added server-side events (`GoAway`, `SessionResumableUpdate`) and extended functionality in WPF sample for better UI and transcription display. Updated types to support session resumption, context compression, and URL metadata.
1 parent 46b358a commit ca1f49f

14 files changed

+448
-9
lines changed
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
<wpf:ResourceDictionary xml:space="preserve" xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml" xmlns:s="clr-namespace:System;assembly=mscorlib" xmlns:ss="urn:shemas-jetbrains-com:settings-storage-xaml" xmlns:wpf="http://schemas.microsoft.com/winfx/2006/xaml/presentation">
2+
<s:Boolean x:Key="/Default/CodeEditing/SuppressUninitializedWarningFix/Enabled/@EntryValue">False</s:Boolean></wpf:ResourceDictionary>

samples/TwoWayAudioCommunicationWpf/MainWindow.xaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,14 @@
5353
</StackPanel>
5454

5555
</ui:Card>
56+
<StackPanel DockPanel.Dock="Bottom">
57+
<TextBlock Text="Input Transcript" Margin="0,0,0,8"></TextBlock>
58+
<TextBox TextWrapping="Wrap" Text="{Binding InputTranscript}" Height="100"></TextBox>
59+
</StackPanel>
60+
<StackPanel DockPanel.Dock="Bottom">
61+
<TextBlock Text="Input Transcript" Margin="0,0,0,8"></TextBlock>
62+
<TextBox TextWrapping="Wrap" Text="{Binding OutputTranscript}" Height="100"></TextBox>
63+
</StackPanel>
5664
<ListBox x:Name="Responses" ItemsSource="{Binding ModelResponses, ElementName=Window}"
5765
VerticalAlignment="Stretch"
5866
BorderThickness="0"

samples/TwoWayAudioCommunicationWpf/MainWindow.xaml.cs

Lines changed: 55 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,9 @@
33
using System.Diagnostics;
44
using System.IO;
55
using System.Runtime.CompilerServices;
6+
using System.Text.Json;
67
using System.Windows;
8+
using System.Windows.Forms.VisualStyles;
79
using GenerativeAI;
810
using GenerativeAI.Live;
911
using GenerativeAI.Types;
@@ -26,6 +28,28 @@ public partial class MainWindow : FluentWindow, INotifyPropertyChanged
2628
private ModelResponse? _currentResponse;
2729
private readonly NAudioHelper _audioHelper = new();
2830

31+
public string InputTranscript
32+
{
33+
get => _inputTranscript;
34+
set
35+
{
36+
if (value == _inputTranscript) return;
37+
_inputTranscript = value;
38+
OnPropertyChanged();
39+
}
40+
}
41+
42+
public string OutputTranscript
43+
{
44+
get => _outputTranscript;
45+
set
46+
{
47+
if (value == _outputTranscript) return;
48+
_outputTranscript = value;
49+
OnPropertyChanged();
50+
}
51+
}
52+
2953
public bool IsRecording
3054
{
3155
get => _isRecording;
@@ -37,8 +61,12 @@ public bool IsRecording
3761
}
3862
}
3963

64+
private const string Model = "gemini-2.0-flash-live-001";
65+
4066
private CancellationTokenSource? _cancellationTokenSource;
4167
private bool _isRecording = false;
68+
private string _inputTranscript;
69+
private string _outputTranscript;
4270

4371
/// <summary>
4472
/// Gets or sets the collection of model responses displayed in the UI.
@@ -161,7 +189,14 @@ private async Task InitializeChat()
161189

162190
RegisterClientEvents();
163191

164-
await _multiModalLiveClient.ConnectAsync(true, cancellationToken);
192+
await _multiModalLiveClient.ConnectAsync(false, cancellationToken);
193+
await _multiModalLiveClient.SendSetupAsync(new BidiGenerateContentSetup()
194+
{
195+
Model = Model.ToModelId(),
196+
GenerationConfig = config,
197+
OutputAudioTranscription = new AudioTranscriptionConfig(),
198+
InputAudioTranscription = new AudioTranscriptionConfig()
199+
}, cancellationToken);
165200
StartRecording(device); //No need to pass cancellation token, as it's handled within StartRecording
166201
btnStartChat.Content = AppConstants.StopChatText;
167202
}
@@ -212,6 +247,24 @@ private void RegisterClientEvents()
212247
}
213248
};
214249

250+
_multiModalLiveClient.TextChunkReceived += (sender, args) =>
251+
{
252+
Console.WriteLine(args.Text);
253+
};
254+
_multiModalLiveClient.InputTranscriptionReceived += (sender, args) =>
255+
{
256+
InputTranscript = $"{InputTranscript}{args.Text}";
257+
};
258+
259+
_multiModalLiveClient.OutputTranscriptionReceived += (sender, args) =>
260+
{
261+
OutputTranscript = $"{OutputTranscript}{args.Text}";
262+
};
263+
264+
_multiModalLiveClient.SessionResumableUpdateReceived += (sender, args) =>
265+
{
266+
Console.WriteLine(JsonSerializer.Serialize(args));
267+
};
215268
_multiModalLiveClient.AudioChunkReceived += (sender, args) =>
216269
{
217270
Dispatcher.Invoke(() => // Batch UI updates for better performance
@@ -237,6 +290,7 @@ private void RegisterClientEvents()
237290
_audioHelper.ClearPlayback(); //Consider if this needs to be inside the Dispatcher.Invoke
238291
var tmpFile = Path.GetTempFileName() + ".wav";
239292

293+
240294
//Consider handling IOException
241295
try
242296
{

samples/TwoWayAudioCommunicationWpf/TwoWayAudioCommunicationWpf.csproj

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
</PropertyGroup>
1212

1313
<ItemGroup>
14-
<PackageReference Include="Google_GenerativeAI.Auth" Version="2.5.6" />
1514
<PackageReference Include="MdXaml.Full" Version="1.27.0" />
1615
<PackageReference Include="NAudio" Version="2.2.0" />
1716
<PackageReference Include="PvRecorder" Version="1.2.10" />

src/GenerativeAI.Live/Events/AudioBufferReceivedEventArgs.cs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
using GenerativeAI.Types;
2+
13
namespace GenerativeAI.Live;
24

35
/// <summary>
@@ -14,6 +16,9 @@ public class AudioBufferReceivedEventArgs : EventArgs
1416
/// Gets or sets the header information for the audio data.
1517
/// </summary>
1618
public AudioHeaderInfo HeaderInfo { get; set; }
19+
20+
public Transcription? InputTranscription { get; set; }
21+
public Transcription? OutputTranscription { get; set; }
1722

1823
public AudioBufferReceivedEventArgs(byte[] buffer, AudioHeaderInfo audioHeaderInfo)
1924
{
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
<wpf:ResourceDictionary xml:space="preserve" xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml" xmlns:s="clr-namespace:System;assembly=mscorlib" xmlns:ss="urn:shemas-jetbrains-com:settings-storage-xaml" xmlns:wpf="http://schemas.microsoft.com/winfx/2006/xaml/presentation">
2+
<s:Boolean x:Key="/Default/CodeInspection/NamespaceProvider/NamespaceFoldersToSkip/=classes/@EntryIndexedValue">True</s:Boolean>
3+
<s:Boolean x:Key="/Default/CodeInspection/NamespaceProvider/NamespaceFoldersToSkip/=events/@EntryIndexedValue">True</s:Boolean>
4+
<s:Boolean x:Key="/Default/CodeInspection/NamespaceProvider/NamespaceFoldersToSkip/=models/@EntryIndexedValue">True</s:Boolean>
5+
<s:Boolean x:Key="/Default/CodeInspection/NamespaceProvider/NamespaceFoldersToSkip/=types/@EntryIndexedValue">True</s:Boolean></wpf:ResourceDictionary>

src/GenerativeAI.Live/Models/MultiModalLiveClient.cs

Lines changed: 73 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -169,8 +169,36 @@ public MultiModalLiveClient(IPlatformAdapter platformAdapter, string modelName,
169169
/// </summary>
170170
public event EventHandler<ErrorEventArgs>? ErrorOccurred;
171171

172+
/// <summary>
173+
/// Event triggered when a chunk of text is received from the server during the live API session.
174+
/// </summary>
172175
public event EventHandler<TextChunkReceivedArgs>? TextChunkReceived;
173176

177+
/// <summary>
178+
/// Event triggered upon receiving input transcription data.
179+
/// </summary>
180+
public event EventHandler<Transcription>? InputTranscriptionReceived;
181+
182+
/// <summary>
183+
/// An event triggered when an output transcription is received from the system.
184+
/// </summary>
185+
public event EventHandler<Transcription>? OutputTranscriptionReceived;
186+
187+
/// <summary>
188+
/// Message sent by the server to indicate that the current connection should be terminated
189+
/// and the client should cease sending further requests on this stream.
190+
/// This is often used for graceful shutdown or when the server is no longer able to
191+
/// process requests on the current stream.
192+
/// </summary>
193+
public event EventHandler<LiveServerGoAway>? GoAwayReceived;
194+
195+
/// <summary>
196+
/// Occurs when the server sends an update that allows the current session to be resumed.
197+
/// This event provides information related to session resumption, enabling the client to continue
198+
/// an existing session without starting over.
199+
/// </summary>
200+
public event EventHandler<LiveServerSessionResumptionUpdate>? SessionResumableUpdateReceived;
201+
174202
#endregion
175203

176204
#region Private Methods
@@ -202,6 +230,10 @@ private void ProcessReceivedMessage(ResponseMessage msg)
202230
}
203231
ProcessTextChunk(responsePayload);
204232
ProcessAudioChunk(responsePayload);
233+
ProcessInputTranscription(responsePayload);
234+
ProcessOutputTranscription(responsePayload);
235+
ProcessSessionResumableUpdate(responsePayload);
236+
ProcessGoAway(responsePayload);
205237

206238
MessageReceived?.Invoke(this, new MessageReceivedEventArgs(responsePayload));
207239
}
@@ -255,12 +287,15 @@ private void ProcessAudioChunk(BidiResponsePayload responsePayload)
255287
{
256288
var audioBlobs = responsePayload.ServerContent.ModelTurn.Parts
257289
.Where(p => p.InlineData?.MimeType?.Contains("audio") == true)
258-
.Select(p => p.InlineData!)
290+
259291
.ToList();
260292

261293
foreach (var blob in audioBlobs)
262294
{
263-
ProcessAudioBlob(blob);
295+
if (blob.InlineData != null)
296+
{
297+
ProcessAudioBlob(blob.InlineData);
298+
}
264299
}
265300
}
266301

@@ -293,7 +328,8 @@ private void ProcessAudioBlob(Blob blob)
293328
this._lastHeaderInfo = headerInfo;
294329

295330
var bufferReceived = new AudioBufferReceivedEventArgs(audioBuffer, headerInfo);
296-
331+
332+
297333
_audioBuffer.AddRange(audioBuffer);
298334
_logger?.LogAudioChunkReceived(sampleRate, hasHeader, bufferReceived.Buffer.Length);
299335
AudioChunkReceived?.Invoke(this, bufferReceived);
@@ -347,6 +383,38 @@ private void HandleInterruption()
347383
_audioBuffer.Clear();
348384
}
349385

386+
private void ProcessInputTranscription(BidiResponsePayload responsePayload)
387+
{
388+
if (responsePayload.ServerContent?.InputTranscription != null)
389+
{
390+
InputTranscriptionReceived?.Invoke(this, responsePayload.ServerContent.InputTranscription);
391+
}
392+
}
393+
394+
private void ProcessOutputTranscription(BidiResponsePayload responsePayload)
395+
{
396+
if (responsePayload.ServerContent?.OutputTranscription != null)
397+
{
398+
OutputTranscriptionReceived?.Invoke(this, responsePayload.ServerContent.OutputTranscription);
399+
}
400+
}
401+
402+
private void ProcessSessionResumableUpdate(BidiResponsePayload responsePayload)
403+
{
404+
if (responsePayload.SessionResumptionUpdate != null)
405+
{
406+
SessionResumableUpdateReceived?.Invoke(this, responsePayload.SessionResumptionUpdate);
407+
}
408+
}
409+
410+
private void ProcessGoAway(BidiResponsePayload responsePayload)
411+
{
412+
if (responsePayload.GoAway != null)
413+
{
414+
GoAwayReceived?.Invoke(this, responsePayload.GoAway);
415+
}
416+
}
417+
350418
private async Task CallFunctions(BidiGenerateContentToolCall responsePayloadToolCall,
351419
CancellationToken cancellationToken = default)
352420
{
@@ -531,6 +599,8 @@ public async Task DisconnectAsync(CancellationToken cancellationToken = default)
531599
/// </returns>
532600
public async Task SendSetupAsync(BidiGenerateContentSetup setup, CancellationToken cancellationToken = default)
533601
{
602+
if(!setup.Model.Contains("/"))
603+
throw new ArgumentException("Please provide a valid model name such as 'models/gemini-2.0-flash-live-001'.");
534604
var payload = new BidiClientPayload { Setup = setup };
535605
await SendAsync(payload, cancellationToken).ConfigureAwait(false);
536606
}

src/GenerativeAI/Types/MultimodalLive/BidiGenerateContentServerContent.cs

Lines changed: 92 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ public class BidiGenerateContentServerContent
1414
/// </summary>
1515
[JsonPropertyName("turnComplete")]
1616
public bool? TurnComplete { get; set; }
17-
1817
/// <summary>
1918
/// Output only. If true, indicates that a client message has interrupted current model generation. If the client is playing out the content in real time, this is a good signal to stop and empty the current playback queue.
2019
/// </summary>
@@ -32,4 +31,96 @@ public class BidiGenerateContentServerContent
3231
/// </summary>
3332
[JsonPropertyName("modelTurn")]
3433
public Content? ModelTurn { get; set; }
34+
35+
/// <summary>
36+
/// Output only. If true, indicates that the model has completed generating all content for the current request.
37+
/// </summary>
38+
[JsonPropertyName("generationComplete")]
39+
public bool? GenerationComplete { get; set; }
40+
41+
/// <summary>
42+
/// Output only. Transcription of the input content.
43+
/// </summary>
44+
[JsonPropertyName("inputTranscription")]
45+
public Transcription? InputTranscription { get; set; }
46+
47+
/// <summary>
48+
/// Output only. Transcription of the output content.
49+
/// </summary>
50+
[JsonPropertyName("outputTranscription")]
51+
public Transcription? OutputTranscription { get; set; }
52+
53+
/// <summary>
54+
/// Output only. Metadata for URL context.
55+
/// </summary>
56+
[JsonPropertyName("urlContextMetadata")]
57+
public UrlContextMetadata? UrlContextMetadata { get; set; }
58+
}
59+
60+
/// <summary>
61+
/// Audio transcription in Server Content.
62+
/// </summary>
63+
public class Transcription
64+
{
65+
/// <summary>
66+
/// The bool indicates the end of the transcription.
67+
/// </summary>
68+
[JsonPropertyName("finished")]
69+
public bool? Finished { get; set; }
70+
71+
/// <summary>
72+
/// Transcription text.
73+
/// </summary>
74+
[JsonPropertyName("text")]
75+
public string? Text { get; set; }
76+
}
77+
/// <summary>
78+
/// Status of the url retrieval.
79+
/// </summary>
80+
public enum UrlRetrievalStatus
81+
{
82+
/// <summary>
83+
/// Default value. This value is unused.
84+
/// </summary>
85+
URL_RETRIEVAL_STATUS_UNSPECIFIED,
86+
87+
/// <summary>
88+
/// Url retrieval is successful.
89+
/// </summary>
90+
URL_RETRIEVAL_STATUS_SUCCESS,
91+
92+
/// <summary>
93+
/// Url retrieval is failed due to error.
94+
/// </summary>
95+
URL_RETRIEVAL_STATUS_ERROR
96+
}
97+
98+
/// <summary>
99+
/// Context for a single url retrieval.
100+
/// </summary>
101+
public class UrlMetadata
102+
{
103+
/// <summary>
104+
/// The URL retrieved by the tool.
105+
/// </summary>
106+
[JsonPropertyName("retrievedUrl")]
107+
public string? RetrievedUrl { get; set; }
108+
109+
/// <summary>
110+
/// Status of the url retrieval.
111+
/// </summary>
112+
[JsonPropertyName("urlRetrievalStatus")]
113+
public UrlRetrievalStatus? UrlRetrievalStatus { get; set; }
114+
}
115+
116+
/// <summary>
117+
/// Metadata related to url context retrieval tool.
118+
/// </summary>
119+
public class UrlContextMetadata
120+
{
121+
/// <summary>
122+
/// List of url context.
123+
/// </summary>
124+
[JsonPropertyName("urlMetadata")]
125+
public List<UrlMetadata>? UrlMetadata { get; set; }
35126
}

0 commit comments

Comments
 (0)