Skip to content

Commit 385b65e

Browse files
authored
Merge pull request #964 from hchen2020/master
IAudioSynthesis
2 parents 23cbe41 + 2d710d4 commit 385b65e

File tree

35 files changed

+307
-244
lines changed

35 files changed

+307
-244
lines changed

src/Infrastructure/BotSharp.Abstraction/MLTasks/IAudioCompletion.cs

Lines changed: 0 additions & 15 deletions
This file was deleted.
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
namespace BotSharp.Abstraction.MLTasks;
2+
3+
/// <summary>
4+
/// Text to speech synthesis
5+
/// </summary>
6+
public interface IAudioSynthesis
7+
{
8+
string Provider { get; }
9+
10+
string Model { get; }
11+
12+
void SetModelName(string model);
13+
14+
Task<BinaryData> GenerateAudioAsync(string text, string? voice = "alloy", string? format = "mp3", string? instructions = null);
15+
}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
using System.IO;
2+
3+
namespace BotSharp.Abstraction.MLTasks;
4+
5+
/// <summary>
6+
/// Audio transcription service
7+
/// </summary>
8+
public interface IAudioTranscription
9+
{
10+
string Provider { get; }
11+
12+
string Model { get; }
13+
14+
Task<string> TranscriptTextAsync(Stream audio, string audioFileName, string? text = null);
15+
16+
void SetModelName(string model);
17+
}

src/Infrastructure/BotSharp.Abstraction/MLTasks/Settings/LlmModelSetting.cs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,14 @@ namespace BotSharp.Abstraction.MLTasks.Settings;
33
public class LlmModelSetting
44
{
55
/// <summary>
6-
/// Model Id, like "gpt-3.5" and "gpt-4".
6+
/// Model Id, like "gpt-4", "gpt-4o", "o1".
77
/// </summary>
8-
public string? Id { get; set; }
8+
public string Id { get; set; } = null!;
99

1010
/// <summary>
1111
/// Deployment model name
1212
/// </summary>
13-
public string Name { get; set; }
13+
public string Name { get; set; } = null!;
1414

1515
/// <summary>
1616
/// Model version
@@ -28,8 +28,8 @@ public class LlmModelSetting
2828
/// </summary>
2929
public string? Group { get; set; }
3030

31-
public string ApiKey { get; set; }
32-
public string Endpoint { get; set; }
31+
public string ApiKey { get; set; } = null!;
32+
public string? Endpoint { get; set; }
3333
public LlmModelType Type { get; set; } = LlmModelType.Chat;
3434

3535
/// <summary>

src/Infrastructure/BotSharp.Abstraction/Realtime/Models/ModelTurnDetection.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,6 @@ public class ModelTurnDetection
1111

1212
public class AudioTranscription
1313
{
14-
public string Model { get; set; } = "whisper-1";
15-
public string Language { get; set; } = "en";
14+
public string Model { get; set; } = "gpt-4o-mini-transcribe";
15+
public string? Language { get; set; }
1616
}

src/Infrastructure/BotSharp.Core.Realtime/Services/RealtimeHub.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ private async Task ConnectToModel(WebSocket userWebSocket)
7474
if (!model.Contains("-realtime-"))
7575
{
7676
var llmProviderService = _services.GetRequiredService<ILlmProviderService>();
77-
model = llmProviderService.GetProviderModel("openai", "gpt-4", realTime: true).Name;
77+
model = llmProviderService.GetProviderModel("openai", "gpt-4o", realTime: true).Name;
7878
}
7979

8080
_completer.SetModelName(model);

src/Infrastructure/BotSharp.Core/Files/Services/Instruct/FileInstructService.Audio.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,14 @@ public partial class FileInstructService
66
{
77
public async Task<string> SpeechToText(string? provider, string? model, InstructFileModel audio, string? text = null)
88
{
9-
var completion = CompletionProvider.GetAudioCompletion(_services, provider: provider ?? "openai", model: model ?? "whisper-1");
9+
var completion = CompletionProvider.GetAudioTranscriber(_services, provider: provider, model: model);
1010
var audioBytes = await DownloadFile(audio);
1111
using var stream = new MemoryStream();
1212
stream.Write(audioBytes, 0, audioBytes.Length);
1313
stream.Position = 0;
1414

1515
var fileName = $"{audio.FileName ?? "audio"}.{audio.FileExtension ?? "wav"}";
16-
var content = await completion.GenerateTextFromAudioAsync(stream, fileName, text);
16+
var content = await completion.TranscriptTextAsync(stream, fileName, text);
1717
stream.Close();
1818
return content;
1919
}

src/Infrastructure/BotSharp.Core/Files/Services/Instruct/FileInstructService.Pdf.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ public async Task<string> ReadPdf(string? provider, string? model, string? model
2727

2828
var innerAgentId = agentId ?? Guid.Empty.ToString();
2929
var completion = CompletionProvider.GetChatCompletion(_services, provider: provider ?? "openai",
30-
model: model, modelId: modelId ?? "gpt-4", multiModal: true);
30+
model: model, modelId: modelId ?? "gpt-4o", multiModal: true);
3131
var message = await completion.GetChatCompletions(new Agent()
3232
{
3333
Id = innerAgentId,

src/Infrastructure/BotSharp.Core/Files/Services/Instruct/FileInstructService.SelectFile.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ private async Task<IEnumerable<MessageFileModel>> SelectFiles(IEnumerable<Messag
9393
}
9494

9595
var providerName = options.Provider ?? "openai";
96-
var modelId = options?.ModelId ?? "gpt-4";
96+
var modelId = options?.ModelId ?? "gpt-4o";
9797
var provider = llmProviderService.GetProviders().FirstOrDefault(x => x == providerName);
9898
var model = llmProviderService.GetProviderModel(provider: provider, id: modelId);
9999
var completion = CompletionProvider.GetChatCompletion(_services, provider: provider, model: model.Name);

src/Infrastructure/BotSharp.Core/Infrastructures/CompletionProvider.cs

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ public static object GetCompletion(IServiceProvider services,
3030
}
3131
else if (settings.Type == LlmModelType.Audio)
3232
{
33-
return GetAudioCompletion(services, provider: provider, model: model);
33+
return GetAudioTranscriber(services, provider: provider, model: model);
3434
}
3535
else
3636
{
@@ -126,20 +126,39 @@ public static ITextEmbedding GetTextEmbedding(IServiceProvider services,
126126
return completer;
127127
}
128128

129-
public static IAudioCompletion GetAudioCompletion(
129+
public static IAudioTranscription GetAudioTranscriber(
130130
IServiceProvider services,
131-
string provider,
132-
string model)
131+
string? provider = null,
132+
string? model = null)
133133
{
134-
var completions = services.GetServices<IAudioCompletion>();
135-
var completer = completions.FirstOrDefault(x => x.Provider == provider);
134+
var completions = services.GetServices<IAudioTranscription>();
135+
var completer = completions.FirstOrDefault(x => x.Provider == (provider ?? "openai"));
136136
if (completer == null)
137137
{
138138
var logger = services.GetRequiredService<ILogger<CompletionProvider>>();
139-
logger.LogError($"Can't resolve audio-completion provider by {provider}");
139+
logger.LogError($"Can't resolve audio-transcriber provider by {provider}");
140+
return default!;
140141
}
141142

142-
completer.SetModelName(model);
143+
completer.SetModelName(model ?? "gpt-4o-mini-transcribe");
144+
return completer;
145+
}
146+
147+
public static IAudioSynthesis GetAudioSynthesizer(
148+
IServiceProvider services,
149+
string? provider = null,
150+
string? model = null)
151+
{
152+
var completions = services.GetServices<IAudioSynthesis>();
153+
var completer = completions.FirstOrDefault(x => x.Provider == (provider ?? "openai"));
154+
if (completer == null)
155+
{
156+
var logger = services.GetRequiredService<ILogger<CompletionProvider>>();
157+
logger.LogError($"Can't resolve audio-synthesizer provider by {provider}");
158+
return default!;
159+
}
160+
161+
completer.SetModelName(model ?? "gpt-4o-mini-tts");
143162
return completer;
144163
}
145164

0 commit comments

Comments
 (0)