-
Notifications
You must be signed in to change notification settings - Fork 463
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #566 from evan-cao-wb/features/add-audio-handler
add audio service to transcribe local mp3/wav file
- Loading branch information
Showing
12 changed files
with
350 additions
and
0 deletions.
There are no files selected for viewing
14 changes: 14 additions & 0 deletions
14
src/Infrastructure/BotSharp.Abstraction/MLTasks/ISpeechToText.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
using System; | ||
using System.Collections.Generic; | ||
using System.IO; | ||
using System.Linq; | ||
using System.Text; | ||
using System.Threading.Tasks; | ||
|
||
namespace BotSharp.Abstraction.MLTasks; | ||
|
||
public interface ISpeechToText | ||
{ | ||
Task<string> AudioToTextTranscript(string filePath); | ||
// Task<string> AudioToTextTranscript(Stream stream); | ||
} |
29 changes: 29 additions & 0 deletions
29
src/Plugins/BotSharp.Plugin.AudioHandler/AudioHandlerPlugin.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
using BotSharp.Plugin.AudioHandler.Settings; | ||
using BotSharp.Plugin.AudioHandler.Provider; | ||
using BotSharp.Abstraction.Settings; | ||
|
||
namespace BotSharp.Plugin.AudioHandler | ||
{ | ||
public class AudioHandlerPlugin : IBotSharpPlugin | ||
{ | ||
public string Id => "9d22014c-4f45-466a-9e82-a74e67983df8"; | ||
public string Name => "Audio Handler"; | ||
public string Description => "Process audio input and transform it into text output."; | ||
public void RegisterDI(IServiceCollection services, IConfiguration config) | ||
{ | ||
//var settings = new AudioHandlerSettings(); | ||
//config.Bind("AudioHandler", settings); | ||
//services.AddSingleton(x => settings); | ||
|
||
services.AddScoped(provider => | ||
{ | ||
var settingService = provider.GetRequiredService<ISettingService>(); | ||
return settingService.Bind<AudioHandlerSettings>("AudioHandler"); | ||
}); | ||
|
||
services.AddScoped<ISpeechToText, NativeWhisperProvider>(); | ||
services.AddScoped<IAudioProcessUtilities, AudioProcessUtilities>(); | ||
} | ||
} | ||
} | ||
|
26 changes: 26 additions & 0 deletions
26
src/Plugins/BotSharp.Plugin.AudioHandler/BotSharp.Plugin.AudioHandler.csproj
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
<Project Sdk="Microsoft.NET.Sdk"> | ||
|
||
<PropertyGroup> | ||
<TargetFramework>$(TargetFramework)</TargetFramework> | ||
<ImplicitUsings>enable</ImplicitUsings> | ||
<Nullable>enable</Nullable> | ||
<LangVersion>$(LangVersion)</LangVersion> | ||
<VersionPrefix>$(BotSharpVersion)</VersionPrefix> | ||
<GeneratePackageOnBuild>$(GeneratePackageOnBuild)</GeneratePackageOnBuild> | ||
<GenerateDocumentationFile>$(GenerateDocumentationFile)</GenerateDocumentationFile> | ||
<OutputPath>$(SolutionDir)packages</OutputPath> | ||
</PropertyGroup> | ||
|
||
<ItemGroup> | ||
<PackageReference Include="Microsoft.AspNetCore.Mvc" Version="2.2.0" /> | ||
<PackageReference Include="NAudio" Version="2.2.1" /> | ||
<PackageReference Include="NAudio.Core" Version="2.2.1" /> | ||
<PackageReference Include="Whisper.net" Version="1.5.0" /> | ||
<PackageReference Include="Whisper.net.Runtime" Version="1.5.0" /> | ||
</ItemGroup> | ||
|
||
<ItemGroup> | ||
<ProjectReference Include="..\..\Infrastructure\BotSharp.Core\BotSharp.Core.csproj" /> | ||
</ItemGroup> | ||
|
||
</Project> |
44 changes: 44 additions & 0 deletions
44
src/Plugins/BotSharp.Plugin.AudioHandler/Controllers/AudioController.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
using System; | ||
using System.Collections.Generic; | ||
using System.Diagnostics; | ||
using System.Linq; | ||
using System.Text; | ||
using System.Threading.Tasks; | ||
using BotSharp.Plugin.AudioHandler.Models; | ||
using BotSharp.Plugin.AudioHandler.Provider; | ||
|
||
namespace BotSharp.Plugin.AudioHandler.Controllers | ||
{ | ||
#if DEBUG | ||
[AllowAnonymous] | ||
#endif | ||
[ApiController] | ||
public class AudioController : ControllerBase | ||
{ | ||
private readonly ISpeechToText _nativeWhisperProvider; | ||
|
||
public AudioController(ISpeechToText audioService) | ||
{ | ||
_nativeWhisperProvider = audioService; | ||
} | ||
|
||
[HttpGet("audio/transcript")] | ||
public async Task<IActionResult> GetTextFromAudioController(string audioInputString) | ||
{ | ||
#if DEBUG | ||
Stopwatch stopWatch = new Stopwatch(); | ||
stopWatch.Start(); | ||
#endif | ||
var result = await _nativeWhisperProvider.AudioToTextTranscript(audioInputString); | ||
#if DEBUG | ||
stopWatch.Stop(); | ||
TimeSpan ts = stopWatch.Elapsed; | ||
string elapsedTime = String.Format("{0:00}:{1:00}:{2:00}.{3:00}", | ||
ts.Hours, ts.Minutes, ts.Seconds, | ||
ts.Milliseconds / 10); | ||
Console.WriteLine("RunTime " + elapsedTime); | ||
#endif | ||
return Ok(result); | ||
} | ||
} | ||
} |
22 changes: 22 additions & 0 deletions
22
src/Plugins/BotSharp.Plugin.AudioHandler/Enums/AudioType.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
using System; | ||
using System.Collections.Generic; | ||
using System.Linq; | ||
using System.Runtime.CompilerServices; | ||
using System.Text; | ||
using System.Threading.Tasks; | ||
using Whisper.net.Wave; | ||
|
||
namespace BotSharp.Plugin.AudioHandler.Enums | ||
{ | ||
public enum AudioType | ||
{ | ||
wav, | ||
mp3, | ||
} | ||
|
||
public static class AudioTypeExtensions | ||
{ | ||
public static string ToFileExtension(this AudioType audioType) => $".{audioType}"; | ||
} | ||
} | ||
|
68 changes: 68 additions & 0 deletions
68
src/Plugins/BotSharp.Plugin.AudioHandler/Functions/AudioProcessUtilities.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
using BotSharp.Plugin.AudioHandler.Enums; | ||
using NAudio; | ||
using NAudio.Wave; | ||
using NAudio.Wave.SampleProviders; | ||
|
||
namespace BotSharp.Plugin.AudioHandler.Functions; | ||
|
||
public class AudioProcessUtilities : IAudioProcessUtilities | ||
{ | ||
public AudioProcessUtilities() | ||
{ | ||
} | ||
|
||
public Stream ConvertMp3ToStream(string mp3FileName) | ||
{ | ||
var fileStream = File.OpenRead(mp3FileName); | ||
using var reader = new Mp3FileReader(fileStream); | ||
if (reader.WaveFormat.SampleRate != 16000) | ||
{ | ||
var wavStream = new MemoryStream(); | ||
var resampler = new WdlResamplingSampleProvider(reader.ToSampleProvider(), 16000); | ||
WaveFileWriter.WriteWavFileToStream(wavStream, resampler.ToWaveProvider16()); | ||
wavStream.Seek(0, SeekOrigin.Begin); | ||
return wavStream; | ||
} | ||
fileStream.Seek(0, SeekOrigin.Begin); | ||
return fileStream; | ||
|
||
} | ||
|
||
public Stream ConvertWavToStream(string wavFileName) | ||
{ | ||
var fileStream = File.OpenRead(wavFileName); | ||
using var reader = new WaveFileReader(fileStream); | ||
if (reader.WaveFormat.SampleRate != 16000) | ||
{ | ||
var wavStream = new MemoryStream(); | ||
var resampler = new WdlResamplingSampleProvider(reader.ToSampleProvider(), 16000); | ||
WaveFileWriter.WriteWavFileToStream(wavStream, resampler.ToWaveProvider16()); | ||
wavStream.Seek(0, SeekOrigin.Begin); | ||
return wavStream; | ||
} | ||
fileStream.Seek(0, SeekOrigin.Begin); | ||
return fileStream; | ||
} | ||
|
||
public Stream ConvertToStream(string fileName) | ||
{ | ||
if (string.IsNullOrEmpty(fileName)) | ||
{ | ||
throw new ArgumentNullException("fileName is Null"); | ||
} | ||
string fileExtension = Path.GetExtension(fileName).ToLower().TrimStart('.'); | ||
if (!Enum.TryParse<AudioType>(fileExtension, out AudioType fileType)) | ||
{ | ||
throw new NotSupportedException($"File extension: '{fileExtension}' not supported"); | ||
} | ||
|
||
var stream = fileType switch | ||
{ | ||
AudioType.mp3 => ConvertMp3ToStream(fileName), | ||
AudioType.wav => ConvertWavToStream(fileName), | ||
_ => throw new NotSupportedException("File extension not supported"), | ||
}; | ||
|
||
return stream; | ||
} | ||
} |
10 changes: 10 additions & 0 deletions
10
src/Plugins/BotSharp.Plugin.AudioHandler/Functions/IAudioProcessUtilities.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
|
||
namespace BotSharp.Plugin.AudioHandler.Functions | ||
{ | ||
public interface IAudioProcessUtilities | ||
{ | ||
Stream ConvertMp3ToStream(string mp3FileName); | ||
Stream ConvertWavToStream(string wavFileName); | ||
Stream ConvertToStream(string fileName); | ||
} | ||
} |
19 changes: 19 additions & 0 deletions
19
src/Plugins/BotSharp.Plugin.AudioHandler/Models/AudioOutput.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
using System; | ||
using System.Collections.Generic; | ||
using System.Linq; | ||
using System.Text; | ||
using System.Threading.Tasks; | ||
using Whisper.net; | ||
|
||
namespace BotSharp.Plugin.AudioHandler.Models | ||
{ | ||
public class AudioOutput | ||
{ | ||
public List<SegmentData> Segments { get; set; } | ||
|
||
public override string ToString() | ||
{ | ||
return this.Segments.Count > 0 ? string.Join(" ", this.Segments.Select(x => x.Text)) : string.Empty; | ||
} | ||
} | ||
} |
82 changes: 82 additions & 0 deletions
82
src/Plugins/BotSharp.Plugin.AudioHandler/Provider/NativeWhisperProvider.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
using Whisper.net; | ||
using Whisper.net.Ggml; | ||
|
||
namespace BotSharp.Plugin.AudioHandler.Provider; | ||
|
||
/// <summary> | ||
/// Native Whisper provider for speech to text conversion | ||
/// </summary> | ||
public class NativeWhisperProvider : ISpeechToText | ||
{ | ||
private readonly IAudioProcessUtilities _audioProcessUtilities; | ||
private static WhisperProcessor _processor; | ||
|
||
private string _modelName; | ||
|
||
public NativeWhisperProvider(IAudioProcessUtilities audioProcessUtilities) | ||
{ | ||
_audioProcessUtilities = audioProcessUtilities; | ||
} | ||
|
||
public async Task<string> AudioToTextTranscript(string filePath) | ||
{ | ||
string fileExtension = Path.GetExtension(filePath); | ||
if (!Enum.TryParse<AudioType>(fileExtension.TrimStart('.').ToLower(), out AudioType audioType)) | ||
{ | ||
throw new Exception($"Unsupported audio type: {fileExtension}"); | ||
} | ||
await InitModel(); | ||
// var _streamHandler = _audioHandlerFactory.CreateAudioHandler(audioType); | ||
using var stream = _audioProcessUtilities.ConvertToStream(filePath); | ||
|
||
if (stream == null) | ||
{ | ||
throw new Exception($"Failed to convert {fileExtension} to stream"); | ||
} | ||
|
||
var textResult = new List<SegmentData>(); | ||
|
||
await foreach (var result in _processor.ProcessAsync((Stream)stream).ConfigureAwait(false)) | ||
{ | ||
textResult.Add(result); | ||
} | ||
|
||
var audioOutput = new AudioOutput | ||
{ | ||
Segments = textResult | ||
}; | ||
return audioOutput.ToString(); | ||
} | ||
private async Task LoadWhisperModel(GgmlType modelType) | ||
{ | ||
try | ||
{ | ||
_modelName = $"ggml-{modelType}.bin"; | ||
|
||
if (!File.Exists(_modelName)) | ||
{ | ||
using var modelStream = await WhisperGgmlDownloader.GetGgmlModelAsync(GgmlType.TinyEn); | ||
using var fileWriter = File.OpenWrite(_modelName); | ||
await modelStream.CopyToAsync(fileWriter); | ||
} | ||
} | ||
catch (Exception ex) | ||
{ | ||
throw new Exception($"Failed to load whisper model: {ex.Message}"); | ||
} | ||
} | ||
|
||
private async Task InitModel(GgmlType modelType = GgmlType.TinyEn) | ||
{ | ||
if (_processor == null) | ||
{ | ||
|
||
await LoadWhisperModel(modelType); | ||
_processor = WhisperFactory | ||
.FromPath(_modelName) | ||
.CreateBuilder() | ||
.WithLanguage("en") | ||
.Build(); | ||
} | ||
} | ||
} |
6 changes: 6 additions & 0 deletions
6
src/Plugins/BotSharp.Plugin.AudioHandler/Settings/AudioHandlerSettings.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
namespace BotSharp.Plugin.AudioHandler.Settings | ||
{ | ||
public class AudioHandlerSettings | ||
{ | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
global using System; | ||
global using System.Collections.Generic; | ||
global using System.Text; | ||
global using System.Linq; | ||
global using System.Text.Json; | ||
global using System.Linq; | ||
global using System.Text; | ||
global using System.Threading.Tasks; | ||
global using System.Threading.Tasks; | ||
|
||
global using BotSharp.Abstraction.Plugins; | ||
global using BotSharp.Abstraction.MLTasks; | ||
global using BotSharp.Plugin.AudioHandler.Enums; | ||
global using BotSharp.Plugin.AudioHandler.Functions; | ||
global using BotSharp.Plugin.AudioHandler.Models; | ||
|
||
global using Microsoft.Extensions.Configuration; | ||
global using Microsoft.Extensions.DependencyInjection; | ||
global using Microsoft.AspNetCore.Http; | ||
global using Microsoft.AspNetCore.Authorization; | ||
global using Microsoft.AspNetCore.Mvc; |
9 changes: 9 additions & 0 deletions
9
src/Plugins/BotSharp.Plugin.OpenAI/Providers/Audio/SpeechToTextProvider.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
namespace BotSharp.Plugin.OpenAI.Providers.Audio; | ||
|
||
public class SpeechToTextProvider : ISpeechToText | ||
{ | ||
public Task<string> AudioToTextTranscript(string filePath) | ||
{ | ||
throw new NotImplementedException(); | ||
} | ||
} |