diff --git a/src/Infrastructure/BotSharp.Abstraction/MLTasks/ISpeechToText.cs b/src/Infrastructure/BotSharp.Abstraction/MLTasks/ISpeechToText.cs new file mode 100644 index 000000000..42ee4ddd9 --- /dev/null +++ b/src/Infrastructure/BotSharp.Abstraction/MLTasks/ISpeechToText.cs @@ -0,0 +1,14 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace BotSharp.Abstraction.MLTasks; + +public interface ISpeechToText +{ + Task AudioToTextTranscript(string filePath); + // Task AudioToTextTranscript(Stream stream); +} diff --git a/src/Plugins/BotSharp.Plugin.AudioHandler/AudioHandlerPlugin.cs b/src/Plugins/BotSharp.Plugin.AudioHandler/AudioHandlerPlugin.cs new file mode 100644 index 000000000..442728876 --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.AudioHandler/AudioHandlerPlugin.cs @@ -0,0 +1,29 @@ +using BotSharp.Plugin.AudioHandler.Settings; +using BotSharp.Plugin.AudioHandler.Provider; +using BotSharp.Abstraction.Settings; + +namespace BotSharp.Plugin.AudioHandler +{ + public class AudioHandlerPlugin : IBotSharpPlugin + { + public string Id => "9d22014c-4f45-466a-9e82-a74e67983df8"; + public string Name => "Audio Handler"; + public string Description => "Process audio input and transform it into text output."; + public void RegisterDI(IServiceCollection services, IConfiguration config) + { + //var settings = new AudioHandlerSettings(); + //config.Bind("AudioHandler", settings); + //services.AddSingleton(x => settings); + + services.AddScoped(provider => + { + var settingService = provider.GetRequiredService(); + return settingService.Bind("AudioHandler"); + }); + + services.AddScoped(); + services.AddScoped(); + } + } +} + diff --git a/src/Plugins/BotSharp.Plugin.AudioHandler/BotSharp.Plugin.AudioHandler.csproj b/src/Plugins/BotSharp.Plugin.AudioHandler/BotSharp.Plugin.AudioHandler.csproj new file mode 100644 index 000000000..7218d40c1 --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.AudioHandler/BotSharp.Plugin.AudioHandler.csproj @@ -0,0 +1,26 @@ + + + + $(TargetFramework) + enable + enable + $(LangVersion) + $(BotSharpVersion) + $(GeneratePackageOnBuild) + $(GenerateDocumentationFile) + $(SolutionDir)packages + + + + + + + + + + + + + + + diff --git a/src/Plugins/BotSharp.Plugin.AudioHandler/Controllers/AudioController.cs b/src/Plugins/BotSharp.Plugin.AudioHandler/Controllers/AudioController.cs new file mode 100644 index 000000000..cc2b10f10 --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.AudioHandler/Controllers/AudioController.cs @@ -0,0 +1,44 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using BotSharp.Plugin.AudioHandler.Models; +using BotSharp.Plugin.AudioHandler.Provider; + +namespace BotSharp.Plugin.AudioHandler.Controllers +{ +#if DEBUG + [AllowAnonymous] +#endif + [ApiController] + public class AudioController : ControllerBase + { + private readonly ISpeechToText _nativeWhisperProvider; + + public AudioController(ISpeechToText audioService) + { + _nativeWhisperProvider = audioService; + } + + [HttpGet("audio/transcript")] + public async Task GetTextFromAudioController(string audioInputString) + { +#if DEBUG + Stopwatch stopWatch = new Stopwatch(); + stopWatch.Start(); +#endif + var result = await _nativeWhisperProvider.AudioToTextTranscript(audioInputString); +#if DEBUG + stopWatch.Stop(); + TimeSpan ts = stopWatch.Elapsed; + string elapsedTime = String.Format("{0:00}:{1:00}:{2:00}.{3:00}", + ts.Hours, ts.Minutes, ts.Seconds, + ts.Milliseconds / 10); + Console.WriteLine("RunTime " + elapsedTime); +#endif + return Ok(result); + } + } +} diff --git a/src/Plugins/BotSharp.Plugin.AudioHandler/Enums/AudioType.cs b/src/Plugins/BotSharp.Plugin.AudioHandler/Enums/AudioType.cs new file mode 100644 index 000000000..356adfaa1 --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.AudioHandler/Enums/AudioType.cs @@ -0,0 +1,22 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Runtime.CompilerServices; +using System.Text; +using System.Threading.Tasks; +using Whisper.net.Wave; + +namespace BotSharp.Plugin.AudioHandler.Enums +{ + public enum AudioType + { + wav, + mp3, + } + + public static class AudioTypeExtensions + { + public static string ToFileExtension(this AudioType audioType) => $".{audioType}"; + } +} + diff --git a/src/Plugins/BotSharp.Plugin.AudioHandler/Functions/AudioProcessUtilities.cs b/src/Plugins/BotSharp.Plugin.AudioHandler/Functions/AudioProcessUtilities.cs new file mode 100644 index 000000000..a65443596 --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.AudioHandler/Functions/AudioProcessUtilities.cs @@ -0,0 +1,68 @@ +using BotSharp.Plugin.AudioHandler.Enums; +using NAudio; +using NAudio.Wave; +using NAudio.Wave.SampleProviders; + +namespace BotSharp.Plugin.AudioHandler.Functions; + +public class AudioProcessUtilities : IAudioProcessUtilities +{ + public AudioProcessUtilities() + { + } + + public Stream ConvertMp3ToStream(string mp3FileName) + { + var fileStream = File.OpenRead(mp3FileName); + using var reader = new Mp3FileReader(fileStream); + if (reader.WaveFormat.SampleRate != 16000) + { + var wavStream = new MemoryStream(); + var resampler = new WdlResamplingSampleProvider(reader.ToSampleProvider(), 16000); + WaveFileWriter.WriteWavFileToStream(wavStream, resampler.ToWaveProvider16()); + wavStream.Seek(0, SeekOrigin.Begin); + return wavStream; + } + fileStream.Seek(0, SeekOrigin.Begin); + return fileStream; + + } + + public Stream ConvertWavToStream(string wavFileName) + { + var fileStream = File.OpenRead(wavFileName); + using var reader = new WaveFileReader(fileStream); + if (reader.WaveFormat.SampleRate != 16000) + { + var wavStream = new MemoryStream(); + var resampler = new WdlResamplingSampleProvider(reader.ToSampleProvider(), 16000); + WaveFileWriter.WriteWavFileToStream(wavStream, resampler.ToWaveProvider16()); + wavStream.Seek(0, SeekOrigin.Begin); + return wavStream; + } + fileStream.Seek(0, SeekOrigin.Begin); + return fileStream; + } + + public Stream ConvertToStream(string fileName) + { + if (string.IsNullOrEmpty(fileName)) + { + throw new ArgumentNullException("fileName is Null"); + } + string fileExtension = Path.GetExtension(fileName).ToLower().TrimStart('.'); + if (!Enum.TryParse(fileExtension, out AudioType fileType)) + { + throw new NotSupportedException($"File extension: '{fileExtension}' not supported"); + } + + var stream = fileType switch + { + AudioType.mp3 => ConvertMp3ToStream(fileName), + AudioType.wav => ConvertWavToStream(fileName), + _ => throw new NotSupportedException("File extension not supported"), + }; + + return stream; + } +} diff --git a/src/Plugins/BotSharp.Plugin.AudioHandler/Functions/IAudioProcessUtilities.cs b/src/Plugins/BotSharp.Plugin.AudioHandler/Functions/IAudioProcessUtilities.cs new file mode 100644 index 000000000..a3c8243b3 --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.AudioHandler/Functions/IAudioProcessUtilities.cs @@ -0,0 +1,10 @@ + +namespace BotSharp.Plugin.AudioHandler.Functions +{ + public interface IAudioProcessUtilities + { + Stream ConvertMp3ToStream(string mp3FileName); + Stream ConvertWavToStream(string wavFileName); + Stream ConvertToStream(string fileName); + } +} \ No newline at end of file diff --git a/src/Plugins/BotSharp.Plugin.AudioHandler/Models/AudioOutput.cs b/src/Plugins/BotSharp.Plugin.AudioHandler/Models/AudioOutput.cs new file mode 100644 index 000000000..1b58f455d --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.AudioHandler/Models/AudioOutput.cs @@ -0,0 +1,19 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using Whisper.net; + +namespace BotSharp.Plugin.AudioHandler.Models +{ + public class AudioOutput + { + public List Segments { get; set; } + + public override string ToString() + { + return this.Segments.Count > 0 ? string.Join(" ", this.Segments.Select(x => x.Text)) : string.Empty; + } + } +} diff --git a/src/Plugins/BotSharp.Plugin.AudioHandler/Provider/NativeWhisperProvider.cs b/src/Plugins/BotSharp.Plugin.AudioHandler/Provider/NativeWhisperProvider.cs new file mode 100644 index 000000000..1ec23e1db --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.AudioHandler/Provider/NativeWhisperProvider.cs @@ -0,0 +1,82 @@ +using Whisper.net; +using Whisper.net.Ggml; + +namespace BotSharp.Plugin.AudioHandler.Provider; + +/// +/// Native Whisper provider for speech to text conversion +/// +public class NativeWhisperProvider : ISpeechToText +{ + private readonly IAudioProcessUtilities _audioProcessUtilities; + private static WhisperProcessor _processor; + + private string _modelName; + + public NativeWhisperProvider(IAudioProcessUtilities audioProcessUtilities) + { + _audioProcessUtilities = audioProcessUtilities; + } + + public async Task AudioToTextTranscript(string filePath) + { + string fileExtension = Path.GetExtension(filePath); + if (!Enum.TryParse(fileExtension.TrimStart('.').ToLower(), out AudioType audioType)) + { + throw new Exception($"Unsupported audio type: {fileExtension}"); + } + await InitModel(); + // var _streamHandler = _audioHandlerFactory.CreateAudioHandler(audioType); + using var stream = _audioProcessUtilities.ConvertToStream(filePath); + + if (stream == null) + { + throw new Exception($"Failed to convert {fileExtension} to stream"); + } + + var textResult = new List(); + + await foreach (var result in _processor.ProcessAsync((Stream)stream).ConfigureAwait(false)) + { + textResult.Add(result); + } + + var audioOutput = new AudioOutput + { + Segments = textResult + }; + return audioOutput.ToString(); + } + private async Task LoadWhisperModel(GgmlType modelType) + { + try + { + _modelName = $"ggml-{modelType}.bin"; + + if (!File.Exists(_modelName)) + { + using var modelStream = await WhisperGgmlDownloader.GetGgmlModelAsync(GgmlType.TinyEn); + using var fileWriter = File.OpenWrite(_modelName); + await modelStream.CopyToAsync(fileWriter); + } + } + catch (Exception ex) + { + throw new Exception($"Failed to load whisper model: {ex.Message}"); + } + } + + private async Task InitModel(GgmlType modelType = GgmlType.TinyEn) + { + if (_processor == null) + { + + await LoadWhisperModel(modelType); + _processor = WhisperFactory + .FromPath(_modelName) + .CreateBuilder() + .WithLanguage("en") + .Build(); + } + } +} diff --git a/src/Plugins/BotSharp.Plugin.AudioHandler/Settings/AudioHandlerSettings.cs b/src/Plugins/BotSharp.Plugin.AudioHandler/Settings/AudioHandlerSettings.cs new file mode 100644 index 000000000..4ace63db4 --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.AudioHandler/Settings/AudioHandlerSettings.cs @@ -0,0 +1,6 @@ +namespace BotSharp.Plugin.AudioHandler.Settings +{ + public class AudioHandlerSettings + { + } +} diff --git a/src/Plugins/BotSharp.Plugin.AudioHandler/Using.cs b/src/Plugins/BotSharp.Plugin.AudioHandler/Using.cs new file mode 100644 index 000000000..25be4aa66 --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.AudioHandler/Using.cs @@ -0,0 +1,21 @@ +global using System; +global using System.Collections.Generic; +global using System.Text; +global using System.Linq; +global using System.Text.Json; +global using System.Linq; +global using System.Text; +global using System.Threading.Tasks; +global using System.Threading.Tasks; + +global using BotSharp.Abstraction.Plugins; +global using BotSharp.Abstraction.MLTasks; +global using BotSharp.Plugin.AudioHandler.Enums; +global using BotSharp.Plugin.AudioHandler.Functions; +global using BotSharp.Plugin.AudioHandler.Models; + +global using Microsoft.Extensions.Configuration; +global using Microsoft.Extensions.DependencyInjection; +global using Microsoft.AspNetCore.Http; +global using Microsoft.AspNetCore.Authorization; +global using Microsoft.AspNetCore.Mvc; \ No newline at end of file diff --git a/src/Plugins/BotSharp.Plugin.OpenAI/Providers/Audio/SpeechToTextProvider.cs b/src/Plugins/BotSharp.Plugin.OpenAI/Providers/Audio/SpeechToTextProvider.cs new file mode 100644 index 000000000..0a8cbb0af --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.OpenAI/Providers/Audio/SpeechToTextProvider.cs @@ -0,0 +1,9 @@ +namespace BotSharp.Plugin.OpenAI.Providers.Audio; + +public class SpeechToTextProvider : ISpeechToText +{ + public Task AudioToTextTranscript(string filePath) + { + throw new NotImplementedException(); + } +}