Merge pull request #566 from evan-cao-wb/features/add-audio-handler

add audio service to transcribe local mp3/wav file
SciSharp · Jul 30, 2024 · e497233 · e497233
2 parents d17574a + 8e097d1
commit e497233
Show file tree

Hide file tree

Showing 12 changed files with 350 additions and 0 deletions.
diff --git a/src/Infrastructure/BotSharp.Abstraction/MLTasks/ISpeechToText.cs b/src/Infrastructure/BotSharp.Abstraction/MLTasks/ISpeechToText.cs
@@ -0,0 +1,14 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+
+namespace BotSharp.Abstraction.MLTasks;
+
+public interface ISpeechToText
+{
+    Task<string> AudioToTextTranscript(string filePath);
+    // Task<string> AudioToTextTranscript(Stream stream);
+}
diff --git a/src/Plugins/BotSharp.Plugin.AudioHandler/AudioHandlerPlugin.cs b/src/Plugins/BotSharp.Plugin.AudioHandler/AudioHandlerPlugin.cs
@@ -0,0 +1,29 @@
+using BotSharp.Plugin.AudioHandler.Settings;
+using BotSharp.Plugin.AudioHandler.Provider;
+using BotSharp.Abstraction.Settings;
+
+namespace BotSharp.Plugin.AudioHandler
+{
+    public class AudioHandlerPlugin : IBotSharpPlugin
+    {
+        public string Id => "9d22014c-4f45-466a-9e82-a74e67983df8";
+        public string Name => "Audio Handler";
+        public string Description => "Process audio input and transform it into text output.";
+        public void RegisterDI(IServiceCollection services, IConfiguration config)
+        {
+            //var settings = new AudioHandlerSettings();
+            //config.Bind("AudioHandler", settings);
+            //services.AddSingleton(x => settings);
+
+            services.AddScoped(provider =>
+            {
+                var settingService = provider.GetRequiredService<ISettingService>();
+                return settingService.Bind<AudioHandlerSettings>("AudioHandler");
+            });
+
+            services.AddScoped<ISpeechToText, NativeWhisperProvider>();
+            services.AddScoped<IAudioProcessUtilities, AudioProcessUtilities>();
+        }
+    }
+}
+
diff --git a/src/Plugins/BotSharp.Plugin.AudioHandler/BotSharp.Plugin.AudioHandler.csproj b/src/Plugins/BotSharp.Plugin.AudioHandler/BotSharp.Plugin.AudioHandler.csproj
@@ -0,0 +1,26 @@
+<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <TargetFramework>$(TargetFramework)</TargetFramework>
+    <ImplicitUsings>enable</ImplicitUsings>
+    <Nullable>enable</Nullable>
+    <LangVersion>$(LangVersion)</LangVersion>
+    <VersionPrefix>$(BotSharpVersion)</VersionPrefix>
+    <GeneratePackageOnBuild>$(GeneratePackageOnBuild)</GeneratePackageOnBuild>
+    <GenerateDocumentationFile>$(GenerateDocumentationFile)</GenerateDocumentationFile>
+    <OutputPath>$(SolutionDir)packages</OutputPath>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <PackageReference Include="Microsoft.AspNetCore.Mvc" Version="2.2.0" />
+    <PackageReference Include="NAudio" Version="2.2.1" />
+    <PackageReference Include="NAudio.Core" Version="2.2.1" />
+    <PackageReference Include="Whisper.net" Version="1.5.0" />
+    <PackageReference Include="Whisper.net.Runtime" Version="1.5.0" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <ProjectReference Include="..\..\Infrastructure\BotSharp.Core\BotSharp.Core.csproj" />
+  </ItemGroup>
+
+</Project>
diff --git a/src/Plugins/BotSharp.Plugin.AudioHandler/Controllers/AudioController.cs b/src/Plugins/BotSharp.Plugin.AudioHandler/Controllers/AudioController.cs
@@ -0,0 +1,44 @@
+using System;
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+using BotSharp.Plugin.AudioHandler.Models;
+using BotSharp.Plugin.AudioHandler.Provider;
+
+namespace BotSharp.Plugin.AudioHandler.Controllers
+{
+#if DEBUG
+    [AllowAnonymous]
+#endif
+    [ApiController]
+    public class AudioController : ControllerBase
+    {
+        private readonly ISpeechToText _nativeWhisperProvider;
+
+        public AudioController(ISpeechToText audioService)
+        {
+            _nativeWhisperProvider = audioService;
+        }
+
+        [HttpGet("audio/transcript")]
+        public async Task<IActionResult> GetTextFromAudioController(string audioInputString)
+        {
+#if DEBUG
+            Stopwatch stopWatch = new Stopwatch();
+            stopWatch.Start();
+#endif
+            var result = await _nativeWhisperProvider.AudioToTextTranscript(audioInputString);
+#if DEBUG
+            stopWatch.Stop();
+            TimeSpan ts = stopWatch.Elapsed;
+            string elapsedTime = String.Format("{0:00}:{1:00}:{2:00}.{3:00}",
+            ts.Hours, ts.Minutes, ts.Seconds,
+            ts.Milliseconds / 10);
+            Console.WriteLine("RunTime " + elapsedTime);
+#endif
+            return Ok(result);
+        }
+    }
+}
diff --git a/src/Plugins/BotSharp.Plugin.AudioHandler/Enums/AudioType.cs b/src/Plugins/BotSharp.Plugin.AudioHandler/Enums/AudioType.cs
@@ -0,0 +1,22 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Runtime.CompilerServices;
+using System.Text;
+using System.Threading.Tasks;
+using Whisper.net.Wave;
+
+namespace BotSharp.Plugin.AudioHandler.Enums
+{
+    public enum AudioType
+    {
+        wav,
+        mp3,
+    }
+
+    public static class AudioTypeExtensions
+    {
+        public static string ToFileExtension(this AudioType audioType) => $".{audioType}";
+    }
+}
+
diff --git a/src/Plugins/BotSharp.Plugin.AudioHandler/Functions/AudioProcessUtilities.cs b/src/Plugins/BotSharp.Plugin.AudioHandler/Functions/AudioProcessUtilities.cs
@@ -0,0 +1,68 @@
+using BotSharp.Plugin.AudioHandler.Enums;
+using NAudio;
+using NAudio.Wave;
+using NAudio.Wave.SampleProviders;
+
+namespace BotSharp.Plugin.AudioHandler.Functions;
+
+public class AudioProcessUtilities : IAudioProcessUtilities
+{
+    public AudioProcessUtilities()
+    {
+    }
+
+    public Stream ConvertMp3ToStream(string mp3FileName)
+    {
+        var fileStream = File.OpenRead(mp3FileName);
+        using var reader = new Mp3FileReader(fileStream);
+        if (reader.WaveFormat.SampleRate != 16000)
+        {
+            var wavStream = new MemoryStream();
+            var resampler = new WdlResamplingSampleProvider(reader.ToSampleProvider(), 16000);
+            WaveFileWriter.WriteWavFileToStream(wavStream, resampler.ToWaveProvider16());
+            wavStream.Seek(0, SeekOrigin.Begin);
+            return wavStream;
+        }
+        fileStream.Seek(0, SeekOrigin.Begin);
+        return fileStream;
+
+    }
+
+    public Stream ConvertWavToStream(string wavFileName)
+    {
+        var fileStream = File.OpenRead(wavFileName);
+        using var reader = new WaveFileReader(fileStream);
+        if (reader.WaveFormat.SampleRate != 16000)
+        {
+            var wavStream = new MemoryStream();
+            var resampler = new WdlResamplingSampleProvider(reader.ToSampleProvider(), 16000);
+            WaveFileWriter.WriteWavFileToStream(wavStream, resampler.ToWaveProvider16());
+            wavStream.Seek(0, SeekOrigin.Begin);
+            return wavStream;
+        }
+        fileStream.Seek(0, SeekOrigin.Begin);
+        return fileStream;
+    }
+
+    public Stream ConvertToStream(string fileName)
+    {
+        if (string.IsNullOrEmpty(fileName))
+        {
+            throw new ArgumentNullException("fileName is Null");
+        }
+        string fileExtension = Path.GetExtension(fileName).ToLower().TrimStart('.');
+        if (!Enum.TryParse<AudioType>(fileExtension, out AudioType fileType))
+        {
+            throw new NotSupportedException($"File extension: '{fileExtension}' not supported");
+        }
+
+        var stream = fileType switch
+        {
+            AudioType.mp3 => ConvertMp3ToStream(fileName),
+            AudioType.wav => ConvertWavToStream(fileName),
+            _ => throw new NotSupportedException("File extension not supported"),
+        };
+
+        return stream;
+    }
+}
diff --git a/src/Plugins/BotSharp.Plugin.AudioHandler/Functions/IAudioProcessUtilities.cs b/src/Plugins/BotSharp.Plugin.AudioHandler/Functions/IAudioProcessUtilities.cs
@@ -0,0 +1,10 @@
+
+namespace BotSharp.Plugin.AudioHandler.Functions
+{
+    public interface IAudioProcessUtilities
+    {
+        Stream ConvertMp3ToStream(string mp3FileName);
+        Stream ConvertWavToStream(string wavFileName);
+        Stream ConvertToStream(string fileName);
+    }
+}
diff --git a/src/Plugins/BotSharp.Plugin.AudioHandler/Models/AudioOutput.cs b/src/Plugins/BotSharp.Plugin.AudioHandler/Models/AudioOutput.cs
@@ -0,0 +1,19 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+using Whisper.net;
+
+namespace BotSharp.Plugin.AudioHandler.Models
+{
+    public class AudioOutput
+    {
+        public List<SegmentData> Segments { get; set; }
+
+        public override string ToString()
+        {
+            return this.Segments.Count > 0 ? string.Join(" ", this.Segments.Select(x => x.Text)) : string.Empty;
+        }
+    }
+}
diff --git a/src/Plugins/BotSharp.Plugin.AudioHandler/Provider/NativeWhisperProvider.cs b/src/Plugins/BotSharp.Plugin.AudioHandler/Provider/NativeWhisperProvider.cs
@@ -0,0 +1,82 @@
+using Whisper.net;
+using Whisper.net.Ggml;
+
+namespace BotSharp.Plugin.AudioHandler.Provider;
+
+/// <summary>
+/// Native Whisper provider for speech to text conversion
+/// </summary>
+public class NativeWhisperProvider : ISpeechToText
+{
+    private readonly IAudioProcessUtilities _audioProcessUtilities;
+    private static WhisperProcessor _processor;
+
+    private string _modelName;
+
+    public NativeWhisperProvider(IAudioProcessUtilities audioProcessUtilities)
+    {
+        _audioProcessUtilities = audioProcessUtilities;
+    }
+
+    public async Task<string> AudioToTextTranscript(string filePath)
+    {
+        string fileExtension = Path.GetExtension(filePath);
+        if (!Enum.TryParse<AudioType>(fileExtension.TrimStart('.').ToLower(), out AudioType audioType))
+        {
+            throw new Exception($"Unsupported audio type: {fileExtension}");
+        }
+        await InitModel();
+        // var _streamHandler = _audioHandlerFactory.CreateAudioHandler(audioType);
+        using var stream = _audioProcessUtilities.ConvertToStream(filePath);
+
+        if (stream == null)
+        {
+            throw new Exception($"Failed to convert {fileExtension} to stream");
+        }
+
+        var textResult = new List<SegmentData>();
+
+        await foreach (var result in _processor.ProcessAsync((Stream)stream).ConfigureAwait(false))
+        {
+            textResult.Add(result);
+        }
+
+        var audioOutput = new AudioOutput
+        {
+            Segments = textResult
+        };
+        return audioOutput.ToString();
+    }
+    private async Task LoadWhisperModel(GgmlType modelType)
+    {
+        try
+        {
+            _modelName = $"ggml-{modelType}.bin";
+
+            if (!File.Exists(_modelName))
+            {
+                using var modelStream = await WhisperGgmlDownloader.GetGgmlModelAsync(GgmlType.TinyEn);
+                using var fileWriter = File.OpenWrite(_modelName);
+                await modelStream.CopyToAsync(fileWriter);
+            }
+        }
+        catch (Exception ex)
+        {
+            throw new Exception($"Failed to load whisper model: {ex.Message}");
+        }
+    }
+
+    private async Task InitModel(GgmlType modelType = GgmlType.TinyEn)
+    {
+        if (_processor == null)
+        {
+
+            await LoadWhisperModel(modelType);
+            _processor = WhisperFactory
+                .FromPath(_modelName)
+                .CreateBuilder()
+                .WithLanguage("en")
+                .Build();
+        }
+    }
+}
diff --git a/src/Plugins/BotSharp.Plugin.AudioHandler/Settings/AudioHandlerSettings.cs b/src/Plugins/BotSharp.Plugin.AudioHandler/Settings/AudioHandlerSettings.cs
@@ -0,0 +1,6 @@
+namespace BotSharp.Plugin.AudioHandler.Settings
+{
+    public class AudioHandlerSettings
+    {
+    }
+}
diff --git a/src/Plugins/BotSharp.Plugin.AudioHandler/Using.cs b/src/Plugins/BotSharp.Plugin.AudioHandler/Using.cs
@@ -0,0 +1,21 @@
+global using System;
+global using System.Collections.Generic;
+global using System.Text;
+global using System.Linq;
+global using System.Text.Json;
+global using System.Linq;
+global using System.Text;
+global using System.Threading.Tasks;
+global using System.Threading.Tasks;
+
+global using BotSharp.Abstraction.Plugins;
+global using BotSharp.Abstraction.MLTasks;
+global using BotSharp.Plugin.AudioHandler.Enums;
+global using BotSharp.Plugin.AudioHandler.Functions;
+global using BotSharp.Plugin.AudioHandler.Models;
+
+global using Microsoft.Extensions.Configuration;
+global using Microsoft.Extensions.DependencyInjection;
+global using Microsoft.AspNetCore.Http;
+global using Microsoft.AspNetCore.Authorization;
+global using Microsoft.AspNetCore.Mvc;
diff --git a/src/Plugins/BotSharp.Plugin.OpenAI/Providers/Audio/SpeechToTextProvider.cs b/src/Plugins/BotSharp.Plugin.OpenAI/Providers/Audio/SpeechToTextProvider.cs
@@ -0,0 +1,9 @@
+namespace BotSharp.Plugin.OpenAI.Providers.Audio;
+
+public class SpeechToTextProvider : ISpeechToText
+{
+    public Task<string> AudioToTextTranscript(string filePath)
+    {
+        throw new NotImplementedException();
+    }
+}