Skip to content

Commit

Permalink
Merge pull request #566 from evan-cao-wb/features/add-audio-handler
Browse files Browse the repository at this point in the history
add audio service to transcribe local mp3/wav file
  • Loading branch information
Oceania2018 authored Jul 30, 2024
2 parents d17574a + 8e097d1 commit e497233
Show file tree
Hide file tree
Showing 12 changed files with 350 additions and 0 deletions.
14 changes: 14 additions & 0 deletions src/Infrastructure/BotSharp.Abstraction/MLTasks/ISpeechToText.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace BotSharp.Abstraction.MLTasks;

public interface ISpeechToText
{
Task<string> AudioToTextTranscript(string filePath);
// Task<string> AudioToTextTranscript(Stream stream);
}
29 changes: 29 additions & 0 deletions src/Plugins/BotSharp.Plugin.AudioHandler/AudioHandlerPlugin.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
using BotSharp.Plugin.AudioHandler.Settings;
using BotSharp.Plugin.AudioHandler.Provider;
using BotSharp.Abstraction.Settings;

namespace BotSharp.Plugin.AudioHandler
{
public class AudioHandlerPlugin : IBotSharpPlugin
{
public string Id => "9d22014c-4f45-466a-9e82-a74e67983df8";
public string Name => "Audio Handler";
public string Description => "Process audio input and transform it into text output.";
public void RegisterDI(IServiceCollection services, IConfiguration config)
{
//var settings = new AudioHandlerSettings();
//config.Bind("AudioHandler", settings);
//services.AddSingleton(x => settings);

services.AddScoped(provider =>
{
var settingService = provider.GetRequiredService<ISettingService>();
return settingService.Bind<AudioHandlerSettings>("AudioHandler");
});

services.AddScoped<ISpeechToText, NativeWhisperProvider>();
services.AddScoped<IAudioProcessUtilities, AudioProcessUtilities>();
}
}
}

Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFramework>$(TargetFramework)</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<LangVersion>$(LangVersion)</LangVersion>
<VersionPrefix>$(BotSharpVersion)</VersionPrefix>
<GeneratePackageOnBuild>$(GeneratePackageOnBuild)</GeneratePackageOnBuild>
<GenerateDocumentationFile>$(GenerateDocumentationFile)</GenerateDocumentationFile>
<OutputPath>$(SolutionDir)packages</OutputPath>
</PropertyGroup>

<ItemGroup>
<PackageReference Include="Microsoft.AspNetCore.Mvc" Version="2.2.0" />
<PackageReference Include="NAudio" Version="2.2.1" />
<PackageReference Include="NAudio.Core" Version="2.2.1" />
<PackageReference Include="Whisper.net" Version="1.5.0" />
<PackageReference Include="Whisper.net.Runtime" Version="1.5.0" />
</ItemGroup>

<ItemGroup>
<ProjectReference Include="..\..\Infrastructure\BotSharp.Core\BotSharp.Core.csproj" />
</ItemGroup>

</Project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using BotSharp.Plugin.AudioHandler.Models;
using BotSharp.Plugin.AudioHandler.Provider;

namespace BotSharp.Plugin.AudioHandler.Controllers
{
#if DEBUG
[AllowAnonymous]
#endif
[ApiController]
public class AudioController : ControllerBase
{
private readonly ISpeechToText _nativeWhisperProvider;

public AudioController(ISpeechToText audioService)
{
_nativeWhisperProvider = audioService;
}

[HttpGet("audio/transcript")]
public async Task<IActionResult> GetTextFromAudioController(string audioInputString)
{
#if DEBUG
Stopwatch stopWatch = new Stopwatch();
stopWatch.Start();
#endif
var result = await _nativeWhisperProvider.AudioToTextTranscript(audioInputString);
#if DEBUG
stopWatch.Stop();
TimeSpan ts = stopWatch.Elapsed;
string elapsedTime = String.Format("{0:00}:{1:00}:{2:00}.{3:00}",
ts.Hours, ts.Minutes, ts.Seconds,
ts.Milliseconds / 10);
Console.WriteLine("RunTime " + elapsedTime);
#endif
return Ok(result);
}
}
}
22 changes: 22 additions & 0 deletions src/Plugins/BotSharp.Plugin.AudioHandler/Enums/AudioType.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Runtime.CompilerServices;
using System.Text;
using System.Threading.Tasks;
using Whisper.net.Wave;

namespace BotSharp.Plugin.AudioHandler.Enums
{
public enum AudioType
{
wav,
mp3,
}

public static class AudioTypeExtensions
{
public static string ToFileExtension(this AudioType audioType) => $".{audioType}";
}
}

Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
using BotSharp.Plugin.AudioHandler.Enums;
using NAudio;
using NAudio.Wave;
using NAudio.Wave.SampleProviders;

namespace BotSharp.Plugin.AudioHandler.Functions;

public class AudioProcessUtilities : IAudioProcessUtilities
{
public AudioProcessUtilities()
{
}

public Stream ConvertMp3ToStream(string mp3FileName)
{
var fileStream = File.OpenRead(mp3FileName);
using var reader = new Mp3FileReader(fileStream);
if (reader.WaveFormat.SampleRate != 16000)
{
var wavStream = new MemoryStream();
var resampler = new WdlResamplingSampleProvider(reader.ToSampleProvider(), 16000);
WaveFileWriter.WriteWavFileToStream(wavStream, resampler.ToWaveProvider16());
wavStream.Seek(0, SeekOrigin.Begin);
return wavStream;
}
fileStream.Seek(0, SeekOrigin.Begin);
return fileStream;

}

public Stream ConvertWavToStream(string wavFileName)
{
var fileStream = File.OpenRead(wavFileName);
using var reader = new WaveFileReader(fileStream);
if (reader.WaveFormat.SampleRate != 16000)
{
var wavStream = new MemoryStream();
var resampler = new WdlResamplingSampleProvider(reader.ToSampleProvider(), 16000);
WaveFileWriter.WriteWavFileToStream(wavStream, resampler.ToWaveProvider16());
wavStream.Seek(0, SeekOrigin.Begin);
return wavStream;
}
fileStream.Seek(0, SeekOrigin.Begin);
return fileStream;
}

public Stream ConvertToStream(string fileName)
{
if (string.IsNullOrEmpty(fileName))
{
throw new ArgumentNullException("fileName is Null");
}
string fileExtension = Path.GetExtension(fileName).ToLower().TrimStart('.');
if (!Enum.TryParse<AudioType>(fileExtension, out AudioType fileType))
{
throw new NotSupportedException($"File extension: '{fileExtension}' not supported");
}

var stream = fileType switch
{
AudioType.mp3 => ConvertMp3ToStream(fileName),
AudioType.wav => ConvertWavToStream(fileName),
_ => throw new NotSupportedException("File extension not supported"),
};

return stream;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@

namespace BotSharp.Plugin.AudioHandler.Functions
{
public interface IAudioProcessUtilities
{
Stream ConvertMp3ToStream(string mp3FileName);
Stream ConvertWavToStream(string wavFileName);
Stream ConvertToStream(string fileName);
}
}
19 changes: 19 additions & 0 deletions src/Plugins/BotSharp.Plugin.AudioHandler/Models/AudioOutput.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using Whisper.net;

namespace BotSharp.Plugin.AudioHandler.Models
{
public class AudioOutput
{
public List<SegmentData> Segments { get; set; }

public override string ToString()
{
return this.Segments.Count > 0 ? string.Join(" ", this.Segments.Select(x => x.Text)) : string.Empty;
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
using Whisper.net;
using Whisper.net.Ggml;

namespace BotSharp.Plugin.AudioHandler.Provider;

/// <summary>
/// Native Whisper provider for speech to text conversion
/// </summary>
public class NativeWhisperProvider : ISpeechToText
{
private readonly IAudioProcessUtilities _audioProcessUtilities;
private static WhisperProcessor _processor;

private string _modelName;

public NativeWhisperProvider(IAudioProcessUtilities audioProcessUtilities)
{
_audioProcessUtilities = audioProcessUtilities;
}

public async Task<string> AudioToTextTranscript(string filePath)
{
string fileExtension = Path.GetExtension(filePath);
if (!Enum.TryParse<AudioType>(fileExtension.TrimStart('.').ToLower(), out AudioType audioType))
{
throw new Exception($"Unsupported audio type: {fileExtension}");
}
await InitModel();
// var _streamHandler = _audioHandlerFactory.CreateAudioHandler(audioType);
using var stream = _audioProcessUtilities.ConvertToStream(filePath);

if (stream == null)
{
throw new Exception($"Failed to convert {fileExtension} to stream");
}

var textResult = new List<SegmentData>();

await foreach (var result in _processor.ProcessAsync((Stream)stream).ConfigureAwait(false))
{
textResult.Add(result);
}

var audioOutput = new AudioOutput
{
Segments = textResult
};
return audioOutput.ToString();
}
private async Task LoadWhisperModel(GgmlType modelType)
{
try
{
_modelName = $"ggml-{modelType}.bin";

if (!File.Exists(_modelName))
{
using var modelStream = await WhisperGgmlDownloader.GetGgmlModelAsync(GgmlType.TinyEn);
using var fileWriter = File.OpenWrite(_modelName);
await modelStream.CopyToAsync(fileWriter);
}
}
catch (Exception ex)
{
throw new Exception($"Failed to load whisper model: {ex.Message}");
}
}

private async Task InitModel(GgmlType modelType = GgmlType.TinyEn)
{
if (_processor == null)
{

await LoadWhisperModel(modelType);
_processor = WhisperFactory
.FromPath(_modelName)
.CreateBuilder()
.WithLanguage("en")
.Build();
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
namespace BotSharp.Plugin.AudioHandler.Settings
{
public class AudioHandlerSettings
{
}
}
21 changes: 21 additions & 0 deletions src/Plugins/BotSharp.Plugin.AudioHandler/Using.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
global using System;
global using System.Collections.Generic;
global using System.Text;
global using System.Linq;
global using System.Text.Json;
global using System.Linq;
global using System.Text;
global using System.Threading.Tasks;
global using System.Threading.Tasks;

global using BotSharp.Abstraction.Plugins;
global using BotSharp.Abstraction.MLTasks;
global using BotSharp.Plugin.AudioHandler.Enums;
global using BotSharp.Plugin.AudioHandler.Functions;
global using BotSharp.Plugin.AudioHandler.Models;

global using Microsoft.Extensions.Configuration;
global using Microsoft.Extensions.DependencyInjection;
global using Microsoft.AspNetCore.Http;
global using Microsoft.AspNetCore.Authorization;
global using Microsoft.AspNetCore.Mvc;
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
namespace BotSharp.Plugin.OpenAI.Providers.Audio;

public class SpeechToTextProvider : ISpeechToText
{
public Task<string> AudioToTextTranscript(string filePath)
{
throw new NotImplementedException();
}
}

0 comments on commit e497233

Please sign in to comment.