Skip to content
This repository was archived by the owner on Jan 7, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified speechly-client.unitypackage
Binary file not shown.
158 changes: 94 additions & 64 deletions speechly-unity/Assets/Speechly/MicToSpeechly.cs
Original file line number Diff line number Diff line change
Expand Up @@ -26,42 +26,57 @@ public static MicToSpeechly Instance

[Tooltip("Speechly App Id")]
public string AppId = "ef84e8ba-c5a7-46c2-856e-8b853e2c77b1"; // Speechly Client Demos / speech-to-text only configuration
[Tooltip("Capture device name or null for default")]
[Tooltip("Capture device name or null for default.")]
public string CaptureDeviceName = null;
public int MicSampleRate = 16000;
public int MicBufferLengthMillis = 1000;
[Tooltip("Milliseconds of history data to send upon StartContext to capture lead of the utterance.")]
public int SendHistoryMillis = 200;
public int FrameMillis = 30;
[Range(1, 32)]
[Tooltip("Number of frames to keep in memory. When listening is started, history frames are sent to capture the lead-in audio.")]
public int HistoryFrames = 8;
public bool CalcAudioPeaks = true;
public bool CalcEnergy = true;
public bool VADUseEnergyGate = false;
public float Peak {get; private set; } = 0f;
public float Energy {get; private set; } = 0f;
public float BaselineEnergy {get; private set; } = -1f;
public int VADAnalysisWindowMillis = 30;
[Tooltip("Voice Activity Detection (VAD) using adaptive energy tresholding. Automatically controls listening based on audio 'loudness'.")]
public bool EnergyTresholdVAD = false;
[Range(0.0f, 1.0f)]
[Tooltip("Energy treshold - below this won't trigger activation")]
public float VADEnergyTreshold = 0.005f;
public float VADMinimumEnergy = 0.005f;
[Range(1.0f, 10.0f)]
[Tooltip("Signal-to-noise energy ratio needed for activation")]
public float VADActivationRatio = 2.0f;
public int VADActivationMillis = 150;
public int VADReleaseMillis = 300;
[Tooltip("Signal-to-noise energy ratio needed for frame to be 'loud'")]
public float VADSignalToNoise = 2.0f;
[Range(1, 32)]
[Tooltip("Number of past frames analyzed for energy treshold VAD. Should be <= than HistoryFrames.")]
public int VADFrames = 5;
[Range(.0f, 1.0f)]
[Tooltip("Minimum 'loud' to 'silent' frame ratio in history to activate 'IsSignalDetected'")]
public float VADActivation = 0.7f;
[Range(.0f, 1.0f)]
[Tooltip("Maximum 'loud' to 'silent' frame ratio in history to inactivate 'IsSignalDetected'. Only evaluated when the sustain period is over.")]
public float VADRelease = 0.2f;
[Range(0, 8000)]
[Tooltip("Duration to keep 'IsSignalDetected' active. Renewed as long as VADActivation is holds true.")]
public int VADSustainMillis = 3000;
public bool PrintDebug = false;
public bool IsSpeechDetected {get; private set; }
[Range(0, 5000)]
[Tooltip("Rate of background noise learn. Defined as duration in which background noise energy is moved halfway towards current frame's energy.")]
public int VADNoiseHalftimeMillis = 400;
[Tooltip("When checked, VAD listening control is disabled but IsSignalDetected is updated.")]
public bool DebugSimulateVAD = false;
public bool DebugPrint = false;
public float Peak {get; private set; } = 0f;
public float Energy {get; private set; } = 0f;
public float BaselineEnergy {get; private set; } = -1f;
public bool IsSignalDetected {get; private set; }
private int loudFrameBits = 0;
public SpeechlyClient SpeechlyClient { get; private set; }
private AudioClip clip;
private float[] waveData;
private int oldCaptureRingbufferPos;
private int loops;

private int historySamples;
private float vadNoiseGateHeat = 0f;
private int vadAnalysisWindowSamples;
private int vadAnalysisWindowSamplesLeft;
private int historySizeSamples;
private int frameSamples;
private int frameSamplesLeft;
private float vadSum = 0f;
private float vadSustainContextMillis = 0;
private float vadSustainMillisLeft = 0;

private void Awake()
{
Expand All @@ -82,7 +97,7 @@ private void Awake()
deviceId = SystemInfo.deviceUniqueIdentifier
},
manualUpdate: true,
debug: PrintDebug
debug: DebugPrint
);

_instance = this;
Expand All @@ -96,9 +111,12 @@ void Start()
// Microphone.GetDeviceCaps(CaptureDeviceName, out minFreq, out maxFreq);
// Debug.Log($"minFreq {minFreq} maxFreq {maxFreq}");

int capturedAudioBufferMillis = 500;
int micBufferMillis = FrameMillis * HistoryFrames + capturedAudioBufferMillis;
int micBufferSecs = (micBufferMillis / 1000) + 1;
// Start audio capture
clip = Microphone.Start(CaptureDeviceName, true, MicBufferLengthMillis / 1000, MicSampleRate);
clip = Microphone.Start(CaptureDeviceName, true, micBufferSecs, MicSampleRate);

if (clip != null)
{
waveData = new float[clip.samples * clip.channels];
Expand All @@ -109,9 +127,9 @@ void Start()
throw new Exception($"Could not open microphone {CaptureDeviceName}");
}

historySamples = MicSampleRate * SendHistoryMillis / 1000;
vadAnalysisWindowSamples = MicSampleRate * VADAnalysisWindowMillis / 1000;
vadAnalysisWindowSamplesLeft = vadAnalysisWindowSamples;
frameSamples = MicSampleRate * FrameMillis / 1000;
frameSamplesLeft = frameSamples;
historySizeSamples = frameSamples * HistoryFrames;

StartCoroutine(RunSpeechly());
}
Expand Down Expand Up @@ -141,11 +159,11 @@ private IEnumerator RunSpeechly()
} else {
samples = captureRingbufferPos - oldCaptureRingbufferPos;
}
samples = Math.Min(samples, waveData.Length - historySamples);
samples = Math.Min(samples, waveData.Length - historySizeSamples);

if (samples > 0) {
if (loop) loops++;
int effectiveHistorySamples = loops > 0 ? historySamples : Math.Min(captureRingbufferPos, historySamples);
int effectiveHistorySamples = loops > 0 ? historySizeSamples : Math.Min(captureRingbufferPos, historySizeSamples);
int effectiveCapturePos = (oldCaptureRingbufferPos + (waveData.Length - effectiveHistorySamples)) % waveData.Length;

// Always captures full buffer length (MicSampleRate * MicBufferLengthMillis / 1000 samples), starting from offset
Expand All @@ -161,60 +179,55 @@ private IEnumerator RunSpeechly()
}
}

if (CalcEnergy) {
if (EnergyTresholdVAD) {
int capturedSamplesLeft = samples;

while (capturedSamplesLeft > 0) {
int summedSamples = Math.Min(capturedSamplesLeft, vadAnalysisWindowSamplesLeft);
int summedSamples = Math.Min(capturedSamplesLeft, frameSamplesLeft);
int s = summedSamples;
while (s > 0)
{
vadSum += waveData[s + effectiveHistorySamples] * waveData[s + effectiveHistorySamples];
s--;
}
vadAnalysisWindowSamplesLeft -= summedSamples;
if (vadAnalysisWindowSamplesLeft == 0) {
vadAnalysisWindowSamplesLeft = vadAnalysisWindowSamples;
Energy = (float)Math.Sqrt(vadSum / vadAnalysisWindowSamples);
frameSamplesLeft -= summedSamples;
if (frameSamplesLeft == 0) {
frameSamplesLeft = frameSamples;
Energy = (float)Math.Sqrt(vadSum / frameSamples);
if (BaselineEnergy < 0f) {
BaselineEnergy = Energy;
}
if (Energy > Math.Max(VADEnergyTreshold, BaselineEnergy * VADActivationRatio)) {
vadNoiseGateHeat = (float)Math.Min(vadNoiseGateHeat + (1f * VADAnalysisWindowMillis / VADActivationMillis), 1f);
} else {
vadNoiseGateHeat = (float)Math.Max(vadNoiseGateHeat - (1f * VADAnalysisWindowMillis / VADReleaseMillis), 0f);
}
bool isLoudFrame = Energy > Math.Max(VADMinimumEnergy, BaselineEnergy * VADSignalToNoise);
PushToFrameHistory(isLoudFrame);

if (vadNoiseGateHeat == 1f) {
if (!IsSpeechDetected) {
IsSpeechDetected = true;
if (VADUseEnergyGate) {
StartContext();
}
}
}
int loudFrames = CountLoudFrames(VADFrames);
float loudFrameRatio = (1f * loudFrames) / VADFrames;

if (vadNoiseGateHeat > 0.5f) {
vadSustainContextMillis = VADSustainMillis;
if (loudFrameRatio >= VADActivation) {
vadSustainMillisLeft = VADSustainMillis;
if (!IsSignalDetected) {
IsSignalDetected = true;
StartContext();
}
}

if (vadNoiseGateHeat == 0f && vadSustainContextMillis == 0) {
if (IsSpeechDetected) {
IsSpeechDetected = false;
vadNoiseGateHeat = 0f;
if (VADUseEnergyGate) {
StopContext();
}
if (loudFrameRatio < VADRelease && vadSustainMillisLeft == 0) {
if (IsSignalDetected) {
IsSignalDetected = false;
StopContext();
}
}

// Learn background noise level
if (!IsSpeechDetected) {
BaselineEnergy = (BaselineEnergy * 0.95f) + (Energy * 0.05f);
// Gradually learn background noise level
if (!IsSignalDetected) {
if (VADNoiseHalftimeMillis > 0f) {
var decay = (float)Math.Pow(2.0, -FrameMillis / (double)VADNoiseHalftimeMillis);
BaselineEnergy = (BaselineEnergy * decay) + (Energy * (1f - decay));
}
}

vadSum = 0f;
vadSustainContextMillis = Math.Max(vadSustainContextMillis - VADAnalysisWindowMillis, 0);
vadSustainMillisLeft = Math.Max(vadSustainMillisLeft - FrameMillis, 0);
}
capturedSamplesLeft -= summedSamples;
}
Expand All @@ -234,19 +247,36 @@ private IEnumerator RunSpeechly()
if (!audioSent) {
yield return null;
}

}
}

private void PushToFrameHistory(bool isLoud) {
loudFrameBits = (isLoud ? 1 : 0) | (loudFrameBits << 1);
}

private int CountLoudFrames(int numHistoryFrames) {
int numActiveFrames = 0;
int t = loudFrameBits;
while (numHistoryFrames > 0) {
if ((t & 1) == 1) numActiveFrames++;
t = t >> 1;
numHistoryFrames--;
}
return numActiveFrames;
}

// Drop-and-forget wrapper for async StartContext
public void StartContext() {
_ = SpeechlyClient.StartContext();
if (!DebugSimulateVAD) {
_ = SpeechlyClient.StartContext();
}
}

// Drop-and-forget wrapper for async StopContext
public void StopContext() {
_ = SpeechlyClient.StopContext();
if (!DebugSimulateVAD) {
_ = SpeechlyClient.StopContext();
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -1475,15 +1475,14 @@ MonoBehaviour:
AppId: ef84e8ba-c5a7-46c2-856e-8b853e2c77b1
CaptureDeviceName:
MicSampleRate: 16000
MicBufferLengthMillis: 1000
SendHistoryMillis: 200
CalcAudioPeaks: 1
CalcEnergy: 1
VADUseEnergyGate: 0
VADAnalysisWindowMillis: 30
FrameMillis: 30
HistoryFrames: 5
VADEnergyTreshold: 0.005
VADActivationRatio: 2
VADActivationMillis: 150
VADReleaseMillis: 300
VADSignalToNoise: 2
VADActivationRatio: 0.7
VADReleaseRatio: 0.2
VADSustainMillis: 3000
PrintDebug: 0
DebugVAD: 0
DebugPrint: 0
Loading