Pronunciation assessment with stream for C++ language (Azure-Samples#…

…1838)
wangyuantao · Feb 22, 2023 · 1d87a37 · 1d87a37
1 parent ded1cc5
commit 1d87a37
Show file tree

Hide file tree

Showing 3 changed files with 78 additions and 0 deletions.
diff --git a/samples/cpp/windows/console/samples/main.cpp b/samples/cpp/windows/console/samples/main.cpp
@@ -16,6 +16,7 @@ extern void SpeechContinuousRecognitionWithPullStream();
 extern void SpeechContinuousRecognitionWithPushStream();
 extern void KeywordTriggeredSpeechRecognitionWithMicrophone();
 extern void PronunciationAssessmentWithMicrophone();
+extern void PronunciationAssessmentWithStream();
 extern void SpeechContinuousRecognitionFromDefaultMicrophoneWithMASEnabled();
 extern void SpeechRecognitionFromMicrophoneWithMASEnabledAndPresetGeometrySpecified();
 extern void SpeechContinuousRecognitionFromMultiChannelFileWithMASEnabledAndCustomGeometrySpecified();
@@ -99,6 +100,7 @@ void SpeechSamples()
                 "    Microsoft Audio Stack enabled.\n";
         cout << "d.) Speech recognition from push stream with Microsoft Audio Stack enabled and\n"
                 "    beam-forming angles specified.\n";
+        cout << "e.) Pronunciation assessment with stream.\n";
         cout << "\nChoice (0 for MAIN MENU): ";
         cout.flush();
 
@@ -150,6 +152,10 @@ void SpeechSamples()
         case 'd':
             SpeechContinuousRecognitionFromPushStreamWithMASEnabledAndBeamformingAnglesSpecified();
             break;
+        case 'E':
+        case 'e':
+            PronunciationAssessmentWithStream();
+            break;
         case '0':
             break;
         }

diff --git a/samples/cpp/windows/console/samples/samples.vcxproj b/samples/cpp/windows/console/samples/samples.vcxproj
@@ -107,11 +107,14 @@
       <PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <ConformanceMode>true</ConformanceMode>
       <AdditionalIncludeDirectories>$(MSBuildThisFileDirectory)include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalUsingDirectories>
+      </AdditionalUsingDirectories>
     </ClCompile>
     <Link>
       <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <AdditionalDependencies>$(SpeechNativeLibDir)\Microsoft.CognitiveServices.Speech.core.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
     </Link>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">

diff --git a/samples/cpp/windows/console/samples/speech_recognition_samples.cpp b/samples/cpp/windows/console/samples/speech_recognition_samples.cpp
@@ -10,6 +10,8 @@
 #include <nlohmann/json.hpp>
 #include <fstream>
 #include "wav_file_reader.h"
+#include <vector>
+#include <future>
 
 using namespace std;
 using namespace Microsoft::CognitiveServices::Speech;
@@ -613,6 +615,73 @@ void PronunciationAssessmentWithMicrophone()
     }
 }
 
+void PronunciationAssessmentWithStreamInternalAsync(shared_ptr<SpeechConfig> speechConfig, std::string referenceText, std::vector<uint8_t> audioData, std::promise<int> resultReceived, std::vector<std::string>& resultContainer)
+{
+    auto audioFormat = AudioStreamFormat::GetWaveFormatPCM(16000, 16, 1); // This need be set based on the format of the given audio data
+    auto audioInputStream = AudioInputStream::CreatePushStream(audioFormat);
+    auto audioConfig = AudioConfig::FromStreamInput(audioInputStream);
+    auto speechRecognizer = SpeechRecognizer::FromConfig(speechConfig, audioConfig);
+
+    // create pronunciation assessment config, set grading system, granularity and if enable miscue based on your requirement.
+    auto pronAssessmentConfig = PronunciationAssessmentConfig::Create(referenceText, PronunciationAssessmentGradingSystem::HundredMark, PronunciationAssessmentGranularity::Phoneme, false);
+    pronAssessmentConfig->ApplyTo(speechRecognizer);
+
+    audioInputStream->Write(audioData.data(), static_cast<uint32_t>(audioData.size()));
+    audioInputStream->Write(nullptr, 0);// send a zero-size chunk to signal the end of stream
+
+    auto result = speechRecognizer->RecognizeOnceAsync().get();
+    if (result->Reason == ResultReason::Canceled)
+    {
+        auto cancellationDetails = CancellationDetails::FromResult(result);
+        std::cout << cancellationDetails->ErrorDetails << std::endl;
+    }
+    else
+    {
+        auto responsePA = PronunciationAssessmentResult::FromResult(result);
+        std::string responseResult = "PRONUNCIATION ASSESSMENT RESULTS : \n";
+        responseResult = responseResult + "  Accuracy score: " + std::to_string(responsePA->AccuracyScore) + ", Pronunciation score: " + std::to_string(responsePA->PronunciationScore) + ", Completeness score : " + std::to_string(responsePA->CompletenessScore) + ", FluencyScore: " + std::to_string(responsePA->FluencyScore);
+
+        resultContainer.push_back(responseResult);
+    }
+
+    resultReceived.set_value(1);
+}
+
+// Pronunciation assessment with audio stream input.
+// See more information at https://aka.ms/csspeech/pa
+void PronunciationAssessmentWithStream()
+{
+    // Creates an instance of a speech config with specified subscription key and service region.
+    // Replace with your own subscription key and service region (e.g., "westus").
+    auto config = SpeechConfig::FromSubscription("YourSubscriptionKey", "YourServiceRegion");
+
+    // Read audio data from file. In real scenario this can be from memory or network
+    std::ifstream file("whatstheweatherlike.wav", std::ios::binary | std::ios::ate);
+    auto audioDataWithHeaderSize = file.tellg();
+    file.seekg(46);
+    auto audioData = std::vector<uint8_t>(static_cast<size_t>(audioDataWithHeaderSize) - static_cast<size_t>(46));
+
+    file.read((char*)audioData.data(), audioData.size());
+
+    std::promise<int> resultReceived;
+    std::future<int> futureResult = resultReceived.get_future();
+    std::vector<std::string> resultContainer;
+
+    auto startTime = std::chrono::high_resolution_clock::now();
+
+    auto task = std::async(std::launch::async, PronunciationAssessmentWithStreamInternalAsync, config, "what's the weather like", audioData, std::move(resultReceived), std::ref(resultContainer));
+
+    int result = futureResult.get();
+    const auto& resultPA = resultContainer[0];
+
+    auto endTime = std::chrono::high_resolution_clock::now();
+
+    std::cout << resultPA << std::endl;
+
+    auto timeCost = std::chrono::duration_cast<std::chrono::milliseconds>(endTime - startTime).count();
+    std::cout << "Time cost: " << timeCost << "ms" << std::endl;
+}
+
 #pragma region Language Detection related samples
 
 void SpeechRecognitionAndLanguageIdWithMicrophone()