Added ISpeechTranscription device

The device works but no test is available at the moment. It needs to be implemented
robotology · Oct 25, 2023 · 32480ef · 32480ef
1 parent ea6274e
commit 32480ef
Show file tree

Hide file tree

Showing 8 changed files with 360 additions and 0 deletions.
diff --git a/src/devices/CMakeLists.txt b/src/devices/CMakeLists.txt
@@ -2,4 +2,5 @@
 # SPDX-License-Identifier: BSD-3-Clause
 
 add_subdirectory(googleSpeechSynthesizer)
+add_subdirectory(googleSpeechTranscription)
 add_subdirectory(googleDialogflowCxChatBot)
diff --git a/src/devices/googleSpeechTranscription/CMakeLists.txt b/src/devices/googleSpeechTranscription/CMakeLists.txt
@@ -0,0 +1,45 @@
+# SPDX-FileCopyrightText: 2023 Istituto Italiano di Tecnologia (IIT)
+# SPDX-License-Identifier: BSD-3-Clause
+
+yarp_prepare_plugin(googleSpeechTranscription
+  CATEGORY device
+  TYPE GoogleSpeechTranscription
+  INCLUDE GoogleSpeechTranscription.h
+  INTERNAL ON
+)
+
+find_package(google_cloud_cpp_speech REQUIRED)
+
+if(NOT SKIP_googleSpeechTranscription)
+  yarp_add_plugin(yarp_googleSpeechTranscription)
+
+  target_sources(yarp_googleSpeechTranscription
+    PRIVATE
+      GoogleSpeechTranscription.cpp
+      GoogleSpeechTranscription.h
+  )
+
+  target_link_libraries(yarp_googleSpeechTranscription
+    PRIVATE
+      YARP::YARP_os
+      YARP::YARP_sig
+      YARP::YARP_dev
+      google-cloud-cpp::speech
+  )
+
+  yarp_install(
+    TARGETS yarp_googleSpeechTranscription
+    EXPORT yarp-device-googleSpeechTranscription
+    COMPONENT yarp-device-googleSpeechTranscription
+    LIBRARY DESTINATION ${YARP_DYNAMIC_PLUGINS_INSTALL_DIR}
+    ARCHIVE DESTINATION ${YARP_STATIC_PLUGINS_INSTALL_DIR}
+    YARP_INI DESTINATION ${YARP_PLUGIN_MANIFESTS_INSTALL_DIR}
+  )
+
+  if(YARP_COMPILE_TESTS)
+    add_subdirectory(tests)
+    add_subdirectory(demos)
+  endif()
+
+  set_property(TARGET yarp_googleSpeechTranscription PROPERTY FOLDER "Plugins/Device")
+endif()
diff --git a/src/devices/googleSpeechTranscription/GoogleSpeechTranscription.cpp b/src/devices/googleSpeechTranscription/GoogleSpeechTranscription.cpp
@@ -0,0 +1,123 @@
+/*
+ * SPDX-FileCopyrightText: 2023 Istituto Italiano di Tecnologia (IIT)
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#ifndef _USE_MATH_DEFINES
+#define _USE_MATH_DEFINES
+#endif
+
+#include "GoogleSpeechTranscription.h"
+
+#include <yarp/sig/SoundFile.h>
+
+#include <yarp/os/LogComponent.h>
+#include <yarp/os/LogStream.h>
+#include <fstream>
+
+#include <cmath>
+
+using namespace yarp::os;
+using namespace yarp::dev;
+
+
+YARP_LOG_COMPONENT(GOOGLESPEECHTRANSCR, "yarp.googleSpeechTranscription", yarp::os::Log::TraceType);
+
+
+GoogleSpeechTranscription::GoogleSpeechTranscription()
+{
+
+}
+
+bool GoogleSpeechTranscription::open(yarp::os::Searchable &config)
+{
+    if(config.check("__offline"))
+    {
+        m_offline = config.find("__offline").asInt32() == 1;
+    }
+    if(!config.check("language_code"))
+    {
+        yCError(GOOGLESPEECHTRANSCR) << "No language code specified";
+
+        return false;
+    }
+    m_sampleRate = config.check("sample_rate_hertz", yarp::os::Value(16000), "sample rate (int)").asInt32();
+    m_languageCode = config.find("language_code").asString();
+    m_audioConfig.set_language_code(m_languageCode);
+    m_audioConfig.set_encoding(google::cloud::speech::v1::RecognitionConfig::LINEAR16);
+    m_audioConfig.set_sample_rate_hertz(m_sampleRate);
+    m_client = std::make_shared<google::cloud::speech_v1::SpeechClient>(google::cloud::speech_v1::MakeSpeechConnection());
+
+    return true;
+}
+
+bool GoogleSpeechTranscription::close()
+{
+    return true;
+}
+
+bool GoogleSpeechTranscription::setLanguage(const std::string& language)
+{
+    if(language == "auto")
+    {
+        yCError(GOOGLESPEECHTRANSCR) << "The \"auto\" option is not supported by this device";
+
+        return false;
+    }
+
+    m_audioConfig.set_language_code(language);
+    return true;
+}
+
+bool GoogleSpeechTranscription::getLanguage(std::string& language)
+{
+    language = m_audioConfig.language_code();
+
+    return true;
+}
+
+bool GoogleSpeechTranscription::transcribe(const yarp::sig::Sound& sound, std::string& transcription, double& score)
+{
+    transcription="";
+    score = 0.0;
+
+    if (sound.getSamples() == 0 ||
+        sound.getChannels() == 0)
+    {
+        yCError(GOOGLESPEECHTRANSCR) << "Invalid Sound sample received";
+        return false;
+    }
+
+    google::cloud::speech::v1::RecognitionAudio audio;
+    auto rawData_tmp = sound.getNonInterleavedAudioRawData();
+    auto rawData = std::vector<short>(rawData_tmp.begin(), rawData_tmp.end());
+    audio.set_content((char*)rawData.data(),rawData.size()*2);
+
+    auto response = m_client->Recognize(m_audioConfig,audio);
+
+    if(!response)
+    {
+        yCError(GOOGLESPEECHTRANSCR) << "Could not perform audio transcription:" << response.status().message();
+        return false;
+    }
+
+    yCDebug(GOOGLESPEECHTRANSCR) << "Results size:" << response->results_size();
+    for(int i=0; i<response->results_size(); i++)
+    {
+        auto result = response->results(i);
+        yCDebug(GOOGLESPEECHTRANSCR) << i << "Alternative size:" << result.alternatives_size();
+        for(int j=0; j<result.alternatives_size(); j++)
+        {
+            auto alternative = result.alternatives(j);
+            float tempConf = alternative.confidence();
+            yCDebug(GOOGLESPEECHTRANSCR) << "Alternative:" << alternative.SerializeAsString() << "Confidence:" << tempConf;
+            if(tempConf > score)
+            {
+                score = tempConf;
+                transcription = alternative.transcript();
+            }
+        }
+    }
+
+    return true;
+}
diff --git a/src/devices/googleSpeechTranscription/GoogleSpeechTranscription.h b/src/devices/googleSpeechTranscription/GoogleSpeechTranscription.h
@@ -0,0 +1,91 @@
+/*
+ * SPDX-FileCopyrightText: 2023 Istituto Italiano di Tecnologia (IIT)
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#ifndef YARP_GOOGLESPEECHTRANSCR_H
+#define YARP_GOOGLESPEECHTRANSCR_H
+
+#include <yarp/dev/DeviceDriver.h>
+#include <yarp/dev/ISpeechTranscription.h>
+#include <yarp/sig/Sound.h>
+#include <yarp/os/Network.h>
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+#include "google/cloud/speech/v1/speech_client.h"
+#include "google/protobuf/repeated_ptr_field.h"
+
+
+/**
+ *  @ingroup dev_impl_other
+ *
+ * \section googleSpeechTranscription
+ *
+ * \brief `googleSpeechTranscription`: A yarp device for speech transcription using google cloud cpp libraries
+ *
+ *  Parameters required by this device are:
+ * | Parameter name | SubParameter | Type    | Units          | Default Value | Required | Description                                        | Notes |
+ * |:--------------:|:------------:|:-------:|:--------------:|:-------------:|:--------:|:--------------------------------------------------:|:-----:|
+ * | language_code  | -            | string  | -              | -             | Yes      | Language for speech synthesis (e.g. "ita", "eng")  |       |
+ *
+ *
+ * example of xml file with a fake odometer
+ *
+ * \code{.unparsed}
+ * <?xml version="1.0" encoding="UTF-8"?>
+ * <!DOCTYPE robot PUBLIC "-//YARP//DTD yarprobotinterface 3.0//EN" "http://www.yarp.it/DTD/yarprobotinterfaceV3.0.dtd">
+ * <robot name="googleTest" build="2" portprefix="/googleTranscr" xmlns:xi="http://www.w3.org/2001/XInclude">
+ *     <devices>
+ *         <device name="googleTranscr" type="googleSpeechTranscription">
+ *             <param name="language_code">
+ *                 it-IT
+ *             </param>
+ *         </device>
+ *
+ *         <device name="synthWrap" type="speechTranscription_nws_yarp">
+ *             <action phase="startup" level="5" type="attach">
+ *                 <paramlist name="networks">
+ *                     <elem name="subdeviceGoogle">
+ *                         googleTranscr
+ *                     </elem>
+ *                 </paramlist>
+ *             </action>
+ *             <action phase="shutdown" level="5" type="detach" />
+ *         </device>
+ *     </devices>
+ * </robot>
+ * \endcode
+ */
+
+class GoogleSpeechTranscription :
+        public yarp::dev::DeviceDriver,
+        public yarp::dev::ISpeechTranscription
+{
+public:
+    GoogleSpeechTranscription();
+    GoogleSpeechTranscription(const GoogleSpeechTranscription&) = delete;
+    GoogleSpeechTranscription(GoogleSpeechTranscription&&) noexcept = delete;
+    GoogleSpeechTranscription& operator=(const GoogleSpeechTranscription&) = delete;
+    GoogleSpeechTranscription& operator=(GoogleSpeechTranscription&&) noexcept = delete;
+    ~GoogleSpeechTranscription() override = default;
+
+    // DeviceDriver
+    bool open(yarp::os::Searchable& config) override;
+    bool close() override;
+
+    // yarp::dev::ISpeechTranscription
+    bool setLanguage(const std::string& language="auto") override;
+    bool getLanguage(std::string& language) override;
+    bool transcribe(const yarp::sig::Sound& sound, std::string& transcription, double& score) override;
+
+private:
+    int          m_sampleRate;
+    bool         m_offline{false};
+    std::string  m_languageCode;
+    google::cloud::speech::v1::RecognitionConfig             m_audioConfig;
+    std::shared_ptr<google::cloud::speech_v1::SpeechClient>  m_client{nullptr};
+};
+
+#endif // YARP_GOOGLESPEECHTRANSCR_H
diff --git a/src/devices/googleSpeechTranscription/demos/CMakeLists.txt b/src/devices/googleSpeechTranscription/demos/CMakeLists.txt
@@ -0,0 +1,12 @@
+# SPDX-FileCopyrightText: 2023-2023 Istituto Italiano di Tecnologia (IIT)
+# SPDX-License-Identifier: BSD-3-Clause
+
+set(appname googleSpeechSynthesizer_demo)
+
+file(GLOB conf      ${CMAKE_CURRENT_SOURCE_DIR}/yarprobotinterface_xml/*.xml)
+file(GLOB data      ${CMAKE_CURRENT_SOURCE_DIR}/*.wav)
+file(GLOB apps      ${CMAKE_CURRENT_SOURCE_DIR}/*.xml)
+
+yarp_install(FILES ${conf}    DESTINATION ${YARP_CONTEXTS_INSTALL_DIR}/${appname})
+yarp_install(FILES ${data}    DESTINATION ${YARP_CONTEXTS_INSTALL_DIR}/${appname})
+yarp_install(FILES ${apps}    DESTINATION ${YARP_APPLICATIONS_INSTALL_DIR})
diff --git a/src/devices/googleSpeechTranscription/demos/test_audio.wav b/src/devices/googleSpeechTranscription/demos/test_audio.wav
diff --git a/src/devices/googleSpeechTranscription/tests/CMakeLists.txt b/src/devices/googleSpeechTranscription/tests/CMakeLists.txt
@@ -0,0 +1,4 @@
+# SPDX-FileCopyrightText: 2023 Istituto Italiano di Tecnologia (IIT)
+# SPDX-License-Identifier: BSD-3-Clause
+
+create_device_test (googleSpeechSynthesizer)
diff --git a/src/devices/googleSpeechTranscription/tests/googleSpeechSynthesizer_test.cpp b/src/devices/googleSpeechTranscription/tests/googleSpeechSynthesizer_test.cpp
@@ -0,0 +1,84 @@
+/*
+ * SPDX-FileCopyrightText: 2023 Istituto Italiano di Tecnologia (IIT)
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#include <yarp/dev/ISpeechSynthesizer.h>
+#include <yarp/os/Network.h>
+#include <yarp/os/LogStream.h>
+#include <yarp/os/ResourceFinder.h>
+#include <yarp/dev/PolyDriver.h>
+#include <yarp/dev/WrapperSingle.h>
+
+#include <yarp/sig/Sound.h>
+#include <yarp/sig/SoundFile.h>
+
+#include <catch2/catch_amalgamated.hpp>
+#include <harness.h>
+
+using namespace yarp::dev;
+using namespace yarp::os;
+
+TEST_CASE("dev::googleSpeechSynthesizer_test", "[yarp::dev]")
+{
+    YARP_REQUIRE_PLUGIN("googleSpeechSynthesizer", "device");
+
+    Network::setLocalMode(true);
+
+    SECTION("Checking googleSpeechSynthesizer device")
+    {
+        ISpeechSynthesizer* iSynth{nullptr};
+        PolyDriver dd;
+
+        //read a test sound file from disk
+        yarp::sig::Sound snd;
+        yarp::os::ResourceFinder rf;
+
+        rf.setQuiet(false);
+        rf.setVerbose(true);
+
+        rf.setDefaultContext("googleSpeechSynthesizer_demo");
+        std::string ss = rf.findFile("test_audio.wav");
+        CHECK(!ss.empty());
+        yarp::sig::file::read(snd,ss.c_str());
+        CHECK(snd.getSamples()>0);
+
+        //"Checking opening device"
+        {
+            Property pcfg;
+            const std::string init_lang{"it-IT"};
+            const std::string init_voice{"it-IT-Wavenet-A"};
+            pcfg.put("device", "googleSpeechSynthesizer");
+            pcfg.put("language_code",init_lang);
+            pcfg.put("voice_name",init_voice);
+            pcfg.put("__offline", 1);
+            REQUIRE(dd.open(pcfg));
+            REQUIRE(dd.view(iSynth));
+        }
+
+        const std::string lang_to_set{"en-GB"};
+        const std::string voice_to_set{"en-GB-Neural2-C"};
+        std::string lang_code;
+        std::string voice_name;
+
+        std::string toSynthesize{"This is a text to speech test"};
+        yarp::sig::Sound outputSound;
+        CHECK(iSynth->setLanguage(lang_to_set));
+        CHECK(iSynth->setVoice(voice_to_set));
+        CHECK(iSynth->getLanguage(lang_code));
+        CHECK(lang_code == lang_to_set);
+        CHECK(iSynth->getVoice(voice_name));
+        CHECK(voice_name == voice_to_set);
+
+        // For the moment being, actually using the API for test only purposes doesn't seem like a good idea since it will waste money
+        //CHECK(iSynth->synthesize(toSynthesize,outputSound));
+        //CHECK(outputSound == snd);
+
+        //"Close all polydrivers and check"
+        {
+            CHECK(dd.close());
+        }
+    }
+
+    Network::setLocalMode(false);
+}