diff --git a/src/devices/CMakeLists.txt b/src/devices/CMakeLists.txt index ef268ee..1f04a64 100644 --- a/src/devices/CMakeLists.txt +++ b/src/devices/CMakeLists.txt @@ -2,4 +2,5 @@ # SPDX-License-Identifier: BSD-3-Clause add_subdirectory(googleSpeechSynthesizer) +add_subdirectory(googleSpeechTranscription) add_subdirectory(googleDialogflowCxChatBot) diff --git a/src/devices/googleSpeechTranscription/CMakeLists.txt b/src/devices/googleSpeechTranscription/CMakeLists.txt new file mode 100644 index 0000000..96dbca1 --- /dev/null +++ b/src/devices/googleSpeechTranscription/CMakeLists.txt @@ -0,0 +1,45 @@ +# SPDX-FileCopyrightText: 2023 Istituto Italiano di Tecnologia (IIT) +# SPDX-License-Identifier: BSD-3-Clause + +yarp_prepare_plugin(googleSpeechTranscription + CATEGORY device + TYPE GoogleSpeechTranscription + INCLUDE GoogleSpeechTranscription.h + INTERNAL ON +) + +find_package(google_cloud_cpp_speech REQUIRED) + +if(NOT SKIP_googleSpeechTranscription) + yarp_add_plugin(yarp_googleSpeechTranscription) + + target_sources(yarp_googleSpeechTranscription + PRIVATE + GoogleSpeechTranscription.cpp + GoogleSpeechTranscription.h + ) + + target_link_libraries(yarp_googleSpeechTranscription + PRIVATE + YARP::YARP_os + YARP::YARP_sig + YARP::YARP_dev + google-cloud-cpp::speech + ) + + yarp_install( + TARGETS yarp_googleSpeechTranscription + EXPORT yarp-device-googleSpeechTranscription + COMPONENT yarp-device-googleSpeechTranscription + LIBRARY DESTINATION ${YARP_DYNAMIC_PLUGINS_INSTALL_DIR} + ARCHIVE DESTINATION ${YARP_STATIC_PLUGINS_INSTALL_DIR} + YARP_INI DESTINATION ${YARP_PLUGIN_MANIFESTS_INSTALL_DIR} + ) + + if(YARP_COMPILE_TESTS) + add_subdirectory(tests) + add_subdirectory(demos) + endif() + + set_property(TARGET yarp_googleSpeechTranscription PROPERTY FOLDER "Plugins/Device") +endif() diff --git a/src/devices/googleSpeechTranscription/GoogleSpeechTranscription.cpp b/src/devices/googleSpeechTranscription/GoogleSpeechTranscription.cpp new file mode 100644 index 0000000..995f40e --- /dev/null +++ b/src/devices/googleSpeechTranscription/GoogleSpeechTranscription.cpp @@ -0,0 +1,123 @@ +/* + * SPDX-FileCopyrightText: 2023 Istituto Italiano di Tecnologia (IIT) + * SPDX-License-Identifier: BSD-3-Clause + */ + +#ifndef _USE_MATH_DEFINES +#define _USE_MATH_DEFINES +#endif + +#include "GoogleSpeechTranscription.h" + +#include + +#include +#include +#include + +#include + +using namespace yarp::os; +using namespace yarp::dev; + + +YARP_LOG_COMPONENT(GOOGLESPEECHTRANSCR, "yarp.googleSpeechTranscription", yarp::os::Log::TraceType); + + +GoogleSpeechTranscription::GoogleSpeechTranscription() +{ + +} + +bool GoogleSpeechTranscription::open(yarp::os::Searchable &config) +{ + if(config.check("__offline")) + { + m_offline = config.find("__offline").asInt32() == 1; + } + if(!config.check("language_code")) + { + yCError(GOOGLESPEECHTRANSCR) << "No language code specified"; + + return false; + } + m_sampleRate = config.check("sample_rate_hertz", yarp::os::Value(16000), "sample rate (int)").asInt32(); + m_languageCode = config.find("language_code").asString(); + m_audioConfig.set_language_code(m_languageCode); + m_audioConfig.set_encoding(google::cloud::speech::v1::RecognitionConfig::LINEAR16); + m_audioConfig.set_sample_rate_hertz(m_sampleRate); + m_client = std::make_shared(google::cloud::speech_v1::MakeSpeechConnection()); + + return true; +} + +bool GoogleSpeechTranscription::close() +{ + return true; +} + +bool GoogleSpeechTranscription::setLanguage(const std::string& language) +{ + if(language == "auto") + { + yCError(GOOGLESPEECHTRANSCR) << "The \"auto\" option is not supported by this device"; + + return false; + } + + m_audioConfig.set_language_code(language); + return true; +} + +bool GoogleSpeechTranscription::getLanguage(std::string& language) +{ + language = m_audioConfig.language_code(); + + return true; +} + +bool GoogleSpeechTranscription::transcribe(const yarp::sig::Sound& sound, std::string& transcription, double& score) +{ + transcription=""; + score = 0.0; + + if (sound.getSamples() == 0 || + sound.getChannels() == 0) + { + yCError(GOOGLESPEECHTRANSCR) << "Invalid Sound sample received"; + return false; + } + + google::cloud::speech::v1::RecognitionAudio audio; + auto rawData_tmp = sound.getNonInterleavedAudioRawData(); + auto rawData = std::vector(rawData_tmp.begin(), rawData_tmp.end()); + audio.set_content((char*)rawData.data(),rawData.size()*2); + + auto response = m_client->Recognize(m_audioConfig,audio); + + if(!response) + { + yCError(GOOGLESPEECHTRANSCR) << "Could not perform audio transcription:" << response.status().message(); + return false; + } + + yCDebug(GOOGLESPEECHTRANSCR) << "Results size:" << response->results_size(); + for(int i=0; iresults_size(); i++) + { + auto result = response->results(i); + yCDebug(GOOGLESPEECHTRANSCR) << i << "Alternative size:" << result.alternatives_size(); + for(int j=0; j score) + { + score = tempConf; + transcription = alternative.transcript(); + } + } + } + + return true; +} diff --git a/src/devices/googleSpeechTranscription/GoogleSpeechTranscription.h b/src/devices/googleSpeechTranscription/GoogleSpeechTranscription.h new file mode 100644 index 0000000..493ecb2 --- /dev/null +++ b/src/devices/googleSpeechTranscription/GoogleSpeechTranscription.h @@ -0,0 +1,91 @@ +/* + * SPDX-FileCopyrightText: 2023 Istituto Italiano di Tecnologia (IIT) + * SPDX-License-Identifier: BSD-3-Clause + */ + +#ifndef YARP_GOOGLESPEECHTRANSCR_H +#define YARP_GOOGLESPEECHTRANSCR_H + +#include +#include +#include +#include +#include +#include +#include + +#include "google/cloud/speech/v1/speech_client.h" +#include "google/protobuf/repeated_ptr_field.h" + + +/** + * @ingroup dev_impl_other + * + * \section googleSpeechTranscription + * + * \brief `googleSpeechTranscription`: A yarp device for speech transcription using google cloud cpp libraries + * + * Parameters required by this device are: + * | Parameter name | SubParameter | Type | Units | Default Value | Required | Description | Notes | + * |:--------------:|:------------:|:-------:|:--------------:|:-------------:|:--------:|:--------------------------------------------------:|:-----:| + * | language_code | - | string | - | - | Yes | Language for speech synthesis (e.g. "ita", "eng") | | + * + * + * example of xml file with a fake odometer + * + * \code{.unparsed} + * + * + * + * + * + * + * it-IT + * + * + * + * + * + * + * + * googleTranscr + * + * + * + * + * + * + * + * \endcode + */ + +class GoogleSpeechTranscription : + public yarp::dev::DeviceDriver, + public yarp::dev::ISpeechTranscription +{ +public: + GoogleSpeechTranscription(); + GoogleSpeechTranscription(const GoogleSpeechTranscription&) = delete; + GoogleSpeechTranscription(GoogleSpeechTranscription&&) noexcept = delete; + GoogleSpeechTranscription& operator=(const GoogleSpeechTranscription&) = delete; + GoogleSpeechTranscription& operator=(GoogleSpeechTranscription&&) noexcept = delete; + ~GoogleSpeechTranscription() override = default; + + // DeviceDriver + bool open(yarp::os::Searchable& config) override; + bool close() override; + + // yarp::dev::ISpeechTranscription + bool setLanguage(const std::string& language="auto") override; + bool getLanguage(std::string& language) override; + bool transcribe(const yarp::sig::Sound& sound, std::string& transcription, double& score) override; + +private: + int m_sampleRate; + bool m_offline{false}; + std::string m_languageCode; + google::cloud::speech::v1::RecognitionConfig m_audioConfig; + std::shared_ptr m_client{nullptr}; +}; + +#endif // YARP_GOOGLESPEECHTRANSCR_H diff --git a/src/devices/googleSpeechTranscription/demos/CMakeLists.txt b/src/devices/googleSpeechTranscription/demos/CMakeLists.txt new file mode 100644 index 0000000..16336fd --- /dev/null +++ b/src/devices/googleSpeechTranscription/demos/CMakeLists.txt @@ -0,0 +1,12 @@ +# SPDX-FileCopyrightText: 2023-2023 Istituto Italiano di Tecnologia (IIT) +# SPDX-License-Identifier: BSD-3-Clause + +set(appname googleSpeechSynthesizer_demo) + +file(GLOB conf ${CMAKE_CURRENT_SOURCE_DIR}/yarprobotinterface_xml/*.xml) +file(GLOB data ${CMAKE_CURRENT_SOURCE_DIR}/*.wav) +file(GLOB apps ${CMAKE_CURRENT_SOURCE_DIR}/*.xml) + +yarp_install(FILES ${conf} DESTINATION ${YARP_CONTEXTS_INSTALL_DIR}/${appname}) +yarp_install(FILES ${data} DESTINATION ${YARP_CONTEXTS_INSTALL_DIR}/${appname}) +yarp_install(FILES ${apps} DESTINATION ${YARP_APPLICATIONS_INSTALL_DIR}) diff --git a/src/devices/googleSpeechTranscription/demos/test_audio.wav b/src/devices/googleSpeechTranscription/demos/test_audio.wav new file mode 100644 index 0000000..1b1e7ab Binary files /dev/null and b/src/devices/googleSpeechTranscription/demos/test_audio.wav differ diff --git a/src/devices/googleSpeechTranscription/tests/CMakeLists.txt b/src/devices/googleSpeechTranscription/tests/CMakeLists.txt new file mode 100644 index 0000000..f5030b5 --- /dev/null +++ b/src/devices/googleSpeechTranscription/tests/CMakeLists.txt @@ -0,0 +1,4 @@ +# SPDX-FileCopyrightText: 2023 Istituto Italiano di Tecnologia (IIT) +# SPDX-License-Identifier: BSD-3-Clause + +create_device_test (googleSpeechSynthesizer) diff --git a/src/devices/googleSpeechTranscription/tests/googleSpeechSynthesizer_test.cpp b/src/devices/googleSpeechTranscription/tests/googleSpeechSynthesizer_test.cpp new file mode 100644 index 0000000..dd484bb --- /dev/null +++ b/src/devices/googleSpeechTranscription/tests/googleSpeechSynthesizer_test.cpp @@ -0,0 +1,84 @@ +/* + * SPDX-FileCopyrightText: 2023 Istituto Italiano di Tecnologia (IIT) + * SPDX-License-Identifier: BSD-3-Clause + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +using namespace yarp::dev; +using namespace yarp::os; + +TEST_CASE("dev::googleSpeechSynthesizer_test", "[yarp::dev]") +{ + YARP_REQUIRE_PLUGIN("googleSpeechSynthesizer", "device"); + + Network::setLocalMode(true); + + SECTION("Checking googleSpeechSynthesizer device") + { + ISpeechSynthesizer* iSynth{nullptr}; + PolyDriver dd; + + //read a test sound file from disk + yarp::sig::Sound snd; + yarp::os::ResourceFinder rf; + + rf.setQuiet(false); + rf.setVerbose(true); + + rf.setDefaultContext("googleSpeechSynthesizer_demo"); + std::string ss = rf.findFile("test_audio.wav"); + CHECK(!ss.empty()); + yarp::sig::file::read(snd,ss.c_str()); + CHECK(snd.getSamples()>0); + + //"Checking opening device" + { + Property pcfg; + const std::string init_lang{"it-IT"}; + const std::string init_voice{"it-IT-Wavenet-A"}; + pcfg.put("device", "googleSpeechSynthesizer"); + pcfg.put("language_code",init_lang); + pcfg.put("voice_name",init_voice); + pcfg.put("__offline", 1); + REQUIRE(dd.open(pcfg)); + REQUIRE(dd.view(iSynth)); + } + + const std::string lang_to_set{"en-GB"}; + const std::string voice_to_set{"en-GB-Neural2-C"}; + std::string lang_code; + std::string voice_name; + + std::string toSynthesize{"This is a text to speech test"}; + yarp::sig::Sound outputSound; + CHECK(iSynth->setLanguage(lang_to_set)); + CHECK(iSynth->setVoice(voice_to_set)); + CHECK(iSynth->getLanguage(lang_code)); + CHECK(lang_code == lang_to_set); + CHECK(iSynth->getVoice(voice_name)); + CHECK(voice_name == voice_to_set); + + // For the moment being, actually using the API for test only purposes doesn't seem like a good idea since it will waste money + //CHECK(iSynth->synthesize(toSynthesize,outputSound)); + //CHECK(outputSound == snd); + + //"Close all polydrivers and check" + { + CHECK(dd.close()); + } + } + + Network::setLocalMode(false); +}