Skip to content

Commit

Permalink
Added ISpeechTranscription device
Browse files Browse the repository at this point in the history
The device works but no test is available at the moment. It needs to be implemented
  • Loading branch information
elandini84 committed Oct 25, 2023
1 parent ea6274e commit 32480ef
Show file tree
Hide file tree
Showing 8 changed files with 360 additions and 0 deletions.
1 change: 1 addition & 0 deletions src/devices/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@
# SPDX-License-Identifier: BSD-3-Clause

add_subdirectory(googleSpeechSynthesizer)
add_subdirectory(googleSpeechTranscription)
add_subdirectory(googleDialogflowCxChatBot)
45 changes: 45 additions & 0 deletions src/devices/googleSpeechTranscription/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# SPDX-FileCopyrightText: 2023 Istituto Italiano di Tecnologia (IIT)
# SPDX-License-Identifier: BSD-3-Clause

yarp_prepare_plugin(googleSpeechTranscription
CATEGORY device
TYPE GoogleSpeechTranscription
INCLUDE GoogleSpeechTranscription.h
INTERNAL ON
)

find_package(google_cloud_cpp_speech REQUIRED)

if(NOT SKIP_googleSpeechTranscription)
yarp_add_plugin(yarp_googleSpeechTranscription)

target_sources(yarp_googleSpeechTranscription
PRIVATE
GoogleSpeechTranscription.cpp
GoogleSpeechTranscription.h
)

target_link_libraries(yarp_googleSpeechTranscription
PRIVATE
YARP::YARP_os
YARP::YARP_sig
YARP::YARP_dev
google-cloud-cpp::speech
)

yarp_install(
TARGETS yarp_googleSpeechTranscription
EXPORT yarp-device-googleSpeechTranscription
COMPONENT yarp-device-googleSpeechTranscription
LIBRARY DESTINATION ${YARP_DYNAMIC_PLUGINS_INSTALL_DIR}
ARCHIVE DESTINATION ${YARP_STATIC_PLUGINS_INSTALL_DIR}
YARP_INI DESTINATION ${YARP_PLUGIN_MANIFESTS_INSTALL_DIR}
)

if(YARP_COMPILE_TESTS)
add_subdirectory(tests)
add_subdirectory(demos)
endif()

set_property(TARGET yarp_googleSpeechTranscription PROPERTY FOLDER "Plugins/Device")
endif()
123 changes: 123 additions & 0 deletions src/devices/googleSpeechTranscription/GoogleSpeechTranscription.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
/*
* SPDX-FileCopyrightText: 2023 Istituto Italiano di Tecnologia (IIT)
* SPDX-License-Identifier: BSD-3-Clause
*/

#ifndef _USE_MATH_DEFINES
#define _USE_MATH_DEFINES
#endif

#include "GoogleSpeechTranscription.h"

#include <yarp/sig/SoundFile.h>

#include <yarp/os/LogComponent.h>
#include <yarp/os/LogStream.h>
#include <fstream>

#include <cmath>

using namespace yarp::os;
using namespace yarp::dev;


YARP_LOG_COMPONENT(GOOGLESPEECHTRANSCR, "yarp.googleSpeechTranscription", yarp::os::Log::TraceType);


GoogleSpeechTranscription::GoogleSpeechTranscription()
{

}

bool GoogleSpeechTranscription::open(yarp::os::Searchable &config)
{
if(config.check("__offline"))
{
m_offline = config.find("__offline").asInt32() == 1;
}
if(!config.check("language_code"))
{
yCError(GOOGLESPEECHTRANSCR) << "No language code specified";

return false;
}
m_sampleRate = config.check("sample_rate_hertz", yarp::os::Value(16000), "sample rate (int)").asInt32();
m_languageCode = config.find("language_code").asString();
m_audioConfig.set_language_code(m_languageCode);
m_audioConfig.set_encoding(google::cloud::speech::v1::RecognitionConfig::LINEAR16);
m_audioConfig.set_sample_rate_hertz(m_sampleRate);
m_client = std::make_shared<google::cloud::speech_v1::SpeechClient>(google::cloud::speech_v1::MakeSpeechConnection());

return true;
}

bool GoogleSpeechTranscription::close()
{
return true;
}

bool GoogleSpeechTranscription::setLanguage(const std::string& language)
{
if(language == "auto")
{
yCError(GOOGLESPEECHTRANSCR) << "The \"auto\" option is not supported by this device";

return false;
}

m_audioConfig.set_language_code(language);
return true;
}

bool GoogleSpeechTranscription::getLanguage(std::string& language)
{
language = m_audioConfig.language_code();

return true;
}

bool GoogleSpeechTranscription::transcribe(const yarp::sig::Sound& sound, std::string& transcription, double& score)
{
transcription="";
score = 0.0;

if (sound.getSamples() == 0 ||
sound.getChannels() == 0)
{
yCError(GOOGLESPEECHTRANSCR) << "Invalid Sound sample received";
return false;
}

google::cloud::speech::v1::RecognitionAudio audio;
auto rawData_tmp = sound.getNonInterleavedAudioRawData();
auto rawData = std::vector<short>(rawData_tmp.begin(), rawData_tmp.end());
audio.set_content((char*)rawData.data(),rawData.size()*2);

auto response = m_client->Recognize(m_audioConfig,audio);

if(!response)
{
yCError(GOOGLESPEECHTRANSCR) << "Could not perform audio transcription:" << response.status().message();
return false;
}

yCDebug(GOOGLESPEECHTRANSCR) << "Results size:" << response->results_size();
for(int i=0; i<response->results_size(); i++)
{
auto result = response->results(i);
yCDebug(GOOGLESPEECHTRANSCR) << i << "Alternative size:" << result.alternatives_size();
for(int j=0; j<result.alternatives_size(); j++)
{
auto alternative = result.alternatives(j);
float tempConf = alternative.confidence();
yCDebug(GOOGLESPEECHTRANSCR) << "Alternative:" << alternative.SerializeAsString() << "Confidence:" << tempConf;
if(tempConf > score)
{
score = tempConf;
transcription = alternative.transcript();
}
}
}

return true;
}
91 changes: 91 additions & 0 deletions src/devices/googleSpeechTranscription/GoogleSpeechTranscription.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
/*
* SPDX-FileCopyrightText: 2023 Istituto Italiano di Tecnologia (IIT)
* SPDX-License-Identifier: BSD-3-Clause
*/

#ifndef YARP_GOOGLESPEECHTRANSCR_H
#define YARP_GOOGLESPEECHTRANSCR_H

#include <yarp/dev/DeviceDriver.h>
#include <yarp/dev/ISpeechTranscription.h>
#include <yarp/sig/Sound.h>
#include <yarp/os/Network.h>
#include <algorithm>
#include <memory>
#include <vector>

#include "google/cloud/speech/v1/speech_client.h"
#include "google/protobuf/repeated_ptr_field.h"


/**
* @ingroup dev_impl_other
*
* \section googleSpeechTranscription
*
* \brief `googleSpeechTranscription`: A yarp device for speech transcription using google cloud cpp libraries
*
* Parameters required by this device are:
* | Parameter name | SubParameter | Type | Units | Default Value | Required | Description | Notes |
* |:--------------:|:------------:|:-------:|:--------------:|:-------------:|:--------:|:--------------------------------------------------:|:-----:|
* | language_code | - | string | - | - | Yes | Language for speech synthesis (e.g. "ita", "eng") | |
*
*
* example of xml file with a fake odometer
*
* \code{.unparsed}
* <?xml version="1.0" encoding="UTF-8"?>
* <!DOCTYPE robot PUBLIC "-//YARP//DTD yarprobotinterface 3.0//EN" "http://www.yarp.it/DTD/yarprobotinterfaceV3.0.dtd">
* <robot name="googleTest" build="2" portprefix="/googleTranscr" xmlns:xi="http://www.w3.org/2001/XInclude">
* <devices>
* <device name="googleTranscr" type="googleSpeechTranscription">
* <param name="language_code">
* it-IT
* </param>
* </device>
*
* <device name="synthWrap" type="speechTranscription_nws_yarp">
* <action phase="startup" level="5" type="attach">
* <paramlist name="networks">
* <elem name="subdeviceGoogle">
* googleTranscr
* </elem>
* </paramlist>
* </action>
* <action phase="shutdown" level="5" type="detach" />
* </device>
* </devices>
* </robot>
* \endcode
*/

class GoogleSpeechTranscription :
public yarp::dev::DeviceDriver,
public yarp::dev::ISpeechTranscription
{
public:
GoogleSpeechTranscription();
GoogleSpeechTranscription(const GoogleSpeechTranscription&) = delete;
GoogleSpeechTranscription(GoogleSpeechTranscription&&) noexcept = delete;
GoogleSpeechTranscription& operator=(const GoogleSpeechTranscription&) = delete;
GoogleSpeechTranscription& operator=(GoogleSpeechTranscription&&) noexcept = delete;
~GoogleSpeechTranscription() override = default;

// DeviceDriver
bool open(yarp::os::Searchable& config) override;
bool close() override;

// yarp::dev::ISpeechTranscription
bool setLanguage(const std::string& language="auto") override;
bool getLanguage(std::string& language) override;
bool transcribe(const yarp::sig::Sound& sound, std::string& transcription, double& score) override;

private:
int m_sampleRate;
bool m_offline{false};
std::string m_languageCode;
google::cloud::speech::v1::RecognitionConfig m_audioConfig;
std::shared_ptr<google::cloud::speech_v1::SpeechClient> m_client{nullptr};
};

#endif // YARP_GOOGLESPEECHTRANSCR_H
12 changes: 12 additions & 0 deletions src/devices/googleSpeechTranscription/demos/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# SPDX-FileCopyrightText: 2023-2023 Istituto Italiano di Tecnologia (IIT)
# SPDX-License-Identifier: BSD-3-Clause

set(appname googleSpeechSynthesizer_demo)

file(GLOB conf ${CMAKE_CURRENT_SOURCE_DIR}/yarprobotinterface_xml/*.xml)
file(GLOB data ${CMAKE_CURRENT_SOURCE_DIR}/*.wav)
file(GLOB apps ${CMAKE_CURRENT_SOURCE_DIR}/*.xml)

yarp_install(FILES ${conf} DESTINATION ${YARP_CONTEXTS_INSTALL_DIR}/${appname})
yarp_install(FILES ${data} DESTINATION ${YARP_CONTEXTS_INSTALL_DIR}/${appname})
yarp_install(FILES ${apps} DESTINATION ${YARP_APPLICATIONS_INSTALL_DIR})
Binary file not shown.
4 changes: 4 additions & 0 deletions src/devices/googleSpeechTranscription/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# SPDX-FileCopyrightText: 2023 Istituto Italiano di Tecnologia (IIT)
# SPDX-License-Identifier: BSD-3-Clause

create_device_test (googleSpeechSynthesizer)
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
/*
* SPDX-FileCopyrightText: 2023 Istituto Italiano di Tecnologia (IIT)
* SPDX-License-Identifier: BSD-3-Clause
*/

#include <yarp/dev/ISpeechSynthesizer.h>
#include <yarp/os/Network.h>
#include <yarp/os/LogStream.h>
#include <yarp/os/ResourceFinder.h>
#include <yarp/dev/PolyDriver.h>
#include <yarp/dev/WrapperSingle.h>

#include <yarp/sig/Sound.h>
#include <yarp/sig/SoundFile.h>

#include <catch2/catch_amalgamated.hpp>
#include <harness.h>

using namespace yarp::dev;
using namespace yarp::os;

TEST_CASE("dev::googleSpeechSynthesizer_test", "[yarp::dev]")
{
YARP_REQUIRE_PLUGIN("googleSpeechSynthesizer", "device");

Network::setLocalMode(true);

SECTION("Checking googleSpeechSynthesizer device")
{
ISpeechSynthesizer* iSynth{nullptr};
PolyDriver dd;

//read a test sound file from disk
yarp::sig::Sound snd;
yarp::os::ResourceFinder rf;

rf.setQuiet(false);
rf.setVerbose(true);

rf.setDefaultContext("googleSpeechSynthesizer_demo");
std::string ss = rf.findFile("test_audio.wav");
CHECK(!ss.empty());
yarp::sig::file::read(snd,ss.c_str());
CHECK(snd.getSamples()>0);

//"Checking opening device"
{
Property pcfg;
const std::string init_lang{"it-IT"};
const std::string init_voice{"it-IT-Wavenet-A"};
pcfg.put("device", "googleSpeechSynthesizer");
pcfg.put("language_code",init_lang);
pcfg.put("voice_name",init_voice);
pcfg.put("__offline", 1);
REQUIRE(dd.open(pcfg));
REQUIRE(dd.view(iSynth));
}

const std::string lang_to_set{"en-GB"};
const std::string voice_to_set{"en-GB-Neural2-C"};
std::string lang_code;
std::string voice_name;

std::string toSynthesize{"This is a text to speech test"};
yarp::sig::Sound outputSound;
CHECK(iSynth->setLanguage(lang_to_set));
CHECK(iSynth->setVoice(voice_to_set));
CHECK(iSynth->getLanguage(lang_code));
CHECK(lang_code == lang_to_set);
CHECK(iSynth->getVoice(voice_name));
CHECK(voice_name == voice_to_set);

// For the moment being, actually using the API for test only purposes doesn't seem like a good idea since it will waste money
//CHECK(iSynth->synthesize(toSynthesize,outputSound));
//CHECK(outputSound == snd);

//"Close all polydrivers and check"
{
CHECK(dd.close());
}
}

Network::setLocalMode(false);
}

0 comments on commit 32480ef

Please sign in to comment.