YARP
Yet Another Robot Platform
 
Loading...
Searching...
No Matches
TtsDevice.cpp
Go to the documentation of this file.
1/*
2 * SPDX-FileCopyrightText: 2023 Istituto Italiano di Tecnologia (IIT)
3 * SPDX-License-Identifier: BSD-3-Clause
4 */
5
6#ifndef _USE_MATH_DEFINES
7#define _USE_MATH_DEFINES
8#endif
9
10#include "TtsDevice.h"
11
13#include <yarp/os/LogStream.h>
14
15// Include dr_mp3 for decoding MP3
16#define DR_MP3_IMPLEMENTATION
17#include "dr_mp3.h"
18
19#include <cmath>
20
21using namespace yarp::os;
22using namespace yarp::dev;
23
24
26
27
32
34{
35 if (!parseParams(config)) { return false; }
36 if(std::getenv(m_ENVS_api_key_name.c_str()) == nullptr)
37 {
38 yCError(TTSDEVICE) << "Environment variable" << m_ENVS_api_key_name << "not set";
39 return false;
40 }
41 if(std::getenv(m_ENVS_end_point_name.c_str()) == nullptr)
42 {
43 yCError(TTSDEVICE) << "Environment variable" << m_ENVS_end_point_name << "not set";
44 return false;
45 }
46 if(std::getenv(m_ENVS_deployment_id_name.c_str()) == nullptr)
47 {
48 yCError(TTSDEVICE) << "Environment variable" << m_ENVS_deployment_id_name << "not set";
49 return false;
50 }
51 if(std::getenv(m_ENVS_api_version_name.c_str()) == nullptr)
52 {
53 yCError(TTSDEVICE) << "Environment variable" << m_ENVS_api_version_name << "not set";
54 return false;
55 }
56 m_apiKey = std::getenv(m_ENVS_api_key_name.c_str());
57 std::string endpoint = std::getenv(m_ENVS_end_point_name.c_str());
58 std::string deployment_id = std::getenv(m_ENVS_deployment_id_name.c_str());
59 std::string api_version = std::getenv(m_ENVS_api_version_name.c_str());
60 m_url = endpoint + "/openai/deployments/" + deployment_id + "/audio/speech?api-version=" + api_version;
61
62 yCInfo(TTSDEVICE) << "Open";
63 return true;
64}
65
67{
68 yCInfo(TTSDEVICE) << "Close";
69 return true;
70}
71
72ReturnValue TtsDevice::setLanguage(const std::string& language)
73{
74 yCWarning(TTSDEVICE) << "setLanguage not implemented";
76}
77
78ReturnValue TtsDevice::getLanguage(std::string& language)
79{
80 yCWarning(TTSDEVICE) << "getLanguage not implemented";
82}
83
85{
86 if(voice_name.empty())
87 {
88 yCError(TTSDEVICE) << "setVoice not implemented";
89 return ReturnValue::return_code::return_value_error_generic;
90 }
91 if (!_voiceNameIsValid(voice_name)) {
92 yCError(TTSDEVICE) << "Invalid voice name" << voice_name;
93 return ReturnValue::return_code::return_value_error_generic;
94 }
95 m_voiceName = voice_name;
96
97 return ReturnValue_ok;
98}
99
101{
102 voice_name = m_voiceName;
103 return ReturnValue_ok;
104}
105
107{
108 yCWarning(TTSDEVICE) << "setSpeed not implemented";
110}
111
113{
114 yCWarning(TTSDEVICE) << "getSpeed not implemented";
116}
117
119{
120 yCWarning(TTSDEVICE) << "setPitch not implemented";
122}
123
125{
126 yCWarning(TTSDEVICE) << "getPitch not implemented";
128}
129
130ReturnValue TtsDevice::synthesize(const std::string& text, yarp::sig::Sound& sound)
131{
133 if (!curl) {
134 yCError(TTSDEVICE) << "Failed to initialize cURL";
135 return ReturnValue::return_code::return_value_error_generic;
136 }
137
138 std::string payload = "{\"model\": \"tts-1\", \"input\": \"" + _escapeJsonString(text) + "\", \"voice\": \""+ m_voiceName + "\"}";
139
140 std::vector<uint8_t> audioData;
141
142 struct curl_slist *headers = NULL;
143 headers = curl_slist_append(headers, ("api-key: " + m_apiKey).c_str());
144 headers = curl_slist_append(headers, "Content-Type: application/json");
145
146 curl_easy_setopt(curl, CURLOPT_URL, m_url.c_str());
152
154 curl_slist_free_all(headers);
156
157 if (res != CURLE_OK) {
158 yCError(TTSDEVICE) << "cURL request failed: " << curl_easy_strerror(res);
159 return ReturnValue::return_code::return_value_error_generic;
160 }
161
162 yCInfo(TTSDEVICE) << "Downloaded MP3 data: " << audioData.size() << " bytes";
163
164 // Decode MP3 using dr_mp3
165 drmp3 mp3;
166 if (!drmp3_init_memory(&mp3, audioData.data(), audioData.size(), NULL)) {
167 yCError(TTSDEVICE) << "Failed to decode MP3";
168 return ReturnValue::return_code::return_value_error_generic;
169 }
170
172 std::vector<int16_t> pcmData(totalFrames * mp3.channels);
173
176
177 pcmData.resize(samplesRead * mp3.channels);
178
179 yCInfo(TTSDEVICE) << "Decoded " << samplesRead << " frames, channels: " << mp3.channels;
180
181 sound.clear();
182 sound.resize(samplesRead, mp3.channels);
183 sound.setFrequency(mp3.sampleRate);
184
185 for (size_t i = 0; i < samplesRead; ++i) {
186 for (uint32_t ch = 0; ch < mp3.channels; ++ch) {
187 sound.set(pcmData[i * mp3.channels + ch], i, ch);
188 }
189 }
190
191 return ReturnValue_ok;
192}
193
194size_t TtsDevice::_writeCallback(void *contents, size_t size, size_t nmemb, std::vector<uint8_t> *output) {
195 size_t totalSize = size * nmemb;
196 output->insert(output->end(), (uint8_t *)contents, (uint8_t *)contents + totalSize);
197 return totalSize;
198}
199
200std::string TtsDevice::_escapeJsonString(const std::string &input) {
201 std::ostringstream ss;
202 for (const auto &c : input) {
203 switch (c) {
204 case '\"': ss << "\\\""; break;
205 case '\\': ss << "\\\\"; break;
206 case '\b': ss << "\\b"; break;
207 case '\f': ss << "\\f"; break;
208 case '\n': ss << "\\n"; break;
209 case '\r': ss << "\\r"; break;
210 case '\t': ss << "\\t"; break;
211 default:
212 if (static_cast<unsigned char>(c) < 0x20) {
213 ss << "\\u" << std::hex << std::setw(4) << std::setfill('0') << (int)c;
214 } else {
215 ss << c;
216 }
217 }
218 }
219 return ss.str();
220}
221
222bool TtsDevice::_voiceNameIsValid(const std::string& voice_name)
223{
224 if (std::find(VOICES.begin(), VOICES.end(), voice_name) != VOICES.end()) {
225 return true;
226 } else {
227 yCError(TTSDEVICE) << "Invalid voice name" << voice_name;
228 return false;
229 }
230}
#define ReturnValue_ok
Definition ReturnValue.h:77
#define YARP_METHOD_NOT_YET_IMPLEMENTED()
Definition ReturnValue.h:96
const yarp::os::LogComponent & TTSDEVICE()
Definition TtsDevice.cpp:25
bool parseParams(const yarp::os::Searchable &config) override
Parse the DeviceDriver parameters.
yarp::dev::ReturnValue setVoice(const std::string &voice_name="auto") override
Sets the voice set for speech synthesis.
Definition TtsDevice.cpp:84
bool close() override
Close the DeviceDriver.
Definition TtsDevice.cpp:66
yarp::dev::ReturnValue synthesize(const std::string &text, yarp::sig::Sound &sound) override
Performs the speech synthesis.
bool open(yarp::os::Searchable &config) override
Open the DeviceDriver.
Definition TtsDevice.cpp:33
yarp::dev::ReturnValue setLanguage(const std::string &language="auto") override
Sets the language for speech synthesis.
Definition TtsDevice.cpp:72
yarp::dev::ReturnValue setPitch(const double pitch) override
Sets the pitch for speech synthesis.
yarp::dev::ReturnValue getVoice(std::string &voice_name) override
Gets the current voice set for speech synthesis.
yarp::dev::ReturnValue getLanguage(std::string &language) override
Gets the current language set for speech synthesis.
Definition TtsDevice.cpp:78
yarp::dev::ReturnValue getSpeed(double &speed) override
Gets the current voice speed.
yarp::dev::ReturnValue setSpeed(const double speed=0) override
Sets the voice speed for speech synthesis.
yarp::dev::ReturnValue getPitch(double &pitch) override
Gets the current pitch set for speech synthesis.
A mini-server for performing network communication in the background.
@ TraceType
Definition Log.h:92
A base class for nested structures that can be searched.
Definition Searchable.h:31
Class for storing sounds See Audio in YARP for additional documentation on YARP audio.
Definition Sound.h:25
void setFrequency(int freq)
Set the frequency of the sound (i.e.
Definition Sound.cpp:361
void clear()
set all the samples to zero (silence)
Definition Sound.cpp:315
void resize(size_t samples, size_t channels=1)
Set the sound size.
Definition Sound.cpp:270
void set(audio_sample value, size_t sample, size_t channel=0)
Definition Sound.cpp:334
drmp3_bool32 drmp3_init_memory(drmp3 *pMP3, const void *pData, size_t dataSize, const drmp3_allocation_callbacks *pAllocationCallbacks)
void drmp3_uninit(drmp3 *pMP3)
drmp3_uint64 drmp3_get_pcm_frame_count(drmp3 *pMP3)
drmp3_uint64 drmp3_read_pcm_frames_s16(drmp3 *pMP3, drmp3_uint64 framesToRead, drmp3_int16 *pBufferOut)
const std::vector< std::string > VOICES
ttsDevice: A yarp device for speech synthesis using azure openai APIs
Definition TtsDevice.h:31
#define yCInfo(component,...)
#define yCError(component,...)
#define yCWarning(component,...)
#define YARP_LOG_COMPONENT(name,...)
For streams capable of holding different kinds of content, check what they actually have.
An interface to the operating system, including Port based communication.