YARP
Yet Another Robot Platform
 
Loading...
Searching...
No Matches
WhisperDevice.cpp
Go to the documentation of this file.
1/*
2 * SPDX-FileCopyrightText: 2023 Istituto Italiano di Tecnologia (IIT)
3 * SPDX-License-Identifier: BSD-3-Clause
4 */
5
6#ifndef _USE_MATH_DEFINES
7#define _USE_MATH_DEFINES
8#endif
9
10#include "WhisperDevice.h"
11
13#include <yarp/os/LogStream.h>
14
15
16#include <cmath>
17
18using namespace yarp::os;
19using namespace yarp::dev;
20
21
23
24
29
31{
32 if (!parseParams(config)) { return false; }
33
34 if(std::getenv(m_ENVS_api_key_name.c_str()) == nullptr)
35 {
36 yCError(WHISPERDEVICE) << "Environment variable" << m_ENVS_api_key_name << "not set";
37 return false;
38 }
39 if(std::getenv(m_ENVS_end_point_name.c_str()) == nullptr)
40 {
41 yCError(WHISPERDEVICE) << "Environment variable" << m_ENVS_end_point_name << "not set";
42 return false;
43 }
44 if(std::getenv(m_ENVS_deployment_id_name.c_str()) == nullptr)
45 {
46 yCError(WHISPERDEVICE) << "Environment variable" << m_ENVS_deployment_id_name << "not set";
47 return false;
48 }
49 if(std::getenv(m_ENVS_api_version_name.c_str()) == nullptr)
50 {
51 yCError(WHISPERDEVICE) << "Environment variable" << m_ENVS_api_version_name << "not set";
52 return false;
53 }
54 m_apiKey = std::getenv(m_ENVS_api_key_name.c_str());
55 std::string endpoint = std::getenv(m_ENVS_end_point_name.c_str());
56 std::string deployment_id = std::getenv(m_ENVS_deployment_id_name.c_str());
57 std::string api_version = std::getenv(m_ENVS_api_version_name.c_str());
58 m_url = endpoint + "/openai/deployments/" + deployment_id + "/audio/transcriptions?api-version=" + api_version;
59
60 yCInfo(WHISPERDEVICE) << "Open";
61 return true;
62}
63
65{
66 yCInfo(WHISPERDEVICE) << "Close";
67 return true;
68}
69
70ReturnValue WhisperDevice::setLanguage(const std::string& language)
71{
72 yCWarning(WHISPERDEVICE) << "setLanguage not implemented";
74}
75
77{
78 yCWarning(WHISPERDEVICE) << "getLanguage not implemented";
80}
81
82ReturnValue WhisperDevice::transcribe(const yarp::sig::Sound& sound, std::string& transcription, double& score)
83{
85 if (!curl) {
86 std::cerr << "Failed to initialize cURL" << std::endl;
88 }
89 struct curl_slist *headers = NULL;
90 headers = curl_slist_append(headers, ("api-key: " + m_apiKey).c_str());
91 headers = curl_slist_append(headers, "Content-Type: multipart/form-data");
92
93 int sampleRate = sound.getFrequency();
94 std::vector<uint8_t> wavHeader = _createWavHeader(sampleRate, sound.getSamples());
95 std::vector<uint8_t> audioData(wavHeader.begin(), wavHeader.end());
96
97 for (size_t i = 0; i < sound.getSamples(); ++i) {
98 int16_t sample = static_cast<int16_t>(sound.get(i));
99 audioData.push_back(sample & 0xFF);
100 audioData.push_back((sample >> 8) & 0xFF);
101 }
102
103 struct curl_httppost *post = NULL;
104 struct curl_httppost *last = NULL;
105 curl_formadd(&post, &last,
106 CURLFORM_COPYNAME, "file",
107 CURLFORM_BUFFER, "audio.wav",
110 CURLFORM_CONTENTTYPE, "audio/wav",
112
113 std::string response;
114 curl_easy_setopt(curl, CURLOPT_URL, m_url.c_str());
119
121 if (res != CURLE_OK) {
122 std::cerr << "cURL request failed: " << curl_easy_strerror(res) << std::endl;
123 } else {
124 std::cout << "Transcription response: " << response << std::endl;
125 }
126
127 // Parse the JSON response
128 try {
129 auto jsonResponse = nlohmann::json::parse(response);
130 if (jsonResponse.contains("text")) {
131 transcription = jsonResponse["text"].get<std::string>();
132 } else {
133 std::cerr << "No 'text' field in the response" << std::endl;
134 return ReturnValue::return_code::return_value_error_generic;
135 }
136 } catch (const nlohmann::json::parse_error& e) {
137 std::cerr << "JSON parse error: " << e.what() << std::endl;
138 return ReturnValue::return_code::return_value_error_generic;
139 }
140
142 curl_formfree(post);
143 curl_slist_free_all(headers);
144
145 return ReturnValue::return_code::return_value_ok;
146}
147
148std::vector<uint8_t> WhisperDevice::_createWavHeader(int sampleRate, int numSamples)
149{
150 int byteRate = sampleRate * 2; // 16-bit mono
151 int blockAlign = 2;
152 int subChunk2Size = numSamples * blockAlign;
153 int chunkSize = 36 + subChunk2Size;
154
155 std::vector<uint8_t> header(44);
156 std::memcpy(header.data(), "RIFF", 4);
157 std::memcpy(header.data() + 4, &chunkSize, 4);
158 std::memcpy(header.data() + 8, "WAVE", 4);
159 std::memcpy(header.data() + 12, "fmt ", 4);
160 int subChunk1Size = 16;
161 short audioFormat = 1;
162 short numChannels = 1;
163 std::memcpy(header.data() + 16, &subChunk1Size, 4);
164 std::memcpy(header.data() + 20, &audioFormat, 2);
165 std::memcpy(header.data() + 22, &numChannels, 2);
166 std::memcpy(header.data() + 24, &sampleRate, 4);
167 std::memcpy(header.data() + 28, &byteRate, 4);
168 std::memcpy(header.data() + 32, &blockAlign, 2);
169 short bitsPerSample = 16;
170 std::memcpy(header.data() + 34, &bitsPerSample, 2);
171 std::memcpy(header.data() + 36, "data", 4);
172 std::memcpy(header.data() + 40, &subChunk2Size, 4);
173
174 return header;
175}
176
177size_t WhisperDevice::_writeCallback(void *contents, size_t size, size_t nmemb, std::string *output) {
178 size_t totalSize = size * nmemb;
179 output->append((char*)contents, totalSize);
180 return totalSize;
181}
#define YARP_METHOD_NOT_YET_IMPLEMENTED()
Definition ReturnValue.h:96
const yarp::os::LogComponent & WHISPERDEVICE()
bool parseParams(const yarp::os::Searchable &config) override
Parse the DeviceDriver parameters.
yarp::dev::ReturnValue getLanguage(std::string &language) override
Gets the current language set for speech transcription.
yarp::dev::ReturnValue transcribe(const yarp::sig::Sound &sound, std::string &transcription, double &score) override
Performs the speech transcription.
yarp::dev::ReturnValue setLanguage(const std::string &language="auto") override
Sets the language for speech transcription.
bool close() override
Close the DeviceDriver.
bool open(yarp::os::Searchable &config) override
Open the DeviceDriver.
@ return_value_error_generic
Method was successfully executed.
A mini-server for performing network communication in the background.
@ TraceType
Definition Log.h:92
A base class for nested structures that can be searched.
Definition Searchable.h:31
Class for storing sounds See Audio in YARP for additional documentation on YARP audio.
Definition Sound.h:25
int getFrequency() const
Get the frequency of the sound (i.e.
Definition Sound.cpp:356
audio_sample get(size_t sample, size_t channel=0) const
Definition Sound.cpp:294
size_t getSamples() const
Get the number of samples contained in the sound.
Definition Sound.cpp:598
#define yCInfo(component,...)
#define yCError(component,...)
#define yCWarning(component,...)
#define YARP_LOG_COMPONENT(name,...)
For streams capable of holding different kinds of content, check what they actually have.
An interface to the operating system, including Port based communication.