git-master/WhisperDevice_8cpp_source.html

/*

 * SPDX-FileCopyrightText: 2023 Istituto Italiano di Tecnologia (IIT)

 * SPDX-License-Identifier: BSD-3-Clause

 */


#ifndef _USE_MATH_DEFINES

#define _USE_MATH_DEFINES

#endif


#include "WhisperDevice.h"


#include <yarp/os/LogComponent.h>

#include <yarp/os/LogStream.h>


#include <cmath>


using namespace yarp::os;

using namespace yarp::dev;


YARP_LOG_COMPONENT(WHISPERDEVICE, "yarp.whisperDevice", yarp::os::Log::TraceType);


WhisperDevice::WhisperDevice()

{


}


bool WhisperDevice::open(yarp::os::Searchable &config)

{

    if (!parseParams(config))  { return false; }


    if(std::getenv(m_ENVS_api_key_name.c_str()) == nullptr)

    {

        yCError(WHISPERDEVICE) << "Environment variable" << m_ENVS_api_key_name << "not set";

        return false;

    }

    if(std::getenv(m_ENVS_end_point_name.c_str()) == nullptr)

    {

        yCError(WHISPERDEVICE) << "Environment variable" << m_ENVS_end_point_name << "not set";

        return false;

    }

    if(std::getenv(m_ENVS_deployment_id_name.c_str()) == nullptr)

    {

        yCError(WHISPERDEVICE) << "Environment variable" << m_ENVS_deployment_id_name << "not set";

        return false;

    }

    if(std::getenv(m_ENVS_api_version_name.c_str()) == nullptr)

    {

        yCError(WHISPERDEVICE) << "Environment variable" << m_ENVS_api_version_name << "not set";

        return false;

    }

    m_apiKey = std::getenv(m_ENVS_api_key_name.c_str());

    std::string endpoint = std::getenv(m_ENVS_end_point_name.c_str());

    std::string deployment_id = std::getenv(m_ENVS_deployment_id_name.c_str());

    std::string api_version = std::getenv(m_ENVS_api_version_name.c_str());

    m_url = endpoint + "/openai/deployments/" + deployment_id + "/audio/transcriptions?api-version=" + api_version;


    yCInfo(WHISPERDEVICE) << "Open";

    return true;

}


bool WhisperDevice::close()

{

    yCInfo(WHISPERDEVICE) << "Close";

    return true;

}


ReturnValue WhisperDevice::setLanguage(const std::string& language)

{

    yCWarning(WHISPERDEVICE) << "setLanguage not implemented";

    return YARP_METHOD_NOT_YET_IMPLEMENTED();

}


ReturnValue WhisperDevice::getLanguage(std::string& language)

{

    yCWarning(WHISPERDEVICE) << "getLanguage not implemented";

    return YARP_METHOD_NOT_YET_IMPLEMENTED();

}


ReturnValue WhisperDevice::transcribe(const yarp::sig::Sound& sound, std::string& transcription, double& score)

{

    CURL *curl = curl_easy_init();

    if (!curl) {

        std::cerr << "Failed to initialize cURL" << std::endl;

        return yarp::dev::ReturnValue::return_code::return_value_error_generic;

    }

    struct curl_slist *headers = NULL;

    headers = curl_slist_append(headers, ("api-key: " + m_apiKey).c_str());

    headers = curl_slist_append(headers, "Content-Type: multipart/form-data");


    int sampleRate = sound.getFrequency();

    std::vector<uint8_t> wavHeader = _createWavHeader(sampleRate, sound.getSamples());

    std::vector<uint8_t> audioData(wavHeader.begin(), wavHeader.end());


    for (size_t i = 0; i < sound.getSamples(); ++i) {

        int16_t sample = static_cast<int16_t>(sound.get(i));

        audioData.push_back(sample & 0xFF);

        audioData.push_back((sample >> 8) & 0xFF);

    }


    struct curl_httppost *post = NULL;

    struct curl_httppost *last = NULL;

    curl_formadd(&post, &last,

                 CURLFORM_COPYNAME, "file",

                 CURLFORM_BUFFER, "audio.wav",

                 CURLFORM_BUFFERPTR, audioData.data(),

                 CURLFORM_BUFFERLENGTH, audioData.size(),

                 CURLFORM_CONTENTTYPE, "audio/wav",

                 CURLFORM_END);


    std::string response;

    curl_easy_setopt(curl, CURLOPT_URL, m_url.c_str());

    curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);

    curl_easy_setopt(curl, CURLOPT_HTTPPOST, post);

    curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, _writeCallback);

    curl_easy_setopt(curl, CURLOPT_WRITEDATA, &response);


    CURLcode res = curl_easy_perform(curl);

    if (res != CURLE_OK) {

        std::cerr << "cURL request failed: " << curl_easy_strerror(res) << std::endl;

    } else {

        std::cout << "Transcription response: " << response << std::endl;

    }


    // Parse the JSON response

    try {

        auto jsonResponse = nlohmann::json::parse(response);

        if (jsonResponse.contains("text")) {

            transcription = jsonResponse["text"].get<std::string>();

        } else {

            std::cerr << "No 'text' field in the response" << std::endl;

            return ReturnValue::return_code::return_value_error_generic;

        }

    } catch (const nlohmann::json::parse_error& e) {

        std::cerr << "JSON parse error: " << e.what() << std::endl;

        return ReturnValue::return_code::return_value_error_generic;

    }


    curl_easy_cleanup(curl);

    curl_formfree(post);

    curl_slist_free_all(headers);


    return ReturnValue::return_code::return_value_ok;

}


std::vector<uint8_t> WhisperDevice::_createWavHeader(int sampleRate, int numSamples)

{

    int byteRate = sampleRate * 2; // 16-bit mono

    int blockAlign = 2;

    int subChunk2Size = numSamples * blockAlign;

    int chunkSize = 36 + subChunk2Size;


    std::vector<uint8_t> header(44);

    std::memcpy(header.data(), "RIFF", 4);

    std::memcpy(header.data() + 4, &chunkSize, 4);

    std::memcpy(header.data() + 8, "WAVE", 4);

    std::memcpy(header.data() + 12, "fmt ", 4);

    int subChunk1Size = 16;

    short audioFormat = 1;

    short numChannels = 1;

    std::memcpy(header.data() + 16, &subChunk1Size, 4);

    std::memcpy(header.data() + 20, &audioFormat, 2);

    std::memcpy(header.data() + 22, &numChannels, 2);

    std::memcpy(header.data() + 24, &sampleRate, 4);

    std::memcpy(header.data() + 28, &byteRate, 4);

    std::memcpy(header.data() + 32, &blockAlign, 2);

    short bitsPerSample = 16;

    std::memcpy(header.data() + 34, &bitsPerSample, 2);

    std::memcpy(header.data() + 36, "data", 4);

    std::memcpy(header.data() + 40, &subChunk2Size, 4);


    return header;

}


size_t WhisperDevice::_writeCallback(void *contents, size_t size, size_t nmemb, std::string *output) {

    size_t totalSize = size * nmemb;

    output->append((char*)contents, totalSize);

    return totalSize;

}

LogStream.h

YARP_METHOD_NOT_YET_IMPLEMENTED
#define YARP_METHOD_NOT_YET_IMPLEMENTED()
Definition ReturnValue.h:96

WHISPERDEVICE
const yarp::os::LogComponent & WHISPERDEVICE()
Definition WhisperDevice.cpp:22

WhisperDevice.h

WhisperDevice_ParamsParser::m_ENVS_end_point_name
std::string m_ENVS_end_point_name
Definition WhisperDevice_ParamsParser.h:66

WhisperDevice_ParamsParser::m_ENVS_api_key_name
std::string m_ENVS_api_key_name
Definition WhisperDevice_ParamsParser.h:68

WhisperDevice_ParamsParser::parseParams
bool parseParams(const yarp::os::Searchable &config) override
Parse the DeviceDriver parameters.
Definition WhisperDevice_ParamsParser.cpp:39

WhisperDevice_ParamsParser::m_ENVS_api_version_name
std::string m_ENVS_api_version_name
Definition WhisperDevice_ParamsParser.h:69

WhisperDevice_ParamsParser::m_ENVS_deployment_id_name
std::string m_ENVS_deployment_id_name
Definition WhisperDevice_ParamsParser.h:67

WhisperDevice::WhisperDevice
WhisperDevice()
Definition WhisperDevice.cpp:25

WhisperDevice::getLanguage
yarp::dev::ReturnValue getLanguage(std::string &language) override
Gets the current language set for speech transcription.
Definition WhisperDevice.cpp:76

WhisperDevice::transcribe
yarp::dev::ReturnValue transcribe(const yarp::sig::Sound &sound, std::string &transcription, double &score) override
Performs the speech transcription.
Definition WhisperDevice.cpp:82

WhisperDevice::setLanguage
yarp::dev::ReturnValue setLanguage(const std::string &language="auto") override
Sets the language for speech transcription.
Definition WhisperDevice.cpp:70

WhisperDevice::close
bool close() override
Close the DeviceDriver.
Definition WhisperDevice.cpp:64

WhisperDevice::open
bool open(yarp::os::Searchable &config) override
Open the DeviceDriver.
Definition WhisperDevice.cpp:30

yarp::dev::ReturnValue
Definition ReturnValue.h:33

yarp::dev::ReturnValue::return_code::return_value_error_generic
@ return_value_error_generic
Method was successfully executed.

yarp::os::BufferedPort
A mini-server for performing network communication in the background.
Definition BufferedPort.h:60

yarp::os::Log::TraceType
@ TraceType
Definition Log.h:92

yarp::os::Searchable
A base class for nested structures that can be searched.
Definition Searchable.h:31

yarp::sig::Sound
Class for storing sounds See Audio in YARP for additional documentation on YARP audio.
Definition Sound.h:25

yarp::sig::Sound::getFrequency
int getFrequency() const
Get the frequency of the sound (i.e.
Definition Sound.cpp:356

yarp::sig::Sound::get
audio_sample get(size_t sample, size_t channel=0) const
Definition Sound.cpp:294

yarp::sig::Sound::getSamples
size_t getSamples() const
Get the number of samples contained in the sound.
Definition Sound.cpp:598

LogComponent.h

yCInfo
#define yCInfo(component,...)
Definition LogComponent.h:171

yCError
#define yCError(component,...)
Definition LogComponent.h:213

yCWarning
#define yCWarning(component,...)
Definition LogComponent.h:192

YARP_LOG_COMPONENT
#define YARP_LOG_COMPONENT(name,...)
Definition LogComponent.h:76

yarp::dev
For streams capable of holding different kinds of content, check what they actually have.
Definition BatteryData.cpp:13

yarp::os
An interface to the operating system, including Port based communication.
Definition AbstractCarrier.h:13