RetroShare/plugins/VOIP/gui/SpeexProcessor.cpp

527 lines
19 KiB
C++
Raw Normal View History

#include "SpeexProcessor.h"
#include <speex/speex.h>
#include <speex/speex_preprocess.h>
#include <sstream>
#include <iostream>
#include <fstream>
#include <fcntl.h>
#include <math.h>
#include <cstdlib>
#include <QDateTime>
#include <limits.h>
#include "interface/rsvoip.h"
//#include "gui/settings/rsharesettings.h"
#define iroundf(x) ( static_cast<int>(x) )
using namespace QtSpeex;
SpeexInputProcessor::SpeexInputProcessor(QObject *parent) : QIODevice(parent),
iMaxBitRate(16800),
lastEchoFrame(NULL),
enc_state(0),
enc_bits(),
send_timestamp(0),
bResetProcessor(true),
preprocessor(0),
echo_state(0),
inputBuffer()
{
enc_bits = new SpeexBits;
speex_bits_init(enc_bits);
speex_bits_reset(enc_bits);
enc_state = speex_encoder_init(&speex_wb_mode);
int iArg = 0;
speex_encoder_ctl(enc_state,SPEEX_SET_VAD, &iArg);
speex_encoder_ctl(enc_state,SPEEX_SET_DTX, &iArg);
float fArg=9.0;
speex_encoder_ctl(enc_state,SPEEX_SET_VBR_QUALITY, &fArg);
iArg = iMaxBitRate;
speex_encoder_ctl(enc_state, SPEEX_SET_VBR_MAX_BITRATE, &iArg);
iArg = 10;
speex_encoder_ctl(enc_state,SPEEX_SET_COMPLEXITY, &iArg);
iArg = 9;
speex_encoder_ctl(enc_state,SPEEX_SET_QUALITY, &iArg);
echo_state = NULL;
//iEchoFreq = iMicFreq = iSampleRate;
iSilentFrames = 0;
iHoldFrames = 0;
bResetProcessor = true;
//bEchoMulti = false;
preprocessor = NULL;
echo_state = NULL;
//srsMic = srsEcho = NULL;
//iJitterSeq = 0;
//iMinBuffered = 1000;
//psMic = new short[iFrameSize];
psClean = new short[SAMPLING_RATE];
//psSpeaker = NULL;
//iEchoChannels = iMicChannels = 0;
//iEchoFilled = iMicFilled = 0;
//eMicFormat = eEchoFormat = SampleFloat;
//iMicSampleSize = iEchoSampleSize = 0;
bPreviousVoice = false;
//pfMicInput = pfEchoInput = pfOutput = NULL;
iRealTimeBitrate = 0;
dPeakSignal = dPeakSpeaker = dPeakMic = dPeakCleanMic = dVoiceAcivityLevel = 0.0;
//if (g.uiSession) {
//TODO : get the maxbitrate from a rs service or a dynamic code
//iMaxBitRate = 10000;
//}
//bRunning = true;
}
SpeexInputProcessor::~SpeexInputProcessor() {
if (preprocessor) {
speex_preprocess_state_destroy(preprocessor);
}
if (echo_state) {
speex_echo_state_destroy(echo_state);
}
speex_encoder_destroy(enc_state);
speex_bits_destroy(enc_bits);
delete enc_bits;
delete[] psClean;
}
QByteArray SpeexInputProcessor::getNetworkPacket() {
return outputNetworkBuffer.takeFirst();
}
bool SpeexInputProcessor::hasPendingPackets() {
return !outputNetworkBuffer.empty();
}
qint64 SpeexInputProcessor::writeData(const char *data, qint64 maxSize) {
int iArg;
int i;
float sum;
short max;
inputBuffer += QByteArray(data, maxSize);
while((size_t)inputBuffer.size() > FRAME_SIZE * sizeof(qint16)) {
QByteArray source_frame = inputBuffer.left(FRAME_SIZE * sizeof(qint16));
short* psMic = (short *)source_frame.data();
//let's do volume detection
sum=1.0f;
for (i=0;i<FRAME_SIZE;i++) {
sum += static_cast<float>(psMic[i] * psMic[i]);
}
dPeakMic = qMax(20.0f*log10f(sqrtf(sum / static_cast<float>(FRAME_SIZE)) / 32768.0f), -96.0f);
max = 1;
for (i=0;i<FRAME_SIZE;i++)
max = static_cast<short>(std::abs(psMic[i]) > max ? std::abs(psMic[i]) : max);
dMaxMic = max;
dPeakSpeaker = 0.0;
QMutexLocker l(&qmSpeex);
if (bResetProcessor) {
if (preprocessor)
speex_preprocess_state_destroy(preprocessor);
preprocessor = speex_preprocess_state_init(FRAME_SIZE, SAMPLING_RATE);
iArg = 1;
speex_preprocess_ctl(preprocessor, SPEEX_PREPROCESS_SET_VAD, &iArg);
speex_preprocess_ctl(preprocessor, SPEEX_PREPROCESS_SET_AGC, &iArg);
speex_preprocess_ctl(preprocessor, SPEEX_PREPROCESS_SET_DENOISE, &iArg);
speex_preprocess_ctl(preprocessor, SPEEX_PREPROCESS_SET_DEREVERB, &iArg);
iArg = 30000;
speex_preprocess_ctl(preprocessor, SPEEX_PREPROCESS_SET_AGC_TARGET, &iArg);
iArg = -60;
speex_preprocess_ctl(preprocessor, SPEEX_PREPROCESS_SET_AGC_DECREMENT, &iArg);
iArg = rsVoip->getVoipiNoiseSuppress();
speex_preprocess_ctl(preprocessor, SPEEX_PREPROCESS_SET_NOISE_SUPPRESS, &iArg);
if (echo_state) {
iArg = SAMPLING_RATE;
speex_echo_ctl(echo_state, SPEEX_ECHO_SET_SAMPLING_RATE, &iArg);
speex_preprocess_ctl(preprocessor, SPEEX_PREPROCESS_SET_ECHO_STATE, echo_state);
}
bResetProcessor = false;
}
float v = 30000.0f / static_cast<float>(rsVoip->getVoipiMinLoudness());
iArg = iroundf(floorf(20.0f * log10f(v)));
speex_preprocess_ctl(preprocessor, SPEEX_PREPROCESS_SET_AGC_MAX_GAIN, &iArg);
speex_preprocess_ctl(preprocessor, SPEEX_PREPROCESS_GET_AGC_GAIN, &iArg);
float gainValue = static_cast<float>(iArg);
iArg = rsVoip->getVoipiNoiseSuppress() - iArg;
speex_preprocess_ctl(preprocessor, SPEEX_PREPROCESS_SET_NOISE_SUPPRESS, &iArg);
short * psSource = psMic;
if (echo_state && rsVoip->getVoipEchoCancel()) {
speex_echo_playback(echo_state, (short*)lastEchoFrame->data());
speex_echo_capture(echo_state,psMic,psClean);
psSource = psClean;
}
speex_preprocess_run(preprocessor, psSource);
//we will now analize the processed signal
sum=1.0f;
for (i=0;i<FRAME_SIZE;i++)
sum += static_cast<float>(psSource[i] * psSource[i]);
float micLevel = sqrtf(sum / static_cast<float>(FRAME_SIZE));
dPeakSignal = qMax(20.0f*log10f(micLevel / 32768.0f), -96.0f);
spx_int32_t prob = 0;
speex_preprocess_ctl(preprocessor, SPEEX_PREPROCESS_GET_PROB, &prob);//speech probability
fSpeechProb = static_cast<float>(prob) / 100.0f;
// clean microphone level: peak of filtered signal attenuated by AGC gain
dPeakCleanMic = qMax(dPeakSignal - gainValue, -96.0f);
dVoiceAcivityLevel = 0.4f * fSpeechProb + 0.6f * (1.0f + dPeakCleanMic / 96.0f);//ponderation for speech detection and audio amplitude
bool bIsSpeech = false;
if (dVoiceAcivityLevel > (static_cast<float>(rsVoip->getVoipfVADmax()) / 32767))
bIsSpeech = true;
else if (dVoiceAcivityLevel > (static_cast<float>(rsVoip->getVoipfVADmin()) / 32767) && bPreviousVoice)
bIsSpeech = true;
if (! bIsSpeech) {
iHoldFrames++;
if (iHoldFrames < rsVoip->getVoipVoiceHold())
bIsSpeech = true;
} else {
iHoldFrames = 0;
}
if (rsVoip->getVoipATransmit() == RsVoip::AudioTransmitContinous) {
bIsSpeech = true;
}
else if (rsVoip->getVoipATransmit() == RsVoip::AudioTransmitPushToTalk)
bIsSpeech = false;//g.s.uiDoublePush && ((g.uiDoublePush < g.s.uiDoublePush) || (g.tDoublePush.elapsed() < g.s.uiDoublePush));
//bIsSpeech = bIsSpeech || (g.iPushToTalk > 0);
/*if (g.s.bMute || ((g.s.lmLoopMode != RsVoip::Local) && p && (p->bMute || p->bSuppress)) || g.bPushToMute || (g.iTarget < 0)) {
bIsSpeech = false;
}*/
if (bIsSpeech) {
iSilentFrames = 0;
} else {
iSilentFrames++;
}
/*if (p) {
if (! bIsSpeech)
p->setTalking(RsVoip::Passive);
else if (g.iTarget == 0)
p->setTalking(RsVoip::Talking);
else
p->setTalking(RsVoip::Shouting);
}*/
if (! bIsSpeech && ! bPreviousVoice) {
iRealTimeBitrate = 0;
/*if (g.s.iIdleTime && ! g.s.bDeaf && ((tIdle.elapsed() / 1000000ULL) > g.s.iIdleTime)) {
emit doDeaf();
tIdle.restart();
}*/
spx_int32_t increment = 0;
speex_preprocess_ctl(preprocessor, SPEEX_PREPROCESS_SET_AGC_INCREMENT, &increment);
} else {
spx_int32_t increment = 12;
speex_preprocess_ctl(preprocessor, SPEEX_PREPROCESS_SET_AGC_INCREMENT, &increment);
}
int vbr_on=0;
//just use fixed bitrate for now
//encryption of VBR-encoded speech may not ensure complete privacy, as phrases can still be identified, at least in a controlled setting with a small dictionary of phrases, by analysing the pattern of variation of the bit rate.
if (rsVoip->getVoipATransmit() == RsVoip::AudioTransmitVAD) {//maybe we can do fixer bitrate when voice detection is active
vbr_on = 1;//test it on for all modes
} else {//maybe we can do vbr for ppt and continuous
vbr_on = 1;
}
speex_encoder_ctl(enc_state,SPEEX_SET_VBR, &vbr_on);
int br = 0;
speex_encoder_ctl(enc_state, SPEEX_GET_VBR_MAX_BITRATE, &br);
if (br != iMaxBitRate) {
br = iMaxBitRate;
speex_encoder_ctl(enc_state, SPEEX_SET_VBR_MAX_BITRATE, &br);
}
speex_encoder_ctl(enc_state, SPEEX_GET_BITRATE, &br);
if (br != iMaxBitRate) {
br = iMaxBitRate;
speex_encoder_ctl(enc_state, SPEEX_SET_BITRATE, &br);
}
if (! bPreviousVoice)
speex_encoder_ctl(enc_state, SPEEX_RESET_STATE, NULL);
if (bIsSpeech) {
speex_bits_reset(enc_bits);
speex_encode_int(enc_state, psSource, enc_bits);
QByteArray networkFrame;
networkFrame.resize(speex_bits_nbytes(enc_bits)+4);//add 4 for the frame timestamp for the jitter buffer
int packetSize = speex_bits_write(enc_bits, networkFrame.data()+4, networkFrame.size()-4);
((int*)networkFrame.data())[0] = send_timestamp;
outputNetworkBuffer.append(networkFrame);
emit networkPacketReady();
iRealTimeBitrate = packetSize * SAMPLING_RATE / FRAME_SIZE * 8;
} else {
iRealTimeBitrate = 0;
}
bPreviousVoice = bIsSpeech;
//std::cerr << "iRealTimeBitrate : " << iRealTimeBitrate << std::endl;
send_timestamp += FRAME_SIZE;
if (send_timestamp >= INT_MAX)
send_timestamp = 0;
inputBuffer = inputBuffer.right(inputBuffer.size() - FRAME_SIZE * sizeof(qint16));
}
return maxSize;
}
SpeexOutputProcessor::SpeexOutputProcessor(QObject *parent) : QIODevice(parent),
outputBuffer()
{
}
SpeexOutputProcessor::~SpeexOutputProcessor() {
QHashIterator<QString, SpeexJitter*> i(userJitterHash);
while (i.hasNext()) {
i.next();
speex_jitter_destroy(*(i.value()));
free (i.value());
}
}
void SpeexOutputProcessor::putNetworkPacket(QString name, QByteArray packet) {
//buffer:
// timestamp | encodedBuf
// —————–———–——————–———–——————–———–——————–
// 4 | totalSize 4
//the size part (first 4 byets) is not actually used in the logic
if (packet.size() > 4)
{
SpeexJitter* userJitter;
if (userJitterHash.contains(name)) {
userJitter = userJitterHash.value(name);
} else {
userJitter = (SpeexJitter*)malloc(sizeof(SpeexJitter));
speex_jitter_init(userJitter, speex_decoder_init(&speex_wb_mode), SAMPLING_RATE);
int on = 1;
speex_decoder_ctl(userJitter->dec, SPEEX_SET_ENH, &on);
userJitterHash.insert(name, userJitter);
}
int recv_timestamp = ((int*)packet.data())[0];
userJitter->mostUpdatedTSatPut = recv_timestamp;
if (userJitter->firsttimecalling_get)
return;
speex_jitter_put(*userJitter, (char *)packet.data()+4, packet.size()-4, recv_timestamp);
}
}
bool SpeexInputProcessor::isSequential() const {
return true;
}
void SpeexInputProcessor::addEchoFrame(QByteArray* echo_frame) {
if (rsVoip->getVoipEchoCancel() && echo_frame) {
QMutexLocker l(&qmSpeex);
lastEchoFrame = echo_frame;
if (!echo_state) {//init echo_state
echo_state = speex_echo_state_init(FRAME_SIZE, ECHOTAILSIZE*FRAME_SIZE);
int tmp = SAMPLING_RATE;
speex_echo_ctl(echo_state, SPEEX_ECHO_SET_SAMPLING_RATE, &tmp);
bResetProcessor = true;
}
lastEchoFrame = echo_frame;
}
}
qint64 SpeexOutputProcessor::readData(char *data, qint64 maxSize) {
int ts = 0; //time stamp for the jitter call
while(outputBuffer.size() < maxSize) {
QByteArray* result_frame = new QByteArray();
result_frame->resize(FRAME_SIZE * sizeof(qint16));
result_frame->fill(0,FRAME_SIZE * sizeof(qint16));
QHashIterator<QString, SpeexJitter*> i(userJitterHash);
while (i.hasNext()) {
i.next();
SpeexJitter* jitter = i.value();
QByteArray intermediate_frame;
intermediate_frame.resize(FRAME_SIZE * sizeof(qint16));
if (jitter->firsttimecalling_get)
{
//int ts = jitter->mostUpdatedTSatPut;
jitter->firsttimecalling_get = false;
}
speex_jitter_get(*jitter, (spx_int16_t*)intermediate_frame.data(), &ts);
for (int j = 0; j< FRAME_SIZE; j++) {
short sample1 = ((short*)result_frame->data())[j];
short sample2 = ((short*)intermediate_frame.data())[j];
float samplef1 = sample1 / 32768.0f;
float samplef2 = sample2 / 32768.0f;
float mixed = samplef1 + 0.8f * samplef2;
// hard clipping
if (mixed > 1.0f) mixed = 1.0f;
if (mixed < -1.0f) mixed = -1.0f;
((spx_int16_t*)result_frame->data())[j] = (short)(mixed * 32768.0f);
}
}
outputBuffer += *result_frame;
emit playingFrame(result_frame);
}
QByteArray resultBuffer = outputBuffer.left(maxSize);
memcpy(data, resultBuffer.data(), resultBuffer.size());
outputBuffer = outputBuffer.right(outputBuffer.size() - resultBuffer.size());
return resultBuffer.size();
}
bool SpeexOutputProcessor::isSequential() const {
return true;
}
void SpeexOutputProcessor::speex_jitter_init(SpeexJitter *jit, void *decoder, int /*sampling_rate*/)
{
jit->dec = decoder;
speex_decoder_ctl(decoder, SPEEX_GET_FRAME_SIZE, &jit->frame_size);
jit->packets = jitter_buffer_init(jit->frame_size);
jit->current_packet = new SpeexBits;
speex_bits_init(jit->current_packet);
jit->valid_bits = 0;
jit->firsttimecalling_get = true;
jit->mostUpdatedTSatPut = 0;
}
void SpeexOutputProcessor::speex_jitter_destroy(SpeexJitter jitter)
{
if (jitter.dec) {
speex_decoder_destroy(jitter.dec);
}
jitter_buffer_destroy(jitter.packets);
speex_bits_destroy(jitter.current_packet);
}
void SpeexOutputProcessor::speex_jitter_put(SpeexJitter jitter, char *packet, int len, int timestamp)
{
JitterBufferPacket p;
p.data = packet;
p.len = len;
p.timestamp = timestamp;
p.span = jitter.frame_size;
jitter_buffer_put(jitter.packets, &p);
}
void SpeexOutputProcessor::speex_jitter_get(SpeexJitter jitter, spx_int16_t *out, int *current_timestamp)
{
int i;
int ret;
spx_int32_t activity;
//int bufferCount = 0;
JitterBufferPacket packet;
char data[FRAME_SIZE * ECHOTAILSIZE * 10];
packet.data = data;
packet.len = FRAME_SIZE * ECHOTAILSIZE * 10;
if (jitter.valid_bits)
{
/* Try decoding last received packet */
ret = speex_decode_int(jitter.dec, jitter.current_packet, out);
if (ret == 0)
{
jitter_buffer_tick(jitter.packets);
return;
} else {
jitter.valid_bits = 0;
}
}
if (current_timestamp)
ret = jitter_buffer_get(jitter.packets, &packet, jitter.frame_size, current_timestamp);
else
ret = jitter_buffer_get(jitter.packets, &packet, jitter.frame_size, NULL);
if (ret != JITTER_BUFFER_OK)
{
/* No packet found */
speex_decode_int(jitter.dec, NULL, out);
} else {
speex_bits_read_from(jitter.current_packet, packet.data, packet.len);
/* Decode packet */
ret = speex_decode_int(jitter.dec, jitter.current_packet, out);
if (ret == 0)
{
jitter.valid_bits = 1;
} else {
/* Error while decoding */
for (i=0;i<jitter.frame_size;i++)
out[i]=0;
}
}
speex_decoder_ctl(jitter.dec, SPEEX_GET_ACTIVITY, &activity);
if (activity < 30)
{
jitter_buffer_update_delay(jitter.packets, &packet, NULL);
}
jitter_buffer_tick(jitter.packets);
//ret = jitter_buffer_ctl(jitter.packets, JITTER_BUFFER_GET_AVALIABLE_COUNT, &bufferCount);
//sprintf(msg, “ get %d bufferCount=%d\n”, speex_jitter_get_pointer_timestamp(jitter), bufferCount);
//debugPrint(msg);
}
int SpeexOutputProcessor::speex_jitter_get_pointer_timestamp(SpeexJitter jitter)
{
return jitter_buffer_get_pointer_timestamp(jitter.packets);
}