www.pudn.com > speechVAD.rar > VAD.cpp
// VAD.cpp: implementation of the CVAD class.
//
//////////////////////////////////////////////////////////////////////
#include "stdafx.h"
#include "VAD.h"
#include "DllUseLib.h"
//////////////////////////////////////////////////////////////////////
// Construction/Destruction
//////////////////////////////////////////////////////////////////////
short pTaps[] = {
500, 1001, 500, 14, -22366, 7824,
500, 1001, 500, 14, -26406, 12198
};
CVAD::CVAD()
{
VADInit();
}
CVAD::~CVAD()
{
if (pState)
delete[] pState;
if (m_pReadBuffer)
delete[] m_pReadBuffer;
if (m_pDecResultBuf)
delete[] m_pDecResultBuf;
}
void CVAD::VADInit()
{
cWinSamps = WINDOW_LEN_SAMPS; //samples in one frame
cFrameShiftSamps = FRAME_SHIFT_SAMPS; //samples shift
m_nAllDateLenInSamps = VAD_ALL_SAMPS; //samples in the buffer to be processed
cReuseSamps = cWinSamps - cFrameShiftSamps;//reuse samps
m_pReadBuffer = NULL;
m_pDecResultBuf = NULL;
pState = NULL;
//alloc DecResultBuf
int nDecNumInFirstVoice = (m_nAllDateLenInSamps-cWinSamps)/cFrameShiftSamps + 1;
int nDecNumInCommVoice = m_nAllDateLenInSamps / cFrameShiftSamps;
m_pDecResultBuf = new int[nDecNumInCommVoice];
VAD_GetStateSizeBytes(cWinSamps, &cStateBytes); /* compute the size in bytes required for the VAD Internal State */
pState = (VADStateStruct*)(new BYTE[cStateBytes]);
m_pReadBuffer = new short[cWinSamps];
VAD_Init(pState, FRAME_SHIFT_MSEC, cWinSamps, SAMPLING_FREQUENCY_HZ);
prevDecisionState = INACTIVE;
prevInputSample = 0;
m_bIsFirstSegment = TRUE;//when init VAD first voise segment
}
/////////////////////////////
//VAD process frame
/*
// in:
// pFrameBuffer: pointer to this buffer
// bufferLenInSams: buffer len in samps // VAD_ALL_SAMPS
// isFistVoise: first voise segment
// out: VAD descion
*////////////////////////////
int CVAD::VADProcessFrame(short * pFrameBuffer, int bufferLenInSmps)
{
int nDecNumInFirstVoice = (bufferLenInSmps-cWinSamps)/cFrameShiftSamps + 1;
int nDecNumInCommVoice = bufferLenInSmps / cFrameShiftSamps;
int nReadSampsNum;
int nSmpsLeftInBuf = bufferLenInSmps;
int nIndex = 0;
int VAD_FinalDec;
VADDecisionState curDecisionState = NODECISION;
if (m_bIsFirstSegment == TRUE)
{
memset(m_pDecResultBuf, -1, nDecNumInCommVoice*sizeof(int));
//read the first frame to process
nReadSampsNum = cWinSamps;
memcpy(m_pReadBuffer, pFrameBuffer, nReadSampsNum* sizeof(short)); //copy the first frame into process buffer
nSmpsLeftInBuf -= nReadSampsNum;
//remove DC
prevInputSample = 0;
own_ippsCompensateOffsetQ15_16s_I(m_pReadBuffer, nReadSampsNum, &prevInputSample, 0, DC_RMV_COEF);
prevOutputSample = m_pReadBuffer[nReadSampsNum-1];
while (nSmpsLeftInBuf >= cFrameShiftSamps)
{
/* perform VAD */
VAD_ProcessFrame(pState, m_pReadBuffer, cWinSamps, &curDecisionState, &decisionFrameNum);
/*put this dec in dec buffer*/
if ((nIndex >= 0) && (nIndex < nDecNumInFirstVoice))
{
m_pDecResultBuf[nIndex] = curDecisionState;
}
/* save the samples to be re-used from the recently processed frame and read another (lookahead) frame shift of data. */
//ippsCopy_16s(m_pReadBuffer+cFrameShiftSamps, m_pReadBuffer, cReuseSamps);
memcpy(m_pReadBuffer, m_pReadBuffer+cFrameShiftSamps, cReuseSamps*sizeof(short));
memcpy(m_pReadBuffer+cReuseSamps, pFrameBuffer+cWinSamps+nIndex*cFrameShiftSamps, cFrameShiftSamps*sizeof(short));
nSmpsLeftInBuf -= cFrameShiftSamps;
nIndex++;
/* remove DC */
own_ippsCompensateOffsetQ15_16s_I(m_pReadBuffer+cReuseSamps, cFrameShiftSamps, &prevInputSample, prevOutputSample, DC_RMV_COEF);
prevOutputSample = m_pReadBuffer[cReuseSamps+cFrameShiftSamps-1];
}
/*process the dec buffer*/
VAD_FinalDec = VADProcessDecBuf(&prevDecisionState, nDecNumInFirstVoice, m_pDecResultBuf);//prevDecisionState changed in the func
m_bIsFirstSegment = FALSE;
}
else//in the process of qq chat
{
memset(m_pDecResultBuf, -1, nDecNumInCommVoice*sizeof(int));
//copy the first shift frame to buffer to process
//ippsCopy_16s(m_pReadBuffer+cFrameShiftSamps, m_pReadBuffer, cReuseSamps);
memcpy(m_pReadBuffer, m_pReadBuffer+cFrameShiftSamps, cReuseSamps);
memcpy(m_pReadBuffer+cReuseSamps, pFrameBuffer, cFrameShiftSamps*sizeof(short));
/* remove DC */
own_ippsCompensateOffsetQ15_16s_I(m_pReadBuffer+cReuseSamps, cFrameShiftSamps, &prevInputSample, prevOutputSample, DC_RMV_COEF);
prevOutputSample = m_pReadBuffer[cReuseSamps+cFrameShiftSamps-1];
nSmpsLeftInBuf -= cFrameShiftSamps;
while (nSmpsLeftInBuf >= cFrameShiftSamps)
{
/* perform VAD */
VAD_ProcessFrame(pState, m_pReadBuffer, cWinSamps, &curDecisionState, &decisionFrameNum);
/*put this dec in dec buffer*/
if ((nIndex >= 0) && (nIndex < nDecNumInCommVoice))
{
m_pDecResultBuf[nIndex] = curDecisionState;
}
nIndex++;
/* save the samples to be re-used from the recently processed frame and read another (lookahead) frame shift of data. */
//ippsCopy_16s(m_pReadBuffer+cFrameShiftSamps, m_pReadBuffer, cReuseSamps);
memcpy(m_pReadBuffer, m_pReadBuffer+cFrameShiftSamps, cReuseSamps);
memcpy(m_pReadBuffer+cReuseSamps, pFrameBuffer+nIndex*cFrameShiftSamps, cFrameShiftSamps*sizeof(short));
nSmpsLeftInBuf -= cFrameShiftSamps;
/* remove DC */
own_ippsCompensateOffsetQ15_16s_I(m_pReadBuffer+cReuseSamps, cFrameShiftSamps, &prevInputSample, prevOutputSample, DC_RMV_COEF);
prevOutputSample = m_pReadBuffer[cReuseSamps+cFrameShiftSamps-1];
}
/*process the dec buffer*/
VAD_FinalDec = VADProcessDecBuf(&prevDecisionState, nDecNumInFirstVoice, m_pDecResultBuf);//prevDecisionState changed in the func
}
return VAD_FinalDec;
}
//preDec is &prevDesionState , changed in func
int CVAD::VADProcessDecBuf(VADDecisionState *preDec, int bufLen, int *pDecBuffer)
{
int i;
int return_Value;
BOOL decExist = FALSE;
if (*preDec == INACTIVE)
{
for (i=0; iframeNum >= 2147483647)
{
pState->frameNum = 1000;
}
else
{
pState->frameNum++;
}
/* Energy-Based state update */
E_UpdateEnergyState(pInFrame, pState->frameNum, pState->pEState);
/* Periodicity-Based state update */
PER_UpdatePerState(pInFrame, pState->frameNum, len, pState->pPerState);
/* State Machine update */
SM_UpdateState(pState);
/* Update the output decision variables based on the VAD internal state */
if (pState->pSM->uttHasStartedFlag)
{
/* if utterance start detected */
*pDecisionState = ACTIVE;
*pDecisionFrame = pState->pSM->uttBegFrameNum;
/* reset the start flag for the next utterance */
pState->pSM->uttHasStartedFlag=0;
}
else if (pState->pSM->uttHasEndedFlag)
{
/* if utterance end detected */
*pDecisionState = INACTIVE;
*pDecisionFrame = pState->pSM->uttEndFrameNum;
/* reset the end flag for the next utterance */
pState->pSM->uttHasEndedFlag = 0;
}
else
{
/* if neither start or end of utterance was detected */
*pDecisionState = NODECISION;
*pDecisionFrame = -1;
}
}
/********************************************************************************
// Name: VAD_GetStateSizeBytes
// Description: Calculate and return the size in bytes required by the VAD
// internal state structure based on the input parameters
// Input Arguments:
// cWinSamps - size of an input data frame in samples
// Output Arguments:
// pNumStateBytes - pointer to output variable containing the
// calculated state size
// Returns: None
// Notes:
********************************************************************************/
void CVAD::VAD_GetStateSizeBytes(int cWinSamps, int* pNumStateBytes)
{
int cTmpStateBytes; /* size of the intermediate states in bytes */
/* initialize with the size of the static components of the VADStateStruct */
*pNumStateBytes = sizeof(VADStateStruct);
/* add the size of the energy state structure */
E_GetStateSizeBytes(&cTmpStateBytes);
*pNumStateBytes += cTmpStateBytes;
/* add the size of the periodicity state structure */
PER_GetStateSizeBytes(cWinSamps, &cTmpStateBytes);
*pNumStateBytes += cTmpStateBytes;
/* add the size of the state-machine structure */
SM_GetStateSizeBytes(&cTmpStateBytes);
*pNumStateBytes += cTmpStateBytes;
}
/********************************************************************************
// Name: VAD_Init
// Description: Initialize the VAD state structure with initial values. Also,
// assign the externally allocated memory to the internal variables
// of the VADStateStruct.
//
// Input Arguments:
// frameShiftMsec - frame shift for overlapping frames in msec
// cWinSamps - size of an input data frame in samples
// sampFreqHz - sampling frequency of the input data in Hz
// Input/Output Arguments:
// pState - pointer to an VADState structure
//
// Returns: None
********************************************************************************/
void CVAD::VAD_Init(VADStateStruct* pState, int frameShiftMsec, int cWinSamps, int sampFreqHz)
{
char *pMemory; /* pointer to current memory block to be assigned */
int cStateBytes; /* size of the intermediate states in bytes */
/* start memory pointer just after the memory for the VADStateStruct */
pMemory = (char*)(&pState->pMemoryBlock + 1);
/* assign memory and initialize the energy state structure */
pState->pEState = (EStateStruct *) pMemory;
E_Init(pState->pEState, cWinSamps, frameShiftMsec);
E_GetStateSizeBytes(&cStateBytes);
pMemory += cStateBytes;
/* assign memory and initialize the periodicity state structure */
pState->pPerState = (PERStateStruct *) pMemory;
PER_Init(pState->pPerState, frameShiftMsec, cWinSamps, sampFreqHz);
PER_GetStateSizeBytes(cWinSamps, &cStateBytes);
pMemory += cStateBytes;
/* assign memory and initialize the state-machine structure */
pState->pSM = (SMStruct *) pMemory;
SM_Init(pState->pSM, frameShiftMsec);
SM_GetStateSizeBytes(&cStateBytes);
pMemory += cStateBytes;
/* initialize the frame count */
pState->frameNum = 0;
}
/********************************************************************************
// Name: VAD_ProcessEndOfInput
// Description: This function is called at the end of input data stream to check
// if the VAD state machine was already in the HANG state when the stream
// ended. If so, utterance endpoint is flagged accordingly.
//
// Input/Output Arguments:
// pState - pointer to the VAD state structure
// outputArguments:
// pDecisionState - pointer to output variable that contains the decision
// made by VAD
// pDecisionFrame - pointer to output variable that contains the frame number
// (counted from zero) of the determined endpoint
// Returns: None
********************************************************************************/
void CVAD::VAD_ProcessEndOfInput(
VADStateStruct* pState,
VADDecisionState* pDecisionState,
int* pDecisionFrame)
{
/* Update the output decision variables based on the VAD internal state */
if (pState->pSM->uttHasEndedFlag)
{
/* if utterance end detected */
*pDecisionState = INACTIVE;
*pDecisionFrame = pState->pSM->uttEndFrameNum;
/* reset the end flag for the next utterance */
pState->pSM->uttHasEndedFlag = 0;
}
else
{
/* declare end of stream to flush the complete output buffer */
*pDecisionState = END_OF_STREAM;
*pDecisionFrame = -1;
}
}
/* EOF */
/********************************************************************************
// Name: E_GetStateSizeBytes
// Description: Calculate and return the size in bytes required by the Energy
// state structure.
// Input Arguments:
// None
// Output Arguments:
// pNumStateBytes - pointer to output variable containing the
// calculated state size
// Returns: None
// Notes:
********************************************************************************/
void CVAD::E_GetStateSizeBytes(int* pNumStateBytes)
{
/* initialize with the size of the static components of the EStateStruct */
*pNumStateBytes = sizeof(EStateStruct);
}
/********************************************************************************
// Name: E_Init
// Description: Initialize the Energy state structure with initial values. Also,
// assign the externally allocated memory to the internal variables
// of the EStateStruct.
//
// Input Arguments:
// cWinSamps - size of an input data frame in samples
// frameShiftMsec - frame shift for overlapping frames in msec
// Input/Output Arguments:
// pEState - pointer to an EState structure
// Returns: None
********************************************************************************/
void CVAD::E_Init(EStateStruct* pEState, int cWinSamps, int frameShiftMsec)
{
int halfFrameShiftMsec;
int i;
/* number of samples in a frame */
pEState->cFrameSamples = cWinSamps;
/* find number of right shifts for terms in energy summation */
if (cWinSamps > 0)
{
pEState->cScaleFactor = 0;
i = cWinSamps;
while(i>0)
{
pEState->cScaleFactor++;
i >>= 1;
}
}
/* half the frame shift used to round of the computations of variables */
halfFrameShiftMsec = frameShiftMsec/2;
/* Initialize variables */
pEState->cInitNoiseEstFrames = (E_INIT_NOISE_ESTIMATE_MSEC + halfFrameShiftMsec) / frameShiftMsec;
/* Initial value to compute running average in order to initialize noise floor */
pEState->noiseFloorDB = E_MIN_ENERGY_DB;
}
/********************************************************************************
// Name: E_UpdateEnergyState
// Description: Update the energy state based on the current input frame.
//
// Input Arguments:
// pFrame - input frame
// frameNum - current frame number used in initializing the noise floor
// Input/Output Arguments:
// pEState - pointer to an EState structure
//
// Returns: None
********************************************************************************/
void CVAD::E_UpdateEnergyState(const short* pFrame, int frameNum, EStateStruct* pEState)
{
/* compute frame energy in DB */
E_ComputeEnergyDB(pFrame, pEState);
/* compute threshold for noise for the current frame */
E_ComputeNoiseEnergyThreshDB(pEState, frameNum);
}
/********************************************************************************
// Name: E_ComputeEnergyDB
// Description: Compute the energy of the input frame in DB. The energy is computed
// as the variance of the samples in the input frame. The variance is
// subsequently converted into DB (10log10).
//
// Input Arguments:
// pFrame - input frame
// Input/Output Arguments:
// pEState - pointer to an EState structure
//
// Returns: None
********************************************************************************/
void CVAD::E_ComputeEnergyDB(const short* pFrame, EStateStruct* pEState)
{
int sumSqr; /* sum square of the input samples */
int i;
sumSqr = 0;
/* compute sum-square and sum of the input samples */
for (i=0; icFrameSamples; i++)
{
sumSqr += (pFrame[i]*pFrame[i]) >> pEState->cScaleFactor;
}
/* convert to DB using Intel(R) IPP call */
if (sumSqr > 0)
{
own_ipps10Log10_32s_Sfs(&sumSqr, &(pEState->energyDB), 1, -Q15);
}
else
{
pEState->energyDB = E_MIN_ENERGY_DB;
}
}
/********************************************************************************
// Name: E_ComputeNoiseEnergyThreshDB
// Description: Compute the noise energy threshold of the input frame in DB.
// The noise threshold is computed as the sum of a noise floor and
// a correction term.
// The noise floor is initialized as the average
// energy over the initial E_INIT_NOISE_ESTIMATE_MSEC of input data.
// The noise floor is then updated in hypothesized non-speech regions
// using either a slow or fast adaptation factor.
//
// Input Arguments:
// frameNum - current frame number used in initializing the noise floor
// Input/Output Arguments:
// pEState - pointer to an EState structure
//
// Returns: None
********************************************************************************/
void CVAD::E_ComputeNoiseEnergyThreshDB(EStateStruct* pEState, int frameNum)
{
if (frameNum <= pEState->cInitNoiseEstFrames)
{
/* Initialize noise floor */
pEState->noiseFloorDB = pEState->noiseFloorDB * (frameNum-1) + pEState->energyDB;
pEState->noiseFloorDB /= frameNum;
}
else
{
if (pEState->energyDB > pEState->noiseThreshDB)
{
WEIGHTED_AVG_Q15(pEState->energyDB, pEState->noiseFloorDB, E_SLOW_ADAPT_ALPHA_Q15, pEState->noiseFloorDB)
}
else
{
WEIGHTED_AVG_Q15(pEState->energyDB, pEState->noiseFloorDB, E_FAST_ADAPT_ALPHA_Q15, pEState->noiseFloorDB)
}
}
pEState->noiseThreshDB = pEState->noiseFloorDB + E_THRESH_OFFSET_DB_Q15;
}
/* EOF */
/********************************************************************************
// Name: PER_GetStateSizeBytes
// Description: Calculate and return the size in bytes required by the
// periodicity internal state structure based on the input parameters
// Input Arguments:
// cWinSamps - size of an input data frame in samples
// Output Arguments:
// pNumStateBytes - pointer to output variable that contains the
// calculated state size
// Returns: None
// Notes:
********************************************************************************/
void CVAD::PER_GetStateSizeBytes(int cWinSamps, int* pNumStateBytes)
{
int cTmpBytes;
int cTmpDownSampledWinSamps;
/* initialize with the size of the static components of the PERStateStruct */
*pNumStateBytes = sizeof(PERStateStruct);
/* add the size of pState->pBPFrame - band-pass filtered values */
cTmpBytes = cWinSamps *sizeof(short);
*pNumStateBytes += cTmpBytes;
/* add any alignment bytes for word boundary alignment */
cTmpBytes = cTmpBytes % sizeof(int); /* align of word boundary */
*pNumStateBytes += cTmpBytes;
/* add the size of pState->pDSFrame - downsampled frame */
own_ippsDownSampleSize(cWinSamps, PER_DOWNSAMPLE_FACTOR, PER_DOWNSAMPLE_PHASE, &cTmpDownSampledWinSamps);
cTmpBytes = cTmpDownSampledWinSamps * sizeof(short);
*pNumStateBytes += cTmpBytes;
/* add any alignment bytes for word boundary alignment */
cTmpBytes = cTmpBytes % sizeof(int); /* align of word boundary */
*pNumStateBytes += cTmpBytes;
//cTmpBytes = TAPSLEN * sizeof(int);
cTmpBytes = (NUMBIQUAD*2) * sizeof(int);
*pNumStateBytes += cTmpBytes;
}
/********************************************************************************
// Name: PER_Init
// Description: Initialize the Periodicity state structure with initial values. Also,
// assign the externally allocated memory to the internal variables
// of the PERStateStruct.
//
// Input Arguments:
// frameShiftMsec - frame shift for overlapping frames in msec
// cWinSamps - size of an input data frame in samples
// sampFreqHz - sampling frequency of the input data in Hz
// Input/Output Arguments:
// pPerState - pointer to an PERState structure to be initialized
// Returns: None
********************************************************************************/
void CVAD::PER_Init(PERStateStruct* pPerState, int frameShiftMsec, int cWinSamps, int sampFreqHz)
{
int halfFrameShiftMsec;
int tmpBytes;
char* pMemory;
int i;
/* half the frame shift used to round of the computations of variables */
halfFrameShiftMsec = frameShiftMsec/2;
/* start memory pointer just after the memory for the PERStateStruct */
pMemory = (char*)(&pPerState->pMemoryBlock + 1);
/* Initialize periodicity values of state */
pPerState->cInitPerEstFrames = (PER_INIT_PER_ESTIMATE_MSEC + halfFrameShiftMsec) / frameShiftMsec;
pPerState->minPeriodSamps = sampFreqHz / (PER_DOWNSAMPLE_FACTOR * PER_MAX_PITCH_FREQ_HZ);
pPerState->maxPeriodSamps = sampFreqHz / (PER_DOWNSAMPLE_FACTOR * PER_MIN_PITCH_FREQ_HZ);
pPerState->smoothPeriodicityQ15 = 0;
/* assign memory to the buffer that holds the bandpass filtered values */
pPerState->pBPFrame = (short *) pMemory;
tmpBytes = cWinSamps * sizeof(short);
/* align on word boundary */
tmpBytes += (tmpBytes % sizeof(int));
pMemory += tmpBytes;
/* assign memory to the buffer that holds the downsampled values */
own_ippsDownSampleSize(cWinSamps, PER_DOWNSAMPLE_FACTOR, PER_DOWNSAMPLE_PHASE, &pPerState->cDSFrameSamps);
pPerState->pDSFrame = (short *) pMemory;
tmpBytes = pPerState->cDSFrameSamps * sizeof(short);
/* align on word boundary */
tmpBytes += (tmpBytes % sizeof(int));
pMemory += tmpBytes;
/* assign memory to the delayLine for the IIR filter */
pPerState->pDelayLine = (int *) pMemory;
tmpBytes = (NUMBIQUAD*2) * sizeof(int);
for (i=0; i<(NUMBIQUAD*2); i++)
{
pPerState->pDelayLine[i] = 0;
}
}
/********************************************************************************
// Name: PER_UpdatePerState
// Description: Update the periodicity state based on the current input frame.
//
// Input Arguments:
// pFrame - input frame
// frameNum - current frame number used in initializing the noise floor
// len - number of samples in the input frame
// Input/Output Arguments:
// pPerState - pointer to an periodicity state structure
//
// Returns: None
********************************************************************************/
void CVAD::PER_UpdatePerState(const short* pFrame, int frameNum, int len, PERStateStruct* pPerState)
{
/* bandpass filter the input data */
PER_BandPassAndDownSample(pFrame, len, pPerState);
/* compute periodicity on the bandpass filtered data */
PER_ComputePeriodicity(pPerState->pDSFrame, pPerState->cDSFrameSamps, pPerState);
/* smooth the periodicity using history */
PER_SmoothPeriodicity(pPerState, frameNum);
}
/********************************************************************************
// Name: PER_BandPassAndDownSample
// Description: Band-pass filter (70-1000Hz) the input data using Intel(R) IPP data
//
// Input Arguments:
// pSrc - input data
// srcLen - number of samples in the input and output buffers
// Output Arguments:
// pPerState - output band pass filtered data
// Returns: None
********************************************************************************/
void CVAD::PER_BandPassAndDownSample(const short* pSrc, int srcLen, PERStateStruct* pPerState)
{
int phase;
own_ippsIIR_BiQuadDirect_16s(pSrc, pPerState->pBPFrame, srcLen, pTaps, NUMBIQUAD, pPerState->pDelayLine);
/* downsample */
phase = PER_DOWNSAMPLE_PHASE;
own_ippsDownSample_16s(pPerState->pBPFrame, srcLen, &phase, pPerState->pDSFrame, PER_DOWNSAMPLE_FACTOR);
}
/********************************************************************************
// Name: PER_ComputePeriodicity
// Description: Compute periodicity using Intel(R) IPP function
//
// Input Arguments:
// pInFrame - input data
// len - number of samples in the input buffer
// Input/Output Arguments:
// pPerState - pointer to an periodicity state structure
//
// Returns: None
********************************************************************************/
void CVAD::PER_ComputePeriodicity(const short* pInFrame, int len, PERStateStruct* pPerState)
{
own_ippsPeriodicityLSPE_16s(pInFrame,len,&(pPerState->periodicityQ15),&(pPerState->period),pPerState->maxPeriodSamps,pPerState->minPeriodSamps);
}
/********************************************************************************
// Name: PER_SmoothPeriodicity
// Description: Smooth the computed periodicity by summing over periodicity history.
// The average noise periodicity value is removed before smoothing.
//
// Input Arguments:
// frameNum - current frame number used in initializing the noise floor
// Input/Output Arguments:
// pPerState - pointer to an periodicity state structure
//
// Returns: None
********************************************************************************/
void CVAD::PER_SmoothPeriodicity(PERStateStruct* pPerState, int frameNum)
{
if (frameNum <= pPerState->cInitPerEstFrames)
{
/* Initialize periodicity */
pPerState->smoothPeriodicityQ15 = pPerState->smoothPeriodicityQ15 * (frameNum-1) + pPerState->periodicityQ15;
pPerState->smoothPeriodicityQ15 /= frameNum;
}
else
{
WEIGHTED_AVG_Q15(pPerState->periodicityQ15, pPerState->smoothPeriodicityQ15, PER_ADAPT_ALPHA_Q15,
pPerState->smoothPeriodicityQ15)
}
}
/* EOF */
/********************************************************************************
// Name: SM_GetStateSizeBytes
// Description: Calculate and return the size in bytes required by the State
// Machine (SM) structure.
// Output Arguments:
// pNumStateBytes - return variable holding the calculated state size
//
// Returns: None
// Notes:
********************************************************************************/
void CVAD::SM_GetStateSizeBytes(int* pNumStateBytes)
{
/* size of the SMStruct */
*pNumStateBytes = sizeof(SMStruct);
}
/********************************************************************************
// Name: SM_Init
// Description: Initialize the state machine structure with initial values.
//
// Input Arguments:
// frameShiftMsec - frame shift for overlapping frames in msec
// Input/Output Arguments:
// pSMState - pointer to an SM structure to be initialized
//
// Returns: None
********************************************************************************/
void CVAD::SM_Init(SMStruct* pSMState, int frameShiftMsec)
{
/* half the frame shift used to round of the computations of variables */
int halfFrameShiftMsec = frameShiftMsec/2;
/* Initialize variables */
pSMState->cMinOnsetFrames = (ONSET_THRESHOLD_MSEC + halfFrameShiftMsec) / frameShiftMsec;
pSMState->cMinEnergyHangFrames = (ENERGY_HANG_THRESH_MSEC + halfFrameShiftMsec) / frameShiftMsec;
pSMState->cMinPerHangFrames = (PER_HANG_THRESH_MSEC + halfFrameShiftMsec) / frameShiftMsec;
pSMState->cUttBegAdjustFrames = (UTT_BEG_ADJUSTMENT_MSEC + halfFrameShiftMsec) / frameShiftMsec;
pSMState->cOnsetFrames = 0;
pSMState->cEnergyHangFrames = 0;
pSMState->cPerHangFrames = 0;
pSMState->uttHasStartedFlag = 0;
pSMState->uttHasEndedFlag = 0;
pSMState->state = SILENCE;
pSMState->uttEndFrameNum = 0;
pSMState->uttBegFrameNum = 0;
pSMState->prevUttEndFrameNum=0;
}
/********************************************************************************
// Name: SM_UpdateSMState
// Description: Update the state machine based on energy and periodicity
//
// Input/Output Arguments:
// pState - pointer to the VAD state structure
//
// Returns: None
********************************************************************************/
void CVAD::SM_UpdateState(VADStateStruct* pState)
{
/* update state machine parameters based on the energy and periodicity measures */
SM_UpdateSMParams(pState);
/* determine state of the VAD */
SM_UpdateSMState(pState->pSM, pState->frameNum);
if (pState->pSM->uttHasStartedFlag)
{
/* adjust the start frame number by a fixed number of frames */
pState->pSM->uttBegFrameNum -= pState->pSM->cUttBegAdjustFrames;
/* prevent over-adjustment */
if (pState->pSM->uttBegFrameNum <= pState->pSM->prevUttEndFrameNum)
{
pState->pSM->uttBegFrameNum = pState->pSM->prevUttEndFrameNum + 1;
}
}
else if (pState->pSM->uttHasEndedFlag)
{
/* save the end-of-utterance frame number to use in calculation of history for start of next utterance */
pState->pSM->prevUttEndFrameNum = pState->frameNum;
}
}
/********************************************************************************
// Name: SM_UpdateStateParams
// Description: Update the energy and periodicity related flags based on the
// their values with respect to their thresholds.
//
// Input/Output Arguments:
// pState - pointer to the VADState structure.
//
// Returns: None
********************************************************************************/
void CVAD::SM_UpdateSMParams(VADStateStruct* pState)
{
/*
// If the current frame energy exceeds the noise threshold, this indicates
// that speech is present in the current frame.
*/
if (pState->pEState->energyDB > pState->pEState->noiseThreshDB)
{
pState->pSM->energySpeechIsActiveFlag = 1;
}
else
{
pState->pSM->energySpeechIsActiveFlag = 0;
}
/*
// If the current frame periodicity exceeds the threshold, this indicates
// that (voiced) speech is present in the current frame.
*/
if (pState->pPerState->smoothPeriodicityQ15 > PER_SPEECH_THRESHOLD_Q15)
{
pState->pSM->perSpeechIsActiveFlag = 1;
}
else
{
pState->pSM->perSpeechIsActiveFlag = 0;
}
}
/********************************************************************************
// Name: SM_UpdateState
// Description: Update the state of the VAD state machine based on onset and
// hang times. The VAD state machine can be in one of the following states
// 1) SILENCE - No speech is present.
// Allowed transitions -
// SILENCE -> SILENCE : atleast one of the energy/periodicity
// flags indicates speech inactivity.
// SILENCE -> ONSET : both energy/periodicity flag indicate
// speech activity
//
// 2) ONSET - Start of an utterance may have been detected.
// Allowed transitions -
// ONSET -> SPEECH : both the energy/periodicity flags indicate
// speech activity in consecutive ONSET_THRESHOLD_MSEC
// frames. This transition sets the utterance start flag.
// ONSET -> SILENCE : both the energy/periodicity flags are false (do not
// indicate speech activity) in consecutive
// ONSET_THRESHOLD_MSEC frames
// ONSET -> ONSET : both energy/periodicity flag indicate
// speech activity but ONSET_THRESHOLD_MSEC
// has not been reached
// 3) SPEECH - Speech is present.
// Allowed transitions -
// SPEECH -> HANG : at least one of energy/periodicity
// flags do not indicate speech activity
// SPEECH -> SPEECH : both energy/periodicity flag indicate
// speech activity
//
// 4) HANG - End of an utterance may have been detected.
// Allowed transitions -
// HANG -> SPEECH : if both energy/periodicity flags indicate
// speech activity before hang times
// ENERGY_HANG_THRESH_MSEC or PER_HANG_THRESH_MSEC
// are completed
// HANG -> HANG : at least one of energy/periodicity
// flags do not indicate speech activity in consecutive
// frames but hang time ENERGY_HANG_THRESH_MSEC or
// PER_HANG_THRESH_MSEC are not yet completed
// HANG -> SILENCE : at least one of energy/periodicity flags do not indicate
// speech activity in consecutive frames and hang times
// ENERGY_HANG_THRESH_MSEC or PER_HANG_THRESH_MSEC are
// is completed. This transition sets the utterance end flag.
// Input Arguments:
// frameNum - current frame number used for setting the start/end frame number.
// Input/Output ARguments:
// pState - pointer to the SM structure.
//
// Returns: None
********************************************************************************/
void CVAD::SM_UpdateSMState(SMStruct* pSM, int frameNum)
{
switch (pSM->state)
{
case SILENCE:
if (pSM->energySpeechIsActiveFlag && pSM->perSpeechIsActiveFlag )
{
/* speech onset detected. Transition SILENCE -> ONSET */
pSM->state = ONSET;
pSM->cOnsetFrames = 1;
}
break;
case ONSET:
if (pSM->energySpeechIsActiveFlag && pSM->perSpeechIsActiveFlag )
{
/* update number of onset frames */
(pSM->cOnsetFrames)++;
if (pSM->cOnsetFrames >= pSM->cMinOnsetFrames)
{
/* utterance start detected. Transition ONSET -> SPEECH */
pSM->uttHasStartedFlag = 1;
pSM->uttBegFrameNum = frameNum - pSM->cMinOnsetFrames;
pSM->state = SPEECH;
}
}
else
{
/* False onset. Reset onset frames. Transition ONSET -> SILENCE */
pSM->state = SILENCE;
pSM->cOnsetFrames = 0;
}
break;
case SPEECH:
if (!(pSM->energySpeechIsActiveFlag))
{
/* utterance end may have started. Transition SPEECH -> HANG */
pSM->cEnergyHangFrames=1;
pSM->state = HANG;
}
if (!(pSM->perSpeechIsActiveFlag))
{
/* utterance end may have started. Transition SPEECH -> HANG */
pSM->cPerHangFrames=1;
pSM->state = HANG;
}
break;
case HANG:
if (!(pSM->energySpeechIsActiveFlag))
{
/* update number of consecutive hang frames based on energy measure */
(pSM->cEnergyHangFrames)++;
}
else
{
/* reset to zero since consecutive frames not inactive */
pSM->cEnergyHangFrames = 0;
}
if (!(pSM->perSpeechIsActiveFlag))
{
/* update number of consecutive hang frames based on energy measure */
(pSM->cPerHangFrames)++;
}
else
{
/* reset to zero since consecutive frames not inactive */
pSM->cPerHangFrames = 0;
}
if ( (pSM->cEnergyHangFrames >= pSM->cMinEnergyHangFrames)
|| (pSM->cPerHangFrames >= pSM->cMinPerHangFrames) )
{
/* speech end detected. Transition HANG -> SILENCE */
pSM->uttEndFrameNum = frameNum - 1;
pSM->uttHasEndedFlag = 1;
pSM->cEnergyHangFrames = 0;
pSM->cPerHangFrames = 0;
pSM->state = SILENCE;
}
else if ((0 == pSM->cEnergyHangFrames)
&& (0 == pSM->cPerHangFrames))
{
/*
// false HANG since both energy/periodicity flags indicate speech activity.
// Transition HANG -> SPEECH
*/
pSM->state = SPEECH;
}
break;
default:
break;
}
}
/* EOF */
//void CVAD::SetLength(int nLen)
//{
// m_nAllDateLenInSamps = nLen;
//}