www.pudn.com > speechVAD.rar > VAD.h
// VAD.h: interface for the CVAD class.
//
//////////////////////////////////////////////////////////////////////
#if !defined(AFX_VAD_H__7F330C4E_49D3_49D7_8296_35BA78D26446__INCLUDED_)
#define AFX_VAD_H__7F330C4E_49D3_49D7_8296_35BA78D26446__INCLUDED_
#if _MSC_VER > 1000
#pragma once
#endif // _MSC_VER > 1000
#include
#include
//#ifndef _WIN32_WCE
//#include
//#include
//#include
//#else
//#include "ippdefs.h"
//#include "ippSP.h"
//#include "ippSR.h"
//#endif
#include
#define NUMBIQUAD 2
////////////////////////////////////////
#define E_INIT_NOISE_ESTIMATE_MSEC 250 /* input length in msec used to estimate initial noise energy */
#define E_FAST_ADAPT_ALPHA_Q15 163 /* noise energy adaptation parameter during speech (0.001 Q15) */
#define E_SLOW_ADAPT_ALPHA_Q15 32 /* noise energy adaptation parameter during pause (0.005 Q15) */
#define E_THRESH_OFFSET_DB_Q15 98304 /* amount above noise energy to set threshold (3 dB Q15) */
#define E_MIN_ENERGY_DB 0 /* minimum energy dB */
/* Constants used in Periodicity-related computations */
#define PER_INIT_PER_ESTIMATE_MSEC 250 /* input length in msec used to estimate initial periodicity */
#define PER_ADAPT_ALPHA_Q15 4915 /* periodicity adaptation parameter (0.15 Q15) */
#define PER_MIN_PITCH_FREQ_HZ 100 /* minimum pitch frequency in Hz */
#define PER_MAX_PITCH_FREQ_HZ 200 /* maximum pitch frequency in Hz */
#define PER_SPEECH_THRESHOLD_Q15 10485 /* 0.32 - minimum threshold for periodicity of speech*/
#define PER_DOWNSAMPLE_FACTOR 4//8 /* factor used to downsample the bandpass filtered signal */
#define PER_DOWNSAMPLE_PHASE 0 /* relative position of the input vector in the downsampled output */
/* Onset and hang times used by the State Machine */
#define ONSET_THRESHOLD_MSEC 50 /* minimum duration in msec */
#define ENERGY_HANG_THRESH_MSEC 300 /* minimum energy hang duration in msec */
#define PER_HANG_THRESH_MSEC 500 /* minimum periodicity hang duration in msec */
#define UTT_BEG_ADJUSTMENT_MSEC 225 /* adjustment for start frame number in msec */
/* the four states of the VAD state machine */
typedef enum
{
SILENCE = 0,
ONSET = 1,
SPEECH = 2,
HANG = 3
}
VADState;
/* Structure for the Energy-related state */
typedef struct _EStateStruct
{
int cInitNoiseEstFrames; /* input length in frames used to estimate an initial noise floor */
int energyDB; /* energy in dB of the current frame */
int noiseFloorDB; /* noise floor estimate in dB */
int noiseThreshDB; /* noise energy threshold in dB (noiseFloorDB + noiseEstCorrectionDB) */
int cFrameSamples; /* number of samples per frame */
int cScaleFactor; /* number of right shifts to use in energy calculation */
}
EStateStruct;
/* Structure for the Periodicity-related state */
typedef struct _PERStateStruct
{
int cInitPerEstFrames; /* input length in frames used to estimate an periodicity */
int minPeriodSamps; /* minimum pitch period in samples corresponding to PER_MAX_PITCH_FREQ_HZ */
int maxPeriodSamps; /* maximum pitch period in samples corresponding to PER_MIN_PITCH_FREQ_HZ */
int period; /* period of the current frame */
short periodicityQ15; /* Q15 periodicity value for the current frame */
short dummy; /* dummy variable to ensure struct size is multiple of word size (4 bytes) */
int smoothPeriodicityQ15; /* smoothed Q15 value of periodicity */
int cDSFrameSamps; /* number of samples in the down-sampled frame */
short* pBPFrame; /* buffer to hold the band-pass filtered input data */
short* pDSFrame; /* buffer to hold the down-sampled data */
int* pDelayLine;
void* pMemoryBlock; /* pointer to easily compute the size of the structure */
}
PERStateStruct;
/* Structure for the VAD State Machine (SM) */
typedef struct _SMStruct
{
int cMinOnsetFrames; /* minimum duration in frames for speech onset */
int cMinEnergyHangFrames; /* minimum energy hang duration in frames of speech end */
int cMinPerHangFrames; /* minimum periodicity hang duration in frames of speech end */
int cUttBegAdjustFrames; /* number of frames used to adjust the start frame number */
int cOnsetFrames; /* number of consecutive onset frames detected */
int cEnergyHangFrames; /* number of consecutive hang frames detected based on energy measure */
int cPerHangFrames; /* number of consecutive hang frames detected based on periodicity measure*/
int uttHasStartedFlag; /* Flag indicating utterance has start has been detected */
int uttHasEndedFlag; /* Flag indicating utterance end has been detected */
int energySpeechIsActiveFlag; /* Flag indicating speech activity has been detected based on energy measure */
int perSpeechIsActiveFlag; /* Flag indicating speech activity has been detected based on periodicity measure */
int uttBegFrameNum; /* frame number of start of utterance */
int uttEndFrameNum; /* frame number of end of utterance */
int prevUttEndFrameNum; /* frame number of the end of previous utterance (including hang time) */
VADState state; /* state of the VAD state machine */
void* pMemoryBlock; /* pointer to easily compute the size of the structure */
}
SMStruct ;
typedef struct _VADStateStruct
{
EStateStruct* pEState; /* energy-based state structure */
PERStateStruct* pPerState; /* periodicity-based state structure */
SMStruct* pSM; /* state machine structure */
int frameNum; /* absolute frame number (starting from 0) of the number of frames read */
void* pMemoryBlock; /* pointer to easily compute the size of the structure */
}
VADStateStruct;
/* weighted average using Q15 weight */
#define WEIGHTED_AVG_Q15(val1, val2, wgt, result) \
{ \
LONG term1,term2; \
term1 = (LONG) val1 * wgt + 1; \
term2 = (LONG) val2 * ((1<<15) - wgt) + 1; \
result = (int)((term1 + term2) >> 15); \
}
///////////////////////////////////////////////////////
/* parameters */
#define DC_RMV_COEF 32735 /* dc removal filter coefficient (0.99 Q15) */
#define SAMPLING_FREQUENCY_HZ 8000 /* sampling frequency of the input data in Hz */
#define WINDOW_LEN_MSEC 30 /* length of a window/frame of speech in msec */
#define FRAME_SHIFT_MSEC 15 /* frame shift in msec for overlapping frames */
#define CONV_FACTOR_SEC_MSEC 1000
/* conversion of lengths defined above from msec to samples */
#define WINDOW_LEN_SAMPS WINDOW_LEN_MSEC * SAMPLING_FREQUENCY_HZ / CONV_FACTOR_SEC_MSEC
#define FRAME_SHIFT_SAMPS FRAME_SHIFT_MSEC * SAMPLING_FREQUENCY_HZ / CONV_FACTOR_SEC_MSEC
#define Q15 15 /* N value of a Q15 (QM.N format) number */
typedef enum
{
NODECISION = -1,
ACTIVE = 0,
INACTIVE = 1,
END_OF_STREAM = 2
} VADDecisionState;
#define FINAL_SILENCE 0x00
#define FINAL_VOICE 0x01
#define VAD_ALL_SAMPS_IN_MSEC 180//can be changed the same as recorder buffer
#define VAD_ALL_SAMPS VAD_ALL_SAMPS_IN_MSEC * SAMPLING_FREQUENCY_HZ / CONV_FACTOR_SEC_MSEC
class CVAD
{
public:
void VADInit();
int VADProcessFrame(short * pFrameBuffer, int bufferLenInSmps);
int VADProcessDecBuf(VADDecisionState *preDec, int bufLen, int *pDecBuffer);
void E_ComputeEnergyDB(const short* pFrame, EStateStruct* pEState);
void E_ComputeNoiseEnergyThreshDB(EStateStruct* pEState, int frameNum);
void E_UpdateEnergyState(const short* pFrame, int frameNum, EStateStruct* pEState);
void E_Init(EStateStruct* pEState, int cWinSamps, int frameShiftMsec);
void E_GetStateSizeBytes(int* pNumStateBytes);
/* function prototypes */
void PER_BandPassAndDownSample(const short* pSrc, int srcLen, PERStateStruct* pPerState);
void PER_ComputePeriodicity(const short* pInFrame, int len, PERStateStruct* pPerState);
void PER_SmoothPeriodicity(PERStateStruct* pPerState, int frameNum);
void PER_UpdatePerState(const short* pFrame, int frameNum, int len, PERStateStruct* pPerState);
void PER_Init(PERStateStruct* pPerState, int frameShiftMsec, int cWinSamps, int sampFreqHz);
void PER_GetStateSizeBytes(int cWinSamps, int* pNumStateBytes);
/* function prototypes */
void SM_UpdateSMParams(VADStateStruct* pState);
void SM_UpdateSMState(SMStruct* pState, int frameNum);
void SM_UpdateState(VADStateStruct* pState);
void SM_Init(SMStruct* pSMState, int frameShiftMsec);
void SM_GetStateSizeBytes(int* pNumStateBytes);
void VAD_GetStateSizeBytes(int cWinSamps, int* pNumStateBytes);
void VAD_Init(VADStateStruct* pState, int frameShiftMsec, int cWinSamps, int sampFreqHz);
void VAD_ProcessFrame(
VADStateStruct* pState,
const short* pInFrame,
int len,
VADDecisionState* pDecisionState,
int* pDecisionFrame);
void VAD_ProcessEndOfInput(
VADStateStruct* pState,
VADDecisionState* pDecisionState,
int* pDecisionFrame);
CVAD();
virtual ~CVAD();
public:
short * m_pReadBuffer; //data in buffer that will be processed
int * m_pDecResultBuf;
int m_nAllDateLenInSamps;
BOOL m_bIsFirstSegment;
private:
int cWinSamps; /* window/frame size in samples */
int cFrameShiftSamps; /* frame shift in samples for overlapping frames */
int cReuseSamps; /* number of samples from previous frame to be reused for next frame */
VADStateStruct* pState; /* pointer to the internal VAD state structure */
int cStateBytes; /* size in bytes of the VAD state */
VADDecisionState prevDecisionState; /* decision state of VAD from previous frames */
int decisionFrameNum; /* the frame number of the endpoint determined by VAD */
short prevInputSample; /* previous input sample for DC removal filter */
short prevOutputSample; /* previous output sample for DC removal filter */
};
#endif // !defined(AFX_VAD_H__7F330C4E_49D3_49D7_8296_35BA78D26446__INCLUDED_)