www.pudn.com > speechVAD.rar > VAD.h


     // VAD.h: interface for the CVAD class. 
// 
////////////////////////////////////////////////////////////////////// 
 
#if !defined(AFX_VAD_H__7F330C4E_49D3_49D7_8296_35BA78D26446__INCLUDED_) 
#define AFX_VAD_H__7F330C4E_49D3_49D7_8296_35BA78D26446__INCLUDED_ 
 
#if _MSC_VER > 1000 
#pragma once 
#endif // _MSC_VER > 1000 
#include  
#include  
 
//#ifndef _WIN32_WCE 
//#include  
//#include  
//#include  
//#else 
//#include "ippdefs.h" 
//#include "ippSP.h" 
//#include "ippSR.h" 
//#endif 
 
#include  
 
#define NUMBIQUAD  2 
 
 
//////////////////////////////////////// 
#define E_INIT_NOISE_ESTIMATE_MSEC 250   /* input length in msec used to estimate initial noise energy */ 
#define E_FAST_ADAPT_ALPHA_Q15     163   /* noise energy adaptation parameter during speech (0.001 Q15) */ 
#define E_SLOW_ADAPT_ALPHA_Q15     32    /* noise energy adaptation parameter during pause (0.005 Q15) */ 
#define E_THRESH_OFFSET_DB_Q15     98304 /* amount above noise energy to set threshold (3 dB Q15) */ 
#define E_MIN_ENERGY_DB            0     /* minimum energy dB */ 
 
/* Constants used in Periodicity-related computations */ 
#define PER_INIT_PER_ESTIMATE_MSEC 250   /* input length in msec used to estimate initial periodicity */ 
#define PER_ADAPT_ALPHA_Q15        4915  /* periodicity adaptation parameter (0.15 Q15) */ 
#define PER_MIN_PITCH_FREQ_HZ      100   /* minimum pitch frequency in Hz */ 
#define PER_MAX_PITCH_FREQ_HZ      200   /* maximum pitch frequency in Hz */ 
#define PER_SPEECH_THRESHOLD_Q15   10485 /* 0.32 - minimum threshold for periodicity of speech*/ 
#define PER_DOWNSAMPLE_FACTOR      4//8     /* factor used to downsample the bandpass filtered signal */ 
#define PER_DOWNSAMPLE_PHASE       0     /* relative position of the input vector in the downsampled output */ 
 
/* Onset and hang times used by the State Machine */ 
#define ONSET_THRESHOLD_MSEC       50    /* minimum duration in msec */ 
#define ENERGY_HANG_THRESH_MSEC    300   /* minimum energy hang duration in msec */ 
#define PER_HANG_THRESH_MSEC       500   /* minimum periodicity hang duration in msec */ 
#define UTT_BEG_ADJUSTMENT_MSEC    225   /* adjustment for start frame number in msec */ 
 
/* the four states of the VAD state machine */ 
typedef enum 
{ 
    SILENCE = 0, 
    ONSET   = 1, 
    SPEECH  = 2, 
    HANG    = 3 
} 
VADState; 
 
/* Structure for the Energy-related state */ 
typedef struct _EStateStruct 
{ 
    int             cInitNoiseEstFrames;  /* input length in frames used to estimate an initial noise floor */ 
    int             energyDB;             /* energy in dB of the current frame */ 
    int             noiseFloorDB;         /* noise floor estimate in dB */ 
    int             noiseThreshDB;        /* noise energy threshold in dB (noiseFloorDB + noiseEstCorrectionDB) */ 
    int                cFrameSamples;        /* number of samples per frame */ 
    int                cScaleFactor;         /* number of right shifts to use in energy calculation */ 
} 
EStateStruct; 
 
/* Structure for the Periodicity-related state */ 
typedef struct _PERStateStruct 
{ 
    int             cInitPerEstFrames;     /* input length in frames used to estimate an periodicity */ 
    int             minPeriodSamps;        /* minimum pitch period in samples corresponding to PER_MAX_PITCH_FREQ_HZ */ 
    int             maxPeriodSamps;        /* maximum pitch period in samples corresponding to PER_MIN_PITCH_FREQ_HZ */ 
    int             period;                /* period of the current frame */ 
    short             periodicityQ15;        /* Q15 periodicity value for the current frame */ 
    short             dummy;                 /* dummy variable to ensure struct size is multiple of word size (4 bytes) */ 
    int             smoothPeriodicityQ15;  /* smoothed Q15 value of periodicity */ 
    int             cDSFrameSamps;         /* number of samples in the down-sampled frame */ 
    short*            pBPFrame;              /* buffer to hold the band-pass filtered input data */ 
    short*            pDSFrame;              /* buffer to hold the down-sampled data */ 
    int*            pDelayLine; 
    void*              pMemoryBlock;          /* pointer to easily compute the size of the structure */ 
} 
PERStateStruct; 
 
/* Structure for the VAD State Machine (SM) */ 
typedef struct _SMStruct 
{ 
    int    cMinOnsetFrames;          /* minimum duration in frames for speech onset */ 
    int    cMinEnergyHangFrames;     /* minimum energy hang duration in frames of speech end */ 
    int    cMinPerHangFrames;        /* minimum periodicity hang duration in frames of speech end */ 
    int    cUttBegAdjustFrames;      /* number of frames used to adjust the start frame number */ 
    int    cOnsetFrames;             /* number of consecutive onset frames detected */ 
    int    cEnergyHangFrames;        /* number of consecutive hang frames detected based on energy measure */ 
    int    cPerHangFrames;           /* number of consecutive hang frames detected based on periodicity measure*/ 
    int    uttHasStartedFlag;        /* Flag indicating utterance has start has been detected */ 
    int    uttHasEndedFlag;          /* Flag indicating utterance end has been detected */ 
    int    energySpeechIsActiveFlag; /* Flag indicating speech activity has been detected based on energy measure  */ 
    int    perSpeechIsActiveFlag;    /* Flag indicating speech activity has been detected based on periodicity measure */ 
    int    uttBegFrameNum;           /* frame number of start of utterance */ 
    int    uttEndFrameNum;           /* frame number of end of utterance */ 
    int    prevUttEndFrameNum;       /* frame number of the end of previous utterance (including hang time) */ 
    VADState  state;                    /* state of the VAD state machine */ 
    void*     pMemoryBlock;             /* pointer to easily compute the size of the structure */ 
} 
SMStruct ; 
 
typedef struct _VADStateStruct 
{ 
    EStateStruct*     pEState;      /* energy-based state structure */ 
    PERStateStruct*   pPerState;    /* periodicity-based state structure */ 
    SMStruct*         pSM;          /* state machine structure */ 
    int            frameNum;     /* absolute frame number (starting from 0) of the number of frames read */ 
    void*             pMemoryBlock; /* pointer to easily compute the size of the structure */ 
} 
VADStateStruct; 
 
/* weighted average using Q15 weight */ 
#define WEIGHTED_AVG_Q15(val1, val2, wgt, result) \ 
{                                                 \ 
  LONG term1,term2;                             \ 
  term1 = (LONG) val1 * wgt + 1;                \ 
  term2 = (LONG) val2 * ((1<<15) - wgt) + 1;    \ 
  result = (int)((term1 + term2) >> 15);       \ 
} 
/////////////////////////////////////////////////////// 
 
 
/* parameters */ 
#define DC_RMV_COEF             32735                 /* dc removal filter coefficient (0.99 Q15) */ 
#define SAMPLING_FREQUENCY_HZ   8000                 /* sampling frequency of the input data in Hz */ 
#define WINDOW_LEN_MSEC         30                    /* length of a window/frame of speech in msec */ 
#define FRAME_SHIFT_MSEC        15                    /* frame shift in msec for overlapping frames */ 
#define CONV_FACTOR_SEC_MSEC    1000 
 
/* conversion of lengths defined above from msec to samples */ 
#define WINDOW_LEN_SAMPS        WINDOW_LEN_MSEC * SAMPLING_FREQUENCY_HZ / CONV_FACTOR_SEC_MSEC 
#define FRAME_SHIFT_SAMPS       FRAME_SHIFT_MSEC * SAMPLING_FREQUENCY_HZ / CONV_FACTOR_SEC_MSEC 
 
#define   Q15    15  /* N value of a Q15 (QM.N format) number */ 
 
typedef enum  
{ 
    NODECISION    = -1, 
    ACTIVE        = 0, 
    INACTIVE      = 1, 
    END_OF_STREAM = 2 
} VADDecisionState; 
 
#define FINAL_SILENCE 0x00 
#define FINAL_VOICE 0x01 
 
#define VAD_ALL_SAMPS_IN_MSEC 180//can be changed the same as recorder buffer 
#define VAD_ALL_SAMPS    VAD_ALL_SAMPS_IN_MSEC * SAMPLING_FREQUENCY_HZ / CONV_FACTOR_SEC_MSEC      
 
 
class CVAD   
{ 
public: 
     
    void VADInit(); 
	int VADProcessFrame(short * pFrameBuffer, int bufferLenInSmps); 
    int VADProcessDecBuf(VADDecisionState *preDec, int bufLen, int *pDecBuffer); 
 
    void E_ComputeEnergyDB(const short* pFrame, EStateStruct* pEState); 
    void E_ComputeNoiseEnergyThreshDB(EStateStruct* pEState, int frameNum); 
    void E_UpdateEnergyState(const short* pFrame, int frameNum, EStateStruct* pEState); 
    void E_Init(EStateStruct* pEState, int cWinSamps, int frameShiftMsec); 
    void E_GetStateSizeBytes(int* pNumStateBytes); 
 
    /* function prototypes */ 
    void PER_BandPassAndDownSample(const short* pSrc, int srcLen, PERStateStruct* pPerState); 
    void PER_ComputePeriodicity(const short* pInFrame, int len, PERStateStruct* pPerState); 
    void PER_SmoothPeriodicity(PERStateStruct* pPerState, int frameNum); 
    void PER_UpdatePerState(const short* pFrame, int frameNum, int len, PERStateStruct* pPerState); 
    void PER_Init(PERStateStruct* pPerState, int frameShiftMsec, int cWinSamps, int sampFreqHz); 
    void PER_GetStateSizeBytes(int cWinSamps, int* pNumStateBytes); 
 
     
    /* function prototypes */ 
    void SM_UpdateSMParams(VADStateStruct* pState); 
    void SM_UpdateSMState(SMStruct* pState, int frameNum); 
    void SM_UpdateState(VADStateStruct* pState); 
    void SM_Init(SMStruct* pSMState, int frameShiftMsec); 
    void SM_GetStateSizeBytes(int* pNumStateBytes); 
 
 
 
    void VAD_GetStateSizeBytes(int cWinSamps, int* pNumStateBytes); 
    void VAD_Init(VADStateStruct* pState, int frameShiftMsec, int cWinSamps, int sampFreqHz);	 
    void VAD_ProcessFrame( 
                        VADStateStruct*    pState, 
                        const short*      pInFrame, 
                        int                len, 
                        VADDecisionState*  pDecisionState, 
                        int*               pDecisionFrame); 
 
    void VAD_ProcessEndOfInput( 
                              VADStateStruct*    pState, 
                              VADDecisionState*  pDecisionState, 
                              int*               pDecisionFrame); 
    CVAD(); 
	virtual ~CVAD(); 
     
public: 
    short * m_pReadBuffer;             //data in buffer that will be processed 
    int    * m_pDecResultBuf; 
    int    m_nAllDateLenInSamps; 
    BOOL   m_bIsFirstSegment; 
     
 
     
private: 
    int    cWinSamps;             /* window/frame size in samples */ 
    int    cFrameShiftSamps;      /* frame shift in samples for overlapping frames */ 
    int    cReuseSamps;           /* number of samples from previous frame to be reused for next frame */ 
    VADStateStruct*    pState;                /* pointer to the internal VAD state structure */ 
    int                cStateBytes;           /* size in bytes of the VAD state */ 
    VADDecisionState   prevDecisionState;     /* decision state of VAD from previous frames */ 
    int                decisionFrameNum;      /* the frame number of the endpoint determined by VAD */ 
    short      prevInputSample;       /* previous input sample for DC removal filter */ 
    short      prevOutputSample;      /* previous output sample for DC removal filter */    
}; 
 
#endif // !defined(AFX_VAD_H__7F330C4E_49D3_49D7_8296_35BA78D26446__INCLUDED_)