www.pudn.com > ETSI_ES_202_212_software.rar > dsrAfeVad.c
/*=============================================================================== * ETSI ES 202 212 Distributed Speech Recognition * Extended Advanced Front-End Feature Extraction Algorithm & Compression Algorithm * Speech Reconstruction Algorithm. * C-language software implementation * Version 1.1.1 October, 2003 *===============================================================================*/ /*------------------------------------------------------------------------------- * * FILE NAME: dsrAfeVad.c * PURPOSE: Implementation of Voice Activity Detector used for the DSR Extension. * *-------------------------------------------------------------------------------*/ #include#include #include #include #include #include "ParmInterface.h" #include "dsrAfeVad.h" /* ======================================================================== CONSTANTS ======================================================================= */ /* * Define TRUE and FALSE */ #ifndef FALSE #define FALSE 0 #endif #ifndef TRUE #define TRUE (!FALSE) #endif /* * Number of channels */ #define NUM_CHAN 23 /* * Minimum channel energy and Initial signal energy * (Corresponds roughly to a signal at -36 dB) */ #define MIN_CH_ENRG_8K 5000.0 #define MIN_CH_ENRG_11K 6400.0 #define MIN_CH_ENRG_16K 10000.0 #define INIT_SIG_ENRG_8K (1.00e+09) #define INIT_SIG_ENRG_11K (1.67e+09) #define INIT_SIG_ENRG_16K (3.00e+09) /* * Channel energy and channel noise energy smoothing factors */ #define CE_SM_FAC 0.55 #define CNE_SM_FAC 0.1 /* * Low and high values of "gamma" used in filtering * "long term log-spectral energy" */ #define LO_GAMMA 0.7 #define HI_GAMMA 0.9 /* * Low and high values of "beta" used in filtering "snr" */ #define LO_BETA 0.950 #define HI_BETA 0.998 /* * Number of initial frames, which are assumed to be non-speech */ #define INIT_FRAMES 10 /* * Sine start channel and peak-to-average threshold - Used in * "sine wave detection" */ #define SINE_START_CHAN_8K 4 #define SINE_START_CHAN_11K 3 #define SINE_START_CHAN_16K 3 #define PEAK_TO_AVE_THLD 10.0 /* * Deviation threshold, hysteresis count threshold, and forced * update count threshold - Used in "forced update" of channel * noise energy estimates */ #define DEV_THLD 70.0 #define HYSTER_CNT_THLD 9 #define F_UPDATE_CNT_THLD 500 /* * Non-speech threshold */ #define NON_SPEECH_THLD 32 /* ======================================================================== MACROS ======================================================================= */ #define min(a,b) ((a)<(b)?(a):(b)) #define max(a,b) ((a)>(b)?(a):(b)) #define square(a) ((a)*(a)) /* ======================================================================== INTERNAL FUNCTIONS ======================================================================= */ /*---------------------------------------------------------------------------- * FUNCTION NAME: get_vm * * PURPOSE: Measure speech quality ("voice metric") * * INPUT: * pfMFBOutArray[0:NUM_CHAN-1] - Array of Mel-Filter bank outputs * * OUTPUT * pfSnr - Current estimate of the SNR (filtered) * * RETURN VALUE * iVoiceMetric is returned * *---------------------------------------------------------------------------*/ static X_INT16 get_vm(X_FLOAT32 *pfMFBOutArray, X_FLOAT32 *pfSnr) { /* * The voice metric table is defined below. It is a non- * linear table that maps the SNR index (quantized SNR value) * to a number that is a measure of voice quality. */ static X_INT16 piVMTable[90] = { 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 7, 8, 8, 9, 9, 10, 10, 11, 12, 12, 13, 13, 14, 15, 15, 16, 17, 17, 18, 19, 20, 20, 21, 22, 23, 24, 24, 25, 26, 27, 28, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50 }; /* * The different thresholds (as a function of SNR) are * defined below */ static X_INT16 piSigThld[20] = {36, 43, 52, 62, 73, 86, 101, 117, 134, 153, 173, 194, 217, 242, 268, 295, 295, 295, 295, 295}; static X_INT16 piUpdateThld[20] = {31, 32, 33, 34, 35, 36, 37, 37, 37, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38}; static X_INT32 iFrameCount = 0; static X_INT16 iFUpdateFlag = FALSE; static X_INT16 iHysterCount = 0; static X_INT16 iLastUpdateCount = 0; static X_INT16 iSigThld = 217; static X_INT16 iUpdateCount = 0; /* * The shape table is defined below. It is used to correct * the spectral shaping caused by the different channel * widths used in DSR-AFE standard. */ static X_FLOAT32 pfShapeTable_8K[NUM_CHAN] = { 0.3333, 0.3333, 0.2857, 0.2857, 0.2857, 0.2500, 0.2500, 0.2222, 0.2000, 0.2000, 0.2000, 0.1818, 0.1667, 0.1538, 0.1429, 0.1429, 0.1333, 0.1176, 0.1111, 0.1111, 0.1000, 0.0909, 0.0870 }; static X_FLOAT32 pfChanEnrg[NUM_CHAN]; static X_FLOAT32 pfChanNoiseEnrg[NUM_CHAN]; static X_FLOAT32 pfLogSpecEnrgLong[NUM_CHAN]; static X_FLOAT32 fBeta = LO_BETA; static X_FLOAT32 fSnr; static X_FLOAT32 fNoiseEnrg; X_INT16 piChanSnr[NUM_CHAN]; X_INT16 i,j; X_INT16 iUpdateThld; X_INT16 iSineStartChan; X_INT16 iUpdateFlag; X_INT16 iQSnr; X_INT16 iVoiceMetric; X_FLOAT32 pfLogSpecEnrg[NUM_CHAN]; X_FLOAT32 *pfShapeTable; X_FLOAT32 fAve; X_FLOAT32 fPeak; X_FLOAT32 fPeak2Ave; X_FLOAT32 fAlpha; X_FLOAT32 fGamma; X_FLOAT32 fEnrg; X_FLOAT32 fLogSpecEnrgDev; X_FLOAT32 fMinChEnrg; X_FLOAT32 fInitSigEnrg; X_FLOAT32 fSigEnrgInst; X_FLOAT32 fSnrInst; X_FLOAT32 fTemp; pfShapeTable = pfShapeTable_8K; fMinChEnrg = MIN_CH_ENRG_8K; fInitSigEnrg = INIT_SIG_ENRG_8K; iSineStartChan = SINE_START_CHAN_8K; /* * Increment the frame counter */ iFrameCount++; if (iFrameCount > (INT_MAX-1)) { iFrameCount = INT_MAX-1; } /* * Estimate the energy in each channel */ for (i = 0; i < NUM_CHAN; i++) { fEnrg = pfMFBOutArray[i] * pfShapeTable[i]; fAlpha = (iFrameCount == 1)? 1.0 : CE_SM_FAC; pfChanEnrg[i] = (1-fAlpha)*pfChanEnrg[i] + fAlpha*fEnrg; pfChanEnrg[i] = max(pfChanEnrg[i],fMinChEnrg); } /* * Calculate the Peak-to-Average ratio */ fPeak = 0.0; fAve = 0.0; for (i = 0; i < NUM_CHAN; i++) { if ((i >= iSineStartChan) && (pfChanEnrg[i] > fPeak)) { fPeak = pfChanEnrg[i]; } fAve += pfChanEnrg[i]; } fAve /= (X_FLOAT32)NUM_CHAN; fPeak2Ave = 10.0*log10(fPeak/fAve); /* * Estimate the channel noise energies from the first * INIT_FRAMES frames */ if ((iFrameCount <= INIT_FRAMES) || (iFUpdateFlag == TRUE)) { if (fPeak2Ave < PEAK_TO_AVE_THLD) { for (i = 0; i < NUM_CHAN; i++) { if (iFrameCount == 1) { pfChanNoiseEnrg[i] = pfChanEnrg[i]; } else { pfChanNoiseEnrg[i] = 0.7*pfChanNoiseEnrg[i] + 0.3*pfChanEnrg[i]; } } } else { for (i = 0; i < NUM_CHAN; i++) { pfChanNoiseEnrg[i] = fMinChEnrg; } } } /* * Compute the channel SNR indices */ for (i = 0; i < NUM_CHAN; i++) { fTemp = 10.0 * log10((double)(pfChanEnrg[i]/pfChanNoiseEnrg[i])); fTemp = max(fTemp,0.0); piChanSnr[i] = (fTemp+0.1875) / 0.375; } /* * Compute the Voice Metric */ iVoiceMetric = 0; for (i = 0; i < NUM_CHAN; i++) { j = min(piChanSnr[i],89); iVoiceMetric += piVMTable[j]; } /* * Estimate the log spectral energy deviation */ /* * First, compute the log-spectral energy */ for (i = 0; i < NUM_CHAN; i++) { pfLogSpecEnrg[i] = 10.0*log10(pfChanEnrg[i]); } if ((iFrameCount <= INIT_FRAMES) || (iFUpdateFlag == TRUE)) { for (i = 0; i < NUM_CHAN; i++) { pfLogSpecEnrgLong[i] = pfLogSpecEnrg[i]; } } /* * Next, compute the log-spectral energy deviation */ fLogSpecEnrgDev = 0.0; for (i = 0; i < NUM_CHAN; i++) { fLogSpecEnrgDev += fabs(pfLogSpecEnrgLong[i]-pfLogSpecEnrg[i]); } /* * Update the long term log-spectral energy */ fGamma = (iVoiceMetric > iSigThld)? HI_GAMMA : LO_GAMMA; for (i = 0; i < NUM_CHAN; i++) { pfLogSpecEnrgLong[i] = fGamma*pfLogSpecEnrgLong[i] + (1.0-fGamma)*pfLogSpecEnrg[i]; } /* * Estimate the SNR of the speech input */ /* * First, estimate the noise energy */ fNoiseEnrg = 0.0; for (i = 0; i < NUM_CHAN; i++) { fNoiseEnrg += pfChanNoiseEnrg[i]; } /* * Next, estimate the signal energy */ if ((iFrameCount <= INIT_FRAMES) || (iFUpdateFlag == TRUE)) { fSigEnrgInst = fInitSigEnrg; } else { if (iVoiceMetric > iSigThld) { fSigEnrgInst = 0.0; for (i = 0; i < NUM_CHAN; i++) { if (pfChanEnrg[i] > pfChanNoiseEnrg[i]) { fSigEnrgInst += pfChanEnrg[i]; } else { fSigEnrgInst += pfChanNoiseEnrg[i]; } } } else { fSigEnrgInst = fNoiseEnrg; } } /* * Compute the speech SNR */ fSnrInst = 10.0 * log10(fSigEnrgInst/fNoiseEnrg); fSnrInst = max(fSnrInst,0.0); if ((iFrameCount <= INIT_FRAMES) || (iFUpdateFlag == TRUE)) { fSnr = fSnrInst; } else { if (iVoiceMetric > iSigThld) { fSnr = fBeta*fSnr + (1.0-fBeta)*fSnrInst; fBeta = fBeta + 0.003; fBeta = min(fBeta,HI_BETA); } else { fBeta = fBeta - 0.003; fBeta = max(fBeta,LO_BETA); } } /* * Quantize the SNR and select the different thresholds * based on this value */ iQSnr = max(0,min((fSnr/1.5),19)); iSigThld = piSigThld[iQSnr]; iUpdateThld = piUpdateThld[iQSnr]; /* * Set or reset the update flag and the forced update flag */ iUpdateFlag = FALSE; iFUpdateFlag = FALSE; if ((iVoiceMetric < iUpdateThld) && (fPeak2Ave < PEAK_TO_AVE_THLD) && (iFrameCount > INIT_FRAMES)) { iUpdateFlag = TRUE; iUpdateCount = 0; } else if ((fPeak2Ave < PEAK_TO_AVE_THLD) && (fLogSpecEnrgDev < DEV_THLD)) { iUpdateCount++; if (iUpdateCount >= F_UPDATE_CNT_THLD) { iUpdateFlag = TRUE; iFUpdateFlag = TRUE; } } else { ; } if (iUpdateCount == iLastUpdateCount) { iHysterCount++; } else { iHysterCount = 0; } iLastUpdateCount = iUpdateCount; if (iHysterCount > HYSTER_CNT_THLD) { iUpdateCount = 0; } /* * Update the channel noise estimates */ if (iUpdateFlag == TRUE) { for (i = 0; i < NUM_CHAN; i++) { pfChanNoiseEnrg[i] = (1.0-CNE_SM_FAC)*pfChanNoiseEnrg[i] + CNE_SM_FAC*pfChanEnrg[i]; pfChanNoiseEnrg[i] = max(pfChanNoiseEnrg[i],fMinChEnrg); } } /* * Save the output values and return */ *pfSnr = fSnr; if (iFrameCount <= INIT_FRAMES) { iVoiceMetric = NON_SPEECH_THLD; } return(iVoiceMetric); } /* ======================================================================== EXTERNAL FUNCTIONS ======================================================================= */ /*---------------------------------------------------------------------------- * FUNCTION NAME: dsr_afe_vad * * PURPOSE: Detects the Voice Activity at the DSR Front-End * * INPUT: * pfMFBOutArray[0:NUM_CHAN-1] - Array of Mel-Filter bank outputs * * OUTPUT * piHangOverFlag - This flag is set if the current frame is * a hang-over frame * pfSnr - Current estimate of the SNR (filtered) * * RETURN VALUE * iVad - The VAD value is returned * *---------------------------------------------------------------------------*/ X_INT16 dsr_afe_vad(X_FLOAT32 *pfMFBOutArray, X_INT16 *piHangOverFlag, X_FLOAT32 *pfSnr) { static X_INT16 piBurstConst[20] = {2, 2, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6}; static X_INT16 piHangConst[20] = {54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, 22, 20, 18, 16}; static X_INT16 piVADThld[20] = {32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 55, 56, 57, 57, 58, 58, 58, 58}; static X_INT16 iBurstConst = 6; static X_INT16 iBurstCount = 0; static X_INT16 iHangConst = 28; static X_INT16 iHangCount = -1; static X_INT16 iVADThld = 56; X_INT16 iQSnr; X_INT16 iVad; X_INT16 iVoiceMetric; X_INT16 iVvad; /* * Get the "voice metric" for the frame */ iVoiceMetric = get_vm(pfMFBOutArray,pfSnr); /* * Estimate "voice activity" for the frame */ iVvad = (iVoiceMetric > iVADThld)? TRUE : FALSE; /* * Add Hangover */ if (iVvad == TRUE) { iBurstCount++; } else { iBurstCount = 0; } if (iBurstCount >= iBurstConst) { iHangCount = iHangConst; iBurstCount = iBurstConst; } /* * Make the "vad" decision for the frame */ iVad = FALSE; if ((iVvad == TRUE) || (iHangCount >= 0)) { iVad = TRUE; } *piHangOverFlag = FALSE; if ((iVvad == FALSE) && (iHangCount >= 0)) { *piHangOverFlag = TRUE; } if (iHangCount >= 0) { iHangCount--; } /* * Update the thresholds and return */ iQSnr = max(0,min((*pfSnr/1.5),19)); iVADThld = piVADThld[iQSnr]; iBurstConst = piBurstConst[iQSnr]; iHangConst = piHangConst[iQSnr]; return(iVad); }