www.pudn.com > ETSI_ES_202_212_software.rar > dsrAfeVad.c


/*===============================================================================
 *      ETSI ES 202 212   Distributed Speech Recognition
 *      Extended Advanced Front-End Feature Extraction Algorithm & Compression Algorithm
 *      Speech Reconstruction Algorithm.
 *      C-language software implementation                                      
 *      Version 1.1.1   October, 2003                                            
 *===============================================================================*/
/*-------------------------------------------------------------------------------
 *
 * FILE NAME: dsrAfeVad.c
 * PURPOSE:   Implementation of Voice Activity Detector used for the DSR Extension.
 *
 *-------------------------------------------------------------------------------*/

#include 
#include 
#include 
#include 
#include 

#include "ParmInterface.h"
#include "dsrAfeVad.h"



/* ========================================================================
                                  CONSTANTS
   ======================================================================= */

/*
* Define TRUE and FALSE
*/

#ifndef FALSE
#define			FALSE			0
#endif

#ifndef TRUE
#define			TRUE			(!FALSE)
#endif


/*
* Number of channels
*/

#define			NUM_CHAN		23


/*
* Minimum channel energy and Initial signal energy
* (Corresponds roughly to a signal at -36 dB)
*/

#define			MIN_CH_ENRG_8K		5000.0
#define			MIN_CH_ENRG_11K		6400.0
#define			MIN_CH_ENRG_16K		10000.0

#define			INIT_SIG_ENRG_8K	(1.00e+09)
#define			INIT_SIG_ENRG_11K	(1.67e+09)
#define			INIT_SIG_ENRG_16K	(3.00e+09)


/*
* Channel energy and channel noise energy smoothing factors
*/

#define			CE_SM_FAC		0.55
#define			CNE_SM_FAC		0.1


/*
* Low and high values of "gamma" used in filtering
* "long term log-spectral energy"
*/

#define			LO_GAMMA		0.7
#define			HI_GAMMA		0.9


/*
* Low and high values of "beta" used in filtering "snr"
*/

#define			LO_BETA			0.950
#define			HI_BETA			0.998


/*
* Number of initial frames, which are assumed to be non-speech
*/

#define			INIT_FRAMES		10


/*
* Sine start channel and peak-to-average threshold - Used in
* "sine wave detection"
*/

#define			SINE_START_CHAN_8K	4
#define			SINE_START_CHAN_11K	3
#define			SINE_START_CHAN_16K	3
#define			PEAK_TO_AVE_THLD	10.0


/*
* Deviation threshold, hysteresis count threshold, and forced
* update count threshold - Used in "forced update" of channel
* noise energy estimates
*/

#define			DEV_THLD		70.0
#define			HYSTER_CNT_THLD		9
#define			F_UPDATE_CNT_THLD	500


/*
* Non-speech threshold
*/

#define			NON_SPEECH_THLD		32




/* ========================================================================
                                  MACROS
   ======================================================================= */



#define			min(a,b)		((a)<(b)?(a):(b))
#define			max(a,b)		((a)>(b)?(a):(b))
#define			square(a)		((a)*(a))




/* ========================================================================
                                INTERNAL FUNCTIONS
   ======================================================================= */


/*----------------------------------------------------------------------------
 * FUNCTION NAME:  get_vm
 *
 * PURPOSE:       Measure speech quality ("voice metric")
 *
 * INPUT:
 *   pfMFBOutArray[0:NUM_CHAN-1] - Array of Mel-Filter bank outputs
 *
 * OUTPUT
 *   pfSnr - Current estimate of the SNR (filtered)
 *
 * RETURN VALUE
 *   iVoiceMetric is returned
 *
 *---------------------------------------------------------------------------*/
static X_INT16 get_vm(X_FLOAT32 *pfMFBOutArray, X_FLOAT32 *pfSnr)
{

  /*
   * The voice metric table is defined below.  It is a non-
   * linear table that maps the SNR index (quantized SNR value)
   * to a number that is a measure of voice quality.
   */

  static X_INT16 piVMTable[90] =
  {
    1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3,
    4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 7, 8, 8, 9, 9, 10,
    10, 11, 12, 12, 13, 13, 14, 15, 15, 16, 17, 17,
    18, 19, 20, 20, 21, 22, 23, 24, 24, 25, 26, 27,
    28, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37,
    38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
    50, 50, 50, 50, 50, 50, 50, 50, 50, 50
  };


  /*
   * The different thresholds (as a function of SNR) are
   * defined below
   */

  static X_INT16 piSigThld[20] = {36, 43, 52, 62, 73, 86, 101, 117, 134, 153,
			      173, 194, 217, 242, 268, 295, 295, 295, 295, 295};

  static X_INT16 piUpdateThld[20] = {31, 32, 33, 34, 35, 36, 37, 37, 37, 37,
				 37, 37, 37, 37, 37, 38, 38, 38, 38, 38};


  static X_INT32 iFrameCount = 0;
  static X_INT16 iFUpdateFlag = FALSE;
  static X_INT16 iHysterCount = 0;
  static X_INT16 iLastUpdateCount = 0;
  static X_INT16 iSigThld = 217;
  static X_INT16 iUpdateCount = 0;


  /*
   * The shape table is defined below.  It is used to correct
   * the spectral shaping caused by the different channel
   * widths used in DSR-AFE standard.
   */

  static X_FLOAT32 pfShapeTable_8K[NUM_CHAN] =
  {
    0.3333,
    0.3333,
    0.2857,
    0.2857,
    0.2857,
    0.2500,
    0.2500,
    0.2222,
    0.2000,
    0.2000,
    0.2000,
    0.1818,
    0.1667,
    0.1538,
    0.1429,
    0.1429,
    0.1333,
    0.1176,
    0.1111,
    0.1111,
    0.1000,
    0.0909,
    0.0870
  };




  

  static X_FLOAT32 pfChanEnrg[NUM_CHAN];
  static X_FLOAT32 pfChanNoiseEnrg[NUM_CHAN];
  static X_FLOAT32 pfLogSpecEnrgLong[NUM_CHAN];
  
  static X_FLOAT32 fBeta = LO_BETA;
  
  static X_FLOAT32 fSnr;
  static X_FLOAT32 fNoiseEnrg;



  X_INT16 piChanSnr[NUM_CHAN];
  X_INT16 i,j;
  X_INT16 iUpdateThld;
  X_INT16 iSineStartChan;
  X_INT16 iUpdateFlag;
  X_INT16 iQSnr;
  X_INT16 iVoiceMetric;
  X_FLOAT32 pfLogSpecEnrg[NUM_CHAN];
  X_FLOAT32 *pfShapeTable;

  X_FLOAT32 fAve;
  X_FLOAT32 fPeak;
  X_FLOAT32 fPeak2Ave;

  X_FLOAT32 fAlpha;
  X_FLOAT32 fGamma;
  X_FLOAT32 fEnrg;
  X_FLOAT32 fLogSpecEnrgDev;

  X_FLOAT32 fMinChEnrg;
  X_FLOAT32 fInitSigEnrg;
 
  X_FLOAT32 fSigEnrgInst;
  X_FLOAT32 fSnrInst;
  X_FLOAT32 fTemp;




  pfShapeTable = pfShapeTable_8K;
  fMinChEnrg = MIN_CH_ENRG_8K;
  fInitSigEnrg = INIT_SIG_ENRG_8K;
  iSineStartChan = SINE_START_CHAN_8K;



  /*
   * Increment the frame counter
   */

  iFrameCount++;

  if (iFrameCount > (INT_MAX-1))
    {
      iFrameCount = INT_MAX-1;
    }


  /*
   * Estimate the energy in each channel
   */

  for (i = 0; i < NUM_CHAN; i++)
    {

      fEnrg = pfMFBOutArray[i] * pfShapeTable[i];
      fAlpha = (iFrameCount == 1)? 1.0 : CE_SM_FAC;
      pfChanEnrg[i] = (1-fAlpha)*pfChanEnrg[i] + fAlpha*fEnrg;
      pfChanEnrg[i] = max(pfChanEnrg[i],fMinChEnrg);

    }


  /*
   * Calculate the Peak-to-Average ratio
   */

  fPeak = 0.0;
  fAve = 0.0;

  for (i = 0; i < NUM_CHAN; i++)
    {
      if ((i >= iSineStartChan) && (pfChanEnrg[i] > fPeak))
	{
	  fPeak = pfChanEnrg[i];
	}

      fAve += pfChanEnrg[i];
    }

  fAve /= (X_FLOAT32)NUM_CHAN;

  fPeak2Ave = 10.0*log10(fPeak/fAve);


  /*
   * Estimate the channel noise energies from the first
   * INIT_FRAMES frames
   */

  if ((iFrameCount <= INIT_FRAMES) || (iFUpdateFlag == TRUE))
    {
      
      if (fPeak2Ave < PEAK_TO_AVE_THLD)
	{
	  for (i = 0; i < NUM_CHAN; i++)
	    {
	      if (iFrameCount == 1)
		{
		  pfChanNoiseEnrg[i] = pfChanEnrg[i];
		}
	      else
		{
		  pfChanNoiseEnrg[i] = 0.7*pfChanNoiseEnrg[i] + 0.3*pfChanEnrg[i];
		}
	    }
	}
      else
	{
	  for (i = 0; i < NUM_CHAN; i++)
	    {
	      pfChanNoiseEnrg[i] = fMinChEnrg;
	    }
	}
      
    }


  /*
   * Compute the channel SNR indices
   */

  for (i = 0; i < NUM_CHAN; i++)
    {
      fTemp = 10.0 * log10((double)(pfChanEnrg[i]/pfChanNoiseEnrg[i]));
      fTemp = max(fTemp,0.0);
      piChanSnr[i] = (fTemp+0.1875) / 0.375;
    }


  /*
   * Compute the Voice Metric
   */

  iVoiceMetric = 0;
  for (i = 0; i < NUM_CHAN; i++)
    {
      j = min(piChanSnr[i],89);
      iVoiceMetric += piVMTable[j];
    }


  /*
   * Estimate the log spectral energy deviation
   */

  /*
   * First, compute the log-spectral energy
   */

  for (i = 0; i < NUM_CHAN; i++)
    {
      pfLogSpecEnrg[i] = 10.0*log10(pfChanEnrg[i]);
    }

  if ((iFrameCount <= INIT_FRAMES) || (iFUpdateFlag == TRUE))
    {
      for (i = 0; i < NUM_CHAN; i++)
	{
	  pfLogSpecEnrgLong[i] = pfLogSpecEnrg[i];
	}
    }


  /*
   * Next, compute the log-spectral energy deviation
   */

  fLogSpecEnrgDev = 0.0;
  for (i = 0; i < NUM_CHAN; i++)
    {
      fLogSpecEnrgDev += fabs(pfLogSpecEnrgLong[i]-pfLogSpecEnrg[i]);
    }


  /*
   * Update the long term log-spectral energy
   */

  fGamma = (iVoiceMetric > iSigThld)? HI_GAMMA : LO_GAMMA;

  for (i = 0; i < NUM_CHAN; i++)
    {
      pfLogSpecEnrgLong[i] = fGamma*pfLogSpecEnrgLong[i] +
	(1.0-fGamma)*pfLogSpecEnrg[i];
    }


  /*
   * Estimate the SNR of the speech input
   */

  /*
   * First, estimate the noise energy
   */

  fNoiseEnrg = 0.0;
  for (i = 0; i < NUM_CHAN; i++)
    {
      fNoiseEnrg += pfChanNoiseEnrg[i];
    }


  /*
   * Next, estimate the signal energy
   */

  if ((iFrameCount <= INIT_FRAMES) || (iFUpdateFlag == TRUE))
    {
      fSigEnrgInst = fInitSigEnrg;
    }
  else
    {
      
      if (iVoiceMetric > iSigThld)
	{
	  fSigEnrgInst = 0.0;
	  for (i = 0; i < NUM_CHAN; i++)
	    {
	      if (pfChanEnrg[i] > pfChanNoiseEnrg[i])
		{
		  fSigEnrgInst += pfChanEnrg[i];
		}
	      else
		{
		  fSigEnrgInst += pfChanNoiseEnrg[i];
		}
	    }

	}
      else
	{
	  fSigEnrgInst = fNoiseEnrg;
	}
    }


  /*
   * Compute the speech SNR
   */

  fSnrInst = 10.0 * log10(fSigEnrgInst/fNoiseEnrg);
  fSnrInst = max(fSnrInst,0.0);
  
  if ((iFrameCount <= INIT_FRAMES) || (iFUpdateFlag == TRUE))
    {
      fSnr = fSnrInst;
    }
  else
    {
      if (iVoiceMetric > iSigThld)
	{
	  fSnr = fBeta*fSnr + (1.0-fBeta)*fSnrInst;
	  fBeta = fBeta + 0.003;
	  fBeta = min(fBeta,HI_BETA);
	}
      else
	{
	  fBeta = fBeta - 0.003;
	  fBeta = max(fBeta,LO_BETA);
	}
    }


  /*
   * Quantize the SNR and select the different thresholds
   * based on this value
   */

  iQSnr = max(0,min((fSnr/1.5),19));
  
  iSigThld = piSigThld[iQSnr];
  iUpdateThld = piUpdateThld[iQSnr];


  /*
   * Set or reset the update flag and the forced update flag
   */

  iUpdateFlag = FALSE;
  iFUpdateFlag = FALSE;
  
  if ((iVoiceMetric < iUpdateThld) && (fPeak2Ave < PEAK_TO_AVE_THLD) &&
      (iFrameCount > INIT_FRAMES))
    {
      iUpdateFlag = TRUE;
      iUpdateCount = 0;
    }
  else if ((fPeak2Ave < PEAK_TO_AVE_THLD) && (fLogSpecEnrgDev < DEV_THLD))
    {
      iUpdateCount++;
      if (iUpdateCount >= F_UPDATE_CNT_THLD)
	{
	  iUpdateFlag = TRUE;
	  iFUpdateFlag = TRUE;
	}
    }
  else
    {
      ;
    }

  if (iUpdateCount == iLastUpdateCount)
    {
      iHysterCount++;
    }
  else
    {
      iHysterCount = 0;
    }
  iLastUpdateCount = iUpdateCount;

  if (iHysterCount > HYSTER_CNT_THLD)
    {
      iUpdateCount = 0;
    }


  /*
   * Update the channel noise estimates
   */

  if (iUpdateFlag == TRUE)
    {
      for (i = 0; i < NUM_CHAN; i++)
	{
	  pfChanNoiseEnrg[i] = (1.0-CNE_SM_FAC)*pfChanNoiseEnrg[i] +
	    CNE_SM_FAC*pfChanEnrg[i];
	  pfChanNoiseEnrg[i] = max(pfChanNoiseEnrg[i],fMinChEnrg);
	}
    }


  /*
   * Save the output values and return
   */

  *pfSnr = fSnr;
  
  if (iFrameCount <= INIT_FRAMES)
    {
      iVoiceMetric = NON_SPEECH_THLD;
    }

  return(iVoiceMetric);

}





/* ========================================================================
                                EXTERNAL FUNCTIONS
   ======================================================================= */


/*----------------------------------------------------------------------------
 * FUNCTION NAME:  dsr_afe_vad
 *
 * PURPOSE:       Detects the Voice Activity at the DSR Front-End
 *
 * INPUT:
 *   pfMFBOutArray[0:NUM_CHAN-1] - Array of Mel-Filter bank outputs
 *
 * OUTPUT
 *   piHangOverFlag - This flag is set if the current frame is
 *                    a hang-over frame
 *   pfSnr - Current estimate of the SNR (filtered)
 *
 * RETURN VALUE
 *   iVad - The VAD value is returned
 *
 *---------------------------------------------------------------------------*/
X_INT16 dsr_afe_vad(X_FLOAT32 *pfMFBOutArray,
	       X_INT16 *piHangOverFlag, X_FLOAT32 *pfSnr)
{

  static X_INT16 piBurstConst[20] = {2, 2, 3, 3, 4, 4, 4, 4, 5, 5,
				 5, 5, 5, 5, 6, 6, 6, 6, 6, 6};
  
  static X_INT16 piHangConst[20] = {54, 52, 50, 48, 46, 44, 42, 40, 38, 36,
				34, 32, 30, 28, 26, 24, 22, 20, 18, 16};

  static X_INT16 piVADThld[20] = {32, 34, 36, 38, 40, 42, 44, 46, 48, 50,
			      52, 54, 55, 56, 57, 57, 58, 58, 58, 58};


  static X_INT16 iBurstConst = 6;
  static X_INT16 iBurstCount = 0;

  static X_INT16 iHangConst = 28;
  static X_INT16 iHangCount = -1;

  static X_INT16 iVADThld = 56;


  X_INT16 iQSnr;
  X_INT16 iVad;
  X_INT16 iVoiceMetric;
  X_INT16 iVvad;



  /*
   * Get the "voice metric" for the frame
   */

  iVoiceMetric = get_vm(pfMFBOutArray,pfSnr);


  /*
   * Estimate "voice activity" for the frame
   */

  iVvad = (iVoiceMetric > iVADThld)? TRUE : FALSE;


  /*
   * Add Hangover
   */

  if (iVvad == TRUE)
    {
      iBurstCount++;
    }
  else
    {
      iBurstCount = 0;
    }

  if (iBurstCount >= iBurstConst)
    {
      iHangCount = iHangConst;
      iBurstCount = iBurstConst;
    }


  /*
   * Make the "vad" decision for the frame
   */

  iVad = FALSE;
  if ((iVvad == TRUE) || (iHangCount >= 0))
    {
      iVad = TRUE;
    }

  *piHangOverFlag = FALSE;
  if ((iVvad == FALSE) && (iHangCount >= 0))
    {
      *piHangOverFlag = TRUE;
    }

  if (iHangCount >= 0)
    {
      iHangCount--;
    }


  /*
   * Update the thresholds and return
   */

  iQSnr = max(0,min((*pfSnr/1.5),19));
  iVADThld = piVADThld[iQSnr];
  iBurstConst = piBurstConst[iQSnr];
  iHangConst = piHangConst[iQSnr];

  return(iVad);

}