www.pudn.com > lpc.zip > voice_i.c


/***************************************************************************   
*
*	VOICIN Version 52
*
***************************************************************************
*
*        Voicing Detection (VOICIN) makes voicing decisions for each half
*   frame of input speech.  Tentative voicing decisions are made two frames
*   in the future (2F) for each half frame.  These decisions are carried
*   through one frame in the future (1F) to the present (P) frame where
*   they are examined and smoothed, resulting in the final voicing
*   decisions for each half frame. 
*        The voicing parameter (signal measurement) column vector (VALUE)
*   is based on a rectangular window of speech samples determined by the
*   window placement algorithm.  The voicing parameter vector contains the
*   AMDF windowed maximum-to-minimum ratio, the zero crossing rate, energy
*   measures, reflection coefficients, and prediction gains.  The voicing
*   window is placed to avoid contamination of the voicing parameter vector
*   with speech onsets. 
*        The input signal is then classified as unvoiced (including
*   silence) or voiced.  This decision is made by a linear discriminant
*   function consisting of a dot product of the voicing decision
*   coefficient (VDC) row vector with the measurement column vector
*   (VALUE).  The VDC vector is 2-dimensional, each row vector is optimized
*   for a particular signal-to-noise ratio (SNR).  So, before the dot
*   product is performed, the SNR is estimated to select the appropriate
*   VDC vector. 
*        The smoothing algorithm is a modified median smoother.  The
*   voicing discriminant function is used by the smoother to determine how
*   strongly voiced or unvoiced a signal is.  The smoothing is further
*   modified if a speech onset and a voicing decision transition occur
*   within one half frame.  In this case, the voicing decision transition
*   is extended to the speech onset.  For transmission purposes, there are
*   constraints on the duration and transition of voicing decisions.  The
*   smoother takes these constraints into account. 
*        Finally, the energy estimates are updated along with the dither
*   threshold used to calculate the zero crossing rate (ZC).
*
*  Inputs:
*   VWIN      - Voicing window limits
*   INBUF     - Input speech buffer
*   LPBUF     - Low-pass filtered speech buffer
*   BUFLIM    - INBUF and LPBUF limits
*   HALF      - Present analysis half frame number
*   MINAMD    - Minimum value of the AMDF
*   MAXAMD    - Maximum value of the AMDF
*   MINTAU    - Pointer to the lag of the minimum AMDF value
*   IVRC(2)   - Inverse filter's RC's
*   OBOUND    - Onset boundary descriptions
*   AF        - The analysis frame number
*  Output:
*   VOIBUF(2,0:AF) - Buffer of voicing decisions
*  Internal:
*   QS        - Ratio of preemphasized to full-band energies
*   RC1       - First reflection coefficient
*   AR_B      - Product of the causal forward and reverse pitch prediction
*               gains
*   AR_F      - Product of the noncausal forward and rev. pitch prediction
*               gains
*   ZC        - Zero crossing rate
*   DITHER    - Zero crossing threshold level
*   MAXMIN    - AMDF's 1 octave windowed maximum-to-minimum ratio
*   MINPTR    - Location  of minimum AMDF value
*   NVDC      - Number of elements in each VDC vector
*   NVDCL     - Number of VDC vectors
*   VDCL      - SNR values corresponding to the set of VDC's
*   VDC       - 2-D voicing decision coefficient vector
*   VALUE(9)  - Voicing Parameters
*   VOICE(2,3)- History of LDA results
*   LBE       - Ratio of low-band instantaneous to average energies
*   FBE       - Ratio of full-band instantaneous to average energies
*   LBVE      - Low band voiced energy
*   LBUE      - Low band unvoiced energy
*   FBVE      - Full band voiced energy
*   FBUE      - Full band unvoiced energy
*   OFBUE     - Previous full-band unvoiced energy
*   OLBUE     - Previous low-band unvoiced energy
*   REF       - Reference energy for initialization and DITHER threshold
*   SNR       - Estimate of signal-to-noise ratio
*   SNR2      - Estimate of low-band signal-to-noise ratio
*   SNRL      - SNR level number
*   OT        - Onset transition present
*   VSTATE    - Decimal interpretation of binary voicing classifications
*   FIRST     - First call flag
*/

#include "ourstuff.h"
#include 
#include "vcomm_i.ch"
#include "contrl.ch"
#include "lpcdefs.h"
#include 

/********sw*******/
extern float min1, min2, min3;
extern float max1, max2, max3;
/*****************/

voicin_i( vwin, inbuf, lpbuf, half, minamd, maxamd, mintau, ivrc,
	 obound, voibuf)
int_type vwin[2][AF], half, mintau;
int_type minamd, maxamd, ivrc[2];
int_type *inbuf;
int_type *lpbuf;
int_type *obound, voibuf[2][AF+1];
{
int_type zc, lbe, fbe;
int_type i, snrl;
static int_type vstate=0;
static int_type dither=20;
static int_type snr;
int_type snr2;
static int_type maxmin;
int_type qs, rc1, ar_b;
int_type ar_f;
static int_type voice[2][3];
int_type value[9];
short ot=0;

/***** added for testing of vparms_i ****/
int vlen, stop, start, sw, sw2;
int_type dither_i;
int_type inbuf_i[1000];
int_type lpbuf_i[1000];
int_type vwin_i[2][AF];

int_type zc_i, lbe_i, fbe_i,  qs_i, rc1_i, ar_b_i, ar_f_i;
/****************************************/

/*   Declare and initialize filters:	*/

static int_type lbve, lbue, fbve, fbue, ofbue, olbue;
static int_type sfbue, slbue=0;
int_type ref= 3000;
static short first=1;


if (first) {
	lbve = ref;
	fbve = ref;
	fbue = ref>>4;
	ofbue = ref>>4;
	lbue = ref>>5;
	olbue = ref>>5;
	snr = 128; /* "quantized" 64*(fbve/fbue) */
	first = 0;
	vdcl_i[0] = 600;
	vdcl_i[1] = 450;
	vdcl_i[2] = 300;
	vdcl_i[3] = 200;
	vdcl_i[4] = 6*0;
	
	for(i=0;i<3;i++)	{
		voice[1][i] = 0;
		voice[0][i] = 0;
	}
	
}

/*   The VOICE array contains the result of the linear discriminant function 
 *   (analog values).  The VOIBUF array contains the hard-limited binary 
 *   voicing decisions.  The VOICE and VOIBUF arrays, according to FORTRAN 
 *   memory allocation, are addressed as:
 *
 *	   (half-frame number, future-frame number)
 *
 *	   |   Past    |  Present  |  Future1  |  Future2  |
 *	   | 1,0 | 2,0 | 1,1 | 2,1 | 1,2 | 2,2 | 1,3 | 2,3 |  --->  time
 *
 *   Update linear discriminant function history each frame:		*/

if (half == 1) {
	voice[0][0]=voice[0][1];
	voice[1][0]=voice[1][1];
	voice[0][1]=voice[0][2];
	voice[1][1]=voice[1][2];
	maxmin = (minamd>1)?(maxamd<<10)/minamd:maxamd;
}

/*   Calculate voicing parameters twice per frame:	*/

vparms_i( vwin, inbuf, lpbuf, half, &dither, mintau, &zc, &lbe, &fbe, &qs,
	 &rc1, &ar_b, &ar_f );
/*
printf("(i) zc=%d, lbe=%d, fbe=%d, qs=%f, rc1=%f, ar_b=%f, ar_f=%f\n",
       zc, lbe, fbe, qs/16384., rc1/16384.,
       ar_b/16384., ar_f/16384.);
*/

/*   Estimate signal-to-noise ratio to select the appropriate VDC vector.
 *   The SNR is estimated as the running average of the ratio of the
 *   running average full-band voiced energy to the running average
 *   full-band unvoiced energy. SNR filter has gain of 63.	*/
/*
printf("(i) > snr = %d fbve = %d fbue = %d lbue = %d\n",
       snr,fbve,fbue,lbue);
*/
snr = (((long_type)snr*(long_type)63) >> 6) + (((fbve*63)/mmax(fbue,1)) >> 9);
snr2 = (snr*(fbue >> 3)/mmax(lbue>>3,1));
/*
printf("(i) snr = %f, snr2 = %f\n",snr*8., snr2*8.);
*/

/*   Quantize SNR to SNRL according to VDCL thresholds.*/

/*DO SNRL = 1, NVDCL-1 */
for (snrl=1;snrl %f, for snrl = %d?\n",
	       snr2*8.,vdcl_i[snrl-1]*1.,snrl);
	       */
	if (snr2 > (vdcl_i[snrl-1]>>3)) break;
}
/*
printf("(i) snrl = %d\n",snrl);
*/
/*   	(Note:  SNRL = NVDCL Here)	*/

/*   Linear discriminant voicing parameters:	*/

value[0] = maxmin;
value[1] = (lbve>1)?(lbe<<10)/lbve:(lbe<<10);
/*
if(value[1]>max1)
  max1=value[1];
if(value[1]> 10);
if(0)
  printf("(%d) %d\n",0,voice[half-1][2]);
voice[half-1][2] += (((long_type)vdc_i[1][snrl-1]*(long_type)value[1]) >> 10);
if(0)
  printf("(%d) %d\n",1,voice[half-1][2]);
voice[half-1][2] += (vdc_i[2][snrl-1]*zc);
if(0)
  printf("(%d) %d\n",2,voice[half-1][2]);
voice[half-1][2] += (((long_type)vdc_i[3][snrl-1]*(long_type)rc1) >> 14);
if(0)
  printf("(%d) %d\n",3,voice[half-1][2]);
voice[half-1][2] += (((long_type)vdc_i[4][snrl-1]*(long_type)qs) >> 14);
if(0)
  printf("(%d) %d\n",4,voice[half-1][2]);
voice[half-1][2] += (((long_type)vdc_i[5][snrl-1]*(long_type)ivrc[2]) >> 14);
if(0)
  printf("(%d) %d\n",5,voice[half-1][2]);
voice[half-1][2] += (((long_type)vdc_i[6][snrl-1]*(long_type)ar_b) >> 14);
if(0)
  printf("(%d) %d\n",6,voice[half-1][2]);
voice[half-1][2] += (((long_type)vdc_i[7][snrl-1]*(long_type)ar_f) >> 14);
if(0)
  printf("(%d) %d\n",7,voice[half-1][2]);
/*
printf("(i) voiced if > 0 : %d\n",voice[half-1][2]);
*/
/*   Classify as voiced if discriminant > 0, otherwise unvoiced
 *   Voicing decision for current half-frame:  1 = Voiced; 0 = Unvoiced	*/

if (voice[half-1][2] > 0.0) 
	voibuf[half-1][3]=1;
else
	voibuf[half-1][3]=0;

/*   Skip voicing decision smoothing in first half-frame:	*/

if (half != 1) {
/*   Voicing decision smoothing rules (override of linear combination):
 *
 *	Unvoiced half-frames:  At least two in a row.
 *	--------------------
 *
 *	Voiced half-frames:    At least two in a row in one frame.
 *	-------------------    Otherwise at least three in a row.
 *			       (Due to the way transition frames are encoded)
 *
 *	In many cases, the discriminant function determines how to smooth.
 *	In the following chart, the decisions marked with a * may be
 *      overridden.
 *
 *   Voicing override of transitions at onsets:
 *	If a V/UV or UV/V voicing decision transition occurs within one-half
 *	frame of an onset bounding a voicing window, then the transition is
 *	moved to occur at the onset.
 *
 *	P	1F
 *	-----	-----
 *	0   0   0   0
 *	0   0   0*  1	(If there is an onset there)
 *	0   0   1*  0*	(Based on 2F and discriminant distance)
 *	0   0   1   1
 *	0   1*  0   0	(Always)
 *	0   1*  0*  1	(Based on discriminant distance)
 *	0*  1   1   0*	(Based on past, 2F, and discriminant distance)
 *	0   1*  1   1	(If there is an onset there)
 *	1   0*  0   0	(If there is an onset there)
 *	1   0   0   1
 *	1   0*  1*  0	(Based on discriminant distance)
 *	1   0*  1   1	(Always)
 *	1   1   0   0
 *	1   1   0*  1*	(Based on 2F and discriminant distance)
 *	1   1   1*  0	(If there is an onset there)
 *	1   1   1   1
 *  
 *   Determine if there is an onset transition between P and 1F.
 *   OT (Onset Transition) is true if there is an onset between 
 *   P and 1F but not after 1F.
 */

/* OT = (AND(OBOUND(1), 2) .NE. 0 .OR. OBOUND(2) .EQ. 1) .AND.
 *       AND(OBOUND(3), 1) .EQ. 0 */
ot = ((obound[1] & 2) != 0 || obound[2] == 1) && (obound[3] & 1) == 0;

/*   Multi-way dispatch on voicing decision history:	*/

vstate = voibuf[0][1]*8 + voibuf[1][1]*4 + voibuf[0][2]*2 + voibuf[1][2];
/*	GOTO (99,1,2,99,4,5,6,7,8,99,10,11,99,13,14,99) VSTATE+1	*/

/*if(count==9) printf("vstate = %d\n",vstate);*/

switch(vstate+1)	{
	case 1:
		break;
	case 2:
		if (ot && voibuf[0][3] == 1) voibuf[0][2] = 1;
		break;
	case 3:
		if (voibuf[0][3] == 0 || voice[0][1] < -voice[1][1]) 
			voibuf[0][2] = 0;
		else
			voibuf[1][2] = 1;
		break;
	case 4:
		break;
	case 5:
		voibuf[1][1] = 0;
		break;
	case 6:
		if (voice[1][0] < -voice[0][1]) 
			voibuf[1][1] = 0;
		else
			voibuf[0][2] = 1;
		break;
	case 7:
	/*   VOIBUF(2,0) must be 0	*/
		if (voibuf[0][0] == 1 || voibuf[0][3] == 1 || voice[1][1] > voice[0][0]) 
			voibuf[1][2] = 1;
		else
			voibuf[0][1] = 1;
		break;
	case 8:
		if (ot) voibuf[1][1] = 0;
		break;
	case 9:
		if (ot) voibuf[1][1] = 1;
		break;
	case 10:
		break;
	case 11:
		if (voice[0][1] <  -voice[1][0]) 
			voibuf[0][2] = 0;
		else
			voibuf[1][1] = 1;
		break;
	case 12:
		voibuf[1][1] = 1;
		break;
	case 13:
		break;
	case 14:
		if ((voibuf[0][3] == 0) && (voice[1][1] < -voice[0][1]) )
			voibuf[1][2] = 0;
		else
			voibuf[0][2] = 1;
		break;
	case 15:
		if (ot && voibuf[0][3] == 0) voibuf[0][2] = 0;
		break;
	default:
		break;
}
} /* (99)*/

/*   Now update parameters:
*   ----------------------
*
*   During unvoiced half-frames, update the low band and full band unvoiced
*   energy estimates (LBUE and FBUE) and also the zero crossing
*   threshold (DITHER).  (The input to the unvoiced energy filters is
*   restricted to be less than 10dB above the previous inputs of the
*   filters.)
*   During voiced half-frames, update the low-pass (LBVE) and all-pass 
*   (FBVE) voiced energy estimates.					*/

if (voibuf[half-1][3] == 0) {
  /*
        printf("(i) voibuf==0, sfbue = %d fbe = %d ofbue = %d\n",
	       sfbue, fbe, ofbue);
	       */

/*
        sfbue = (63*sfbue + 8*mmin(fbe,3*ofbue) )/64;
*/
	sfbue = (((long_type)sfbue*(long_type)63) >> 6) + (mmin(fbe,3*ofbue) >> 3);
	fbue = sfbue>>3;
	ofbue = fbe;
/*	
	slbue = (63*slbue + 8*mmin(lbe,3*olbue) )/64;
*/	
	slbue = (((long_type)slbue*(long_type)63) >> 6) + (mmin(lbe,3*olbue) >> 3);
	lbue = slbue>>3;
		
	olbue = lbe;
}
else{
/*
        lbve = ( 63*lbve + lbe )/64;
	fbve = ( 63*fbve + fbe )/64;
*/       
	lbve = (((long_type)63*(long_type)lbve) >> 6) + (lbe >> 6);
	fbve = (((long_type)63*(long_type)fbve) >> 6) + (fbe >> 6);
}

/*
printf("(i) >> sfbue = %d, fbue = %d, ofbue = %d, slbue = %d\n       lbue = %d, olbue = %d, lbve = %d, fbve = %d\n",
       sfbue, fbue, ofbue, slbue, lbue, olbue, lbve, fbve);
*/
/*   Set dither threshold to yield proper zero crossing rates in the
*   presence of low frequency noise and low level signal input.
*   NOTE: The divisor is a function of REF, the expected energies.	*/

dither = mmin(mmax( (int)(64*sqrt((float)(lbue*lbve)) / ref),1),20)<<10;

/*   Voicing decisions are returned in VOIBUF.	*/

}


#ifdef _TMS320C30
int nint(anum)
int anum;
{
	return(round(anum));
}
#endif