www.pudn.com > lpc.zip > voice_i.c
/*************************************************************************** * * VOICIN Version 52 * *************************************************************************** * * Voicing Detection (VOICIN) makes voicing decisions for each half * frame of input speech. Tentative voicing decisions are made two frames * in the future (2F) for each half frame. These decisions are carried * through one frame in the future (1F) to the present (P) frame where * they are examined and smoothed, resulting in the final voicing * decisions for each half frame. * The voicing parameter (signal measurement) column vector (VALUE) * is based on a rectangular window of speech samples determined by the * window placement algorithm. The voicing parameter vector contains the * AMDF windowed maximum-to-minimum ratio, the zero crossing rate, energy * measures, reflection coefficients, and prediction gains. The voicing * window is placed to avoid contamination of the voicing parameter vector * with speech onsets. * The input signal is then classified as unvoiced (including * silence) or voiced. This decision is made by a linear discriminant * function consisting of a dot product of the voicing decision * coefficient (VDC) row vector with the measurement column vector * (VALUE). The VDC vector is 2-dimensional, each row vector is optimized * for a particular signal-to-noise ratio (SNR). So, before the dot * product is performed, the SNR is estimated to select the appropriate * VDC vector. * The smoothing algorithm is a modified median smoother. The * voicing discriminant function is used by the smoother to determine how * strongly voiced or unvoiced a signal is. The smoothing is further * modified if a speech onset and a voicing decision transition occur * within one half frame. In this case, the voicing decision transition * is extended to the speech onset. For transmission purposes, there are * constraints on the duration and transition of voicing decisions. The * smoother takes these constraints into account. * Finally, the energy estimates are updated along with the dither * threshold used to calculate the zero crossing rate (ZC). * * Inputs: * VWIN - Voicing window limits * INBUF - Input speech buffer * LPBUF - Low-pass filtered speech buffer * BUFLIM - INBUF and LPBUF limits * HALF - Present analysis half frame number * MINAMD - Minimum value of the AMDF * MAXAMD - Maximum value of the AMDF * MINTAU - Pointer to the lag of the minimum AMDF value * IVRC(2) - Inverse filter's RC's * OBOUND - Onset boundary descriptions * AF - The analysis frame number * Output: * VOIBUF(2,0:AF) - Buffer of voicing decisions * Internal: * QS - Ratio of preemphasized to full-band energies * RC1 - First reflection coefficient * AR_B - Product of the causal forward and reverse pitch prediction * gains * AR_F - Product of the noncausal forward and rev. pitch prediction * gains * ZC - Zero crossing rate * DITHER - Zero crossing threshold level * MAXMIN - AMDF's 1 octave windowed maximum-to-minimum ratio * MINPTR - Location of minimum AMDF value * NVDC - Number of elements in each VDC vector * NVDCL - Number of VDC vectors * VDCL - SNR values corresponding to the set of VDC's * VDC - 2-D voicing decision coefficient vector * VALUE(9) - Voicing Parameters * VOICE(2,3)- History of LDA results * LBE - Ratio of low-band instantaneous to average energies * FBE - Ratio of full-band instantaneous to average energies * LBVE - Low band voiced energy * LBUE - Low band unvoiced energy * FBVE - Full band voiced energy * FBUE - Full band unvoiced energy * OFBUE - Previous full-band unvoiced energy * OLBUE - Previous low-band unvoiced energy * REF - Reference energy for initialization and DITHER threshold * SNR - Estimate of signal-to-noise ratio * SNR2 - Estimate of low-band signal-to-noise ratio * SNRL - SNR level number * OT - Onset transition present * VSTATE - Decimal interpretation of binary voicing classifications * FIRST - First call flag */ #include "ourstuff.h" #include#include "vcomm_i.ch" #include "contrl.ch" #include "lpcdefs.h" #include /********sw*******/ extern float min1, min2, min3; extern float max1, max2, max3; /*****************/ voicin_i( vwin, inbuf, lpbuf, half, minamd, maxamd, mintau, ivrc, obound, voibuf) int_type vwin[2][AF], half, mintau; int_type minamd, maxamd, ivrc[2]; int_type *inbuf; int_type *lpbuf; int_type *obound, voibuf[2][AF+1]; { int_type zc, lbe, fbe; int_type i, snrl; static int_type vstate=0; static int_type dither=20; static int_type snr; int_type snr2; static int_type maxmin; int_type qs, rc1, ar_b; int_type ar_f; static int_type voice[2][3]; int_type value[9]; short ot=0; /***** added for testing of vparms_i ****/ int vlen, stop, start, sw, sw2; int_type dither_i; int_type inbuf_i[1000]; int_type lpbuf_i[1000]; int_type vwin_i[2][AF]; int_type zc_i, lbe_i, fbe_i, qs_i, rc1_i, ar_b_i, ar_f_i; /****************************************/ /* Declare and initialize filters: */ static int_type lbve, lbue, fbve, fbue, ofbue, olbue; static int_type sfbue, slbue=0; int_type ref= 3000; static short first=1; if (first) { lbve = ref; fbve = ref; fbue = ref>>4; ofbue = ref>>4; lbue = ref>>5; olbue = ref>>5; snr = 128; /* "quantized" 64*(fbve/fbue) */ first = 0; vdcl_i[0] = 600; vdcl_i[1] = 450; vdcl_i[2] = 300; vdcl_i[3] = 200; vdcl_i[4] = 6*0; for(i=0;i<3;i++) { voice[1][i] = 0; voice[0][i] = 0; } } /* The VOICE array contains the result of the linear discriminant function * (analog values). The VOIBUF array contains the hard-limited binary * voicing decisions. The VOICE and VOIBUF arrays, according to FORTRAN * memory allocation, are addressed as: * * (half-frame number, future-frame number) * * | Past | Present | Future1 | Future2 | * | 1,0 | 2,0 | 1,1 | 2,1 | 1,2 | 2,2 | 1,3 | 2,3 | ---> time * * Update linear discriminant function history each frame: */ if (half == 1) { voice[0][0]=voice[0][1]; voice[1][0]=voice[1][1]; voice[0][1]=voice[0][2]; voice[1][1]=voice[1][2]; maxmin = (minamd>1)?(maxamd<<10)/minamd:maxamd; } /* Calculate voicing parameters twice per frame: */ vparms_i( vwin, inbuf, lpbuf, half, &dither, mintau, &zc, &lbe, &fbe, &qs, &rc1, &ar_b, &ar_f ); /* printf("(i) zc=%d, lbe=%d, fbe=%d, qs=%f, rc1=%f, ar_b=%f, ar_f=%f\n", zc, lbe, fbe, qs/16384., rc1/16384., ar_b/16384., ar_f/16384.); */ /* Estimate signal-to-noise ratio to select the appropriate VDC vector. * The SNR is estimated as the running average of the ratio of the * running average full-band voiced energy to the running average * full-band unvoiced energy. SNR filter has gain of 63. */ /* printf("(i) > snr = %d fbve = %d fbue = %d lbue = %d\n", snr,fbve,fbue,lbue); */ snr = (((long_type)snr*(long_type)63) >> 6) + (((fbve*63)/mmax(fbue,1)) >> 9); snr2 = (snr*(fbue >> 3)/mmax(lbue>>3,1)); /* printf("(i) snr = %f, snr2 = %f\n",snr*8., snr2*8.); */ /* Quantize SNR to SNRL according to VDCL thresholds.*/ /*DO SNRL = 1, NVDCL-1 */ for (snrl=1;snrl %f, for snrl = %d?\n", snr2*8.,vdcl_i[snrl-1]*1.,snrl); */ if (snr2 > (vdcl_i[snrl-1]>>3)) break; } /* printf("(i) snrl = %d\n",snrl); */ /* (Note: SNRL = NVDCL Here) */ /* Linear discriminant voicing parameters: */ value[0] = maxmin; value[1] = (lbve>1)?(lbe<<10)/lbve:(lbe<<10); /* if(value[1]>max1) max1=value[1]; if(value[1] > 10); if(0) printf("(%d) %d\n",0,voice[half-1][2]); voice[half-1][2] += (((long_type)vdc_i[1][snrl-1]*(long_type)value[1]) >> 10); if(0) printf("(%d) %d\n",1,voice[half-1][2]); voice[half-1][2] += (vdc_i[2][snrl-1]*zc); if(0) printf("(%d) %d\n",2,voice[half-1][2]); voice[half-1][2] += (((long_type)vdc_i[3][snrl-1]*(long_type)rc1) >> 14); if(0) printf("(%d) %d\n",3,voice[half-1][2]); voice[half-1][2] += (((long_type)vdc_i[4][snrl-1]*(long_type)qs) >> 14); if(0) printf("(%d) %d\n",4,voice[half-1][2]); voice[half-1][2] += (((long_type)vdc_i[5][snrl-1]*(long_type)ivrc[2]) >> 14); if(0) printf("(%d) %d\n",5,voice[half-1][2]); voice[half-1][2] += (((long_type)vdc_i[6][snrl-1]*(long_type)ar_b) >> 14); if(0) printf("(%d) %d\n",6,voice[half-1][2]); voice[half-1][2] += (((long_type)vdc_i[7][snrl-1]*(long_type)ar_f) >> 14); if(0) printf("(%d) %d\n",7,voice[half-1][2]); /* printf("(i) voiced if > 0 : %d\n",voice[half-1][2]); */ /* Classify as voiced if discriminant > 0, otherwise unvoiced * Voicing decision for current half-frame: 1 = Voiced; 0 = Unvoiced */ if (voice[half-1][2] > 0.0) voibuf[half-1][3]=1; else voibuf[half-1][3]=0; /* Skip voicing decision smoothing in first half-frame: */ if (half != 1) { /* Voicing decision smoothing rules (override of linear combination): * * Unvoiced half-frames: At least two in a row. * -------------------- * * Voiced half-frames: At least two in a row in one frame. * ------------------- Otherwise at least three in a row. * (Due to the way transition frames are encoded) * * In many cases, the discriminant function determines how to smooth. * In the following chart, the decisions marked with a * may be * overridden. * * Voicing override of transitions at onsets: * If a V/UV or UV/V voicing decision transition occurs within one-half * frame of an onset bounding a voicing window, then the transition is * moved to occur at the onset. * * P 1F * ----- ----- * 0 0 0 0 * 0 0 0* 1 (If there is an onset there) * 0 0 1* 0* (Based on 2F and discriminant distance) * 0 0 1 1 * 0 1* 0 0 (Always) * 0 1* 0* 1 (Based on discriminant distance) * 0* 1 1 0* (Based on past, 2F, and discriminant distance) * 0 1* 1 1 (If there is an onset there) * 1 0* 0 0 (If there is an onset there) * 1 0 0 1 * 1 0* 1* 0 (Based on discriminant distance) * 1 0* 1 1 (Always) * 1 1 0 0 * 1 1 0* 1* (Based on 2F and discriminant distance) * 1 1 1* 0 (If there is an onset there) * 1 1 1 1 * * Determine if there is an onset transition between P and 1F. * OT (Onset Transition) is true if there is an onset between * P and 1F but not after 1F. */ /* OT = (AND(OBOUND(1), 2) .NE. 0 .OR. OBOUND(2) .EQ. 1) .AND. * AND(OBOUND(3), 1) .EQ. 0 */ ot = ((obound[1] & 2) != 0 || obound[2] == 1) && (obound[3] & 1) == 0; /* Multi-way dispatch on voicing decision history: */ vstate = voibuf[0][1]*8 + voibuf[1][1]*4 + voibuf[0][2]*2 + voibuf[1][2]; /* GOTO (99,1,2,99,4,5,6,7,8,99,10,11,99,13,14,99) VSTATE+1 */ /*if(count==9) printf("vstate = %d\n",vstate);*/ switch(vstate+1) { case 1: break; case 2: if (ot && voibuf[0][3] == 1) voibuf[0][2] = 1; break; case 3: if (voibuf[0][3] == 0 || voice[0][1] < -voice[1][1]) voibuf[0][2] = 0; else voibuf[1][2] = 1; break; case 4: break; case 5: voibuf[1][1] = 0; break; case 6: if (voice[1][0] < -voice[0][1]) voibuf[1][1] = 0; else voibuf[0][2] = 1; break; case 7: /* VOIBUF(2,0) must be 0 */ if (voibuf[0][0] == 1 || voibuf[0][3] == 1 || voice[1][1] > voice[0][0]) voibuf[1][2] = 1; else voibuf[0][1] = 1; break; case 8: if (ot) voibuf[1][1] = 0; break; case 9: if (ot) voibuf[1][1] = 1; break; case 10: break; case 11: if (voice[0][1] < -voice[1][0]) voibuf[0][2] = 0; else voibuf[1][1] = 1; break; case 12: voibuf[1][1] = 1; break; case 13: break; case 14: if ((voibuf[0][3] == 0) && (voice[1][1] < -voice[0][1]) ) voibuf[1][2] = 0; else voibuf[0][2] = 1; break; case 15: if (ot && voibuf[0][3] == 0) voibuf[0][2] = 0; break; default: break; } } /* (99)*/ /* Now update parameters: * ---------------------- * * During unvoiced half-frames, update the low band and full band unvoiced * energy estimates (LBUE and FBUE) and also the zero crossing * threshold (DITHER). (The input to the unvoiced energy filters is * restricted to be less than 10dB above the previous inputs of the * filters.) * During voiced half-frames, update the low-pass (LBVE) and all-pass * (FBVE) voiced energy estimates. */ if (voibuf[half-1][3] == 0) { /* printf("(i) voibuf==0, sfbue = %d fbe = %d ofbue = %d\n", sfbue, fbe, ofbue); */ /* sfbue = (63*sfbue + 8*mmin(fbe,3*ofbue) )/64; */ sfbue = (((long_type)sfbue*(long_type)63) >> 6) + (mmin(fbe,3*ofbue) >> 3); fbue = sfbue>>3; ofbue = fbe; /* slbue = (63*slbue + 8*mmin(lbe,3*olbue) )/64; */ slbue = (((long_type)slbue*(long_type)63) >> 6) + (mmin(lbe,3*olbue) >> 3); lbue = slbue>>3; olbue = lbe; } else{ /* lbve = ( 63*lbve + lbe )/64; fbve = ( 63*fbve + fbe )/64; */ lbve = (((long_type)63*(long_type)lbve) >> 6) + (lbe >> 6); fbve = (((long_type)63*(long_type)fbve) >> 6) + (fbe >> 6); } /* printf("(i) >> sfbue = %d, fbue = %d, ofbue = %d, slbue = %d\n lbue = %d, olbue = %d, lbve = %d, fbve = %d\n", sfbue, fbue, ofbue, slbue, lbue, olbue, lbve, fbve); */ /* Set dither threshold to yield proper zero crossing rates in the * presence of low frequency noise and low level signal input. * NOTE: The divisor is a function of REF, the expected energies. */ dither = mmin(mmax( (int)(64*sqrt((float)(lbue*lbve)) / ref),1),20)<<10; /* Voicing decisions are returned in VOIBUF. */ } #ifdef _TMS320C30 int nint(anum) int anum; { return(round(anum)); } #endif