www.pudn.com > wodegaozhiliangvoicebox.rar > fxrapt.m
function [fx,tt]=fxrapt(s,fs,mode);
%FXRAPT RAPT pitch tracker [FX,VUV]=(S,FS)
%
% Input: s(ns) Speech signal
% fs Sample frequency (Hz)
% mode 'g' will plot a graph [default if no output arguments]
%
% Outputs: fx(nframe) Larynx frequency for each fram,e (or NaN for silent/unvoiced)
% tt(nframe,3) Start and end samples of each frame
%
% Plots a graph if no outputs are specified showing lag candidates and selected path
%
% Bugs/Suggestions:
% (1) Include backward DP pass and output the true cost for each candidate.
% (2) Add an extra state to distinguish between voiceless and silent
% (3) N-best DP to allow longer term penalties (e.g. for frequent pitch doubling/halving)
% The algorithm is taken from [1] with the following differences:
%
% (a) the factor AFACT which in the Talkin algorithm corresponds roughly
% to the absolute level of harmonic noise in the correlation window. This value
% is here calculated as the maximum of three figures:
% (i) an absolute floor set by PP.rapt_absnoise
% (ii) a multiple of the peak signal set by PP.rapt_signoise
% (iii) a multiple of the noise floor set by PP.rapt_relnoise
% (b) The LPC used in calculating the Itakura distance uses a Hamming window rather than
% a Hanning window.
%
% A C implementation of this algorithm by Derek Lin and David Talkin is included as "get_f0.c"
% in the esps.zip package available from http://www.speech.kth.se/esps/esps.zip under the BSD
% license.
%
% Refs:
% [1] D. Talkin, "A Robust Algorithm for Pitch Tracking (RAPT)"
% in "Speech Coding & Synthesis", W B Kleijn, K K Paliwal eds,
% Elsevier ISBN 0444821694, 1995
% Copyright (C) Mike Brookes 2006
% Version: $Id: fxrapt.m,v 1.2 2006/07/28 07:41:25 dmb Exp $
%
% VOICEBOX is a MATLAB toolbox for speech processing.
% Home page: http://www.ee.ic.ac.uk/hp/staff/dmb/voicebox/voicebox.html
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% This program is free software; you can redistribute it and/or modify
% it under the terms of the GNU General Public License as published by
% the Free Software Foundation; either version 2 of the License, or
% (at your option) any later version.
%
% This program is distributed in the hope that it will be useful,
% but WITHOUT ANY WARRANTY; without even the implied warranty of
% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
% GNU General Public License for more details.
%
% You can obtain a copy of the GNU General Public License from
% ftp://prep.ai.mit.edu/pub/gnu/COPYING-2.0 or by writing to
% Free Software Foundation, Inc.,675 Mass Ave, Cambridge, MA 02139, USA.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
s=s(:); % force s to be a column
if nargin<3
mode=' ';
end
doback=0; % don't do backwards DP for now
% read in parameters
PP=voicebox;
f0min=PP.rapt_f0min; % Min F0 (Hz) [50]
f0max=PP.rapt_f0max; % Max F0 (Hz) [500]
tframe=PP.rapt_tframe; % frame size (s) [0.01]
tlpw=PP.rapt_tlpw; % low pass filter window size (s) [0.005]
tcorw=PP.rapt_tcorw; % correlation window size (s) [0.0075]
candtr=PP.rapt_candtr; % minimum peak in NCCF [0.3]
lagwt=PP.rapt_lagwt; % linear lag taper factor [0.3]
freqwt=PP.rapt_freqwt; % cost factor for F0 change [0.02]
vtranc=PP.rapt_vtranc; % fixed voice-state transition cost [0.005]
vtrac=PP.rapt_vtrac; % delta amplitude modulated transition cost [0.5]
vtrsc=PP.rapt_vtrsc; % delta spectrum modulated transition cost [0.5]
vobias=PP.rapt_vobias; % bias to encourage voiced hypotheses [0.0]
doublec=PP.rapt_doublec; % cost of exact doubling or halving [0.35]
absnoise=PP.rapt_absnoise; % absolute rms noise level [0]
relnoise=PP.rapt_relnoise; % rms noise level relative to noise floor [2.0]
signoise=PP.rapt_signoise; % ratio of peak signal rms to noise floor [0.001]
ncands=PP.rapt_ncands; % max hypotheses at each frame [20]
trms=PP.rapt_trms; % window length for rms measurement [0.03]
dtrms=PP.rapt_dtrms; % window spacing for rms measurement [0.02]
preemph=PP.rapt_preemph; % s-plane position of preemphasis zero [-7000]
nfullag=PP.rapt_nfullag; % number of full lags to try (must be odd) [7]
% derived parameters (mostly dependent on sample rate fs)
krms=round(trms*fs); % window length for rms measurement
kdrms=round(dtrms*fs); % window spacing for rms measurement
rmswin=hanning(krms).^2;
kdsmp=round(0.25*fs/f0max);
hlpw=round(tlpw*fs/2); % force window to be an odd length
blp=sinc((-hlpw:hlpw)/kdsmp).*hamming(2*hlpw+1).';
fsd=fs/kdsmp;
kframed=round(fsd*tframe); % downsampled frame length
kframe=kframed*kdsmp; % frame increment at full rate
rmsix=(1:krms)+floor((kdrms-kframe)/2); % rms index according to Talkin; better=(1:krms)+floor((kdrms-krms+1)/2)
minlag=ceil(fsd/f0max);
maxlag=round(fsd/f0min); % use round() only because that is what Talkin does
kcorwd=round(fsd*tcorw); % downsampled correlation window
kcorw=kcorwd*kdsmp; % full rate correlation window
spoff=max(hlpw-floor(kdsmp/2),1+kdrms-rmsix(1)-kframe); % offset for first speech frame at full rate
sfoff=spoff-hlpw+floor(kdsmp/2); % offset for downsampling filter
sfi=1:kcorwd; % initial decimated correlation window index array
sfhi=1:kcorw; % initial correlation window index array
sfj=1:kcorwd+maxlag;
sfmi=repmat((minlag:maxlag)',1,kcorwd)+repmat(sfi,maxlag-minlag+1,1);
lagoff=(minlag-1)*kdsmp; % lag offset when converting to high sample rate
beta=lagwt*f0min/fs; % bias towards low lags
log2=log(2);
lpcord=2+round(fs/1000); % lpc order for itakura distance
hnfullag=floor(nfullag/2);
jumprat=exp((doublec+log2)/2); % lag ratio at which octave jump cost is lowest
ssq=s.^2;
csssq=cumsum(ssq);
sqrt(min(csssq(kcorw+1:end)-csssq(1:end-kcorw))/kcorw);
afact=max([absnoise^2,max(ssq)*signoise^2,min(csssq(kcorw+1:end)-csssq(1:end-kcorw))*(relnoise/kcorw)^2])^2*kcorw^2;
% downsample signal to approx 2 kHz to speed up autocorrelation calculation
% kdsmp is the downsample factor
sf=filter(blp/sum(blp),1,s(sfoff+1:end));
sp=filter([1 exp(preemph/fs)],1,s); % preemphasised speech for LPC calculation
sf(1:length(blp)-1)=[]; % remove startup transient
sf=sf(1:kdsmp:end); % downsample to =~2kHz
nsf=length(sf); % length of downsampled speech
ns=length(s); % length of full rate speech
% Calculate the frame limit to ensure we don't run off the end of the speech or decimated speech:
% (a) For decimated autocorrelation when calculating sff(): (nframe-1)*kframed+kcorwd+maxlag <= nsf
% (b) For full rate autocorrelation when calculating sfh(): max(fho)+kcorw+maxlag*kdsamp+hnfllag <= ns
% (c) For rms ratio window when calculating rr : max(fho)+rmsix(end) <= ns
% where max(fho) = (nframe-1)*kframe + spoff
nframe=floor(1+min((nsf-kcorwd-maxlag)/kframed,(ns-spoff-max(kcorw-maxlag*kdsmp-hnfullag,rmsix(end)))/kframe));
% now search for autocorrelation peaks in the downsampled signal
cost=zeros(nframe,ncands); % cumulative cost
prev=zeros(nframe,ncands); % traceback pointer
mcands=zeros(nframe,1); % number of actual candidates excluding voiceless
lagval=repmat(NaN,nframe,ncands-1); % lag of each voiced candidate
tv=zeros(nframe,3); % diagnostics: 1=voiceless cost, 2=min voiced cost, 3:cumulative voiceless-min voiced
if doback
costms=cell(nframe,1);
end
% Main processing loop for each 10 ms frame
for iframe=1:nframe % loop for each frame (~10 ms)
% Find peaks in the normalized autocorrelation of subsampled (2Khz) speech
% only keep peaks that are > 30% of highest peak
sff=sf((iframe-1)*kframed+sfj);
sffdc=mean(sff(sfi)); % mean of initial correlation window length
sff=sff-sffdc; % subtract off the mean
nccfd=normxcor(sff(1:kcorwd),sff(minlag+1:end));
[ipkd,vpkd]=findpeaks(nccfd,'q');
% Debugging: execute the line below to plot the autocorrelation peaks.
% findpeaks(nccfd,'q'); xlabel(sprintf('Lag = (x+%d)*%g ms',minlag-1,1000*kdsmp/fs)); ylabel('Normalized Cross Correlation'); title (sprintf('Frame %d/%d',iframe,nframe));
vipkd=[vpkd ipkd];
vipkd(vpkdncands-1
vipkd=sortrows(vipkd);
vipkd(1:size(vipkd,1)-ncands+1,:)=[]; % eliminate lowest to leave only ncands-1
end
lagcan=round(vipkd(:,2)*kdsmp+lagoff); % convert the lag candidate values to the full sample rate
nlcan=length(lagcan);
else
nlcan=0;
end
% If there are any candidate lag values (nlcan>0) then refine their accuracy at the full sample rate
if nlcan
laglist=reshape(repmat(lagcan(:)',nfullag,1)+repmat((-hnfullag:hnfullag)',1,nlcan),nfullag*nlcan,1);
fho=(iframe-1)*kframe+spoff;
sfh=s(fho+(1:kcorw+max(lagcan)+hnfullag));
sfhdc=mean(sfh(sfhi));
sfh=sfh-sfhdc;
e0=sum(sfh(sfhi).^2); % energy of initial correlation window (only needed to store in tv(:,6)
lagl2=repmat(lagcan(:)',nfullag+kcorw-1,1)+repmat((1-hnfullag:hnfullag+kcorw)',1,nlcan);
nccf=normxcor(sfh(1:kcorw),sfh(lagl2),afact);
[maxcc,maxcci]=max(nccf,[],1);
vipk=[maxcc(:) lagcan(:)+maxcci(:)-hnfullag-1];
vipk=vipk(:,[1 2 2]);
maxccj=maxcci(:)'+nfullag*(0:nlcan-1); % vector index into nccf array
msk=mod(maxcci,nfullag-1)~=1 & 2*nccf(maxccj)-nccf(mod(maxccj-2,nfullag*nlcan)+1)-nccf(mod(maxccj,nfullag*nlcan)+1)>0; % don't do quadratic interpolation for the end ones
if any(msk)
maxccj=maxccj(msk);
vipk(msk,3)=vipk(msk,3)+(nccf(maxccj+1)-nccf(maxccj-1))'./(2*(2*nccf(maxccj)-nccf(maxccj-1)-nccf(maxccj+1)))';
end
vipk(maxccncands-1
vipk=sortrows(vipk);
vipk(1:size(vipk,1)-ncands+1,:)=[]; % eliminate lowest to leave only ncands-1
end
% vipk(:,1) has NCCF value, vipk(:,2) has integer peak position, vipk(:,3) has refined peak position
mc=size(vipk,1);
else
mc=0;
end
% We now have mc lag candidates at the full sample rate
mc1=mc+1; % total number of candidates including "unvoiced" possibility
mcands(iframe)=mc; % save number of lag candidates (needed for pitch consistency cost calculation)
if mc
lagval(iframe,1:mc)=vipk(:,3)';
cost(iframe,1)=vobias+max(vipk(:,1)); % voiceless cost
cost(iframe,2:mc1)=1-vipk(:,1)'.*(1-beta*vipk(:,3)'); % local voiced costs
tv(iframe,2)=min(cost(iframe,2:mc1));
else
cost(iframe,1)=vobias; % if no lag candidates (mc=0), then the voiceless case is the only possibility
end
tv(iframe,1)=cost(iframe,1);
if iframe>1 % if it is not the first frame, then calculate pitch consistency and v/uv transition costs
mcp=mcands(iframe-1);
costm=zeros(mcp+1,mc1); % cost matrix: rows and cols correspond to candidates in previous and current frames (incl voiceless)
% if both frames have at least one lag candidate, then calculate a pitch consistency cost
if mc*mcp
lrat=abs(log(repmat(lagval(iframe,1:mc),mcp,1)./repmat(lagval(iframe-1,1:mcp)',1,mc)));
costm(2:end,2:end)=freqwt*min(lrat,doublec+abs(lrat-log2)); % allow pitch doubling/halving
end
% if either frame has a lag candidate, then calcualte the cost of voiced/voiceless transition and vice versa
if mc+mcp
rr=sqrt((rmswin'*s(fho+rmsix).^2)/(rmswin'*s(fho+rmsix-kdrms).^2)); % amplitude "gradient"
ss=0.2/(distitar(lpcauto(sp(fho+rmsix),lpcord),lpcauto(sp(fho+rmsix-kdrms),lpcord),'e')-0.8); % Spectral stationarity: note: Talkin uses Hanning instead of Hamming windows for LPC
costm(1,2:end)= vtranc+vtrsc*ss+vtrac/rr; % voiceless -> voiced cost
costm(2:end,1)= vtranc+vtrsc*ss+vtrac*rr;
tv(iframe,4:5)=[costm(1,mc1) costm(mcp+1,1)];
end
costm=costm+repmat(cost(iframe-1,1:mcp+1)',1,mc1); % add in cumulative costs
[costi,previ]=min(costm,[],1);
cost(iframe,1:mc1)=cost(iframe,1:mc1)+costi;
prev(iframe,1:mc1)=previ;
else % first ever frame
costm=zeros(1,mc1); % create a cost matrix in case doing a backward recursion
end
if mc
tv(iframe,3)=cost(iframe,1)-min(cost(iframe,2:mc1));
tv(iframe,6)=5*log10(e0*e0/afact);
end
if doback
costms{iframe}=costm; % need to add repmatted cost into this
end
end
% now do traceback
best=zeros(nframe,1);
[cbest,best(nframe)]=min(cost(nframe,1:mcands(nframe)+1));
for i=nframe:-1:2
best(i-1)=prev(i,best(i));
end
vix=find(best>1);
fx=repmat(NaN,nframe,1); % unvoiced frames will be NaN
fx(vix)=fs*lagval(vix+nframe*(best(vix)-2)).^(-1); % leave as NaN if unvoiced
tt=zeros(nframe,3);
tt(:,1)=(1:nframe)'*kframe+spoff; % find frame times
tt(:,2)=tt(:,1)+kframe-1;
jratm=(jumprat+1/jumprat)/2;
tt(2:end,3)=abs(fx(2:end)./fx(1:end-1)-jratm)>jumprat-jratm; % new spurt if frequency ratio is outside (1/jumprat,jumprat)
tt(1,3)=1; % first frame always starts a spurt
tt(1+find(isnan(fx(1:end-1))),3)=1; % NaN always forces a new spurt
% plot results if there are no output arguments of if the 'g' mode option is specified
if ~nargout | any(mode=='g')
tf=spoff+(0:nframe-1)'*kframe; % one sample before start of each frame
blag=repmat(NaN,nframe,1); % unvoiced frames will be NaN
blag(vix)=lagval(vix+nframe*(best(vix)-2)); % leave as NaN if unvoiced
ts=(1:ns)/fs; % time scale for speech samples
tsa=[1:tf(1) tf(end)+kframe+1:ns]; % indexes for unprocessed speech [-1 term is an error methinks]
sup=repmat(NaN,ns,1); % unprocessed speech - plot in black
sup(tsa)=s(tsa);
sv=reshape(s(tf(1)+1:tf(end)+kframe),kframe,nframe); % processed speech
su=sv;
su(:,best>1)=NaN; % delete all voiced samples
sv(:,best==1)=NaN; % delete all unvoiced samples
tsuv=(tf(1)+1:tf(end)+kframe)/fs;
su=su(:);
sv=sv(:);
subplot(211)
plot(ts,sup,'-k',tsuv,su,'r-',tsuv,sv,'b-');
title('Speech');
subplot(212)
plot((tf+(kframe+1)/2)/fs,lagval*1000/fs,'xr',(tf+(kframe+1)/2)/fs,blag*1000/fs,'-b')
xlabel('Time (s)');
ylabel('Period (ms)');
title('Lag Candidates');
end
tt(isnan(fx),:)=[]; % remove NaN spurts
fx(isnan(fx),:)=[];
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
function v=normxcor(x,y,d)
% Calculate the normalized cross correlation of column vectors x and y
% we can calculate this in two ways but fft is much faster even for nx small
% We must have nx<=ny and the output length is ny-nx+1
% note that this routine does not do mean subtraction even though this is normally a good idea
% if y is a matrix, we correlate with each column
% d is a constant added onto the normalization factor
% v(j)=x'*yj/sqrt(d + x'*x * yj'*yj) where yj=y(j:j+nx-1) for j=1:ny-nx+1
if nargin<3
d=0;
end
nx=length(x);
[ny,my]=size(y);
nv=1+ny-nx;
if nx>ny
error('second argument is shorter than the first');
end
nf=pow2(nextpow2(ny));
w=irfft(repmat(conj(rfft(x,nf,1)),1,my).*rfft(y,nf,1));
s=zeros(ny+1,my);
s(2:end,:)=cumsum(y.^2,1);
v=w(1:nv,:)./sqrt(d+(x'*x).*(s(nx+1:end,:)-s(1:end-nx,:)));