www.pudn.com > ConstrainedEM.zip > test_constrainedEM.m
% Test mixture_of_gaussians3
% In this file we run the following demo :
% We randomely create a synthetic data set using a mixture of gaussian sources.
% We use a fixed amount of equivalence queries to produce positive and negative
% constraints. Then we compare clustering results of
% 1. Regular EM
% 2. Positively constrained EM (only positive constraints are used)
% 3. Fully constrained EM (both positive and negative constraints)
% 4. Fully labeled estimation of the mixture (assumes all the labels are known).
% The results are compared in terms of Z-score (combined purity and
% accuracy).
% Playing with the parameters of the data set creation you can test the
% algorithms performance in various settings. (of course, changing the
% dimension parameter will disable the visual display).
% Sythetic data creation parameters :
k=6; % number of clusters
dim=2; % synthetic data dimension
data_size=200; % number of data points
noc=60; % number of equivalence queries used.
cscmf=0; % when 0 - data is created using a different covariance
% matrixes for each model. when 1 - a single cov matrix
% is used.
sperability=0.9; % This parameter determined the expected z-score
% of the best mixture model of the data (the model
% that generated the data).
splitable_data_flag=1; % should we make the data gaussian separable ?
% 0 - no ( bayes error is bigger then 0 )
% 1 - yes ( bayes error is 0 )
% Create the data set
create_flag=1;
while ( create_flag) % create data more then once if one of the models is too small
[ results, data ,stam, labels , sample_z ]= ...
create_mog_data2(dim,k,data_size,sperability, cscmf);
data=[ data labels ];
create_flag=0;
if splitable_data_flag==1 % make the data gaussian splitable
[ Lresults,LlogLikelihood ,Lgroups ,Lz ]= ...
calculate_labeled_model(data(:,1:end-1), k,cscmf,data(:,end));
data(:,end)=Lgroups';
tmp=pilug(Lgroups);
if any(tmp(:,2)<5) % no model with less then 5 points is allowed
create_flag=1;
end
end
end
% Add constraints by sampling equivalence queries. Positive constraints
% are closed into chunklets and represented in the variable chunks.
% Negative constraints are represented in the variable anti_chunks.
% 'chunks' is a data_size*1 integer list containing the chunklet information.
% If the i'th place contains -1 - the point doesn't belong to any chunklet.
% If it contains the tag 'j' the point is in a chunklet with all the other
% points with the tag 'j'.
% 'anti_chunks' is a (number of negative constraints)*2 table. Each pair is
% known to be negatively constrained
create_constraints;
% Set the starting parameters for the EM : all the EM versions will run
% with the same initial conditions.
ch_num=length(unique(chunks))-1;
nc_inds=find(chunks==-1); % nc_inds - non chunkletted data indexes
param=cell(k,3);
param=best_params(data(:,1:end-1),param,0,0,chunks,ch_num,nc_inds);
close all;
if dim==2
visual_k_means(data(:,1:2),data(:,3),'The true labeles' );
end
% 1. run regular EM
sprintf('Running regular EM') % basic
[results,ll ,groups ,z ,stam, ef]=mixture_of_gaussians3(data(:,1:end-1) ,k,...
[],[],param, data(:,end), cscmf);
z_rec(1)=z(end);
if dim==2
visual_k_means(data(:,1:2),groups,'Regular EM results');
end
% 2. run positively constrained EM
sprintf('Running positively constrained EM')
% Notice that this call to mixture_of_Gaussians3 overides the defualt value
% of the parameter 'late_oracleP' to 0 (the last parameter in the call). This
% leads to the usage of 'Early oracle' sampling assumption. Under this assumption
% the EM has a closed form solution and hence a fast implementation.
[results,ll ,groups ,z ,stam , ef]=mixture_of_gaussians3(data(:,1:end-1) ,k,...
chunks,[],param, data(:,end), cscmf,0);
z_rec(2)=z(end);
if dim==2
visual_k_means(data(:,1:2),groups,'Positively constarined EM results');
end
% 3. run fully constrained EM
sprintf('Running constrained EM') % both,
[results,ll ,groups ,z ,stam, ef ]=mixture_of_gaussians3(data(:,1:end-1) ,k,...
chunks,anti_chunks,param, data(:,end), cscmf);
z_rec(3)=z(end);
if dim==2
visual_k_means(data(:,1:2),groups,'Fully constrained EM results');
end
% 4. calculate the 'best' model (the one calculated with full label information)
sprintf('Computing fully labled model') % both,
[ Lresults,LlogLikelihood ,Lgroups ,Lz ]= ...
calculate_labeled_model(data(:,1:end-1), k,cscmf,data(:,end));
z_rec(4)=Lz(end);
if dim==2
visual_k_means(data(:,1:2),Lgroups,'Fully labeled model results');
end
disp('Z score results:')
disp(sprintf('Regular EM : %g',z_rec(1)));
disp(sprintf('Positively constrained EM : %g',z_rec(2)));
disp(sprintf('Fully constrained EM : %g',z_rec(3)));
disp(sprintf('Fully labeled EM : %g',z_rec(4)));