www.pudn.com > ConstrainedEM.zip > test_constrainedEM.m



% Test mixture_of_gaussians3

% In this file we run the following demo :

%   We randomely create a synthetic data set using a mixture of gaussian sources.
%   We use a fixed amount of equivalence queries to produce positive and negative
%   constraints. Then we compare clustering results of 

%   1. Regular EM
%   2. Positively constrained EM (only positive constraints are used) 
%   3. Fully constrained EM (both positive and negative constraints)
%   4. Fully labeled estimation of the mixture (assumes all the labels are known). 

%   The results are compared in terms of Z-score (combined purity and
%   accuracy).

% Playing with the parameters of the data set creation you can test the
% algorithms performance in various settings. (of course, changing the
% dimension parameter will disable the visual display).


%  Sythetic data creation parameters :

k=6;                    % number of clusters  
dim=2;                  % synthetic data dimension
data_size=200;          % number of data points  
noc=60;                 % number of equivalence queries used.

cscmf=0;	            % when 0 - data is created using a different covariance
%                          matrixes for each model. when 1 - a single cov matrix
%                          is used.
sperability=0.9;        % This parameter determined the expected z-score 
%                         of the best mixture model of the data (the model
%                         that generated the data).

splitable_data_flag=1;	% should we make the data gaussian separable ?
%                         0 - no ( bayes error is bigger then 0 )
%                         1 - yes ( bayes error is 0 )


% Create the data set

create_flag=1;
while ( create_flag)	% create data more then once if one of the models is too small
    
    [ results, data ,stam, labels , sample_z ]= ...
        create_mog_data2(dim,k,data_size,sperability, cscmf);
    data=[ data labels ];
    create_flag=0;
    
    if splitable_data_flag==1	% make the data gaussian splitable
        [ Lresults,LlogLikelihood ,Lgroups ,Lz ]=  ...
            calculate_labeled_model(data(:,1:end-1), k,cscmf,data(:,end));
        data(:,end)=Lgroups';
        tmp=pilug(Lgroups);
        if any(tmp(:,2)<5)		% no model with less then 5 points is allowed
            create_flag=1;
        end
    end
     
end

% Add constraints by sampling equivalence queries. Positive constraints
% are closed into chunklets and represented in the variable chunks.
% Negative constraints are represented in the variable anti_chunks.
% 'chunks' is a data_size*1 integer list containing the chunklet information. 
% If the i'th place contains -1 -  the point doesn't belong to any chunklet.
% If it contains the tag 'j' the point is in a chunklet with all the other 
% points with the tag 'j'.
% 'anti_chunks' is a (number of negative constraints)*2 table. Each pair is 
% known to be negatively constrained

create_constraints;

% Set the starting parameters for the EM : all the EM versions will run
% with the same initial conditions.

ch_num=length(unique(chunks))-1;
nc_inds=find(chunks==-1);   % nc_inds - non chunkletted data indexes
param=cell(k,3);
param=best_params(data(:,1:end-1),param,0,0,chunks,ch_num,nc_inds);

close all;
if dim==2
    visual_k_means(data(:,1:2),data(:,3),'The true labeles' );
end

% 1. run regular EM

   sprintf('Running regular EM')   % basic
   [results,ll ,groups ,z ,stam, ef]=mixture_of_gaussians3(data(:,1:end-1) ,k,...
       [],[],param, data(:,end), cscmf);
   z_rec(1)=z(end);
   if dim==2
       visual_k_means(data(:,1:2),groups,'Regular EM results');
   end
   
% 2. run positively constrained EM 

   sprintf('Running positively constrained EM')	
   % Notice that this call to mixture_of_Gaussians3 overides the defualt value 
   % of the parameter 'late_oracleP' to 0 (the last parameter in the call). This
   % leads to the usage of 'Early oracle' sampling assumption. Under this assumption
   % the EM has a closed form solution and hence a fast implementation. 
   
   [results,ll ,groups ,z ,stam , ef]=mixture_of_gaussians3(data(:,1:end-1) ,k,...
       chunks,[],param, data(:,end), cscmf,0);
   z_rec(2)=z(end);
   if dim==2
       visual_k_means(data(:,1:2),groups,'Positively constarined EM results');
   end
   
% 3. run fully constrained EM 

   sprintf('Running constrained EM')	% both, 
   [results,ll ,groups ,z ,stam, ef ]=mixture_of_gaussians3(data(:,1:end-1) ,k,...
       chunks,anti_chunks,param, data(:,end), cscmf);
   z_rec(3)=z(end);
   if dim==2
       visual_k_means(data(:,1:2),groups,'Fully constrained EM results');
   end

% 4. calculate the 'best' model (the one calculated with full label information)
   
   sprintf('Computing fully labled model')	% both, 
   [ Lresults,LlogLikelihood ,Lgroups ,Lz ]=  ...
     calculate_labeled_model(data(:,1:end-1), k,cscmf,data(:,end));
   z_rec(4)=Lz(end);
   if dim==2
       visual_k_means(data(:,1:2),Lgroups,'Fully labeled model results');
   end
   
   

disp('Z score results:')   
disp(sprintf('Regular EM : %g',z_rec(1)));
disp(sprintf('Positively constrained EM : %g',z_rec(2)));
disp(sprintf('Fully constrained EM : %g',z_rec(3)));
disp(sprintf('Fully labeled EM : %g',z_rec(4)));