www.pudn.com > ConstrainedEM.zip > create_mog_data2.m


% create synthetic data for EM - create test and train data sets of the
% same size.

% function parameters :

% dim 	- the data dimention
% k		- the number of models
% n		- data size  will return test_data of size n and
%                 train_data of size n
% ro		- the required purity accuracy score of the data
% scmf	- single covariance matrix flag
% exp_flag - if 1, transformation matrix entries are drawn ~exp(1). if
% not, they are drawn ~ N(0,1). when 1, the eigen values of the inner covariance matrix 
% drop more rapidly ( the data lives in a flatter manifold )

% Randomize weights ,covmat direction and scale, and centers around 0 ,
% then blows or shrinks	the centers scale to achieve the required ro
% Large ro are alweys achieveable.
% Small ro values are not alweys achieved when scmf=0 due to large direction 
% differences of the covmats, even when the centers have collapsed into one center.
% Very small ro values aren't achieveble at all. A ro of 1/k cannot be achieved due to the
% purity definition and values close to it aren't achievable if the prior isn't 
% uniform.

% after parameters are choosen, they are kept only if every model calculated by the 'fully labeled' algorithm
% keeps at least saf*ro % of it's original points. otherwise, the parameters and data are drawn again.
% if after 100 tries no good parametr set and data are achieved, -1 is returned in sample_z to indicate failure.

% the argumnets returned :

% param - the parameters of the real (metaphysical) mog who created the data
% train_data - a n sized train data
% test_data - a n size test data ( from the same distribution )
% train_labels - train_data's true labels
% test labels - test data's true labels
% sample_z - the purity-accuracy score of the train data when fullt labeled model is calculated.
% tries_num - the number of times parametrs have been chossen before a suitable paramer set was found. used for debuging purposes
% groups - the empirical labeling of train data as calculated by fully labeled algorithm.


function [ param, train_data ,test_data ,train_labels , test_labels, sample_z ,tries_num, groups ]=...
   create_mog_data2(dim,k,n,ro, scmf,exp_flag,seed,seedN)
 
% important parameters :

min_model_size=0.05;			% the minimum size of a model
sample_size=n;				% sample size for deciding the level of difficulity.
max_round=10;               % maximum blow/shrink rounds of the gaussian centers.

if (nargin==8)
  rand('state',seed);
  randn('state',seedN);

else %set rand engines on system clock!!!!
  
  cl= clock;
  rand('state',cl(6));

  cl1= clock;
  randn('state',cl1(6));

end


if (nargin<6)
  exp_flag=0;
end

if 1/k < min_model_size 	% a possible error
   'min_model_size requirement could not be answered . change min_model_size'
   tuk_tuk
end
% delta choosen to assure model size bigger then min_model_size
delta=(k-1)*min_model_size/(1-k*min_model_size);	

% while good_mixture_flag is 0, we continue to serach for a good mixture.
good_mixture_flag=0;
tries_num=0;    % for debuging :count the number of tries.
max_tries=10;

while (~good_mixture_flag & tries_num0.1	% sample_z need to be decreased. 
            if z_is_low~=1
                centers=centers/2;
                if scmf==0
                    % calculate mean covmat
                    a=cell2mat(param(:,2));
                    a=reshape(a,dim,k*dim);
                    a=reshape(a',k,dim*dim);
                    a=mean(a);
                    a=reshape(a,dim,dim);
                    for j=1:k
                        param{j,2}=(param{j,2}+a)/2;
                    end
                end
                z_is_low=-1;
            else
                blow_flag=0;
            end
        else
            blow_flag=0;
        end
        
        % watch out from infinite loop
        Round=Round+1;
        if Round>max_round
            blow_flag=0;
        end
        
    end
    
    
    % final sample creation  
    %sample_z
    [test_data , test_labels ]=sample_ezer(centers,param(:,2),w,n);   % create the test data
    % prepare the param structure
    for i=1:k
        param{i,3}=w(i);
        param{i,2}=param{i,2}'*param{i,2};
        param{i,1}=centers(i,:);
    end
    
    % mixing the data and the labels
    order=randperm(n*2);
        
    train_data=sample;
    train_labels=labels;
    
    % check for each real model that the % of points assigned to that model by the empirical 
    % model is above saf*ro_seperability
    
    saf=0.5;
    bad_sample_flag=0;
    for i=1:k
        real_model_points=find(labels==i);
        number_of_good_points=length(find(groups(real_model_points)==i));
        
        if  ( number_of_good_points/length(real_model_points) < saf*ro )
            bad_sample_flag=1;
            break;
        end
    end
    if ~bad_sample_flag
        good_mixture_flag=1;    % stop the while loop
    end
end

if tries_num==max_tries
    sample_z=-1;
end