www.pudn.com > ConstrainedEM.zip > create_mog_data2.m
% create synthetic data for EM - create test and train data sets of the
% same size.
% function parameters :
% dim - the data dimention
% k - the number of models
% n - data size will return test_data of size n and
% train_data of size n
% ro - the required purity accuracy score of the data
% scmf - single covariance matrix flag
% exp_flag - if 1, transformation matrix entries are drawn ~exp(1). if
% not, they are drawn ~ N(0,1). when 1, the eigen values of the inner covariance matrix
% drop more rapidly ( the data lives in a flatter manifold )
% Randomize weights ,covmat direction and scale, and centers around 0 ,
% then blows or shrinks the centers scale to achieve the required ro
% Large ro are alweys achieveable.
% Small ro values are not alweys achieved when scmf=0 due to large direction
% differences of the covmats, even when the centers have collapsed into one center.
% Very small ro values aren't achieveble at all. A ro of 1/k cannot be achieved due to the
% purity definition and values close to it aren't achievable if the prior isn't
% uniform.
% after parameters are choosen, they are kept only if every model calculated by the 'fully labeled' algorithm
% keeps at least saf*ro % of it's original points. otherwise, the parameters and data are drawn again.
% if after 100 tries no good parametr set and data are achieved, -1 is returned in sample_z to indicate failure.
% the argumnets returned :
% param - the parameters of the real (metaphysical) mog who created the data
% train_data - a n sized train data
% test_data - a n size test data ( from the same distribution )
% train_labels - train_data's true labels
% test labels - test data's true labels
% sample_z - the purity-accuracy score of the train data when fullt labeled model is calculated.
% tries_num - the number of times parametrs have been chossen before a suitable paramer set was found. used for debuging purposes
% groups - the empirical labeling of train data as calculated by fully labeled algorithm.
function [ param, train_data ,test_data ,train_labels , test_labels, sample_z ,tries_num, groups ]=...
create_mog_data2(dim,k,n,ro, scmf,exp_flag,seed,seedN)
% important parameters :
min_model_size=0.05; % the minimum size of a model
sample_size=n; % sample size for deciding the level of difficulity.
max_round=10; % maximum blow/shrink rounds of the gaussian centers.
if (nargin==8)
rand('state',seed);
randn('state',seedN);
else %set rand engines on system clock!!!!
cl= clock;
rand('state',cl(6));
cl1= clock;
randn('state',cl1(6));
end
if (nargin<6)
exp_flag=0;
end
if 1/k < min_model_size % a possible error
'min_model_size requirement could not be answered . change min_model_size'
tuk_tuk
end
% delta choosen to assure model size bigger then min_model_size
delta=(k-1)*min_model_size/(1-k*min_model_size);
% while good_mixture_flag is 0, we continue to serach for a good mixture.
good_mixture_flag=0;
tries_num=0; % for debuging :count the number of tries.
max_tries=10;
while (~good_mixture_flag & tries_num0.1 % sample_z need to be decreased.
if z_is_low~=1
centers=centers/2;
if scmf==0
% calculate mean covmat
a=cell2mat(param(:,2));
a=reshape(a,dim,k*dim);
a=reshape(a',k,dim*dim);
a=mean(a);
a=reshape(a,dim,dim);
for j=1:k
param{j,2}=(param{j,2}+a)/2;
end
end
z_is_low=-1;
else
blow_flag=0;
end
else
blow_flag=0;
end
% watch out from infinite loop
Round=Round+1;
if Round>max_round
blow_flag=0;
end
end
% final sample creation
%sample_z
[test_data , test_labels ]=sample_ezer(centers,param(:,2),w,n); % create the test data
% prepare the param structure
for i=1:k
param{i,3}=w(i);
param{i,2}=param{i,2}'*param{i,2};
param{i,1}=centers(i,:);
end
% mixing the data and the labels
order=randperm(n*2);
train_data=sample;
train_labels=labels;
% check for each real model that the % of points assigned to that model by the empirical
% model is above saf*ro_seperability
saf=0.5;
bad_sample_flag=0;
for i=1:k
real_model_points=find(labels==i);
number_of_good_points=length(find(groups(real_model_points)==i));
if ( number_of_good_points/length(real_model_points) < saf*ro )
bad_sample_flag=1;
break;
end
end
if ~bad_sample_flag
good_mixture_flag=1; % stop the while loop
end
end
if tries_num==max_tries
sample_z=-1;
end