www.pudn.com > firev0.01.rar > em.hpp
/* This file is part of the FIRE -- Flexible Image Retrieval System FIRE is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. FIRE is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with FIRE; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /** * @file em.hpp * @author Thomas Deselaers * @date Mon Jun 23 18:57:30 2003 * * @brief EM algorithm for clustering * * This is an implementation of the EM algorithm for clustering. That * means: you put some data in and get some clusters out. * */ #ifndef __em_hpp #define __em_hpp #include#include #include #include #include "basefeature.hpp" #include "distances.hpp" #include "diag.hpp" #include "baseclusterer.hpp" #include "gaussiandensity.hpp" using namespace std; using namespace diag; template class EM :public BaseClusterer { public: EM() { poolMode_=noPooling; disturbMode_=varianceDisturb; splitMode_=allSplit; stopWithNClusters_=-1; } /** set poolMode to noPooling, to have variances unpooled. */ static const int noPooling=0; /** set poolMode to ClusterPooling, to have variances for all clusters equal, but different in every dimension. */ static const int clusterPooling=1; /** set poolMode to dimensionPooling, to have variances equal for all dimensions, but different for every cluster. */ static const int dimensionPooling=2; /** set poolMode to bothPooling, to have variances pooled over cluster and dimensions */ static const int bothPooling=3; /** set disturbMode to constDisturb to have mean_new1=mean_old+epsilon, mean_new2=mean_old-epsilon after split.*/ static const int constDisturb=0; /** set disturbMode to meanDisturb to have mean_new1=mean_old+epsilon*mean_old, mean_new2=mean_old-epsilon*mean_old after split.*/ static const int meanDisturb=1; /** set disturbMode to meanDisturb to have mean_new1=mean_old+epsilon*sigma, mean_new2=mean_old-epsilon*sigma after split.*/ static const int varianceDisturb=2; /** set disturbMode to meanDisturb2 to have meandisturb, but switch between + and - after half of the entries */ static const int meanDisturb2=3; /** set splitMode to allSplit to split all densities in each split iteration */ static const int allSplit=0; /** set splitMode to largestSplit to split only the largest density in each split iteration */ static const int largestSplit=1; /** set splitMode to varianceSplit to split only density with hightes variance in each split iteration */ static const int varianceSplit=2; /** * EM algorithm. Main part of this class. * * @param inputdata the data to be clustered @param * clusterinformation clusternumbers of the inputdata to be * returned. * * @return nothing useful */ const unsigned int run(const vector < BaseFeature * > &inputdata, vector &clusterinformation); /** set and get the splitMode */ int &splitMode() {return splitMode_;} /** * Set and get the maximum number of splits per call * @return a reference to the maximum number of splits per call */ unsigned int & maxSplits() {return maxSplits_;} /** don't split any further if this number of clusters is reached */ int & stopWithNClusters() {return stopWithNClusters_;} /** * to set disturb mode. * * this controls how means are disturbed. See constDisturb, * meanDisturb, varianceDisturb for further details. */ int & disturbMode() {return disturbMode_;} /** to set pooling mode. * * this controls how variances are pooled. See noPooling, * clusterPooling, dimensionPooling and bothPooling for further * details. */ int & poolMode() {return poolMode_;} /** to set pooling mode. * * this controls how variances are pooled. See noPooling, * clusterPooling, dimensionPooling and bothPooling for further * details. Here strings (written like the name of the consts are * required). The number of the poolingMode ist returned. */ const int & poolMode(const ::std::string poolModeString); const int &disturbMode(const ::std::string disturbModeString); const int & splitMode(const ::std::string splitModeString); int &dontSplitBelow() { return dontSplitBelow_;} const int & dontSplitBelow() const {return dontSplitBelow_;} /** * Set and get the number of iterations between two splits. * @return a reference to the number of iterations between two splits. */ unsigned int & iterationsBetweenSplits() {return iterationsBetweenSplits_;} /** * Set and get the minimum number of observations per cluster to be * not deleted. If there are less observations, that cluster will be * deleted before reestimation, because it obvously is not possible * to estimate e.g. variance with only one observation. * * @return a reference to the min. number of observations per * cluster. */ unsigned int & minObservationsPerCluster() {return minObservationsPerCluster_; } /** * Set and get the epsilon which is used to disturb the means before * reestimation. * * * @return a reference to epsilon. */ double & epsilon() {return epsilon_;} void saveClusters(string filename) const; /** * Set and get the distance function to be used for the clustering. * * @return a reference to the distance function object. */ BaseDist*& dist() {return dist_;} const GaussianDensity &cluster(int i) const {return clusters[i];} private: void printClusters(vector &clusters); void printCluster(const GaussianDensity &cls); GaussianDensity getDensity(const vector * > &inputdata, vector clusterinformation, int cluster); void unzeroSigma(GaussianDensity &gd); int classify(BaseFeature * aktEl, vector clusters); void poolVariances(vector &clusters); int splitMode_; int poolMode_; int dontSplitBelow_; int disturbMode_; unsigned int maxSplits_; unsigned int iterationsBetweenSplits_; unsigned int minObservationsPerCluster_; int stopWithNClusters_; double epsilon_; BaseDist *dist_; vector< GaussianDensity > clusters; }; template const unsigned int EM ::run(const vector < BaseFeature * > &inputdata, vector &clusterinformation) { DBG(DBG_VERBOSE) << " Starting " << endl; //base variables unsigned int dim=inputdata[0]->size(); clusters=vector (); GaussianDensity tmpCluster; //init data clusterinformation=vector (inputdata.size(),0); //init clustering // tmpCluster.mean=getMean(inputdata,clusterinformation,0); // tmpCluster.sigma=getSigma(inputdata,tmpCluster.mean,clusterinformation,0); tmpCluster=getDensity(inputdata,clusterinformation,0); // unzeroSigma(tmpCluster); DBG(DBG_VERBOSE) << " Init distribution estimated" << endl; //----------------------------------------------- //iteration clusters.push_back(tmpCluster); poolVariances(clusters); DBGI(DBG_VERBOSE,printClusters(clusters)); for(unsigned int split=0;split =0;--i) { tmpCluster=GaussianDensity(dim); if(int(clusters[i].elements)>dontSplitBelow_) { DBG(DBG_MESSAGE) << "Splitting cluster with " << clusters[i].elements << " elements."<< endl; for(unsigned int j=0;j maxSize) { maxSize=clusters[i].elements; maxCl=i; } } if(maxCl==-1) { DBG(DBG_MESSAGE) << "STRANGE: No largest cluster found" << endl; } else { tmpCluster=GaussianDensity(dim); for(unsigned int j=0;j maxVar) { maxVar=tmpVar; maxCl=i; } } if(maxCl==-1) { DBG(DBG_MESSAGE) << "STRANGE: No highest variance cluster found" << endl; } else { tmpCluster=GaussianDensity(dim); for(unsigned int j=0;j ::iterator aktClust=clusters.begin();aktClust elements << " entries." << endl; } //check whether there are any clusters which are to small DBG(DBG_VERBOSE) << "Deleting to small clusters" << endl; vector ::iterator deleteIt; vector ::iterator aktClust=clusters.begin(); int aktClustNo=0; // printClusters(clusters); while(aktClust elements << " entries." << endl; if (aktClust->elements< minObservationsPerCluster_) { DBG(DBG_MESSAGE) << "deleting cluster with "< elements<< " elements."<< endl; deleteIt=aktClust; // temporary outputting clusters being deleted: clusters.erase(deleteIt); for(unsigned int i=0;i aktClustNo) { --clusterinformation[i]; } } aktClustNo=0; aktClust=clusters.begin(); //HELP: muss das wirkich sein? } else { ++aktClust; ++aktClustNo; } } nOfClusters=clusters.size(); DBG(DBG_VERBOSE) << "Now having " << nOfClusters << "clusters." << endl; DBG(DBG_VERBOSE) << "Reestimating" << endl; for(unsigned int i=0;i const int & EM ::splitMode(const ::std::string splitModeString) { if(splitModeString=="allSplit") { splitMode_=allSplit; } else if(splitModeString =="largestSplit") { splitMode_=largestSplit; } else if(splitModeString =="varianceSplit") { splitMode_=varianceSplit; } else { ERR << "cannot understand splitModeString: '" << splitModeString << "'." << endl; } return splitMode_; } template const int & EM ::poolMode(const ::std::string poolModeString) { if(poolModeString=="noPooling") { poolMode_=noPooling; } else if( poolModeString=="clusterPooling") { poolMode_=clusterPooling; } else if( poolModeString=="dimensionPooling") { poolMode_=dimensionPooling; } else if( poolModeString=="bothPooling") { poolMode_=bothPooling; } else { ERR << "Cannot understand poolModeString: " << poolModeString << endl; } return poolMode_; } template const int &EM ::disturbMode(const ::std::string disturbModeString) { if(disturbModeString=="constDisturb") { disturbMode_=constDisturb; } else if(disturbModeString=="meanDisturb") { disturbMode_=meanDisturb; } else if(disturbModeString=="meanDisturb2") { disturbMode_=meanDisturb2; } else if(disturbModeString=="varianceDisturb") { disturbMode_=varianceDisturb; } else { ERR << "Cannot understand disturbModeString: " << disturbModeString << endl; } return disturbMode_; } template void EM ::saveClusters(string filename) const { ofstream ofs(filename.c_str()); ofs << clusters.size() << " " << clusters[0].mean.size() << endl; for(unsigned int i=0;i void EM ::printClusters(vector &clusters) { for(unsigned int i=0;i void EM ::printCluster(const GaussianDensity &cls) { cout << "cluster:" << endl << " elements: " << cls.elements << endl << " mean:"; for(unsigned int j=0;j GaussianDensity EM ::getDensity(const vector * > &inputdata, vector clusterinformation, int cluster) { unsigned int nOfelements=0; unsigned int dim=inputdata[0]->size(); GaussianDensity result; result.mean=vector (dim,0); result.sigma=vector (dim,0); BaseFeature *aktEl; for(unsigned int i=0;i void EM ::unzeroSigma(GaussianDensity &gd) { double minSigma=numeric_limits ::max(); int dim=gd.sigma.size(); for(int i=0;i int EM ::classify(BaseFeature * aktEl, vector clusters) { double minDist=numeric_limits ::max(); int bestCluster=-1; double aktDist; for(unsigned int j=0;j (dist_)) { aktDist=dynamic_cast (dist_)->dist(aktEl,clusters[j].mean,clusters[j].sigma); } else { aktDist=dist_->dist(aktEl,clusters[j].mean); } if(isnan(aktDist)) { ERR << "isnan(aktDist)" << endl << *aktEl << endl << clusters[j] << endl; } DBG(DBG_VERBOSE) << "aktdist " << aktDist << endl; if (aktDist