www.pudn.com > firev0.01.rar > em.hpp


/*
This file is part of the FIRE -- Flexible Image Retrieval System

FIRE is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

FIRE is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with FIRE; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
/**
 * @file   em.hpp
 * @author Thomas Deselaers
 * @date   Mon Jun 23 18:57:30 2003
 * 
 * @brief  EM algorithm for clustering
 * 
 * This is an implementation of the EM algorithm for clustering. That
 * means: you put some data in and get some clusters out.
 * 
 */
#ifndef __em_hpp
#define __em_hpp

#include 
#include 
#include 
#include 
#include "basefeature.hpp"
#include "distances.hpp"
#include "diag.hpp"
#include "baseclusterer.hpp"
#include "gaussiandensity.hpp"

using namespace std;
using namespace diag;

template  
class EM :public BaseClusterer {
public:

  EM() {
    poolMode_=noPooling;
    disturbMode_=varianceDisturb;
    splitMode_=allSplit;
    stopWithNClusters_=-1;
  }




 
  /** set poolMode to noPooling, to have variances unpooled. */
  static const int noPooling=0;
  /** set poolMode to ClusterPooling, to have variances for all
      clusters equal, but different in every dimension. */
  static const int clusterPooling=1;
  /** set poolMode to dimensionPooling, to have variances equal for
      all dimensions, but different for every cluster. */
  static const int dimensionPooling=2;
  /** set poolMode to bothPooling, to have variances pooled over
      cluster and dimensions */
  static const int bothPooling=3;
  
  /** set disturbMode to constDisturb to have
      mean_new1=mean_old+epsilon, mean_new2=mean_old-epsilon after
      split.*/ 
  static const int constDisturb=0; 
  /** set disturbMode to meanDisturb to have
      mean_new1=mean_old+epsilon*mean_old, mean_new2=mean_old-epsilon*mean_old after
      split.*/ 
  static const int meanDisturb=1;
  /** set disturbMode to meanDisturb to have
      mean_new1=mean_old+epsilon*sigma, mean_new2=mean_old-epsilon*sigma after
      split.*/ 
  static const int varianceDisturb=2;
  /** set disturbMode to meanDisturb2 to have meandisturb, but switch
      between + and - after half of the entries */
  static const int meanDisturb2=3;
  /** set splitMode to allSplit to split all densities in each split
      iteration */
  static const int allSplit=0;
  /** set splitMode to largestSplit to split only the largest density
      in each split iteration */
  static const int largestSplit=1;
  /** set splitMode to varianceSplit to split only density with
      hightes variance in each split iteration */
  static const int varianceSplit=2;



  

  

  /** 
   * EM algorithm. Main part of this class.
   * 
   * @param inputdata the data to be clustered @param
   * clusterinformation clusternumbers of the inputdata to be
   * returned.
   * 
   * @return nothing useful
   */

  const unsigned int run(const vector < BaseFeature* > &inputdata, vector  &clusterinformation);

  /** set and get the splitMode */
  int &splitMode() {return splitMode_;}


  /** 
   * Set and get the maximum number of splits per call
   * @return a reference to the maximum number of splits per call
   */
  unsigned int & maxSplits() {return maxSplits_;}

  /** don't split any further if this number of clusters is reached */
  int & stopWithNClusters() {return stopWithNClusters_;}

  /**
   * to set disturb mode.
   * 
   * this controls how means are disturbed. See constDisturb,
   * meanDisturb, varianceDisturb for further details.
   */
  int & disturbMode() {return disturbMode_;}

  /** to set pooling mode.
   *
   * this controls how variances are pooled. See noPooling,
   * clusterPooling, dimensionPooling and bothPooling for further
   * details.
   */
  int & poolMode() {return poolMode_;}
  
  /** to set pooling mode.
   *
   * this controls how variances are pooled. See noPooling,
   * clusterPooling, dimensionPooling and bothPooling for further
   * details. Here strings (written like the name of the consts are
   * required). The number of the poolingMode ist returned.
   */
  const int & poolMode(const ::std::string poolModeString);
  const int &disturbMode(const ::std::string disturbModeString);
  const int & splitMode(const ::std::string splitModeString);

  


  int &dontSplitBelow() { return dontSplitBelow_;}
  const int & dontSplitBelow() const {return dontSplitBelow_;}

  
  /** 
   * Set and get the number of iterations between two splits.
   * @return  a reference to the number of iterations between two splits.
   */
  unsigned int & iterationsBetweenSplits() {return iterationsBetweenSplits_;}

  /** 
   * Set and get the minimum number of observations per cluster to be
   * not deleted. If there are less observations, that cluster will be
   * deleted before reestimation, because it obvously is not possible
   * to estimate e.g. variance with only one observation.
   * 
   * @return a reference to the min. number of observations per
   * cluster.
   */
  unsigned int & minObservationsPerCluster() {return minObservationsPerCluster_; }
  
  /** 
   * Set and get the epsilon which is used to disturb the means before
   * reestimation.
   * 
   * 
   * @return a reference to epsilon.
   */
  double & epsilon() {return epsilon_;}
  void saveClusters(string filename) const;
  
  /** 
   * Set and get the distance function to be used for the clustering.
   * 
   * @return a reference to the distance function object.
   */
  BaseDist*& dist() {return dist_;}
  
  const GaussianDensity &cluster(int i) const {return clusters[i];}

private:
  
  void printClusters(vector &clusters);
  void printCluster(const GaussianDensity &cls);
  GaussianDensity getDensity(const vector* > &inputdata, vector clusterinformation, int cluster);
  void unzeroSigma(GaussianDensity &gd);
  int classify(BaseFeature* aktEl, vector clusters);
  void poolVariances(vector &clusters);
  

  int splitMode_;
  int poolMode_;
  int dontSplitBelow_;
  int disturbMode_;
  unsigned int maxSplits_;
  unsigned int iterationsBetweenSplits_;
  unsigned int minObservationsPerCluster_;
  int stopWithNClusters_;
  double epsilon_;
  BaseDist *dist_;
  vector< GaussianDensity > clusters;
};

template  
const unsigned int EM::run(const vector < BaseFeature* > &inputdata, vector  &clusterinformation) {
  DBG(DBG_VERBOSE) << " Starting " << endl;
    
  //base variables
  unsigned int dim=inputdata[0]->size();

  clusters=vector();
  GaussianDensity tmpCluster;
    
  //init data
  clusterinformation=vector(inputdata.size(),0);
    
  //init clustering
  //     tmpCluster.mean=getMean(inputdata,clusterinformation,0);
  //     tmpCluster.sigma=getSigma(inputdata,tmpCluster.mean,clusterinformation,0);
  tmpCluster=getDensity(inputdata,clusterinformation,0);
  //    unzeroSigma(tmpCluster);
    
    
  DBG(DBG_VERBOSE) << " Init distribution estimated" << endl;
  //-----------------------------------------------
  //iteration
  clusters.push_back(tmpCluster);
  poolVariances(clusters);
  DBGI(DBG_VERBOSE,printClusters(clusters));
    
  for(unsigned int split=0;split=0;--i) {
          tmpCluster=GaussianDensity(dim);
          if(int(clusters[i].elements)>dontSplitBelow_) {
            DBG(DBG_MESSAGE) << "Splitting cluster with " << clusters[i].elements << " elements."<< endl;
            for(unsigned int j=0;jmaxSize) {
            maxSize=clusters[i].elements;
            maxCl=i;
          }
        }
        if(maxCl==-1) {
          DBG(DBG_MESSAGE) << "STRANGE: No largest cluster found" << endl;
        } else {
          tmpCluster=GaussianDensity(dim);
          for(unsigned int j=0;jmaxVar) {
            maxVar=tmpVar;
            maxCl=i;
          }
        }
        if(maxCl==-1) {
          DBG(DBG_MESSAGE) << "STRANGE: No highest variance cluster found" << endl;
        } else {
          tmpCluster=GaussianDensity(dim);
          for(unsigned int j=0;j::iterator aktClust=clusters.begin();aktClustelements << " entries." << endl;
        }
        
        
        //check whether there are any clusters which are to small
        DBG(DBG_VERBOSE) << "Deleting to small clusters" << endl;
        
        
        vector::iterator deleteIt;
        vector::iterator aktClust=clusters.begin();
        
        int aktClustNo=0;
        
        //        printClusters(clusters);
        
        
        while(aktClustelements << " entries." << endl;
          
          if (aktClust->elements< minObservationsPerCluster_) {
            DBG(DBG_MESSAGE) << "deleting cluster with "<elements<< " elements."<< endl;
            deleteIt=aktClust;
            
            // temporary outputting clusters being deleted:
            
            clusters.erase(deleteIt);
            
            for(unsigned int i=0;iaktClustNo) {
                --clusterinformation[i];
              }
            }
            aktClustNo=0;
            aktClust=clusters.begin(); //HELP: muss das wirkich sein?
          } else {
            ++aktClust;
            ++aktClustNo;
          }
        }
        nOfClusters=clusters.size();
        DBG(DBG_VERBOSE) << "Now having " << nOfClusters << "clusters." << endl;
        
        DBG(DBG_VERBOSE) << "Reestimating" << endl;
        
        
        for(unsigned int i=0;i
const int & EM::splitMode(const ::std::string splitModeString) {
  if(splitModeString=="allSplit") {
    splitMode_=allSplit;
  } else if(splitModeString =="largestSplit") {
    splitMode_=largestSplit;
  } else if(splitModeString =="varianceSplit") {
    splitMode_=varianceSplit;
  } else {
    ERR << "cannot understand splitModeString: '" << splitModeString << "'." << endl;
  }
  return splitMode_;
}


template  
const int & EM::poolMode(const ::std::string poolModeString) {
  if(poolModeString=="noPooling") {
    poolMode_=noPooling;
  } else if( poolModeString=="clusterPooling") {
    poolMode_=clusterPooling;
  } else if( poolModeString=="dimensionPooling") {
    poolMode_=dimensionPooling;
  } else if( poolModeString=="bothPooling") {
    poolMode_=bothPooling;
  } else {
    ERR << "Cannot understand poolModeString: " << poolModeString << endl;
  }
  return poolMode_;
}
template  
const int &EM::disturbMode(const ::std::string disturbModeString) {
  if(disturbModeString=="constDisturb") {
    disturbMode_=constDisturb;
  } else if(disturbModeString=="meanDisturb") {
    disturbMode_=meanDisturb;
  } else if(disturbModeString=="meanDisturb2") {
    disturbMode_=meanDisturb2;
  } else if(disturbModeString=="varianceDisturb") {
    disturbMode_=varianceDisturb;
  } else {
    ERR << "Cannot understand disturbModeString: " << disturbModeString << endl;
  }
  return disturbMode_;
}

template  
void EM::saveClusters(string filename) const {
  ofstream ofs(filename.c_str());
  ofs << clusters.size() << " " << clusters[0].mean.size() << endl;
  for(unsigned int i=0;i 
void EM::printClusters(vector &clusters) {
  for(unsigned int i=0;i 
void EM::printCluster(const GaussianDensity &cls) {
  cout << "cluster:" << endl
       << "   elements: " << cls.elements << endl
       << "   mean:";
  for(unsigned int j=0;j   
GaussianDensity EM::getDensity(const vector* > &inputdata, vector clusterinformation, int cluster) {
  unsigned int nOfelements=0;
  unsigned int dim=inputdata[0]->size();
    
  GaussianDensity result;
  result.mean=vector(dim,0);
  result.sigma=vector(dim,0);
  BaseFeature *aktEl;
  for(unsigned int i=0;i 
void EM::unzeroSigma(GaussianDensity &gd) {
  double minSigma=numeric_limits::max();
  int dim=gd.sigma.size();
  for(int i=0;i 
int EM::classify(BaseFeature* aktEl, vector clusters) {
  double minDist=numeric_limits::max();
  int bestCluster=-1;
  double aktDist;
    
  for(unsigned int j=0;j(dist_)) {
      aktDist=dynamic_cast(dist_)->dist(aktEl,clusters[j].mean,clusters[j].sigma);
    } else {
      aktDist=dist_->dist(aktEl,clusters[j].mean);
    }
    
    if(isnan(aktDist)) {
      ERR << "isnan(aktDist)" << endl
          << *aktEl << endl
          << clusters[j] << endl;
    }
    
    DBG(DBG_VERBOSE) << "aktdist " << aktDist << endl;
    if (aktDist