www.pudn.com > firev0.01.rar > clustertest.cpp


// Warning!!!!
// this program is programmed in an absolutely horrible manner.
// Take care, reading this could seriously ruin your day
//
// and it is getting worse and worse
//

/**
 * @file   clustertest.cpp
 * @author Thomas Deselaers
 * @date   Mon Jun 23 18:52:16 2003
 * 
 * @brief  program to cluster data.
 *
 * Initially this program was meant only to test some clustering
 * algorithms, now it is my only clustering program, because it's
 * functionality grew and grew and grew. Everything can be changed by
 * command line parameters. Hopefully the documentation is up to date.
 * 
 */
#include 
#include 
#include 
#include 
#include 

#include "filelist.hpp"

#include "basefeature.hpp"
#include "genericfeature.hpp"
#include "mdhistogram.hpp"
#include "colorpixel.hpp"

#include "baseclusterer.hpp"
#include "em.hpp"
#include "kmeans.hpp"
#include "image.hpp"
#include "gzstream.hpp"
#include "getpot.hpp"
#include "loadafeature.hpp"
#include "makeadistance.hpp"
using namespace std;
using namespace img;

///rearrange the information in sorted to have the clustermembers most
///similar to the clustercenter first.
//but this function is absolutely horrible, because the data is stored in a completely silly way.
void rearrangeClusters(const vector< BaseFeature* > & database, 
                       vector< pair > &sorted,  
                       BaseClusterer* cluster,
                       FileList filelist) {
  
  int anzCluster=0;
  for(unsigned int i=0;ianzCluster) anzCluster=sorted[i].first+1;}

  vector< pair > dists;

  //clusterwise
  int number; 
  double dist;
  int actStart, actObservation=0, actDist;
  for(int actCluster=0;actCluster* >(cluster)) {
      density=dynamic_cast< EM* >(cluster)->cluster(actCluster);
    } else if(dynamic_cast< kMeans* >(cluster)) {
      density=dynamic_cast< kMeans* >(cluster)->cluster(actCluster);
    } else {
      ERR << "Not possible!" << endl;
  }
    dists=vector >(0);
    
    //get dists from clustermembers to clustercenter
    actStart=actObservation;
    while(sorted[actObservation].first==actCluster && actObservationdist()->dist(database[idx],density.mean);
      dists.push_back(pair(dist,idx));
      //      cout << actObservation << endl;
      ++actObservation;
    }
    
    sort(dists.begin(),dists.end());
    
    actDist=0;

    for(int i=actStart;i > &sorted) {
  DBG(DBG_MESSAGE) << "starting" << endl;
  int anzCluster=0, anzClasses=0;

  for(unsigned int i=0;ianzCluster) anzCluster=sorted[i].first+1;}
  for(unsigned int i=0;ianzClasses) anzClasses=fileList.cls(i)+1;}
  anzCluster=MAX(1,anzCluster);
  anzClasses=MAX(1,anzClasses);

  vector< set > classSet(anzClasses, set() );
  vector< set > clusterSet(anzCluster, set() );

  for(unsigned int i=0;i > contingencyTable(anzCluster, vector(anzClasses,0));

  set tmp_set;
  double ent_sum1=0.0, ent_sum2=0.0;
  double tmp;
  
  DBG(DBG_MESSAGE) << "init done" << endl;
  for(int i=0;i::max();;
    for(int j=0;j::max();
    for(int i=0;i E1(fileList.size(),0.0);
  vector E2(fileList.size(),0.0);
  
  int aktClass, aktCluster=0;
  for(unsigned int n=0;n> nOfClasses >> dim;
    //    cout << "Will find " << nOfClasses << "classes " << endl;
    DBG(DBG_MESSAGE) << "Reading features with size " << dim << "." << endl;
    while(!ifs.eof() && cls != -1) {

      ifs >> cls ;
      if(!ifs.eof() && cls != -1) {
        img=new GenericFeature(dim);
        for(int x=0;x> tmp;
          img->operator[](x)=double(tmp);
        }
        result.push_back(img);
      }
    }
  }
  DBG(DBG_MESSAGE) << result.size() << " images read." << endl;
  return result;
}


int loadFileList(const FileList fileList, vector< BaseFeature* > & database, vector suffices=vector()) {
  DBG(DBG_VERBOSE) << "starting" << endl;
  string fn;

  vector sufficesToUse(0);
  if(suffices.size()==0) {
    for(unsigned int i=0;i* >::iterator i=database.begin(); i< database.end();++i) {
    vector* >::iterator j;
    j=i;
    delete *j;
  }
  database.clear();
  
  
  DBG(DBG_MESSAGE) << "suffices we are actually using: ";
  for(unsigned int i=0;i* feat=new GenericFeature(0);
    BaseFeature* actFeat;
    
    for(unsigned int j=0;jsize() << " "; 
      for(unsigned int k=0;ksize();++k) {
        feat->push_back(actFeat->operator[](k));
        DBG(DBG_TALKATIVE) << actFeat->operator[](k) << " ";
      }
    }
    DBG(DBG_VERBOSE) << feat->size() << endl;
    database.push_back(feat);
  }
  
  DBG(DBG_VERBOSE) << "Finished reading histograms from list" << endl;
  return database.size();
}


void USAGE() {
  DBG(DBG_RESULT) 
    << "clustertest  [options]" << endl
    << "   mandatory parameters:" << endl
    << "      ((-dummy)|(-rgb list)|(-jf jf))    to select the data to be clustered" << endl
    << "      -c (em|kmeans)            to select the algorithm to be used" << endl
    << "      -d (euclidean|chisquare|jsd|kld|his) to select the distance function" << endl
    << "   Output options: " << endl
    << "      -noRearrange - donot rearrange clustermembers to have the one closest to centroid first" << endl
    << "      -noLCEGCE  - do not calculate LCE and GCE" << endl
    << "      -noPurComp - do not calculate ClusterPurity and ClassCompleteness" << endl
    << "   Options to loading files" << endl
    << "      -suffix followed by a ':' divided list of suffices to use" << endl
    << "   Options to kMeans " << endl
    << "      -nOfClusters (10)" < don't split clusters smaller than this" << endl
    << "      -splitMode (*allSplit*, largestSplit, varianceSplit)" << endl
    << endl;
}

int main(int argc, char **argv) {
  GetPot cl(argc,argv,":");

  vector clusterinformation;  
  vector< pair > toSort;
  vector suffices;
  BaseClusterer* cluster=NULL;
  vector unidentified=cl.unidentified_options(20,
                                                      "-c","-d", "-suffix","-h","-rgb","-dummy","-jf","-stopWithNClusters","-splitMode",
                                                      "-nOfClusters", "-iterations",
                                                      "-maxSplits", "-dontSplitBelow", "-iter", "-minObs", "-epsilon", "-disturbMode", "-poolMode",
                                                      "-noRearrange","-noAnalyse","-testAnalyser");
  
  if(!unidentified.empty()) {
    if(unidentified.size() == 1 && unidentified[0].find("-suffix")==0) {
    } else {
      cout << "Unknown parameters: ";
      for(unsigned int i=0;i;
      
      dynamic_cast* >(cluster)->maxSplits()=cl.follow(4,"-maxSplits");
      dynamic_cast* >(cluster)->iterationsBetweenSplits()=cl.follow(10,"-iter");
      dynamic_cast* >(cluster)->minObservationsPerCluster()=cl.follow(4,"-minObs");
      dynamic_cast* >(cluster)->epsilon()=cl.follow(0.1,"-epsilon");
      dynamic_cast* >(cluster)->disturbMode(cl.follow("varianceDisturb","-disturbMode"));
      dynamic_cast* >(cluster)->poolMode(cl.follow("noPooling","-poolMode"));
      dynamic_cast* >(cluster)->dontSplitBelow()=cl.follow(10,"-dontSplitBelow");
      dynamic_cast* >(cluster)->splitMode(cl.follow("allSplit","-splitMode"));
      dynamic_cast* >(cluster)->stopWithNClusters()=cl.follow(-1,"-stopWithNClusters");
      
    } else if (clusterstring=="kmeans") {
      cluster=new kMeans;
      dynamic_cast* >(cluster)->nOfClusters()=cl.follow(10,"-nOfClusters");
      dynamic_cast* >(cluster)->iterations()=cl.follow(10,"-iterations");
    } else {
      ERR << "Unknown clustermethod '"<< cluster <<"'."<< endl;
      USAGE();
      exit(20);
    }
  }

  if(cl.search("-d")) {
    string distancestring=cl.follow("","-d");
    cluster->distString()=distancestring;
    cluster->dist()=makeADistance(distancestring);
  }
  
  if(cl.search("-dummy")) {
    vector* > database;
    GenericFeature* genFeat;
    BaseFeature* feat;
    //create some dummy-data
    vector fileList;
    ostringstream outstream;
    for(int i=0;i<10;++i) {
      for(int j=0;j<10;++j) {
        genFeat=new GenericFeature(10);
        
        for(int k=0;k<10;++k) {
          (*genFeat)[k]=0;
        }
        
        (*genFeat)[j]=i*1;
        feat=genFeat;
        
        outstream << (*genFeat);
        fileList.push_back(outstream.str());
        outstream.str("");
        for(int k=0;k<10;++k) {
          cout << (*genFeat)[k];
        }
        cout << endl;
        database.push_back(feat);
      }
    }
    
    int nOfData=database.size();
    
    DBG(DBG_VERBOSE) << "Got " << nOfData << " features." << endl;
    
    cluster->run(database,clusterinformation);
    
    toSort=vector< pair >(clusterinformation.size());
    for(unsigned int i=0;i* > database;
    
    FileList fileList;
    fileList.load(filelistname);
    
    int nOfData=loadFileList(fileList,database,suffices);
    
    DBG(DBG_VERBOSE) << "Got " << nOfData << " features." << endl;
    
    cluster->run(database,clusterinformation);
    toSort=vector< pair > (clusterinformation.size());
    for(unsigned int i=0;i* > database;
    
    database=readJF(joergfilename);
    
    DBG(DBG_MESSAGE) << "Starting clustering" <run(database,clusterinformation);
    cluster->saveClusters(cl.follow("clusters.jf","-saveMeansTo"));
    
    toSort=vector< pair >(clusterinformation.size());
    
    for(unsigned int i=0;i