www.pudn.com > firev0.01.rar > clustertest.cpp
// Warning!!!! // this program is programmed in an absolutely horrible manner. // Take care, reading this could seriously ruin your day // // and it is getting worse and worse // /** * @file clustertest.cpp * @author Thomas Deselaers * @date Mon Jun 23 18:52:16 2003 * * @brief program to cluster data. * * Initially this program was meant only to test some clustering * algorithms, now it is my only clustering program, because it's * functionality grew and grew and grew. Everything can be changed by * command line parameters. Hopefully the documentation is up to date. * */ #include#include #include #include #include #include "filelist.hpp" #include "basefeature.hpp" #include "genericfeature.hpp" #include "mdhistogram.hpp" #include "colorpixel.hpp" #include "baseclusterer.hpp" #include "em.hpp" #include "kmeans.hpp" #include "image.hpp" #include "gzstream.hpp" #include "getpot.hpp" #include "loadafeature.hpp" #include "makeadistance.hpp" using namespace std; using namespace img; ///rearrange the information in sorted to have the clustermembers most ///similar to the clustercenter first. //but this function is absolutely horrible, because the data is stored in a completely silly way. void rearrangeClusters(const vector< BaseFeature * > & database, vector< pair > &sorted, BaseClusterer * cluster, FileList filelist) { int anzCluster=0; for(unsigned int i=0;i anzCluster) anzCluster=sorted[i].first+1;} vector< pair > dists; //clusterwise int number; double dist; int actStart, actObservation=0, actDist; for(int actCluster=0;actCluster * >(cluster)) { density=dynamic_cast< EM * >(cluster)->cluster(actCluster); } else if(dynamic_cast< kMeans * >(cluster)) { density=dynamic_cast< kMeans * >(cluster)->cluster(actCluster); } else { ERR << "Not possible!" << endl; } dists=vector >(0); //get dists from clustermembers to clustercenter actStart=actObservation; while(sorted[actObservation].first==actCluster && actObservation dist()->dist(database[idx],density.mean); dists.push_back(pair (dist,idx)); // cout << actObservation << endl; ++actObservation; } sort(dists.begin(),dists.end()); actDist=0; for(int i=actStart;i > &sorted) { DBG(DBG_MESSAGE) << "starting" << endl; int anzCluster=0, anzClasses=0; for(unsigned int i=0;i anzCluster) anzCluster=sorted[i].first+1;} for(unsigned int i=0;i anzClasses) anzClasses=fileList.cls(i)+1;} anzCluster=MAX(1,anzCluster); anzClasses=MAX(1,anzClasses); vector< set > classSet(anzClasses, set () ); vector< set > clusterSet(anzCluster, set () ); for(unsigned int i=0;i > contingencyTable(anzCluster, vector (anzClasses,0)); set tmp_set; double ent_sum1=0.0, ent_sum2=0.0; double tmp; DBG(DBG_MESSAGE) << "init done" << endl; for(int i=0;i ::max();; for(int j=0;j ::max(); for(int i=0;i E1(fileList.size(),0.0); vector E2(fileList.size(),0.0); int aktClass, aktCluster=0; for(unsigned int n=0;n *> readJF(const string filename) { vector * > result; GenericFeature * img; double tmp; int cls, dim, nOfClasses; igzstream ifs(filename.c_str()); if(!ifs) { ERR << "Cannot open '"<< filename <<"' for reading." << endl; } else { ifs >> nOfClasses >> dim; // cout << "Will find " << nOfClasses << "classes " << endl; DBG(DBG_MESSAGE) << "Reading features with size " << dim << "." << endl; while(!ifs.eof() && cls != -1) { ifs >> cls ; if(!ifs.eof() && cls != -1) { img=new GenericFeature (dim); for(int x=0;x > tmp; img->operator[](x)=double(tmp); } result.push_back(img); } } } DBG(DBG_MESSAGE) << result.size() << " images read." << endl; return result; } int loadFileList(const FileList fileList, vector< BaseFeature * > & database, vector suffices=vector ()) { DBG(DBG_VERBOSE) << "starting" << endl; string fn; vector sufficesToUse(0); if(suffices.size()==0) { for(unsigned int i=0;i * >::iterator i=database.begin(); i< database.end();++i) { vector * >::iterator j; j=i; delete *j; } database.clear(); DBG(DBG_MESSAGE) << "suffices we are actually using: "; for(unsigned int i=0;i * feat=new GenericFeature (0); BaseFeature * actFeat; for(unsigned int j=0;j size() << " "; for(unsigned int k=0;k size();++k) { feat->push_back(actFeat->operator[](k)); DBG(DBG_TALKATIVE) << actFeat->operator[](k) << " "; } } DBG(DBG_VERBOSE) << feat->size() << endl; database.push_back(feat); } DBG(DBG_VERBOSE) << "Finished reading histograms from list" << endl; return database.size(); } void USAGE() { DBG(DBG_RESULT) << "clustertest [options]" << endl << " mandatory parameters:" << endl << " ((-dummy)|(-rgb list)|(-jf jf)) to select the data to be clustered" << endl << " -c (em|kmeans) to select the algorithm to be used" << endl << " -d (euclidean|chisquare|jsd|kld|his) to select the distance function" << endl << " Output options: " << endl << " -noRearrange - donot rearrange clustermembers to have the one closest to centroid first" << endl << " -noLCEGCE - do not calculate LCE and GCE" << endl << " -noPurComp - do not calculate ClusterPurity and ClassCompleteness" << endl << " Options to loading files" << endl << " -suffix followed by a ':' divided list of suffices to use" << endl << " Options to kMeans " << endl << " -nOfClusters (10)" < don't split clusters smaller than this" << endl << " -splitMode (*allSplit*, largestSplit, varianceSplit)" << endl << endl; } int main(int argc, char **argv) { GetPot cl(argc,argv,":"); vector clusterinformation; vector< pair > toSort; vector suffices; BaseClusterer * cluster=NULL; vector unidentified=cl.unidentified_options(20, "-c","-d", "-suffix","-h","-rgb","-dummy","-jf","-stopWithNClusters","-splitMode", "-nOfClusters", "-iterations", "-maxSplits", "-dontSplitBelow", "-iter", "-minObs", "-epsilon", "-disturbMode", "-poolMode", "-noRearrange","-noAnalyse","-testAnalyser"); if(!unidentified.empty()) { if(unidentified.size() == 1 && unidentified[0].find("-suffix")==0) { } else { cout << "Unknown parameters: "; for(unsigned int i=0;i ; dynamic_cast* >(cluster)->maxSplits()=cl.follow(4,"-maxSplits"); dynamic_cast* >(cluster)->iterationsBetweenSplits()=cl.follow(10,"-iter"); dynamic_cast* >(cluster)->minObservationsPerCluster()=cl.follow(4,"-minObs"); dynamic_cast* >(cluster)->epsilon()=cl.follow(0.1,"-epsilon"); dynamic_cast* >(cluster)->disturbMode(cl.follow("varianceDisturb","-disturbMode")); dynamic_cast* >(cluster)->poolMode(cl.follow("noPooling","-poolMode")); dynamic_cast* >(cluster)->dontSplitBelow()=cl.follow(10,"-dontSplitBelow"); dynamic_cast* >(cluster)->splitMode(cl.follow("allSplit","-splitMode")); dynamic_cast* >(cluster)->stopWithNClusters()=cl.follow(-1,"-stopWithNClusters"); } else if (clusterstring=="kmeans") { cluster=new kMeans ; dynamic_cast * >(cluster)->nOfClusters()=cl.follow(10,"-nOfClusters"); dynamic_cast * >(cluster)->iterations()=cl.follow(10,"-iterations"); } else { ERR << "Unknown clustermethod '"<< cluster <<"'."<< endl; USAGE(); exit(20); } } if(cl.search("-d")) { string distancestring=cl.follow("","-d"); cluster->distString()=distancestring; cluster->dist()=makeADistance(distancestring); } if(cl.search("-dummy")) { vector * > database; GenericFeature * genFeat; BaseFeature * feat; //create some dummy-data vector fileList; ostringstream outstream; for(int i=0;i<10;++i) { for(int j=0;j<10;++j) { genFeat=new GenericFeature (10); for(int k=0;k<10;++k) { (*genFeat)[k]=0; } (*genFeat)[j]=i*1; feat=genFeat; outstream << (*genFeat); fileList.push_back(outstream.str()); outstream.str(""); for(int k=0;k<10;++k) { cout << (*genFeat)[k]; } cout << endl; database.push_back(feat); } } int nOfData=database.size(); DBG(DBG_VERBOSE) << "Got " << nOfData << " features." << endl; cluster->run(database,clusterinformation); toSort=vector< pair >(clusterinformation.size()); for(unsigned int i=0;i * > database; FileList fileList; fileList.load(filelistname); int nOfData=loadFileList(fileList,database,suffices); DBG(DBG_VERBOSE) << "Got " << nOfData << " features." << endl; cluster->run(database,clusterinformation); toSort=vector< pair > (clusterinformation.size()); for(unsigned int i=0;i * > database; database=readJF(joergfilename); DBG(DBG_MESSAGE) << "Starting clustering" < run(database,clusterinformation); cluster->saveClusters(cl.follow("clusters.jf","-saveMeansTo")); toSort=vector< pair >(clusterinformation.size()); for(unsigned int i=0;i