www.pudn.com > libpmk.rar > hierarchical-clusterer.h, change:2007-05-27,size:7222b

// Copyright 2007, Massachusetts Institute of Technology. // The use of this code is permitted for research only. There is // absolutely no warranty for this software. // // Author: John Lee (jjl@mit.edu) // // TODO(jjl): Allow users to pass in arbitrary Clusterer objects, not just // hard-coded KMeansClusterer. // #ifndef CLUSTERING_HIERARCHICAL_CLUSTERER_H #define CLUSTERING_HIERARCHICAL_CLUSTERER_H #include <vector> #include <iostream> #include "clustering/clusterer.h" #include "point_set/point-ref.h" #include "point_set/point-set-list.h" #include "point_set/mutable-point-set-list.h" #include "util/sparse-vector.h" #include "util/distance-computer.h" using namespace std; namespace libpmk { /// Hierarchical K-Means clusterer. /** * This clusterer does not use the Clusterer interface because its * output involves multiple levels. However, similar to Clusterer, * HierarchicalClusterer has the ability to get its data either by * actually doing hierarchical K-means, or by reading * already-processed data. The difference is that the returned cluster * centers are a PointSetList, where each element in the list gives * the centers for a particular level. */ class HierarchicalClusterer { public: HierarchicalClusterer(); /// Get the cluster centers. /** * Must be called after Cluster() or ReadFromStream(). Returns an * ordered list of point sets. The ith element in this list * corresponds to the centers at the ith level, where i=0 is the * root (which is one giant cluster). * * Note that this function does NOT tell you the hierarchical * relationships. You will not know which clusters are subclusters * of which ones. GetChildRange() and GetParentIndex() provide this * functionality. */ const PointSetList& GetClusterCenters() const; /// Get the membership data. /** * Must be called after Cluster() or ReadFromStream(). The size of * this returned vector, of course, is the total number of points * that were clustered. Each element in this vector is a LargeIndex * describing a path down a tree. Membership of a single point is a * LargeIndex, as opposed to an int which is used by flat * (non-hierarchical) clustering methods. Naturally, the first * element of all of the LargeIndices here will be 0, since the top * level is one giant cluster. Following that, the cluster center * can be retrieved (see GetClusterCenters()) by looking at each * element in the LargeIndex. * * For example, let C be the PointSetList returned by * GetClusterCenters(). Suppose we have a LargeIndex [0 3 * 9]. C[0][0] is the root cluster. Then C[1][3] is the center that * this point belonged to at the first level, and C[2][9] is the * center that this point belonged to at the bottom level. * * Note that this function does NOT tell you the hierarchical * relationships. You will not know which clusters are subclusters * of which ones. GetChildRange() and GetParentIndex() provide this * functionality. */ const vector<LargeIndex>& GetMemberships() const; /// Performs the actual clustering and stores the result internally. void Cluster(int num_levels, int branch_factor, const vector<PointRef>& points, const DistanceComputer& distance_computer); /// Get the start and end indices of children at the next level. /** * Requires Preprocess() or ReadFromStream() to be called first. * The <level> and <index> parameters specify a cluster center. * The two returned integers are the start and end indices into * the cluster centers (returned by GetClusterCenters()) of that * cluster center's children at the (<level>+1)st level. The * returned pair is (-1, -1) if it has no children. */ pair<int, int> GetChildRange(int level, int index) const; /// Get the index of the parent of a node. /** * Requires Preprocess() or ReadFromStream() to be called first. * The <level> and <index> parameters specify a cluster center. The * returned integer is the index into the cluster centers (returned * by GetClusterCenters()) of that cluster center's parent at the * (<level>-1)st level. Returns -1 for the root node (the only one * without a parent). */ int GetParentIndex(int level, int index) const; /// Write the clustered data to a stream. /** * Must be called after Cluster() or ReadFromStream(). File format: * <ul> * <li> (int32) N, the number of levels * <li> (int32) P, the total number of clustered points * <li> (int32) D, the feature dim * <li> For each level: <ul> * <li> (int32) n, the number of centers at this level * <li> For each center at this level: <ul> * <li> (D * float) the center * <li> (int32) the parent index (see GetParentIndex()) * <li> (int32) child range start index (see GetChildRange()) * <li> (int32) child range end index (see GetChildRange()) * </ul></ul> * <li> For each point: <ul> * <li> (int32) the size of the point's LargeIndex (see * GetMemberships()) * <li> (index_size * int32) the point's LargeIndex * </ul> * </ul> * * This function will abort if the stream is bad. */ void WriteToStream(ostream& output_stream) const; /// Write the clustered data to a file. void WriteToFile(const char* output_filename) const; /// Read clustered data from a stream. /** * Can be called in lieu of Cluster(). See WriteToStream() for the * format. This function will abort if the stream is bad. * \sa WriteToStream() */ void ReadFromStream(istream& input_stream); /// Read the clustered data from a file. void ReadFromFile(const char* input_filename); /// Default 100. static const int MAX_ITERATIONS; private: // RecursiveCluster() takes as input which level it's going to get // clusters for, as well as a list of ints. This list of ints gives // *indices into all_points_* to cluster (note that all_points_ itself // is a list of indices into the actual data). void RecursiveCluster(int level, const vector<int>& indices, int num_levels, int branch_factor, const vector<PointRef>& points, const DistanceComputer& distance_computer, int parent_index); MutablePointSetList centers_; // Has the same structure as centers_, except the features are // 3-dim. They tell you the index of the parent, and the start and // end indices of its children in the next level. -1 if N/A, like // if it has no children or if it has no parent. MutablePointSetList tree_indices_; // A vector of size num_levels. For each level, this tells you // how many centers are in that group. We need this beecause centers_ // is sparsely done, meaning if there are clusters with 0 points, // they simply don't show up in centers_. vector<LargeIndex> memberships_; bool done_; }; } // namespace libpmk #endif // CLUSTERING_HIERARCHICAL_CLUSTERER_H