www.pudn.com > TextClassify.rar > CATALOGLIST.H
// CatalogList.h: interface for the CCatalogList class. // ////////////////////////////////////////////////////////////////////// #if !defined(AFX_CATALOGLIST_H__4CF04BF3_9DA2_11D3_8433_00C04F722981__INCLUDED_) #define AFX_CATALOGLIST_H__4CF04BF3_9DA2_11D3_8433_00C04F722981__INCLUDED_ #if _MSC_VER > 1000 #pragma once #endif // _MSC_VER > 1000 #include "wordlist.h" #include#include "svm.h" class CDocNode; void AFXAPI SerializeElements(CArchive& ar,CDocNode* pElements,int nCount); struct DocInfo{ CDocNode *m_pDocNode; CString m_sDocName; CString m_sClsName; double m_dSimRatio; }; //用来记录文档向量中每一维特征的权重 struct sWeightNode { int s_idxWord; //特征的ID short s_tfi; //特征在文档中出现的频次 double s_dWeight; //特征的权重 }; class CDocNode:public CObject { public: CDocNode(); ~CDocNode(); const CDocNode& operator=(const CDocNode& x); static void AllocTempBuffer(int nLen); static void DeallocTempBuffer(); void AllocBuffer(int nLen); void DeallocBuffer(); void AllocResultsBuffer(short nLen); void DeallocResultsBuffer(); //使用分词的方法形成文档的特征属性 int ScanChinese(char *, CWordList&, int, short idxCata=-1); int ScanEnglish(char *, CWordList&, int, short idxCata=-1, bool bStem=false); static int ScanChineseString(char*, CWordList&, int, long, short idxCata=-1); static int ScanEnglishString(char*, CWordList&, int, long, short idxCata=-1, bool bStem=false); //根据词典wordList计算文档每一维的权重,形成文档的向量,将其保存到数组m_sWeightSet //这个方法要求词典wordList中每一个wordnode的m_dWeight的值都赋为此特征的反比文档频率 int ScanChineseWithDict(char *,CWordList&); int ScanEnglishWithDict(char *,CWordList&, bool bStem=false); static int ScanChineseStringWithDict(char *pPath,CWordList& wordList); static int ScanEnglishStringWithDict(char *pPath, CWordList &wordList, bool bStem); int GenDocVector(); int GenDocVector(DOC &doc); double ComputeSimilarityRatio(); //与存放在m_pTemp中的向量进行相似度的计算 void Serialize(CArchive& ar); CDocNode(const CDocNode& x); int GetWordNum(); BOOL IsZero(); public: double ComputeProbability(CWordList& wordList,int n); long m_idxDoc; //文档标识 CString m_strDocName; //文档名称 int m_nAllocLen; //文档向量的长度,即数组m_sWeightSet的长度 sWeightNode *m_sWeightSet; //词权值列表 short m_nClassNum; //代表训练文档中的类别总数,即数组m_pResults的大小 double *m_pResults; //文档与每个类别的相似度 short m_nCataID; //代表当前文档的所属类别,由于只在分类时使用,所以序列化的时候不操作此属性 static sWeightNode *m_pTemp; //生成文档向量时需要使用的一块临时内存 static int m_nAllocTempLen; //临时内存的大小 private: static char m_pSentence[MAX_PATH*10]; private: bool IsNumber(char * p); static int ParseFile(char *, int, int &); }; class CCatalogNode; class CCatalogList; void AFXAPI SerializeElements(CArchive& ar,CCatalogNode* pElements,int nCount); class CCatalogNode { public: CCatalogNode(); ~CCatalogNode(); CCatalogNode(const CCatalogNode& x); const CCatalogNode& operator = (const CCatalogNode& x); const CCatalogNode& operator += (const CCatalogNode& x); public: void InitCatalogNode(int nMode=0); void SetStartDocID(long lDocID); CDocNode& GetNext(POSITION& rPos); POSITION GetFirstPosition(); POSITION AddDoc(CDocNode& docnode); void Serialize(CArchive& ar); UINT GetDocNum(); //扫描路径pPath下的所有文档,将其添加到当前类节点中 long ScanDirectory(CString); public: CDocNode& GetAt( POSITION position ); short m_idxCata; CString m_strCatalogName; CString m_strDirName; long m_lTotalWordNum; private: CList m_lstDocList; long m_lCurDocID; }; class CCatalogList { public: CCatalogList(); virtual ~CCatalogList(); const CCatalogList& operator = (const CCatalogList& x); const CCatalogList& operator += (const CCatalogList& x); void InitCatalogList(int nMode=0); void DumpToFile (CString strFileName, int nSaveMode=0); BOOL GetFromFile(CString strFileName); void DumpDocList(CString strFileName); long BuildLib(CString strDirName); public: static int GetSaveMode(); void DumpCataList(CString strFileName); bool BuildCatalogID(CCatalogList &); CCatalogNode* GetCataByName(CString strCataName); short GetCataIDByName(CString strCataName); short GetCataIDArrayFromString(char * line, CArray &aryCataID); CDocNode* GetDocByName(CString strDocName); CCatalogNode* GetCata(short idxCata); CCatalogNode GetAt(POSITION pos) const; CCatalogNode& GetAt(POSITION pos); CCatalogNode& GetNext(POSITION& rPos); POSITION GetFirstPosition(); POSITION AddCata(CCatalogNode& catanode); bool GetDocName(short idxCata,long idxDoc,CString& strDocName); bool GetCataName(short idxCata,CString& strCataName); int GetCataNum(); long GetDocNum(); private: void Serialize(CArchive& ar); long ScanDirectory(CString strDirName); private: static int m_nSaveMode; //0 保存文档向量, 1 不保存文档向量 CList m_lstCatalogList; }; #endif // !defined(AFX_CATALOGLIST_H__4CF04BF3_9DA2_11D3_8433_00C04F722981__INCLUDED_)