www.pudn.com > TextClassify.rar > CATALOGLIST.H


// CatalogList.h: interface for the CCatalogList class. 
// 
////////////////////////////////////////////////////////////////////// 
 
#if !defined(AFX_CATALOGLIST_H__4CF04BF3_9DA2_11D3_8433_00C04F722981__INCLUDED_) 
#define AFX_CATALOGLIST_H__4CF04BF3_9DA2_11D3_8433_00C04F722981__INCLUDED_ 
 
#if _MSC_VER > 1000 
#pragma once 
#endif // _MSC_VER > 1000 
 
#include "wordlist.h" 
#include  
#include "svm.h" 
 
class CDocNode; 
void AFXAPI SerializeElements(CArchive& ar,CDocNode* pElements,int nCount); 
 
struct DocInfo{ 
	CDocNode *m_pDocNode; 
	CString m_sDocName; 
	CString m_sClsName; 
	double m_dSimRatio; 
}; 
 
//用来记录文档向量中每一维特征的权重 
struct	sWeightNode 
{ 
	int    s_idxWord;    //特征的ID 
	short  s_tfi;        //特征在文档中出现的频次 
	double s_dWeight;    //特征的权重 
}; 
 
class CDocNode:public CObject 
{ 
public: 
	CDocNode(); 
	~CDocNode(); 
	const CDocNode& operator=(const CDocNode& x); 
	static void AllocTempBuffer(int nLen); 
	static void DeallocTempBuffer(); 
	void AllocBuffer(int nLen); 
	void DeallocBuffer(); 
	void AllocResultsBuffer(short nLen); 
	void DeallocResultsBuffer(); 
	//使用分词的方法形成文档的特征属性 
	int ScanChinese(char *, CWordList&, int, short idxCata=-1); 
	int ScanEnglish(char *, CWordList&, int, short idxCata=-1, bool bStem=false); 
	static int ScanChineseString(char*, CWordList&, int, long, short idxCata=-1); 
	static int ScanEnglishString(char*, CWordList&, int, long, short idxCata=-1, bool bStem=false); 
	//根据词典wordList计算文档每一维的权重,形成文档的向量,将其保存到数组m_sWeightSet 
	//这个方法要求词典wordList中每一个wordnode的m_dWeight的值都赋为此特征的反比文档频率 
	int ScanChineseWithDict(char *,CWordList&); 
	int ScanEnglishWithDict(char *,CWordList&, bool bStem=false); 
	static int ScanChineseStringWithDict(char *pPath,CWordList& wordList); 
	static int ScanEnglishStringWithDict(char *pPath, CWordList &wordList, bool bStem); 
	int GenDocVector(); 
	int GenDocVector(DOC &doc); 
	double ComputeSimilarityRatio();  //与存放在m_pTemp中的向量进行相似度的计算 
	void Serialize(CArchive& ar); 
	CDocNode(const CDocNode& x); 
	int GetWordNum(); 
	BOOL IsZero(); 
public: 
	double ComputeProbability(CWordList& wordList,int n); 
	long m_idxDoc;                //文档标识 
	CString	m_strDocName;         //文档名称 
	int m_nAllocLen;              //文档向量的长度,即数组m_sWeightSet的长度 
	sWeightNode	*m_sWeightSet;    //词权值列表 
	short  m_nClassNum;           //代表训练文档中的类别总数,即数组m_pResults的大小 
	double *m_pResults;           //文档与每个类别的相似度 
	short  m_nCataID;             //代表当前文档的所属类别,由于只在分类时使用,所以序列化的时候不操作此属性 
	static sWeightNode *m_pTemp;  //生成文档向量时需要使用的一块临时内存 
	static int m_nAllocTempLen;   //临时内存的大小 
private: 
	static char m_pSentence[MAX_PATH*10]; 
private: 
	bool IsNumber(char * p); 
	static int ParseFile(char *, int, int &); 
}; 
 
class CCatalogNode; 
class CCatalogList; 
void AFXAPI SerializeElements(CArchive& ar,CCatalogNode* pElements,int nCount); 
 
class CCatalogNode 
{ 
public: 
	CCatalogNode(); 
	~CCatalogNode(); 
	CCatalogNode(const CCatalogNode& x); 
	const CCatalogNode& operator = (const CCatalogNode& x); 
	const CCatalogNode& operator += (const CCatalogNode& x); 
public: 
	void InitCatalogNode(int nMode=0); 
	void SetStartDocID(long lDocID); 
	CDocNode& GetNext(POSITION& rPos); 
	POSITION GetFirstPosition(); 
	POSITION AddDoc(CDocNode& docnode); 
	void Serialize(CArchive& ar); 
	UINT GetDocNum(); 
	//扫描路径pPath下的所有文档,将其添加到当前类节点中 
	long ScanDirectory(CString); 
public: 
	CDocNode& GetAt( POSITION position ); 
	short m_idxCata; 
	CString	m_strCatalogName; 
	CString m_strDirName; 
	long	m_lTotalWordNum; 
private: 
	CList			m_lstDocList; 
	long    m_lCurDocID; 
}; 
 
class CCatalogList   
{ 
public: 
	CCatalogList(); 
	virtual ~CCatalogList(); 
	const CCatalogList& operator = (const CCatalogList& x); 
	const CCatalogList& operator += (const CCatalogList& x); 
	void InitCatalogList(int nMode=0); 
	void DumpToFile (CString strFileName, int nSaveMode=0); 
	BOOL GetFromFile(CString strFileName); 
	void DumpDocList(CString strFileName); 
	long BuildLib(CString	strDirName); 
public: 
	static int GetSaveMode(); 
	void DumpCataList(CString strFileName); 
	bool BuildCatalogID(CCatalogList &); 
	CCatalogNode* GetCataByName(CString strCataName); 
	short GetCataIDByName(CString strCataName); 
	short GetCataIDArrayFromString(char * line, CArray &aryCataID); 
	CDocNode* GetDocByName(CString strDocName); 
	CCatalogNode* GetCata(short idxCata); 
	CCatalogNode GetAt(POSITION pos) const; 
	CCatalogNode& GetAt(POSITION pos); 
	CCatalogNode& GetNext(POSITION& rPos); 
	POSITION GetFirstPosition(); 
	POSITION AddCata(CCatalogNode& catanode); 
	bool GetDocName(short idxCata,long idxDoc,CString& strDocName); 
	bool GetCataName(short idxCata,CString& strCataName); 
	int  GetCataNum(); 
	long GetDocNum(); 
private: 
	void Serialize(CArchive& ar); 
	long ScanDirectory(CString strDirName); 
private: 
	static int m_nSaveMode;  //0 保存文档向量, 1 不保存文档向量 
	CList	m_lstCatalogList; 
}; 
 
#endif // !defined(AFX_CATALOGLIST_H__4CF04BF3_9DA2_11D3_8433_00C04F722981__INCLUDED_)