www.pudn.com > TextClassify.rar > CATALOGLIST.CPP


// CatalogList.cpp: implementation of the CCatalogList class. 
// 
////////////////////////////////////////////////////////////////////// 
 
#include "stdafx.h" 
#include "CatalogList.h" 
#include "wordlist.h" 
#include "direct.h" 
#include "wordsegment.h" 
#include "classifierparam.h" 
#include "stemmer.h" 
#include "classifier.h" 
#include  
#include   
#include ".\\Utility\\Utility.h" 
 
#ifdef _DEBUG 
#define new DEBUG_NEW 
#undef THIS_FILE 
static char THIS_FILE[] = __FILE__; 
#endif 
 
int CCatalogList::m_nSaveMode; 
char pTempStr[MAX_PATH]; 
char pWordTag[10]; 
sWeightNode *CDocNode::m_pTemp=NULL;                 //生成文档向量时需要使用的一块临时内存----newly added 
int CDocNode::m_nAllocTempLen=0; 
char CDocNode::m_pSentence[MAX_PATH*10];        //用来存放经过去掉空格回车等字母后的句子 
////////////////////////////////////////////////////////////////////// 
// Construction/Destruction 
////////////////////////////////////////////////////////////////////// 
//文档结点类 
CDocNode::CDocNode(const CDocNode& x):m_nAllocLen(0),m_sWeightSet(NULL),m_pResults(NULL),m_nClassNum(0) 
{ 
	*this=x; 
} 
//拷贝 
const CDocNode& CDocNode::operator=(const CDocNode& x) 
{ 
	if(this==&x) return *this; 
	m_strDocName=x.m_strDocName; 
	m_idxDoc=x.m_idxDoc; 
	if(x.m_sWeightSet!=NULL) 
	{ 
		AllocBuffer(x.m_nAllocLen); 
		memcpy(m_sWeightSet,x.m_sWeightSet,m_nAllocLen*sizeof(sWeightNode)); 
	} 
	else 
	{ 
		m_sWeightSet=NULL; 
		m_nAllocLen=0; 
	} 
 
	if(x.m_pResults!=NULL) 
	{ 
		AllocResultsBuffer(x.m_nClassNum); 
		memcpy(m_pResults,x.m_pResults,m_nClassNum*sizeof(double)); 
	} 
	else 
	{ 
		m_pResults=NULL; 
		m_nClassNum=0; 
	} 
 
	return *this; 
} 
 
CDocNode::CDocNode() 
{ 
	m_sWeightSet = NULL; 
	m_pResults   = NULL; 
	m_nAllocLen=0; 
	m_nClassNum=0; 
	m_idxDoc=-1; 
	m_nCataID=-1; 
} 
 
CDocNode::~CDocNode() 
{ 
	DeallocBuffer(); 
	DeallocResultsBuffer(); 
} 
 
//nMode<=0  删除所有文档信息 
//nMode>0   只删除文档向量所占用的空间 
void CCatalogNode::InitCatalogNode(int nMode) 
{ 
	m_lTotalWordNum = 0; 
	POSITION pos_doc=m_lstDocList.GetHeadPosition(); 
	while(pos_doc!=NULL) 
	{ 
		CDocNode& docnode=m_lstDocList.GetNext(pos_doc); 
		docnode.DeallocBuffer(); 
		docnode.DeallocResultsBuffer(); 
	} 
	if(nMode>0) m_lstDocList.RemoveAll(); 
} 
 
CCatalogNode::CCatalogNode(const CCatalogNode& x) 
{ 
	*this=x; 
} 
 
//类节点构造 
const CCatalogNode& CCatalogNode::operator = (const CCatalogNode& x) 
{ 
	if(this==&x) return *this; 
	m_lTotalWordNum = x.m_lTotalWordNum ; 
	m_strCatalogName=x.m_strCatalogName; 
	m_strDirName=x.m_strDirName; 
	m_idxCata=x.m_idxCata; 
	m_lstDocList.RemoveAll(); 
	POSITION pos = x.m_lstDocList.GetHeadPosition(); 
	while(pos!=NULL) 
	{ 
		CDocNode& docnode=x.m_lstDocList.GetNext(pos); 
		m_lstDocList.AddTail(docnode); 
	} 
	return *this; 
} 
 
const CCatalogNode& CCatalogNode::operator += (const CCatalogNode& x) 
{ 
	if(this==&x) return *this; 
	m_lTotalWordNum += x.m_lTotalWordNum ; 
	m_strCatalogName=x.m_strCatalogName; 
	m_strDirName=x.m_strDirName; 
	m_idxCata=x.m_idxCata; 
	POSITION pos = x.m_lstDocList.GetHeadPosition(); 
	while(pos!=NULL) 
	{ 
		CDocNode& docnode=x.m_lstDocList.GetNext(pos); 
		m_lstDocList.AddTail(docnode); 
	} 
	return *this; 
} 
 
CCatalogNode::CCatalogNode() 
{ 
	m_idxCata=-1; 
	m_lCurDocID=0; 
	m_lTotalWordNum=0; 
	InitCatalogNode(); 
} 
 
CCatalogNode::~CCatalogNode() 
{ 
	InitCatalogNode(); 
} 
 
 
void CCatalogNode::SetStartDocID(long lDocID) 
{ 
	m_lCurDocID=lDocID; 
} 
 
long CCatalogNode::ScanDirectory(CString strPath) 
{ 
	if(_chdir(strPath))  // if can't find the dir 
	{ 
		CString	csTmp = "目录"; 
		csTmp+=strPath; 
		csTmp+="不存在!"; 
		AfxMessageBox(csTmp); 
		return -1; 
	} 
 
	HANDLE hFinder; 
	LPWIN32_FIND_DATA lpFindFileData;	 
	lpFindFileData  = new WIN32_FIND_DATA; 
	hFinder = ::FindFirstFile("*.*",lpFindFileData ); 
	while(::FindNextFile(hFinder,lpFindFileData)) 
	{ 
		if( !strcmp(lpFindFileData->cFileName,".") || !strcmp(lpFindFileData->cFileName,"..") ) 
			continue; 
 
		if(!(lpFindFileData->dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY)) 
		{ 
			CDocNode docnode; 
			docnode.m_strDocName=lpFindFileData->cFileName; 
			docnode.m_idxDoc=m_lCurDocID++; 
			docnode.m_nAllocLen=0; 
			docnode.m_sWeightSet=NULL; 
			docnode.m_nClassNum=0; 
			docnode.m_pResults=NULL; 
			AddDoc(docnode); 
		} 
	} 
	delete	lpFindFileData; 
	return m_lCurDocID; 
} 
 
CCatalogList::CCatalogList() 
{ 
	m_nSaveMode=0; 
} 
 
CCatalogList::~CCatalogList() 
{ 
} 
 
 
void CCatalogList::DumpCataList(CString strFileName) 
{ 
	FILE *stream; 
	if( (stream  = fopen( strFileName, "w+" )) == NULL ) 
	{ 
		AfxMessageBox("无法创建文件"+strFileName+"!"); 
		return; 
	} 
 
	POSITION pos=GetFirstPosition(); 
	while(pos!=NULL) 
	{ 
		CCatalogNode& catanode=GetNext(pos); 
		fprintf(stream,"%d %s\n",catanode.m_idxCata,catanode.m_strCatalogName); 
	} 
	fclose(stream); 
} 
 
void CCatalogList::DumpDocList(CString strFileName) 
{ 
	FILE *stream; 
	if( (stream  = fopen( strFileName, "w+" )) == NULL ) 
	{ 
		AfxMessageBox("无法创建文件"+strFileName+"!"); 
		return; 
	} 
 
	POSITION pos=GetFirstPosition(); 
	POSITION pos_doc; 
	while(pos!=NULL) 
	{ 
		CCatalogNode& catanode=GetNext(pos); 
		pos_doc=catanode.GetFirstPosition(); 
		while(pos_doc!=NULL) 
		{ 
			CDocNode& docnode=catanode.GetNext(pos_doc); 
			if(docnode.m_nAllocLen>0) 
			{ 
				fprintf(stream,"%d",catanode.m_idxCata+1); 
				for(int i=0;i dZero) 
					fprintf(stream," %d:%f",docnode.m_sWeightSet[i].s_idxWord+1,docnode.m_sWeightSet[i].s_dWeight); 
				} 
				fprintf(stream,"\n"); 
			} 
		} 
	} 
	fclose(stream); 
} 
 
//nSaveMode<=0 保存文档的向量 
//nSaveMode>0  不保存文档的向量 
void CCatalogList::DumpToFile(CString strFileName, int nSaveMode)  // view the word list content 
{ 
	CFile		fBinOut; 
	if(!fBinOut.Open(strFileName,CFile::modeWrite | CFile::modeCreate)) 
	{ 
		AfxMessageBox("无法创建文件"+strFileName+"!"); 
		return; 
	} 
	 
	CArchive ar(&fBinOut,CArchive::store); 
	CCatalogList::m_nSaveMode=nSaveMode; 
	Serialize(ar); 
	 
	ar.Close(); 
	fBinOut.Close(); 
} 
 
BOOL CCatalogList::GetFromFile(CString strFileName)  // view the word list content 
{ 
	CFile	fBinOut; 
	if(!fBinOut.Open(strFileName,CFile::modeRead)) 
	{ 
		AfxMessageBox("无法打开文件"+strFileName+"!"); 
		return FALSE; 
	} 
 
	CArchive ar(&fBinOut,CArchive::load); 
	Serialize(ar); 
	ar.Close(); 
 
	fBinOut.Close(); 
	return TRUE; 
} 
//处理文档库中的文档 
long CCatalogList::BuildLib(CString strDirName) 
{ 
	InitCatalogList(); 
	return ScanDirectory(strDirName); 
} 
 
void CCatalogList::Serialize(CArchive &ar) 
{ 
	if(ar.IsStoring()) ar<>m_nSaveMode; 
	m_lstCatalogList.Serialize(ar); 
} 
 
void AFXAPI SerializeElements(CArchive& ar,CCatalogNode* pElements,int nCount) 
{ 
	ASSERT(nCount==0|| 
		AfxIsValidAddress(pElements,nCount*sizeof(CCatalogNode))); 
	pElements->Serialize(ar); 
} 
 
void AFXAPI SerializeElements(CArchive& ar,CDocNode* pElements,int nCount) 
{ 
	ASSERT(nCount==0|| 
		AfxIsValidAddress(pElements,nCount*sizeof(CDocNode))); 
	pElements->Serialize(ar); 
} 
 
void CDocNode::Serialize(CArchive &ar) 
{ 
	int nLen; 
	if(ar.IsStoring()) 
	{ 
		ar<>m_idxDoc; 
		ar>>m_strDocName; 
		if(CCatalogList::GetSaveMode()<=0) 
		{ 
			ar>>nLen; 
			AllocBuffer(nLen); 
			ar.Read((void*)m_sWeightSet,m_nAllocLen*sizeof(sWeightNode)); 
		} 
		else 
		{ 
			m_nAllocLen=0; 
			m_sWeightSet=NULL; 
		} 
		m_nClassNum=0; 
		m_pResults=NULL; 
	} 
} 
 
void CCatalogNode::Serialize(CArchive &ar) 
{ 
	if(ar.IsStoring()) 
	{ 
		ar<>m_idxCata; 
		ar>>m_strDirName; 
		ar>>m_lTotalWordNum; 
		ar>>m_strCatalogName; 
	} 
	m_lstDocList.Serialize(ar); 
} 
 
//为权重分配空间 
void CDocNode::AllocTempBuffer(int nLen) 
{ 
	if((nLen<=0)||(m_nAllocTempLen==nLen)) return; 
	if(m_pTemp!=NULL) 
	{ 
		delete []m_pTemp; 
		m_pTemp=NULL; 
	} 
	m_pTemp=new sWeightNode[nLen]; 
	m_nAllocTempLen=nLen; 
} 
 
 
//释放为权重分配的临时空间 
void CDocNode::DeallocTempBuffer() 
{ 
	if(m_pTemp!=NULL) 
	{ 
		delete []m_pTemp; 
		m_pTemp=NULL; 
	} 
	m_nAllocTempLen=0; 
} 
 
 
//为权重分配临时空间 
void CDocNode::AllocBuffer(int nLen) 
{ 
	if((nLen<=0)||(m_nAllocLen==nLen)) return; 
	if(m_sWeightSet!=NULL) 
	{ 
		delete []m_sWeightSet; 
		m_sWeightSet=NULL; 
	} 
	m_nAllocLen=nLen; 
	m_sWeightSet=new sWeightNode[m_nAllocLen]; 
} 
 
 
//释放为权重分配的空间 
void CDocNode::DeallocBuffer() 
{ 
	if(m_sWeightSet!=NULL) 
	{ 
		delete []m_sWeightSet; 
		m_sWeightSet=NULL; 
	} 
	m_nAllocLen=0; 
} 
 
//为分类结果分配空间 
void CDocNode::AllocResultsBuffer(short nLen) 
{ 
	if((nLen<=0)||(m_nClassNum==nLen)) return; 
	if(m_pResults!=NULL) 
	{ 
		delete []m_pResults; 
		m_pResults=NULL; 
	} 
	m_nClassNum=nLen; 
	m_pResults=new double[m_nClassNum]; 
} 
 
 
//释放为分类结果分配的空间 
void CDocNode::DeallocResultsBuffer() 
{ 
	if(m_pResults!=NULL) 
	{ 
		delete []m_pResults; 
		m_pResults=NULL; 
	} 
	m_nClassNum=0; 
} 
 
int CDocNode::ScanChinese(char * pPath,CWordList& wordList,int nCataNum, short idxCata) 
{ 
	CFile fin; 
	char *buffer; 
	strcpy(pTempStr,pPath); 
	strcat(pTempStr,"\\"); 
	strcat(pTempStr,m_strDocName.GetBuffer(0)); 
	if(!fin.Open(pTempStr,CFile::modeRead)) 
		return -1; 
 
	unsigned int flen=fin.GetLength(); 
	buffer=new char[flen+1]; 
	flen=fin.ReadHuge(buffer,flen); 
	buffer[flen]='\0'; 
	fin.Close(); 
	int num=ScanChineseString(buffer,wordList,nCataNum,m_idxDoc,idxCata); 
	delete[] buffer; 
	return num; 
} 
 
int CDocNode::ScanChineseString(char * pPath,CWordList& wordList,int nCataNum, long idxDoc, short idxCata) 
{ 
	char *buffer=pPath; 
	int i,j,sum; 
	char *w; 
	//realcnt为文章中去掉停用词后剩下的总共词数 
	//nStart为一个句子在buffer中的开始位置 
	int nStart=0,nNewStart=0; 
	bool flag=true; 
	int nSentenceLen=0; 
	int realcnt=0; 
	while(buffer[nStart]!='\0') 
	{ 
		flag=true; 
		nSentenceLen=ParseFile(buffer,nStart,nNewStart); 
		nStart=nNewStart; 
		if(nSentenceLen==0) continue; 
		if(m_pSentence[0]>0) //如果是一个英文单词 
		{ 
			//如果英文单词的长度大于等于2,且不是数字 
			if((nSentenceLen>=2)&&((m_pSentence[0]<'0')||(m_pSentence[0]>'9'))) 
			{ 
				wordList.Add(m_pSentence,idxCata,idxDoc,nCataNum); 
				realcnt++; 
			} 
		} 
		else //如果是汉字串 
		{ 
			if(nSentenceLen%2!=0) continue; 
			if(nSentenceLen==2) //如果是单个汉字 
			{ 
				wordList.Add(m_pSentence,idxCata,idxDoc,nCataNum); 
				realcnt++; 
			} 
			else 
			{ 
				g_wordSeg.Segment(m_pSentence); 
				for(i=0;i32&&c<=47)||(c>=58&&c<=64)||(c>=91&&c<=96)||(c>=123&&c<=127)) 
		{ 
			buffer[nFilePos]='\0'; 
			wordLen=buffer+nFilePos-p; 
			if(wordLen>2) 
			{ 
				if(bStem) theStemmer.stem(p,0,wordLen-1); 
				wordList.Add(p,idxCata,idxDoc,nCataNum); 
				realcnt++; 
			} 
			p=buffer+nFilePos+1; 
		} 
		nFilePos++; 
	} 
	return realcnt; 
} 
 
 
BOOL CDocNode::IsZero() 
{ 
	ASSERT(m_sWeightSet!=NULL); 
	for(int i=0;icFileName,".") || !strcmp(lpFindFileData->cFileName,"..") ) 
			continue; 
 
		if((lpFindFileData->dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY)) 
		{ 
			CCatalogNode catalognode; 
			catalognode.m_strCatalogName=lpFindFileData->cFileName; 
			catalognode.m_strDirName=strPath+"\\"+catalognode.m_strCatalogName; 
			catalognode.m_idxCata=idxCurCata++; 
			POSITION posTemp=AddCata(catalognode); 
			CCatalogNode& cataTemp=GetAt(posTemp); 
			cataTemp.SetStartDocID(docNum); 
			docNum=cataTemp.ScanDirectory(catalognode.m_strDirName); 
		} 
	} 
	delete	lpFindFileData; 
	return docNum; 
} 
 
CCatalogNode CCatalogList::GetAt(POSITION pos) const 
{ 
	return m_lstCatalogList.GetAt(pos); 
} 
 
CCatalogNode& CCatalogList::GetAt(POSITION pos) 
{ 
	return m_lstCatalogList.GetAt(pos); 
} 
//根据标识获得类节点 
CCatalogNode* CCatalogList::GetCata(short idxCata) 
{ 
	POSITION pos_cata=GetFirstPosition(); 
	while(pos_cata!=NULL) 
	{ 
		CCatalogNode& catanode=GetNext(pos_cata); 
		if(catanode.m_idxCata==idxCata) 
			return &catanode; 
	} 
	return NULL; 
} 
 
//根据文档名获得文档 
CDocNode* CCatalogList::GetDocByName(CString strDocName) 
{ 
	POSITION pos_cata=GetFirstPosition(); 
	while(pos_cata!=NULL) 
	{ 
		CCatalogNode& catanode=GetNext(pos_cata); 
		POSITION pos_doc=catanode.GetFirstPosition(); 
		while(pos_doc!=NULL) 
		{ 
			CDocNode& docnode=catanode.GetNext(pos_doc); 
			if(docnode.m_strDocName==strDocName) 
				return &docnode; 
		} 
	} 
	return NULL; 
} 
//根据类名获得类 
CCatalogNode* CCatalogList::GetCataByName(CString strCataName) 
{ 
	POSITION pos_cata=GetFirstPosition(); 
	while(pos_cata!=NULL) 
	{ 
		CCatalogNode& catanode=GetNext(pos_cata); 
		if(catanode.m_strCatalogName==strCataName) 
			return &catanode; 
	} 
	return NULL; 
} 
 
//根据类名获得类的ID 
short CCatalogList::GetCataIDByName(CString strCataName) 
{ 
	POSITION pos_cata=GetFirstPosition(); 
	while(pos_cata!=NULL) 
	{ 
		CCatalogNode& catanode=GetNext(pos_cata); 
		if(catanode.m_strCatalogName==strCataName) 
			return catanode.m_idxCata; 
	} 
	return -1; 
} 
 
int CDocNode::GetWordNum() 
{ 
	int s=0; 
	for(int i=0;idZero) nSum++;	 
	AllocBuffer(nSum); 
	nSum=0; 
	for(i=0;idZero) 
		{ 
			m_sWeightSet[nSum].s_idxWord=m_pTemp[i].s_idxWord; 
			m_sWeightSet[nSum].s_tfi=m_pTemp[i].s_tfi; 
			m_sWeightSet[nSum].s_dWeight=m_pTemp[i].s_dWeight; 
			nSum++; 
		} 
	} 
	return nSum; 
} 
 
 
int CDocNode::GenDocVector(DOC &doc) 
{ 
	if(m_pTemp==NULL||m_nAllocTempLen<=0) return -1; 
	 
	int i,nSum=0; 
	for(i=0;i0) nSum++; 
	} 
 
	doc.words=(SVM_WORD *)malloc(sizeof(SVM_WORD)*(nSum+12)); 
	nSum=0; 
	for(i=0;i0) 
		{ 
			//DOC的特征ID从1开始 
			(doc.words[nSum]).wnum=i+1; 
			(doc.words[nSum]).weight=CDocNode::m_pTemp[i].s_dWeight; 
			nSum++; 
		} 
	} 
	(doc.words[nSum]).wnum=0; 
	doc.docnum=-1; 
	return nSum; 
} 
 
int CDocNode::ScanChineseStringWithDict(char *pPath,CWordList& wordList) 
{ 
	char *buffer=pPath; 
	int i,j,k; 
	short l,n; 
	char gram[12]; 
	//realcnt为文章中去掉停用词后剩下的总共词数 
	//nStart为一个句子在buffer中的开始位置 
	int realcnt=0,nStart=0,nNewStart=0; 
	//句子的长度 
	int nSentenceLen=0; 
	memset(m_pTemp,0,sizeof(sWeightNode)*m_nAllocTempLen); 
	CWordNode wordNode; 
	while(buffer[nStart]!='\0') 
	{ 
		nSentenceLen=ParseFile(buffer,nStart,nNewStart); 
		nStart=nNewStart; 
		if(nSentenceLen==0) continue; 
		if(m_pSentence[0]>0) //如果是一个英文单词 
		{ 
			//如果英文单词的长度大于等于2,且不是数字 
			if((nSentenceLen>=2)&&((m_pSentence[0]<'0')||(m_pSentence[0]>'9'))) 
			{ 
				if(wordList.Lookup(m_pSentence,wordNode)) 
				{ 
					m_pTemp[wordNode.m_nWordID].s_idxWord=wordNode.m_nWordID; 
					m_pTemp[wordNode.m_nWordID].s_tfi+=1; 
					m_pTemp[wordNode.m_nWordID].s_dWeight=wordNode.m_dWeight; 
					realcnt++; 
				} 
			} 
		} 
		else //如果是汉字串 
		{ 
			if(nSentenceLen%2!=0) continue; 
			//倒着扫描句子,这样分词的准确率高一些 
			i=nSentenceLen; 
			while(i>0) 
			{ 
				//最长扫描5个汉字的单词 
				if(i>10) k=10; 
				else k=i; 
				for(j=k;j>0;j=j-2) 
				{ 
					//将Gram项拷贝到gram中 
					n=0; 
					for(l=j;l>0;l--) gram[n++]=m_pSentence[i-l]; 
					gram[n]='\0'; 
					if(wordList.Lookup(gram,wordNode)) 
					{ 
						m_pTemp[wordNode.m_nWordID].s_idxWord=wordNode.m_nWordID; 
						m_pTemp[wordNode.m_nWordID].s_tfi+=1; 
						m_pTemp[wordNode.m_nWordID].s_dWeight=wordNode.m_dWeight; 
						realcnt++; 
						i=i-j+2; 
						break; 
					} 
				} 
				i=i-2; 
			} 
		} 
	} 
	//对文档向量中的每一维进行加权 
	if(realcnt>0) 
	{ 
		double sum=0; 
		for(i=0;i32&&c<=47)||(c>=58&&c<=64)||(c>=91&&c<=96)||(c>=123&&c<=127)) 
		{ 
			buffer[nFilePos]='\0'; 
			wordLen=buffer+nFilePos-p; 
			if(wordLen>2) 
			{ 
				if(bStem) theStemmer.stem(p,0,wordLen-1); 
				if(wordList.Lookup(p,wordNode)) 
				{ 
					m_pTemp[wordNode.m_nWordID].s_idxWord=wordNode.m_nWordID; 
					m_pTemp[wordNode.m_nWordID].s_tfi+=1; 
					m_pTemp[wordNode.m_nWordID].s_dWeight=wordNode.m_dWeight; 
					realcnt++; 
				} 
			} 
			p=buffer+nFilePos+1; 
		} 
		nFilePos++; 
	} 
 
	//对文档向量中的每一维进行加权 
	int i; 
	if(realcnt>0) 
	{ 
		double sum=0; 
		for(i=0;i0) 
		{ 
			bChinese=false; 
			break; 
		} 
		else 
		{ 
			bChinese=true; 
			break; 
		} 
	} 
 
	while(pBuffer[nCurrent]!='\0') 
	{ 
		bChar[0]=pBuffer[nCurrent]; 
		if(bChar[0]>127) 
		{	 
			if(!bChinese) break; 
			nCurrent++; 
			bChar[1]=pBuffer[nCurrent]; 
			//0xA1A1为全角的空格 
			if((bChar[0]!=0xA1)||(bChar[1]!=0xA1)) 
			{ 
				//如果为"的"字,或为标点符号或其它全角字母 
				if(((bChar[0]==0xB5)&&(bChar[1]==0xC4))|| 
					((bChar[0]==0xA1)&&(bChar[1]>0xA1)&&(bChar[1]<=0xFE))|| 
					((bChar[0]==0xA2)&&(bChar[1]>=0xA1)&&(bChar[1]<=0xFC))|| 
					((bChar[0]==0xA3)&&(bChar[1]>=0xA1)&&(bChar[1]<=0xFE))|| 
					((bChar[0]==0xA4)&&(bChar[1]>=0xA1)&&(bChar[1]<=0xF3))|| 
					((bChar[0]==0xA5)&&(bChar[1]>=0xA1)&&(bChar[1]<=0xF6))|| 
					((bChar[0]==0xA6)&&(bChar[1]>=0xA1)&&(bChar[1]<=0xF5))|| 
					((bChar[0]==0xA7)&&(bChar[1]>=0xA1)&&(bChar[1]<=0xF1))|| 
					((bChar[0]==0xA8)&&(bChar[1]>=0x40)&&(bChar[1]<=0xE9))|| 
					((bChar[0]==0xA9)&&(bChar[1]>=0x40)&&(bChar[1]<=0xEF))|| 
					((bChar[0]==0xAA)&&(bChar[1]==0xA5))) 
				{ 
					nCurrent++; 
					break; 
				} 
				//如果后半部分小于0x80,代表是一个错误的汉字 
				else if(bChar[1]>=0x80) 
				{ 
					m_pSentence[nSum]=pBuffer[nCurrent-1]; 
					nSum++; 
					m_pSentence[nSum]=pBuffer[nCurrent]; 
					nSum++; 
				} 
			} 
		} 
		else 
		{ 
			if(bChar[0]==' '||bChar[0]=='\r'||bChar[0]=='\n') 
			{ 
				if(!bChinese) 
				{ 
					nCurrent++; 
					break; 
				} 
			} 
			else 
			{ 
				//if(bChar[0]=='!'||bChar[0]=='?'||bChar[0]==':'|| 
				//	bChar[0]==';'||bChar[0]=='.') 
				if((bChar[0]>32&&bChar[0]<=47)||(bChar[0]>=58&&bChar[0]<=64)|| 
					(bChar[0]>=91&&bChar[0]<=96)||(bChar[0]>=123&&bChar[0]<=127)) 
				{ 
					nCurrent++; 
					break; 
				} 
				else if(bChinese) break; 
				else 
				{ 
					m_pSentence[nSum]=pBuffer[nCurrent]; 
					nSum++; 
				} 
			} 
		} 
		nCurrent++; 
	} 
	m_pSentence[nSum]='\0'; 
	nEnd=nCurrent; 
	return nSum; 
} 
 
bool CCatalogList::BuildCatalogID(CCatalogList & catalogList) 
{ 
	POSITION pos=GetFirstPosition(); 
	CString strCatalogName; 
	short cataID; 
	while(pos!=NULL) 
	{ 
		CCatalogNode& cataNode=GetNext(pos); 
		cataID=catalogList.GetCataIDByName(cataNode.m_strCatalogName); 
		if(cataID<0) return false; 
		cataNode.m_idxCata=cataID; 
	} 
	return true; 
} 
 
bool CDocNode::IsNumber(char *p) 
{ 
	int i=0; 
	while(p[i]!='\0'&&p[i]>='0'&&p[i]<='9') i++; 
	if(p[i]=='\0') return true; 
	else return false; 
} 
 
//函数将smart格式的类别字符串line转换一个类别ID数组 
//函数的返回值为字符串line中包含无法识别的类别总数 
short CCatalogList::GetCataIDArrayFromString(char * line, CArray &aryCataID) 
{ 
	int pos=0,id,d; 
	short result=0; 
	char type[MAX_PATH]; 
 
	aryCataID.RemoveAll(); 
	while(sscanf(line+pos,"%s %d",type,&d) != EOF) 
	{ 
		id=GetCataIDByName(type); 
		if(id>=0) 
		{ 
			aryCataID.Add(id); 
			while(line[pos]!=';'&&line[pos]!='\r'&&line[pos]!='\0')  
				pos++; 
			if(line[pos]==';') pos=pos+2; 
		} 
		else result++; 
	} 
	return result; 
} 
 
CDocNode& CCatalogNode::GetAt(POSITION position) 
{ 
	return m_lstDocList.GetAt(position); 
} 
 
int CCatalogList::GetSaveMode() 
{ 
	return CCatalogList::m_nSaveMode; 
} 
 
 
double CDocNode::ComputeProbability(CWordList &wordlist,int n) 
{ 
	/* 
	double sum=0.0; 
	for(int i=0;i