www.pudn.com > ChineseProcessing.rar > SentSeg.cpp


// SentSeg.cpp: implementation of the CSentSeg class. 
// 
////////////////////////////////////////////////////////////////////// 
 
#include "stdafx.h" 
#include "CPT.h" 
#include "SentSeg.h" 
 
#ifdef _DEBUG 
#undef THIS_FILE 
static char THIS_FILE[]=__FILE__; 
#define new DEBUG_NEW 
#endif 
 
////////////////////////////////////////////////////////////////////// 
// Construction/Destruction 
////////////////////////////////////////////////////////////////////// 
 
CSentSeg::CSentSeg(CCPTDoc * pDoc, CString dirpath) 
{ 
	m_pDoc = pDoc; 
	m_sDir = dirpath; 
	m_fOut = NULL; 
	m_nIdx = 0; 
} 
 
CSentSeg::~CSentSeg() 
{ 
	if (m_fOut) 
		delete m_fOut; 
} 
 
void CSentSeg::DoSentSeg() 
{ 
	m_pDoc->ClearAllCounters(); 
	m_saFiles.RemoveAll(); 
	FindAllFiles(m_sDir, "*.htm;*.html;*.txt;"); 
 
	m_fOut = new CFile; 
	if (!(m_fOut->Open("sentence.txt", CFile::modeCreate | CFile::modeWrite))) 
	{ 
		delete m_fOut; 
		m_fOut = NULL; 
		return; 
	} 
		 
	for (int i=0; iIncNumFiles()) 
	{ 
		try 
		{ 
			ChChar cc; 
			UINT n; 
 
			CFile f; 
			f.Open(m_saFiles[i], CFile::modeRead); 
			while ((n=f.Read(cc, 1))==1) 
			{ 
				if (cc[0] >= 128) 
				{ 
					n=f.Read(cc+1, 1); 
					if (n==1 && IsChineseChar(cc)) 
					{ 
						if (m_nIdx == 1023) 
						{ 
							delete m_fOut; 
							m_fOut = NULL; 
							return; 
						} 
						m_ccBuf[m_nIdx][0]=cc[0]; 
						m_ccBuf[m_nIdx++][1]=cc[1]; 
						m_pDoc->IncNumChars(); 
						continue; 
					} 
				} 
				OutputSentence(); 
			} 
			OutputSentence(); 
			f.Close(); 
		} 
		catch (CFileException *e) {} 
	} 
	delete m_fOut; 
	m_fOut = NULL; 
} 
 
void CSentSeg::OutputSentence() 
{ 
	ASSERT(m_fOut); 
	ASSERT(m_nIdx < 1024); 
	if (m_nIdx == 0) 
		return; 
	m_pDoc->IncNumSent(); 
	((unsigned char *)(m_ccBuf+m_nIdx))[0]=(unsigned char)0; 
	m_fOut->Write(m_ccBuf, m_nIdx<<1); 
	m_fOut->Write("\n", 1); 
	m_nIdx = 0; 
} 
 
void CSentSeg::FindAllFiles(CString sDir, CString sPattern) 
{ 
	CFileFind finder; 
	BOOL bWorking; 
	CString sPatterns=sPattern; 
	CStringArray sPat; 
	int i; 
 
	while ((i=sPatterns.FindOneOf(",;"))!=-1) 
	{ 
		sPat.Add(sPatterns.Left(i)); 
		sPatterns = sPatterns.Right(sPatterns.GetLength() - i - 1); 
	} 
	if (sPatterns != "") 
		sPat.Add(sPatterns); 
 
	for (i=0; i