www.pudn.com > ChineseProcessing.rar > FreqCounter.cpp


// FreqCounter.cpp: implementation of the CFreqCounter class. 
// 
////////////////////////////////////////////////////////////////////// 
 
#include "stdafx.h" 
#include "CPT.h" 
#include "FreqCounter.h" 
#include  
 
#ifdef _DEBUG 
#undef THIS_FILE 
static char THIS_FILE[]=__FILE__; 
#define new DEBUG_NEW 
#endif 
 
////////////////////////////////////////////////////////////////////// 
// Construction/Destruction 
////////////////////////////////////////////////////////////////////// 
 
ChChar CFreqCounter::chars[MAXNUMCHINESECHAR]; 
 
UINT CFreqCounter::retvalue=CFreqCounter::InitChineseChars(); 
 
CFreqCounter::CFreqCounter(UINT nDepth) 
{ 
	m_nSizeArray = 0; 
	m_aucArray = NULL; 
	m_tucRoot=NULL; 
	InitCounterTree(&m_tucRoot, nDepth, 0, MAXNUMCHINESECHAR-1); 
} 
 
CFreqCounter::~CFreqCounter() 
{ 
	if (m_tucRoot) 
		ReleaseCounterTree(m_tucRoot); 
	for (UINT i=0; inRight || nDepth==0) 
		return; 
	p=new TUCounter; 
	*tucTree = p; 
	(p->ch)[0]=chars[i][0]; 
	(p->ch)[1]=chars[i][1]; 
	p->counter=0; 
	p->left=p->right=NULL; 
	InitCounterTree(&(p->left), nDepth-1, nLeft, i-1); 
	InitCounterTree(&(p->right), nDepth-1, i+1, nRight); 
} 
 
void CFreqCounter::ReleaseCounterTree(TUCounter *root) 
{ 
	if (root==NULL) 
		return; 
	ReleaseCounterTree(root->left); 
	ReleaseCounterTree(root->right); 
	delete root; 
} 
 
void CFreqCounter::AddGram(ChChar first) 
{ 
	TUCounter * p=m_tucRoot; 
	TUCounter **q=&m_tucRoot; 
 
	while (p) 
	{ 
		register int i = CompareChineseChar(first, p->ch); 
		if (i==0) 
		{ 
			(p->counter)++; 
			return; 
		} 
		else if (i<0) 
		{ 
			q=&(p->left); 
			p=p->left; 
		} 
		else 
		{ 
			q=&(p->right); 
			p=p->right; 
		} 
	} 
	p=new TUCounter; 
	p->ch[0]=first[0]; 
	p->ch[1]=first[1]; 
	p->counter=1; 
	p->left=p->right=NULL; 
	*q=p; 
} 
 
void CFreqCounter::AddGram(ChChar first, ChChar second) 
{ 
	int l=0, r=m_nSizeArray-1, j, k; 
 
	do  
	{ 
		k=(l+r)/2; 
		j=CompareChineseChar(first, m_aucArray[k].ch); 
		if (j<0) 
			r=k-1; 
		else if (j>0) 
			l=k+1; 
		else 
			break; 
	} 
	while (1); 
	// must found 
	 
	if (CFreqCounter * p=m_aucArray[k].next) 
		p->AddGram(second); 
} 
 
void CFreqCounter::AddGram(ChChar first, ChChar second, ChChar third) 
{ 
	int l=0, r=m_nSizeArray-1, j, k; 
 
	do  
	{ 
		k=(l+r)/2; 
		j=CompareChineseChar(first, m_aucArray[k].ch); 
		if (j<0) 
			r=k-1; 
		else if (j>0) 
			l=k+1; 
		else 
			break; 
	} 
	while (1); 
	// must found 
	 
	if (CFreqCounter * p=m_aucArray[k].next) 
		p->AddGram(second, third); 
} 
 
void CFreqCounter::PostFirstScan() 
{ 
	m_nSizeArray=CountUniqueGrams(m_tucRoot); 
	if (m_nSizeArray) 
	{ 
		m_aucArray = new AUCounter[m_nSizeArray]; 
		UINT i=OutputTreeToArray(0, m_tucRoot); 
	} 
	ReleaseCounterTree(m_tucRoot); 
	m_tucRoot=NULL; 
} 
 
void CFreqCounter::InitSecondScan() 
{ 
	for (UINT i=0; iPostFirstScan(); 
} 
 
void CFreqCounter::InitThirdScan() 
{ 
	for (UINT i=0; iInitSecondScan(); 
} 
 
void CFreqCounter::PostThirdScan() 
{ 
	for (UINT i=0; iPostSecondScan(); 
} 
 
UINT CFreqCounter::CountUniqueGrams(TUCounter * tucTree) 
{ 
	if (tucTree == NULL) 
		return 0; 
	UINT l=CountUniqueGrams(tucTree->left), \ 
		r=CountUniqueGrams(tucTree->right); 
 
	if (tucTree->counter) 
		return l+r+1; 
	else return l+r; 
} 
 
UINT CFreqCounter::OutputTreeToArray(UINT left, TUCounter * tuc) 
{ 
	if (tuc == NULL) 
		return 0; 
	UINT i=OutputTreeToArray(left, tuc->left); 
	if (tuc->counter) 
	{ 
		m_aucArray[left+i].ch[0]=tuc->ch[0]; 
		m_aucArray[left+i].ch[1]=tuc->ch[1]; 
		m_aucArray[left+i].counter=tuc->counter; 
		m_aucArray[left+i].next= NULL;  
		i++; 
	} 
	i += OutputTreeToArray(left+i, tuc->right); 
	return i; 
} 
 
void CFreqCounter::OutputUnigramFrequency(CFile &fout, const void * buf, size_t len) 
{ 
	AUCounter *p=new AUCounter[m_nSizeArray]; 
 
	memmove(p, m_aucArray, m_nSizeArray*sizeof(AUCounter)); 
	qsort(p, m_nSizeArray, sizeof(AUCounter), AUCounter::CompareCounterUnit); 
	for (UINT i=0; iOutputUnigramFrequency(fout, buf1, len+2); 
	} 
	delete buf1; 
} 
 
void CFreqCounter::OutputTrigramFrequency(CFile &fout, const void * buf, size_t len) 
{ 
	char* buf1=new char[len+sizeof(ChChar)]; 
	if (buf) 
		memmove(buf1, buf, len); 
 
	for (UINT i=0; iOutputBigramFrequency(fout, buf1, len+2); 
	} 
	delete buf1; 
} 
 
double AUMutual::moffset=0; 
 
void CFreqCounter::OutputAllMutual(CFile &fout) 
{ 
	UINT n=0, i, j, k; 
	int l, r, m, ret; 
	ChChar cc; 
 
	for (i=0; im_nSizeArray; 
	AUMutual *p=new AUMutual[n]; 
	for (i=0, k=0; im_nSizeArray; j++, k++) 
		{ 
			p[k].ch[0][0]=m_aucArray[i].ch[0]; 
			p[k].ch[0][1]=m_aucArray[i].ch[1]; 
			p[k].ch[1][0]=m_aucArray[i].next->m_aucArray[j].ch[0]; 
			p[k].ch[1][1]=m_aucArray[i].next->m_aucArray[j].ch[1]; 
			cc[0]=p[k].ch[1][0]; 
			cc[1]=p[k].ch[1][1]; 
			p[k].c[0]=m_aucArray[i].counter; 
			p[k].counter=m_aucArray[i].next->m_aucArray[j].counter; 
			l=0; r=m_nSizeArray-1; 
			do  
			{ 
				m=(l+r)/2; 
				ret = CompareChineseChar(cc, m_aucArray[m].ch); 
				if (ret < 0) 
					r = m-1; 
				else if (ret > 0) 
					l = m+1; 
				else  
					break; 
			} 
			while (1); 
			// must found 
			p[k].c[1]=m_aucArray[m].counter; 
			p[k].CalculateMutual(); 
		} 
		qsort(p, n, sizeof(AUMutual), AUMutual::CompareMutual); 
		for (i=0; i