www.pudn.com > IM.rar > IM.cpp


#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
 
typedef struct word_item{ 
	wchar_t *word; 
	unsigned freq; 
}WordItem; 
int wfreqs[20902]={0}; 
double wfreqfs[20902]={0}; 
WordItem *items=NULL; 
int wordcount=2000000; 
wchar_t *puncs=L"⒈⒉⒊⒋⒌⒍⒎⒏⒐⒑ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩ*-./⑴⑵⑶⑷⑸⑹⑺⑻⑼⑽㈠㈡㈢㈣㈤㈥㈦㈧㈨㈩$‰∑§αβγ¥℃∏※±×÷□◆▲●★【】『』①②③④⑤⑥⑦⑧⑨⑩⒈⒉⒊⒋⒌⒍⒎⒏⒐⒑12345678901234567890〈〉○●△▲『』±%×~!@#$%^&*()_+|=][';,.?:\"<>{}!·#¥%……—*‘’()、——+|{}[]:“”;《》,。?\n\r\t "; 
wchar_t *puncs1=L"\n\t\r "; 
int freq_cmp(const void *p,const void *q)//词频排序函数 
{ 
	WordItem *wp=(WordItem*)p,  
			*wq=(WordItem*)q; 
	return (wq->freq)-(wp->freq); 
} 
int wcs_cmp(const void *p,const void *q)//音序排序函数 
{ 
	return wcscmp(*(const wchar_t**)p,*(const wchar_t**)q); 
} 
 
int LoadUnicText(wchar_t * &text, const char*filename)//读入文件,必须用word保存为unicode编码 
{ 
	int char_num=0; 
	wchar_t ch,*p=text; 
	FILE *in; 
	if((in=fopen(filename,"rb"))==NULL){//判断源文件 
		printf("Can't open file!"); 
		return 0; 
	} 
 
	char_num=_filelength(_fileno(in))/sizeof(wchar_t); //length of the file 
	//	while(fgetwc(in)!=WEOF) char_num++; 
 
	rewind(in); 
	p=(wchar_t*)calloc(char_num,sizeof(wchar_t)); 
	if(!text || (_msize(p)/sizeof(wchar_t) < (unsigned)char_num)){//分配检查判断 
		printf("内存分配失败!\n"); 
		fcloseall(); 
		return 0; 
	} 
	while((ch=fgetwc(in))!=WEOF)	{//copy 
		*p=ch; 
		if(ch>=19968 && ch<=40869)//进行字频统计 
			wfreqs[ch-19968]++; 
			 
		p++;		 
	} 
	*p=L'\0'; //确保它是以0结尾 
	rewind(in); 
	if(fgetwc(in)==65279) char_num--;//unicode开头有一个不明的标记字符:值为65279 
	p=p-char_num; 
	text=p; 
	 
	fclose(in); 
	return char_num; 
} 
 
wchar_t **GetUnicStringArray(wchar_t *TEXT,int char_num)//建立索引 
{ 
	wchar_t **pp; 
	int i=0; 
	if(TEXT==NULL || *TEXT==L'\0') {//判断源串(char_num) 
		printf("GetUnicStringArray_源文本为空!\n"); 
		return NULL; 
	} 
 
	pp=(wchar_t**)calloc(char_num+1,sizeof(wchar_t*));//分配row内存 
	if(!pp || _msize(pp)/sizeof(wchar_t *)< (unsigned)char_num){//check 
		printf("row内存分配失败!\n"); 
		return NULL; 
	} 
	pp[char_num]=L"END!";//加一个结尾标记 
 
 
	for(i=0;i0) && (wcsncmp(pp[i+1],string,wcslen(string))!=0)) 
			break;	/*检索到的例子是相邻关系,所以在找到例子以后,一旦发现 
		下一条不符合条件,就可以立即跳出循环		*/ 
	} 
	if (eg_num==0) { 
		printf("文本没有找到相应的字符(串)!"); 
		return 0; 
	} 
	fcloseall(); 
	return eg_num; 
} 
wchar_t *cutoff(const wchar_t *source,FILE *output)//去除括号和词性标记 
{ 
	wchar_t *p=(wchar_t *)source; 
//	FILE *out; 
//	out=fopen("result30.txt","wt"); 
	 
	if(!p) return NULL; 
	printf("df"); 
	while(*p){ 
		 
		if(*p==L'/' && *(p+5)==L' ') p+=7;//cutoff"/n]ns"类 
		if(*p==L'/' && *(p+6)==L' ') p+=8;//cutoff"/nt]ns"类 
		if((*p)==L'/' && *(p+2)==L' ') p+=4;//cutoff"/n  "类 
//			{fprintf(output,"%lc",*p);} 
		if((*p)==L'/' && *(p+3)==L' ') p+=5;//cutoff"/ns  "类 
//				{fprintf(output,"%lc",*p);} 
		if(*p==L'[') p++;//cutoff"["要放到最后一步来做 
		fprintf(output,"%lc",*p);		 
		p++; 
	} 
	 
	return NULL; 
} 
int Dicbulid(const wchar_t *source,FILE *output)//想做一个从文本直接提取词表的函数,失败 
{ 
	int i=0,j=0; 
	wchar_t *q=(wchar_t *)source; 
//items=(WordItem*)calloc(wordcount,sizeof(WordItem)); 
//if(items==NULL)return 0; 
	if(!q) return 0; 
	while(*q){//"  字/"格式,建立单字词表 
//		wchar_t *q=p; 
		if((*q==L' ' || *q==L'[')  && *(q+1) && *(q+2)==L'/'){ 
			fprintf(output,"%lc\n",*(q+1)); 
			q+=3; 
			i++;//计算个数 
		} 
		else q++; 
	} 
	return i; 
//		if (*q==L' ')  
/*		while(*(++q)!=L'/'); 
			wchar_t tmp[200]; 
			wcsncpy(tmp,p,q-p); 
			items[i].word=_wcsdup(tmp); 
			printf("%ls\n",items[i].word); 
//			fwrite(items[i].word,sizeof(wchar_t),wcslen(items[i].word)+1,output); 
		 
		p=q+2; 
		if(*p==L' ' || *p==L'\n') p++; 
		i++; 
	} 
*/ 
//	for(j=0;j=L'一') //“一”最小的汉字 编码,作为阈值 
			p++; 
	}	 
	return NULL; 
} 
int creatdic(const wchar_t *source,FILE *output)//从纯词条的文件建立一个带词频排序的词表 
{	 
	wchar_t *p=(wchar_t *)source; 
	wchar_t *q=(wchar_t *)source; 
	int Wordcount=0,i=0; 
	if(!p) return NULL; 
	while(*q){//计算词条数 
		if(*q==L'\n') Wordcount++; 
		q++; 
	} 
	printf("词条数[%d]",Wordcount); 
	items=(WordItem*)calloc(Wordcount,sizeof(WordItem)); 
	if(items==NULL) return 0; 
	while(*p){ 
		int len=0,j=0,tag=0; 
		wchar_t *words=p; 
		q=p; 
		while(*(++q)!=L'\n'); 
		len=q-p;p=q+1; 
		words[len]=L'\0';		 
		for(j=0;j0) && (wcsncmp(pp[i+1],string,wcslen(string))!=0)) 
			break;	/*检索到的例子是相邻关系,所以在找到例子以后,一旦发现 
		下一条不符合条件,就可以立即跳出循环		*/ 
	} 
	if (eg_num==0) { 
		printf("文本没有找到相应的字符(串)!"); 
		return 0; 
	} 
	fcloseall(); 
	return eg_num; 
} 
int stringcmp2(const wchar_t *string1,const wchar_t *string2) 
{ 
	if(!string1 || !string2) return -1; 
	wchar_t *p=(wchar_t *)string1,*q=(wchar_t *)string2; 
	while(p&&q){ 
		if(*p!=*q) return p-string1; 
		p++,q++; 
	} 
	return 0; 
} 
int stringcmp (const wchar_t *string1,const wchar_t *string2) 
{ 
	if(!string1 || !string2) return -1; 
	wchar_t *p=(wchar_t *)string1,*q=(wchar_t *)string2; 
	while(p&&q){ 
		if(*p!=*q || wcschr(puncs,*p)) return p-string1; 
		p++,q++; 
	} 
	return 0; 
} 
void IM(wchar_t**array,int array_num,FILE *output)	 
{//互信息函数 
	wchar_t **p=(wchar_t**)array; 
	for(int i=0;i=2){ 
				i++; 
				tmplen++; 
				sl2=samelen; 
			} 
			else break; 
		} 
		if(tmplen>0){//如果出现频次〉1 
			double tmpfreq=double(tmplen+1)*sl2/array_num;//保存临时词串每个字的频率的乘积 
			for(int j=0;j19967 && ch<40870)//加上足够的汉字限制条件 
					tmpfreq=tmpfreq/(wfreqfs[ch-19968]); 
				 
				//assert(tmpfreq); 
			} 
			fprintf(output,"\t%d\t%lf\r",tmplen+1,log(tmpfreq)/log(2)); 
		} 
	} 
} 
void chars(wchar_t **array,int array_num,FILE *output)	 
{//打出所有字符的字符 编码 频率 
	wchar_t **p=(wchar_t**)array; 
	for(int i=0;i0) 
			hzfreq[i]=(float)hzarray[i]/CorpusSize; 
	} 
} 
 
void main() 
{ 
	FILE* output,*output1,*output2; 
	char filename[]="199801unic.txt",filename2[]="modal_IM.txt";////"postcorpus.bin";人民日报语料unic.txt 
 
	wchar_t string1[]=L"/v",string2[]=L"/",**array,*text,*textshou;		 
	int char_num=0,ContextLen=5; 
	setlocale(LC_ALL,"chs"); 
 
//	printf("%d",stringcmp(L"asd",NULL)); 
 
	char_num=LoadUnicText(text,filename); 
	printf("从文本中成功加载字符%d个。\n",char_num); 
 
	printf("正在处理中!\n"); 
	freqcmt(wfreqfs,wfreqs,20902,char_num); 
	array=GetUnicStringArray(text,char_num);printf("排序结束!\n"); 
	output=fopen("im__re.txt","wt"); 
//	IM4(array,char_num,output); 
	IM(array,char_num,output); 
	 
/* 
	output1=fopen("hanzi.txt","wt"); 
	for(int i=0;i<20901;i++)//打印出汉字,编码,频次 
		if(wfreqs[i]>0) 
			fprintf(output1,"%lc\t[%d]\t[%d]\t[%e]\n",i+19968,i,wfreqs[i],wfreqfs[i]); 
	output2=fopen("char.txt","wt"); 
	chars(array,char_num,output2); 
*/	printf("处理完毕!"); 
	fcloseall(); 
 
	 
}