www.pudn.com > IM.rar > IM.cpp
#include#include #include #include #include #include #include #include typedef struct word_item{ wchar_t *word; unsigned freq; }WordItem; int wfreqs[20902]={0}; double wfreqfs[20902]={0}; WordItem *items=NULL; int wordcount=2000000; wchar_t *puncs=L"⒈⒉⒊⒋⒌⒍⒎⒏⒐⒑ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩ*-./⑴⑵⑶⑷⑸⑹⑺⑻⑼⑽㈠㈡㈢㈣㈤㈥㈦㈧㈨㈩$‰∑§αβγ¥℃∏※±×÷□◆▲●★【】『』①②③④⑤⑥⑦⑧⑨⑩⒈⒉⒊⒋⒌⒍⒎⒏⒐⒑12345678901234567890〈〉○●△▲『』±%×~!@#$%^&*()_+|=][';,.?:\"<>{}!·#¥%……—*‘’()、——+|{}[]:“”;《》,。?\n\r\t "; wchar_t *puncs1=L"\n\t\r "; int freq_cmp(const void *p,const void *q)//词频排序函数 { WordItem *wp=(WordItem*)p, *wq=(WordItem*)q; return (wq->freq)-(wp->freq); } int wcs_cmp(const void *p,const void *q)//音序排序函数 { return wcscmp(*(const wchar_t**)p,*(const wchar_t**)q); } int LoadUnicText(wchar_t * &text, const char*filename)//读入文件,必须用word保存为unicode编码 { int char_num=0; wchar_t ch,*p=text; FILE *in; if((in=fopen(filename,"rb"))==NULL){//判断源文件 printf("Can't open file!"); return 0; } char_num=_filelength(_fileno(in))/sizeof(wchar_t); //length of the file // while(fgetwc(in)!=WEOF) char_num++; rewind(in); p=(wchar_t*)calloc(char_num,sizeof(wchar_t)); if(!text || (_msize(p)/sizeof(wchar_t) < (unsigned)char_num)){//分配检查判断 printf("内存分配失败!\n"); fcloseall(); return 0; } while((ch=fgetwc(in))!=WEOF) {//copy *p=ch; if(ch>=19968 && ch<=40869)//进行字频统计 wfreqs[ch-19968]++; p++; } *p=L'\0'; //确保它是以0结尾 rewind(in); if(fgetwc(in)==65279) char_num--;//unicode开头有一个不明的标记字符:值为65279 p=p-char_num; text=p; fclose(in); return char_num; } wchar_t **GetUnicStringArray(wchar_t *TEXT,int char_num)//建立索引 { wchar_t **pp; int i=0; if(TEXT==NULL || *TEXT==L'\0') {//判断源串(char_num) printf("GetUnicStringArray_源文本为空!\n"); return NULL; } pp=(wchar_t**)calloc(char_num+1,sizeof(wchar_t*));//分配row内存 if(!pp || _msize(pp)/sizeof(wchar_t *)< (unsigned)char_num){//check printf("row内存分配失败!\n"); return NULL; } pp[char_num]=L"END!";//加一个结尾标记 for(i=0;i 0) && (wcsncmp(pp[i+1],string,wcslen(string))!=0)) break; /*检索到的例子是相邻关系,所以在找到例子以后,一旦发现 下一条不符合条件,就可以立即跳出循环 */ } if (eg_num==0) { printf("文本没有找到相应的字符(串)!"); return 0; } fcloseall(); return eg_num; } wchar_t *cutoff(const wchar_t *source,FILE *output)//去除括号和词性标记 { wchar_t *p=(wchar_t *)source; // FILE *out; // out=fopen("result30.txt","wt"); if(!p) return NULL; printf("df"); while(*p){ if(*p==L'/' && *(p+5)==L' ') p+=7;//cutoff"/n]ns"类 if(*p==L'/' && *(p+6)==L' ') p+=8;//cutoff"/nt]ns"类 if((*p)==L'/' && *(p+2)==L' ') p+=4;//cutoff"/n "类 // {fprintf(output,"%lc",*p);} if((*p)==L'/' && *(p+3)==L' ') p+=5;//cutoff"/ns "类 // {fprintf(output,"%lc",*p);} if(*p==L'[') p++;//cutoff"["要放到最后一步来做 fprintf(output,"%lc",*p); p++; } return NULL; } int Dicbulid(const wchar_t *source,FILE *output)//想做一个从文本直接提取词表的函数,失败 { int i=0,j=0; wchar_t *q=(wchar_t *)source; //items=(WordItem*)calloc(wordcount,sizeof(WordItem)); //if(items==NULL)return 0; if(!q) return 0; while(*q){//" 字/"格式,建立单字词表 // wchar_t *q=p; if((*q==L' ' || *q==L'[') && *(q+1) && *(q+2)==L'/'){ fprintf(output,"%lc\n",*(q+1)); q+=3; i++;//计算个数 } else q++; } return i; // if (*q==L' ') /* while(*(++q)!=L'/'); wchar_t tmp[200]; wcsncpy(tmp,p,q-p); items[i].word=_wcsdup(tmp); printf("%ls\n",items[i].word); // fwrite(items[i].word,sizeof(wchar_t),wcslen(items[i].word)+1,output); p=q+2; if(*p==L' ' || *p==L'\n') p++; i++; } */ // for(j=0;j=L'一') //“一”最小的汉字 编码,作为阈值 p++; } return NULL; } int creatdic(const wchar_t *source,FILE *output)//从纯词条的文件建立一个带词频排序的词表 { wchar_t *p=(wchar_t *)source; wchar_t *q=(wchar_t *)source; int Wordcount=0,i=0; if(!p) return NULL; while(*q){//计算词条数 if(*q==L'\n') Wordcount++; q++; } printf("词条数[%d]",Wordcount); items=(WordItem*)calloc(Wordcount,sizeof(WordItem)); if(items==NULL) return 0; while(*p){ int len=0,j=0,tag=0; wchar_t *words=p; q=p; while(*(++q)!=L'\n'); len=q-p;p=q+1; words[len]=L'\0'; for(j=0;j0) && (wcsncmp(pp[i+1],string,wcslen(string))!=0)) break; /*检索到的例子是相邻关系,所以在找到例子以后,一旦发现 下一条不符合条件,就可以立即跳出循环 */ } if (eg_num==0) { printf("文本没有找到相应的字符(串)!"); return 0; } fcloseall(); return eg_num; } int stringcmp2(const wchar_t *string1,const wchar_t *string2) { if(!string1 || !string2) return -1; wchar_t *p=(wchar_t *)string1,*q=(wchar_t *)string2; while(p&&q){ if(*p!=*q) return p-string1; p++,q++; } return 0; } int stringcmp (const wchar_t *string1,const wchar_t *string2) { if(!string1 || !string2) return -1; wchar_t *p=(wchar_t *)string1,*q=(wchar_t *)string2; while(p&&q){ if(*p!=*q || wcschr(puncs,*p)) return p-string1; p++,q++; } return 0; } void IM(wchar_t**array,int array_num,FILE *output) {//互信息函数 wchar_t **p=(wchar_t**)array; for(int i=0;i =2){ i++; tmplen++; sl2=samelen; } else break; } if(tmplen>0){//如果出现频次〉1 double tmpfreq=double(tmplen+1)*sl2/array_num;//保存临时词串每个字的频率的乘积 for(int j=0;j 19967 && ch<40870)//加上足够的汉字限制条件 tmpfreq=tmpfreq/(wfreqfs[ch-19968]); //assert(tmpfreq); } fprintf(output,"\t%d\t%lf\r",tmplen+1,log(tmpfreq)/log(2)); } } } void chars(wchar_t **array,int array_num,FILE *output) {//打出所有字符的字符 编码 频率 wchar_t **p=(wchar_t**)array; for(int i=0;i 0) hzfreq[i]=(float)hzarray[i]/CorpusSize; } } void main() { FILE* output,*output1,*output2; char filename[]="199801unic.txt",filename2[]="modal_IM.txt";////"postcorpus.bin";人民日报语料unic.txt wchar_t string1[]=L"/v",string2[]=L"/",**array,*text,*textshou; int char_num=0,ContextLen=5; setlocale(LC_ALL,"chs"); // printf("%d",stringcmp(L"asd",NULL)); char_num=LoadUnicText(text,filename); printf("从文本中成功加载字符%d个。\n",char_num); printf("正在处理中!\n"); freqcmt(wfreqfs,wfreqs,20902,char_num); array=GetUnicStringArray(text,char_num);printf("排序结束!\n"); output=fopen("im__re.txt","wt"); // IM4(array,char_num,output); IM(array,char_num,output); /* output1=fopen("hanzi.txt","wt"); for(int i=0;i<20901;i++)//打印出汉字,编码,频次 if(wfreqs[i]>0) fprintf(output1,"%lc\t[%d]\t[%d]\t[%e]\n",i+19968,i,wfreqs[i],wfreqfs[i]); output2=fopen("char.txt","wt"); chars(array,char_num,output2); */ printf("处理完毕!"); fcloseall(); }