www.pudn.com > classifier.rar > fileSpliter.cs
using System;
using System.Data.OleDb;
using System.IO;
using System.Windows.Forms;
using System.Collections;
using System.Data;
using System.Drawing;
using System.ComponentModel;
namespace classifier.split
{
///
/// spliter 的摘要说明。
/// 分词操作的类
///
public class fileSpliter
{
private OleDbConnection sqlConnect;
private char[] endChar; //终结符
private char[] missChar;//忽略符
private char[] word;//英文字母
private char[] number;//数字
private ArrayList leftSplit,rightSplit,splitResult,leftPos,rightPos,leftFreq,rightFreq;
private DataSet ds;
private int denominator;
private string [] filePath;
private string typeName;
private System.Windows.Forms.TextBox msgBox;
public fileSpliter()
{
//
// TODO: 在此处添加构造函数逻辑
//
}
///
///
///
///
///
public fileSpliter(string[] fPath,OleDbConnection sqlCon,int deno,string type,TextBox b)
{
msgBox=b;
msgBox.Text="aaa";
typeName=type;
//空格不算终结符
sqlConnect=sqlCon;
if(sqlConnect.State==ConnectionState.Closed)
{
sqlConnect.Open();
}
//文件列表传递
filePath=fPath;
//从数据库中读入终结符,并写入终结符字符数组中
string cmd="select * from endChar";
OleDbDataAdapter oleAd=new OleDbDataAdapter(cmd,sqlConnect);
DataTable dt=new DataTable();
oleAd.Fill(dt);
cmd="";
foreach( DataRow dr in dt.Rows)
{
cmd+=dr["ending"].ToString();
}
//填充需要使用的数据集,其中word表存储北大的词库,static存储统计词频的结果
ds=new DataSet();
OleDbDataAdapter sqlWordAd=new OleDbDataAdapter("select * from words",sqlConnect);
sqlWordAd.Fill(ds,"word");
OleDbDataAdapter sqlStaticAd=new OleDbDataAdapter("select * from learnResult",sqlConnect);
sqlStaticAd.Fill(ds,"static");
//填充结束
endChar=cmd.ToCharArray();
char[] mm={'\r','\n',' '};
missChar=mm;
char[] nn={'0','1','2','3','4','5','6','7','8','9'};
number=nn;
char[] ww={'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z'};
word=ww;
leftSplit=new ArrayList();
leftPos=new ArrayList();
leftFreq=new ArrayList();
rightSplit=new ArrayList();
rightPos=new ArrayList();
rightFreq=new ArrayList();
splitResult=new ArrayList();
denominator=deno;
sqlConnect.Close();
}
///
/// 读入字句直到遇到一个终结符,英文单词与数字都算终结符
///
///
public void readToEnd()
{
//采用自动机原理实现识别过程
int status=0;
//lastChar是上一个字符,用于识别数字单词
char currentChar;
string result="";
//lastType记录上一个字符是数字、英文、还是终结符
int i=0;
//
StreamReader sr;
//对文件列表中的每一个文件进行处理
for(int ss=0;ss= 0)
{
//当前字符
currentChar=(char)sr.Read();
if(status==0)
{
if(isNumber(currentChar))
{
//数字
result=currentChar.ToString();
status=1;
}
else if(isWord(currentChar))
{
result=currentChar.ToString();
status=5;
}
else if(isChinese(currentChar) )
{
//中文
result=currentChar.ToString();
status=7;
}
else if(isEnd(currentChar))
{
result="";
status=0;
}
else if(isSpace(currentChar))
{
//空格,不改变状态,继续读
result="";
status=0;
}
else
{
result="";
status=0;
}
}
else if(status==1)
{
if(isNumber(currentChar))
{
result+=currentChar;
//状态不变
}
else if(currentChar=='.')
{
result+=currentChar;
status=2;
}
else if(isWord(currentChar))
{
splitResult.Add(result);
result=currentChar.ToString();
status=5;
}
else if(isChinese(currentChar))
{
splitResult.Add(result);
result=currentChar.ToString();
status=7;
}
// else if(isSpace(currentChar))
// {
// splitResult.Add(result);
// result="";
// status=0;
// }
// else if(isEnd(currentChar))
// {
// splitResult.Add(result);
// result="";
// status=0;
// }
else
{
splitResult.Add(result);
result="";
status=0;
}
}
else if(status==2)
{
if(isNumber(currentChar))
{
result+=currentChar;
//状态不变
}
else if(isWord(currentChar))
{
splitResult.Add(result);
result=currentChar.ToString();
status=5;
}
else if(isChinese(currentChar))
{
splitResult.Add(result);
result=currentChar.ToString();
status=7;
}
// else if(isSpace(currentChar))
// {
// splitResult.Add(result);
// result="";
// status=0;
// }
// else if(isEnd(currentChar))
// {
// splitResult.Add(result);
// result="";
// status=0;
// }
else
{
//其他情况
splitResult.Add(result);
result="";
status=0;
}
}
else if(status==5)
{
if(isWord(currentChar))
{
result+=currentChar;
//status不变
}
else if(isNumber(currentChar))
{
splitResult.Add(result);
result=currentChar.ToString();
status=1;
}
else if(isChinese(currentChar))
{
splitResult.Add(result);
result=currentChar.ToString();
status=7;
}
// else if(isSpace(currentChar))
// {
// splitResult.Add(result);
// result="";
// status=0;
// }
// else if(isEnd(currentChar))
// {
// splitResult.Add(result);
// result="";
// status=0;
// }
else
{
//状态6,识别为一个英文单词,并将状态重新置位0
splitResult.Add(result);
result=currentChar.ToString();
status=0;
}
}
else if(status==7)
{
if(isChinese(currentChar))
{
//中文
result+=currentChar;
}
else if(isNumber(currentChar))
{
doSplit(result,sqlConnect);
result=currentChar.ToString();
status=1;
}
else if(isWord(currentChar))
{
doSplit(result,sqlConnect);
result=currentChar.ToString();
status=5;
}
else if(isSpace(currentChar))
{
//忽略空格
}
// else if(isEnd(currentChar))
// {
// //可能是中文标点
// doSplit(result,sqlConnect);
// result="";
// status=0;
// }
else
{
doSplit(result,sqlConnect);
result="";
status=0;
}
}
else
{
MessageBox.Show("出现意外状态!");
}
}
//对结尾情况的处理,只处理英文,中文,数字的情况
if(status==1)
{
//数字
splitResult.Add(result);
}
else if(status==2)
{
//带小数点的数字
splitResult.Add(result);
}
else if(status==5)
{
//单词
splitResult.Add(result);
}
else if(status==7)
{
//汉字
doSplit(result,sqlConnect);
}
else
{
}
//将分类结果存入结果表
saveResult();
msgBox.Text+="文件:"+filePath[ss]+" 处理完毕\n";
}
//结果更新到数据库
saveDB();
}
#region 判断类型函数
private bool isSpace(char currentChar)
{
if(currentChar==' ')
{
return true;
}
else
{
return false;
}
}
private bool isChinese(char currentChar)
{
if(currentChar>=0x4e00 && currentChar<=0x9fa5)
{
return true;
}
else
{
return false;
}
}
private bool isNumber(char currentChar)
{
if(currentChar>='0' && currentChar<='9')
{
return true;
}
else
{
return false;
}
}
private bool isWord(char currentChar)
{
if(currentChar>='a' && currentChar<='z' || currentChar>='A' && currentChar<='Z')
{
return true;
}
else
{
return false;
}
}
//检查字符是否是终结符
private bool isEnd(char currentChar)
{
for(int i=0;i0)
{
//currentPos即剩余串的长度
if(currentPos<=4)
{
subStr=sourceLine.Substring(0,currentPos);
subStr=backCheckSub(subStr);
currentPos-=subStr.Length;
rightPos.Add(currentPos);
}
else
{
subStr=sourceLine.Substring(currentPos-4,4);
subStr=backCheckSub(subStr);
currentPos-=subStr.Length;
rightPos.Add(currentPos);
}
rightSplit.Add(subStr);
}
//计算混合概率密度
int i,j,p;
//用于存储前向、后向扫描最大概率
float pRight=1,pLeft=1;
int denominator=0;
int lastPosI=0,lastPosJ=rightPos.Count-1;
i=1;
j=rightPos.Count-2;
while(i=0)
{
int ll=Convert.ToInt32(leftPos[i]);
int rr=Convert.ToInt32(rightPos[j]);
if(ll>rr)
{
pRight*=((float)(Convert.ToInt32(rightFreq[j])))/denominator;
j--;
}
else if(llj;p--)
{
splitResult.Add(rightSplit[p]);
}
//记录结点位置
lastPosI=i;
lastPosJ=j;
i++;
j--;
}
else
{
//正向搜索的概率大
for(p=lastPosI;pj;p--)
{
splitResult.Add(rightSplit[p]);
}
}
else
{
//正向搜索的概率大
for(p=lastPosI+1;p0)
{
leftFreq.Add(drs[0]["wfreq"].ToString());
return subString;
}
else
{
leftFreq.Add("0");
return subString;
}
}
else
{
DataRow[] drs=ds.Tables["word"].Select("word=\'"+subString+"\'");
if(drs.Length>0)
{
leftFreq.Add(drs[0]["wfreq"].ToString());
return subString;
}
else
{
return foreCheckSub(subString.Substring(0,subString.Length-1));
}
}
}
//逆向递归搜索程序
private string backCheckSub(string subString)
{
if(subString.Length<=1)
{
DataRow[] drs=ds.Tables["word"].Select("word=\'"+subString+"\'");
if(drs.Length>0)
{
rightFreq.Add(drs[0]["wfreq"].ToString());
return subString;
}
else
{
rightFreq.Add("0");
return subString;
}
}
else
{
DataRow[] drs=ds.Tables["word"].Select("word=\'"+subString+"\'");
if(drs.Length>0)
{
rightFreq.Add(drs[0]["wfreq"].ToString());
return subString;
}
else
{
return backCheckSub(subString.Substring(1,subString.Length-1));
}
}
}
#endregion
#region 统计保存结果
private void saveResult()
{
DataTable dt=ds.Tables["static"];
DataRow dr;
DataRow[] drs;
string cmd;
int j;
for(int i=0;i0)
{
j=Convert.ToInt32(drs[0]["times"]);
j++;
drs[0]["times"]=j;
}
else
{
dr=dt.NewRow();
dr["word"]=splitResult[i].ToString();
dr["typeName"]=typeName;
dr["times"]=1;
dt.Rows.Add(dr);
}
}
}
private void saveDB()
{
if(sqlConnect.State==ConnectionState.Closed)
{
sqlConnect.Open();
}
string cmd="select * from learnResult";
OleDbDataAdapter sqlAd=new OleDbDataAdapter(cmd,sqlConnect);
OleDbCommandBuilder sqlBuilder=new OleDbCommandBuilder(sqlAd);
try
{
sqlAd.Update(ds,"static");
ds.Tables["static"].AcceptChanges();
}
catch(Exception ex)
{
System.Windows.Forms.MessageBox.Show("出错了!");
}
sqlConnect.Close();
}
#endregion
}
}