www.pudn.com > classifier.rar > spliter.cs
using System;
using System.Data.OleDb;
using System.IO;
using System.Windows.Forms;
using System.Collections;
using System.Data;
namespace classifier.split
{
///
/// spliter 的摘要说明。
/// 分词操作的类
///
public class spliter
{
private OleDbConnection sqlConnect;
private string fPath;
private char[] endChar; //终结符
private char[] missChar;//忽略符
private char[] word;//英文字母
private char[] number;//数字
private ArrayList leftSplit,rightSplit,splitResult,leftPos,rightPos,leftFreq,rightFreq;
public spliter()
{
//
// TODO: 在此处添加构造函数逻辑
//
}
///
///
///
///
///
public spliter(string filePath,OleDbConnection sqlCon)
{
//空格不算终结符
sqlConnect=sqlCon;
fPath=filePath;
//从数据库中读入终结符,并写入终结符字符数组中
string cmd="select * from endChar";
OleDbDataAdapter oleAd=new OleDbDataAdapter(cmd,sqlCon);
DataTable dt=new DataTable();
oleAd.Fill(dt);
cmd="";
foreach( DataRow dr in dt.Rows)
{
cmd+=dr["ending"].ToString();
}
endChar=cmd.ToCharArray();
char[] mm={'\r','\n',' '};
missChar=mm;
char[] nn={'0','1','2','3','4','5','6','7','8','9'};
number=nn;
char[] ww={'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z'};
word=ww;
leftSplit=new ArrayList();
leftPos=new ArrayList();
leftFreq=new ArrayList();
rightSplit=new ArrayList();
rightPos=new ArrayList();
rightFreq=new ArrayList();
splitResult=new ArrayList();
}
///
/// 读入字句直到遇到一个终结符,英文单词与数字都算终结符
///
public void readToEnd()
{
//采用自动机原理实现识别过程
int status=0;
//lastChar是上一个字符,用于识别数字单词
char currentChar,lastChar;
string result="";
//lastType记录上一个字符是数字、英文、还是终结符
int i=0,lastType=0;
//
StreamReader sr=new StreamReader(fPath,System.Text.Encoding.Default);
while (sr.Peek()>= 0)
{
//当前字符
currentChar=(char)sr.Read();
if(status==0)
{
if(isNumber(currentChar))
{
//数字
result=currentChar.ToString();
status=1;
}
else if(isWord(currentChar))
{
result=currentChar.ToString();
status=5;
}
else if(isChinese(currentChar) )
{
//中文
result=currentChar.ToString();
status=7;
}
else if(isEnd(currentChar))
{
result="";
status=0;
}
else if(isSpace(currentChar))
{
//空格,不改变状态,继续读
result="";
status=0;
}
else
{
result="";
status=0;
}
}
else if(status==1)
{
if(isNumber(currentChar))
{
result+=currentChar;
//状态不变
}
else if(currentChar=='.')
{
result+=currentChar;
status=2;
}
else if(isWord(currentChar))
{
splitResult.Add(result);
result=currentChar.ToString();
status=5;
}
else if(isChinese(currentChar))
{
splitResult.Add(result);
result=currentChar.ToString();
status=7;
}
// else if(isSpace(currentChar))
// {
// splitResult.Add(result);
// result="";
// status=0;
// }
// else if(isEnd(currentChar))
// {
// splitResult.Add(result);
// result="";
// status=0;
// }
else
{
splitResult.Add(result);
result="";
status=0;
}
}
else if(status==2)
{
if(isNumber(currentChar))
{
result+=currentChar;
//状态不变
}
else if(isWord(currentChar))
{
splitResult.Add(result);
result=currentChar.ToString();
status=5;
}
else if(isChinese(currentChar))
{
splitResult.Add(result);
result=currentChar.ToString();
status=7;
}
// else if(isSpace(currentChar))
// {
// splitResult.Add(result);
// result="";
// status=0;
// }
// else if(isEnd(currentChar))
// {
// splitResult.Add(result);
// result="";
// status=0;
// }
else
{
//其他情况
splitResult.Add(result);
result="";
status=0;
}
}
else if(status==5)
{
if(isWord(currentChar))
{
result+=currentChar;
//status不变
}
else if(isNumber(currentChar))
{
splitResult.Add(result);
result=currentChar.ToString();
status=1;
}
else if(isChinese(currentChar))
{
splitResult.Add(result);
result=currentChar.ToString();
status=7;
}
// else if(isSpace(currentChar))
// {
// splitResult.Add(result);
// result="";
// status=0;
// }
// else if(isEnd(currentChar))
// {
// splitResult.Add(result);
// result="";
// status=0;
// }
else
{
//状态6,识别为一个英文单词,并将状态重新置位0
splitResult.Add(result);
result=currentChar.ToString();
status=0;
}
}
else if(status==7)
{
if(isChinese(currentChar))
{
//中文
result+=currentChar;
}
else if(isNumber(currentChar))
{
splitResult.Add(result);
result=currentChar.ToString();
status=1;
}
else if(isWord(currentChar))
{
doSplit(result,sqlConnect);
result=currentChar.ToString();
status=5;
}
else if(isSpace(currentChar))
{
//忽略空格
}
// else if(isEnd(currentChar))
// {
// //可能是中文标点
// doSplit(result,sqlConnect);
// result="";
// status=0;
// }
else
{
doSplit(result,sqlConnect);
result="";
status=0;
}
}
else
{
MessageBox.Show("出现意外状态!");
}
}
}
#region 判断类型函数
private bool isSpace(char currentChar)
{
if(currentChar==' ')
{
return true;
}
else
{
return false;
}
}
private bool isChinese(char currentChar)
{
if(currentChar>=0x4e00 && currentChar<=0x9fa5)
{
return true;
}
else
{
return false;
}
}
private bool isNumber(char currentChar)
{
if(currentChar>='0' && currentChar<='9')
{
return true;
}
else
{
return false;
}
}
private bool isWord(char currentChar)
{
if(currentChar>='a' && currentChar<='z' || currentChar>='A' && currentChar<='Z')
{
return true;
}
else
{
return false;
}
}
//检查字符是否是终结符
private bool isEnd(char currentChar)
{
for(int i=0;i0)
{
//currentPos即剩余串的长度
if(currentPos<=4)
{
subStr=sourceLine.Substring(0,currentPos);
subStr=backCheckSub(subStr,ref sqlCom);
currentPos-=subStr.Length;
rightPos.Add(currentPos);
}
else
{
subStr=sourceLine.Substring(currentPos-4,4);
subStr=backCheckSub(subStr,ref sqlCom);
currentPos-=subStr.Length;
rightPos.Add(currentPos);
}
rightSplit.Add(subStr);
}
//计算混合概率密度
int i,j,p;
//用于存储前向、后向扫描最大概率
float pRight=1,pLeft=1;
int numerator=0,denominator=0;
int lastPosI=0,lastPosJ=rightPos.Count-1;
i=1;
j=rightPos.Count-2;
OleDbDataReader sqlRead;
cmd= "select sum(wfreq) as aa from words";
sqlCom.CommandText=cmd;
sqlRead=sqlCom.ExecuteReader();
if(sqlRead.Read())
{
//读出分母,即所有词的出现次数
denominator=Convert.ToInt32(sqlRead["aa"]);
sqlRead.Close();
while(i=0)
{
int ll=Convert.ToInt32(leftPos[i]);
int rr=Convert.ToInt32(rightPos[j]);
if(ll>rr)
{
pRight*=((float)(Convert.ToInt32(rightFreq[j])))/denominator;
j--;
}
else if(llj;p--)
{
splitResult.Add(rightSplit[p]);
}
//记录结点位置
lastPosI=i;
lastPosJ=j;
i++;
j--;
}
else
{
//正向搜索的概率大
for(p=lastPosI+1;p