www.pudn.com > classifier.rar > fileSpliter.cs


using System; 
using System.Data.OleDb; 
using System.IO; 
using System.Windows.Forms; 
using System.Collections; 
using System.Data; 
using System.Drawing; 
using System.ComponentModel; 
 
 
namespace classifier.split 
{ 
	///  
	/// spliter 的摘要说明。 
	/// 分词操作的类 
	///  
	public class fileSpliter 
	{ 
		private OleDbConnection sqlConnect;		 
		private char[] endChar;	//终结符 
		private char[] missChar;//忽略符 
		private char[] word;//英文字母 
		private char[] number;//数字 
		private ArrayList leftSplit,rightSplit,splitResult,leftPos,rightPos,leftFreq,rightFreq; 
		private DataSet ds; 
		private int denominator; 
		private string [] filePath; 
		private string typeName; 
		private System.Windows.Forms.TextBox msgBox; 
		 
		public fileSpliter() 
		{ 
			// 
			// TODO: 在此处添加构造函数逻辑 
			// 
		} 
 
 
		///  
		///  
		///  
		///  
		///  
		public fileSpliter(string[] fPath,OleDbConnection sqlCon,int deno,string type,TextBox b) 
		{ 
			msgBox=b; 
 
			msgBox.Text="aaa"; 
 
 
			typeName=type; 
			//空格不算终结符 
			sqlConnect=sqlCon; 
 
			if(sqlConnect.State==ConnectionState.Closed) 
			{ 
				sqlConnect.Open(); 
			} 
			 
			//文件列表传递 
			filePath=fPath; 
			 
			//从数据库中读入终结符,并写入终结符字符数组中 
 
			string cmd="select * from endChar"; 
			OleDbDataAdapter oleAd=new OleDbDataAdapter(cmd,sqlConnect); 
			DataTable dt=new DataTable(); 
			oleAd.Fill(dt); 
			cmd=""; 
			foreach( DataRow dr in dt.Rows) 
			{ 
				cmd+=dr["ending"].ToString(); 
			} 
 
			//填充需要使用的数据集,其中word表存储北大的词库,static存储统计词频的结果 
			ds=new DataSet(); 
 
			OleDbDataAdapter sqlWordAd=new OleDbDataAdapter("select * from words",sqlConnect); 
 
			sqlWordAd.Fill(ds,"word"); 
 
			OleDbDataAdapter sqlStaticAd=new OleDbDataAdapter("select * from learnResult",sqlConnect); 
 
			sqlStaticAd.Fill(ds,"static"); 
			//填充结束 
 
			endChar=cmd.ToCharArray(); 
			 
			char[] mm={'\r','\n',' '}; 
 
			missChar=mm; 
 
			char[] nn={'0','1','2','3','4','5','6','7','8','9'}; 
			number=nn; 
 
			char[] ww={'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z'}; 
			word=ww; 
 
			leftSplit=new ArrayList(); 
			leftPos=new ArrayList(); 
			leftFreq=new ArrayList(); 
			rightSplit=new ArrayList(); 
			rightPos=new ArrayList(); 
			rightFreq=new ArrayList(); 
			splitResult=new ArrayList(); 
 
			denominator=deno; 
 
			sqlConnect.Close(); 
			 
 
		} 
 
 
		///  
		/// 读入字句直到遇到一个终结符,英文单词与数字都算终结符 
		///  
		///  
		public void readToEnd() 
		{ 
			//采用自动机原理实现识别过程 
			int status=0; 
			 
			//lastChar是上一个字符,用于识别数字单词			 
			char currentChar; 
			 
			string result=""; 
			 
			//lastType记录上一个字符是数字、英文、还是终结符 
			int i=0; 
 
			// 
			StreamReader sr; 
			 
			//对文件列表中的每一个文件进行处理 
			for(int ss=0;ss= 0)  
				{ 
					//当前字符 
					currentChar=(char)sr.Read();		 
				 
					if(status==0) 
					{ 
						if(isNumber(currentChar)) 
						{ 
							//数字 
							result=currentChar.ToString(); 
							status=1;						 
						} 
						else if(isWord(currentChar)) 
						{ 
							result=currentChar.ToString(); 
							status=5; 
						}					 
						else if(isChinese(currentChar) ) 
						{ 
							//中文 
							result=currentChar.ToString(); 
							status=7; 
						} 
						else if(isEnd(currentChar)) 
						{ 
							result=""; 
							status=0; 
						} 
						else if(isSpace(currentChar)) 
						{ 
							//空格,不改变状态,继续读 
							result=""; 
							status=0; 
						} 
						else 
						{ 
							result=""; 
							status=0; 
						} 
 
					} 
					else if(status==1) 
					{ 
						if(isNumber(currentChar)) 
						{ 
							result+=currentChar; 
							//状态不变 
						} 
						else if(currentChar=='.') 
						{ 
							result+=currentChar; 
							status=2; 
						} 
						else if(isWord(currentChar)) 
						{ 
						 
							splitResult.Add(result); 
							result=currentChar.ToString(); 
							status=5; 
				 
						} 
						else if(isChinese(currentChar)) 
						{ 
							splitResult.Add(result); 
							result=currentChar.ToString(); 
							status=7; 
						} 
							//					else if(isSpace(currentChar)) 
							//					{ 
							//						splitResult.Add(result); 
							//						result=""; 
							//						status=0; 
							//					} 
							//					else if(isEnd(currentChar)) 
							//					{ 
							//						splitResult.Add(result); 
							//						result=""; 
							//						status=0; 
							//					} 
						else 
						{ 
							splitResult.Add(result); 
							result=""; 
							status=0; 
						} 
					} 
					else if(status==2) 
					{ 
						if(isNumber(currentChar)) 
						{ 
							result+=currentChar; 
							//状态不变 
						} 
						else if(isWord(currentChar)) 
						{ 
							splitResult.Add(result); 
							result=currentChar.ToString(); 
							status=5; 
						} 
						else if(isChinese(currentChar)) 
						{ 
							splitResult.Add(result); 
							result=currentChar.ToString(); 
							status=7; 
						} 
							//					else if(isSpace(currentChar)) 
							//					{ 
							//						splitResult.Add(result); 
							//						result=""; 
							//						status=0; 
							//					} 
							//					else if(isEnd(currentChar)) 
							//					{ 
							//						splitResult.Add(result); 
							//						result=""; 
							//						status=0; 
							//					} 
						else 
						{ 
							//其他情况 
							splitResult.Add(result); 
							result=""; 
							status=0; 
						}					 
					}				 
					else if(status==5) 
					{ 
						if(isWord(currentChar)) 
						{ 
							result+=currentChar; 
							//status不变 
						} 
						else if(isNumber(currentChar)) 
						{ 
							splitResult.Add(result); 
							result=currentChar.ToString(); 
							status=1; 
						} 
						else if(isChinese(currentChar)) 
						{ 
							splitResult.Add(result); 
							result=currentChar.ToString(); 
							status=7; 
						} 
							//					else if(isSpace(currentChar))	 
							//					{ 
							//						splitResult.Add(result); 
							//						result=""; 
							//						status=0; 
							//					} 
							//					else if(isEnd(currentChar)) 
							//					{ 
							//						splitResult.Add(result); 
							//						result=""; 
							//						status=0; 
							//					} 
						else 
						{ 
							//状态6,识别为一个英文单词,并将状态重新置位0 
							splitResult.Add(result); 
							result=currentChar.ToString(); 
							status=0; 
						} 
					}				 
					else if(status==7) 
					{ 
						if(isChinese(currentChar)) 
						{ 
							//中文 
							result+=currentChar; 
						 
						} 
						else if(isNumber(currentChar)) 
						{ 
							doSplit(result,sqlConnect); 
							result=currentChar.ToString(); 
							status=1; 
						} 
						else if(isWord(currentChar))	 
						{ 
							doSplit(result,sqlConnect); 
							result=currentChar.ToString(); 
							status=5; 
						} 
						else if(isSpace(currentChar)) 
						{ 
							//忽略空格 
						} 
							//					else if(isEnd(currentChar)) 
							//					{ 
							//						//可能是中文标点 
							//						doSplit(result,sqlConnect); 
							//						result=""; 
							//						status=0; 
							//					} 
						else 
						{						 
							doSplit(result,sqlConnect); 
							result=""; 
							status=0; 
						} 
					}				 
					else 
					{ 
						MessageBox.Show("出现意外状态!"); 
					} 
				} 
			 
				//对结尾情况的处理,只处理英文,中文,数字的情况 
				if(status==1) 
				{ 
					//数字 
					splitResult.Add(result); 
				} 
				else if(status==2) 
				{ 
					//带小数点的数字 
					splitResult.Add(result); 
				} 
				else if(status==5) 
				{ 
					//单词 
					splitResult.Add(result); 
				} 
				else if(status==7) 
				{ 
					//汉字 
					doSplit(result,sqlConnect); 
				} 
				else 
				{ 
					 
				} 
 
				//将分类结果存入结果表 
				saveResult();		 
				msgBox.Text+="文件:"+filePath[ss]+" 处理完毕\n"; 
 
			} 
 
			//结果更新到数据库 
			saveDB(); 
 
		} 
 
 
		#region 判断类型函数 
		private bool isSpace(char currentChar) 
		{ 
			if(currentChar==' ') 
			{ 
				return true; 
			} 
			else 
			{ 
				return false; 
			} 
		} 
 
		private bool isChinese(char currentChar) 
		{ 
			if(currentChar>=0x4e00 && currentChar<=0x9fa5) 
			{ 
				return true; 
			} 
			else 
			{ 
				return false; 
			} 
		} 
 
		private bool isNumber(char currentChar) 
		{ 
			if(currentChar>='0' && currentChar<='9') 
			{ 
				return true; 
			} 
			else 
			{ 
				return false; 
			} 
		} 
				 
		private bool isWord(char currentChar) 
		{ 
			if(currentChar>='a' && currentChar<='z' || currentChar>='A' && currentChar<='Z') 
			{ 
				return true; 
			} 
			else 
			{ 
				return false; 
			} 
		} 
		 
 
		//检查字符是否是终结符 
		private bool isEnd(char currentChar) 
		{ 
			for(int i=0;i0) 
				{					 
					//currentPos即剩余串的长度 
					if(currentPos<=4) 
					{ 
						subStr=sourceLine.Substring(0,currentPos); 
						subStr=backCheckSub(subStr); 
						 
						currentPos-=subStr.Length; 
						rightPos.Add(currentPos); 
					} 
					else 
					{ 
						subStr=sourceLine.Substring(currentPos-4,4); 
						subStr=backCheckSub(subStr); 
						 
						currentPos-=subStr.Length; 
						rightPos.Add(currentPos); 
					} 
 
					rightSplit.Add(subStr); 
				} 
				 
 
				//计算混合概率密度 
				int i,j,p; 
 
				//用于存储前向、后向扫描最大概率 
				float pRight=1,pLeft=1; 
				int denominator=0; 
				int lastPosI=0,lastPosJ=rightPos.Count-1; 
 
				i=1; 
				j=rightPos.Count-2; 
					 
					while(i=0) 
					{ 
						int ll=Convert.ToInt32(leftPos[i]); 
						int rr=Convert.ToInt32(rightPos[j]); 
						if(ll>rr) 
						{ 
							pRight*=((float)(Convert.ToInt32(rightFreq[j])))/denominator; 
							j--; 
						} 
						else if(llj;p--) 
									{ 
										splitResult.Add(rightSplit[p]); 
									} 
 
									//记录结点位置 
									lastPosI=i; 
									lastPosJ=j; 
									i++; 
									j--; 
									 
								} 
								else 
								{ 
									//正向搜索的概率大 
									for(p=lastPosI;pj;p--) 
							{ 
								splitResult.Add(rightSplit[p]); 
							}		 
						} 
						else 
						{ 
							//正向搜索的概率大 
							for(p=lastPosI+1;p0) 
				{ 
					leftFreq.Add(drs[0]["wfreq"].ToString());					 
					return subString; 
				} 
				else 
				{					 
					leftFreq.Add("0"); 
					return subString; 
				} 
			} 
			else 
			{ 
				DataRow[] drs=ds.Tables["word"].Select("word=\'"+subString+"\'"); 
				 
				if(drs.Length>0) 
				{ 
					leftFreq.Add(drs[0]["wfreq"].ToString());					 
					return subString; 
				} 
				else 
				{										 
					return foreCheckSub(subString.Substring(0,subString.Length-1)); 
				} 
 
			} 
		} 
 
		//逆向递归搜索程序 
		private string backCheckSub(string subString) 
		{ 
			if(subString.Length<=1) 
			{ 
				DataRow[] drs=ds.Tables["word"].Select("word=\'"+subString+"\'"); 
				if(drs.Length>0) 
				{ 
					rightFreq.Add(drs[0]["wfreq"].ToString());					 
					return subString; 
				} 
				else 
				{					 
					rightFreq.Add("0"); 
					return subString; 
				} 
			} 
			else 
			{ 
				DataRow[] drs=ds.Tables["word"].Select("word=\'"+subString+"\'"); 
				if(drs.Length>0) 
				{ 
					rightFreq.Add(drs[0]["wfreq"].ToString());					 
					return subString; 
				} 
				else 
				{					 
					return backCheckSub(subString.Substring(1,subString.Length-1)); 
				} 
			} 
		} 
 
		#endregion 
 
 
		#region 统计保存结果 
 
		private void saveResult() 
		{ 
			DataTable dt=ds.Tables["static"]; 
			DataRow dr; 
			DataRow[] drs; 
			string cmd; 
			int j; 
			 
			for(int i=0;i0) 
				{ 
					j=Convert.ToInt32(drs[0]["times"]); 
					j++; 
					drs[0]["times"]=j; 
				} 
				else 
				{ 
					dr=dt.NewRow(); 
					dr["word"]=splitResult[i].ToString(); 
					dr["typeName"]=typeName; 
					dr["times"]=1; 
					dt.Rows.Add(dr); 
				} 
 
			}	 
			 
		} 
 
 
		private void saveDB() 
		{ 
			if(sqlConnect.State==ConnectionState.Closed) 
			{ 
				sqlConnect.Open(); 
			} 
			string cmd="select * from learnResult"; 
 
			OleDbDataAdapter sqlAd=new OleDbDataAdapter(cmd,sqlConnect); 
			OleDbCommandBuilder sqlBuilder=new OleDbCommandBuilder(sqlAd); 
			try 
			{ 
				sqlAd.Update(ds,"static"); 
				ds.Tables["static"].AcceptChanges(); 
			} 
			catch(Exception ex) 
			{ 
				System.Windows.Forms.MessageBox.Show("出错了!"); 
			} 
 
			sqlConnect.Close(); 
		} 
		 
 
		#endregion 
 
	} 
}