www.pudn.com > classifier.rar > stringSpliter.cs


using System; 
using System.Data.OleDb; 
using System.IO; 
using System.Windows.Forms; 
using System.Collections; 
using System.Data; 
 
namespace classifier.split 
{ 
	///  
	/// stringSpliter 的摘要说明。 
	///  
	public class stringSpliter 
	{ 
		private OleDbConnection sqlConnect; 
		private char[] srcChar; 
		private char[] endChar;	//终结符 
		private char[] missChar;//忽略符 
		private char[] word;//英文字母 
		private char[] number;//数字 
		private ArrayList leftSplit,rightSplit,splitResult,leftPos,rightPos,leftFreq,rightFreq; 
		 
		private ArrayList leftResult,rightResult; 
		 
		public ArrayList leftRes 
		{ 
			get 
			{ 
				return leftResult; 
			} 
		} 
 
		public ArrayList rightRes 
		{ 
			get 
			{ 
				return rightResult; 
			} 
		} 
 
		public ArrayList splitRes 
		{ 
			get 
			{ 
				return splitResult; 
			} 
		} 
		 
 
		public stringSpliter() 
		{ 
			// 
			// TODO: 在此处添加构造函数逻辑 
			// 
		} 
 
		public stringSpliter(string srcString) 
		{	 
			//空格不算终结符 
			sqlConnect=db.dbOpertation.getConnection(); 
 
			if(sqlConnect.State==ConnectionState.Closed) 
			{ 
				sqlConnect.Open();		 
			} 
 
			srcChar=srcString.ToCharArray(); 
			 
			//从数据库中读入终结符,并写入终结符字符数组中 
 
			string cmd="select * from endChar"; 
			OleDbDataAdapter oleAd=new OleDbDataAdapter(cmd,sqlConnect); 
			DataTable dt=new DataTable(); 
			oleAd.Fill(dt); 
			cmd=""; 
			foreach( DataRow dr in dt.Rows) 
			{ 
				cmd+=dr["ending"].ToString(); 
			} 
 
			endChar=cmd.ToCharArray(); 
			 
			char[] mm={'\r','\n',' '}; 
 
			missChar=mm; 
 
			char[] nn={'0','1','2','3','4','5','6','7','8','9'}; 
			number=nn; 
 
			char[] ww={'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z'}; 
			word=ww; 
 
			leftSplit=new ArrayList(); 
			leftPos=new ArrayList(); 
			leftFreq=new ArrayList(); 
			rightSplit=new ArrayList(); 
			rightPos=new ArrayList(); 
			rightFreq=new ArrayList(); 
			splitResult=new ArrayList(); 
			leftResult=new ArrayList(); 
			rightResult=new ArrayList(); 
 
		} 
 
		///  
		/// 读入字句直到遇到一个终结符,英文单词与数字都算终结符 
		///  
		public void readToEnd() 
		{ 
			//采用自动机原理实现识别过程 
			int status=0; 
			 
			//lastChar是上一个字符,用于识别数字单词			 
			char currentChar; 
			 
			string result=""; 
			 
			//lastType记录上一个字符是数字、英文、还是终结符 
			int nextChar=0; 
 
			//			 
				 
			while (srcChar.Length>nextChar)  
			{ 
				//当前字符 
				currentChar=srcChar[nextChar];		 
				 
				if(status==0) 
				{ 
					if(isNumber(currentChar)) 
					{ 
						//数字 
						result=currentChar.ToString(); 
						status=1;						 
					} 
					else if(isWord(currentChar)) 
					{ 
						result=currentChar.ToString(); 
						status=5; 
					}					 
					else if(isChinese(currentChar) ) 
					{ 
						//中文 
						result=currentChar.ToString(); 
						status=7; 
					} 
					else if(isEnd(currentChar)) 
					{ 
						result=""; 
						status=0; 
					} 
					else if(isSpace(currentChar)) 
					{ 
						//空格,不改变状态,继续读 
						result=""; 
						status=0; 
					} 
					else 
					{ 
						result=""; 
						status=0; 
					} 
 
				} 
				else if(status==1) 
				{ 
					if(isNumber(currentChar)) 
					{ 
						result+=currentChar; 
						//状态不变 
					} 
					else if(currentChar=='.') 
					{ 
						result+=currentChar; 
						status=2; 
					} 
					else if(isWord(currentChar)) 
					{ 
						 
						splitResult.Add(result); 
						leftResult.Add(result); 
						rightResult.Add(result); 
 
						result=currentChar.ToString(); 
						status=5; 
				 
					} 
					else if(isChinese(currentChar)) 
					{ 
						splitResult.Add(result); 
						leftResult.Add(result); 
						rightResult.Add(result); 
 
						result=currentChar.ToString(); 
						status=7; 
					} 
						//					else if(isSpace(currentChar)) 
						//					{ 
						//						splitResult.Add(result); 
						//						result=""; 
						//						status=0; 
						//					} 
						//					else if(isEnd(currentChar)) 
						//					{ 
						//						splitResult.Add(result); 
						//						result=""; 
						//						status=0; 
						//					} 
					else 
					{ 
						splitResult.Add(result); 
						result=""; 
						status=0; 
					} 
				} 
				else if(status==2) 
				{ 
					if(isNumber(currentChar)) 
					{ 
						result+=currentChar; 
						//状态不变 
					} 
					else if(isWord(currentChar)) 
					{ 
						splitResult.Add(result); 
						leftResult.Add(result); 
						rightResult.Add(result); 
 
						result=currentChar.ToString(); 
						status=5; 
					} 
					else if(isChinese(currentChar)) 
					{ 
						splitResult.Add(result); 
						leftResult.Add(result); 
						rightResult.Add(result); 
 
						result=currentChar.ToString(); 
						status=7; 
					} 
						//					else if(isSpace(currentChar)) 
						//					{ 
						//						splitResult.Add(result); 
						//						result=""; 
						//						status=0; 
						//					} 
						//					else if(isEnd(currentChar)) 
						//					{ 
						//						splitResult.Add(result); 
						//						result=""; 
						//						status=0; 
						//					} 
					else 
					{ 
						//其他情况 
						splitResult.Add(result); 
						leftResult.Add(result); 
						rightResult.Add(result); 
 
						result=""; 
						status=0; 
					}					 
				}				 
				else if(status==5) 
				{ 
					if(isWord(currentChar)) 
					{ 
						result+=currentChar; 
						//status不变 
					} 
					else if(isNumber(currentChar)) 
					{ 
						splitResult.Add(result); 
						leftResult.Add(result); 
						rightResult.Add(result); 
 
						result=currentChar.ToString(); 
						status=1; 
					} 
					else if(isChinese(currentChar)) 
					{ 
						splitResult.Add(result); 
						leftResult.Add(result); 
						rightResult.Add(result); 
 
						result=currentChar.ToString(); 
						status=7; 
					} 
						//					else if(isSpace(currentChar))	 
						//					{ 
						//						splitResult.Add(result); 
						//						result=""; 
						//						status=0; 
						//					} 
						//					else if(isEnd(currentChar)) 
						//					{ 
						//						splitResult.Add(result); 
						//						result=""; 
						//						status=0; 
						//					} 
					else 
					{ 
						//状态6,识别为一个英文单词,并将状态重新置位0 
						splitResult.Add(result); 
						leftResult.Add(result); 
						rightResult.Add(result); 
 
						result=currentChar.ToString(); 
						status=0; 
					} 
				}				 
				else if(status==7) 
				{ 
					if(isChinese(currentChar)) 
					{ 
						//中文 
						result+=currentChar; 
						 
					} 
					else if(isNumber(currentChar)) 
					{ 
						doSplit(result,sqlConnect); 
						result=currentChar.ToString(); 
						status=1; 
					} 
					else if(isWord(currentChar))	 
					{ 
						doSplit(result,sqlConnect); 
						result=currentChar.ToString(); 
						status=5; 
					} 
					else if(isSpace(currentChar)) 
					{ 
						//忽略空格 
					} 
						//					else if(isEnd(currentChar)) 
						//					{ 
						//						//可能是中文标点 
						//						doSplit(result,sqlConnect); 
						//						result=""; 
						//						status=0; 
						//					} 
					else 
					{						 
						doSplit(result,sqlConnect); 
						result=""; 
						status=0; 
					} 
				}				 
				else 
				{ 
					MessageBox.Show("出现意外状态!"); 
				} 
 
				//检查下一个字符 
				nextChar++; 
 
			} 
			 
			//对结尾情况的处理,只处理英文,中文,数字的情况 
			if(status==1) 
			{ 
				//数字 
				splitResult.Add(result); 
				leftResult.Add(result); 
				rightResult.Add(result); 
			} 
			else if(status==2) 
			{ 
				//带小数点的数字 
				splitResult.Add(result); 
				leftResult.Add(result); 
				rightResult.Add(result); 
			} 
			else if(status==5) 
			{ 
				//单词 
				splitResult.Add(result); 
				leftResult.Add(result); 
				rightResult.Add(result); 
			} 
			else if(status==7) 
			{ 
				//汉字 
				doSplit(result,sqlConnect); 
			} 
			else 
			{ 
 
			} 
 
			//将分类结果存入结果表 
 
 
		} 
 
		#region 判断类型函数 
		private bool isSpace(char currentChar) 
		{ 
			if(currentChar==' ') 
			{ 
				return true; 
			} 
			else 
			{ 
				return false; 
			} 
		} 
 
		private bool isChinese(char currentChar) 
		{ 
			if(currentChar>=0x4e00 && currentChar<=0x9fa5) 
			{ 
				return true; 
			} 
			else 
			{ 
				return false; 
			} 
		} 
 
		private bool isNumber(char currentChar) 
		{ 
			if(currentChar>='0' && currentChar<='9') 
			{ 
				return true; 
			} 
			else 
			{ 
				return false; 
			} 
		} 
				 
		private bool isWord(char currentChar) 
		{ 
			if(currentChar>='a' && currentChar<='z' || currentChar>='A' && currentChar<='Z') 
			{ 
				return true; 
			} 
			else 
			{ 
				return false; 
			} 
		} 
		 
 
		//检查字符是否是终结符 
		private bool isEnd(char currentChar) 
		{ 
			for(int i=0;i0) 
				{					 
					//currentPos即剩余串的长度 
					if(currentPos<=4) 
					{ 
						subStr=sourceLine.Substring(0,currentPos); 
						subStr=backCheckSub(subStr,ref sqlCom); 
						 
						currentPos-=subStr.Length; 
						rightPos.Add(currentPos); 
					} 
					else 
					{ 
						subStr=sourceLine.Substring(currentPos-4,4); 
						subStr=backCheckSub(subStr,ref sqlCom); 
						 
						currentPos-=subStr.Length; 
						rightPos.Add(currentPos); 
					} 
 
					rightSplit.Add(subStr);					 
				} 
 
				//逆向搜索结果插入逆向结果 
				for(i=rightSplit.Count-1;i>=0;i--) 
				{ 
					rightResult.Add(rightSplit[i]); 
 
				} 
				 
 
				//计算混合概率密度 
				 
 
				//用于存储前向、后向扫描最大概率 
				float pRight=1,pLeft=1; 
				int denominator=0; 
				int lastPosI=0,lastPosJ=rightPos.Count-1; 
 
				i=1; 
				j=rightPos.Count-2; 
				 
				OleDbDataReader sqlRead; 
				cmd= "select sum(wfreq) as aa from words"; 
				sqlCom.CommandText=cmd; 
 
				sqlRead=sqlCom.ExecuteReader(); 
				if(sqlRead.Read()) 
				{ 
					//读出分母,即所有词的出现次数 
					denominator=Convert.ToInt32(sqlRead["aa"]); 
					sqlRead.Close(); 
					 
					while(i=0) 
					{ 
						int ll=Convert.ToInt32(leftPos[i]); 
						int rr=Convert.ToInt32(rightPos[j]); 
						if(ll>rr) 
						{ 
							pRight*=((float)(Convert.ToInt32(rightFreq[j])))/denominator; 
							j--; 
						} 
						else if(llj;p--) 
									{ 
										splitResult.Add(rightSplit[p]); 
									} 
 
									//记录结点位置 
									lastPosI=i; 
									lastPosJ=j; 
									i++; 
									j--; 
									 
								} 
								else 
								{ 
									//正向搜索的概率大 
									for(p=lastPosI;pj;p--) 
							{ 
								splitResult.Add(rightSplit[p]); 
							}		 
						} 
						else 
						{ 
							//正向搜索的概率大 
							for(p=lastPosI+1;p