www.pudn.com > Crawler_bemjh.rar > FilterURL.cs


using System; 
using Crawler; 
using ShootSeg; 
using System.Collections; 
 
namespace CrawlerLib 
{ 
	///  
	/// FilterURL 的摘要说明。 
	///  
	public class FilterURL 
	{ 
		const float TITLE_SCORE_PORPROTION = 0.4F; 
		const float HREFTEXT_SCORE_PORPROTION = 0.4F; 
		const float KEYWORDS_SCORE_PORPROTION = 0.2F; 
		const float CONTENT_SCORE_PORPROTION = 0.5F; 
		// 保存关键词 
		private Hashtable topicLibs; 
		// 要过滤的网址数组 
		private ArrayList filteredURL; 
		// 分词变量 
		private static Segment seg ; 
		private LogicalLayer llay; 
		public FilterURL() 
		{ 
			// 初始化词典 
			seg = new Segment(); 
			seg.InitWordDics(); 
			llay = new LogicalLayer(); 
			//保存关键词对应的权值 
			topicLibs = llay.GetTopicLib(); 
			//获取过滤的URL 
			filteredURL = llay.GetFilterURL(); 
 
		} 
		 
		///  
		/// 判断一条网址是否要被过滤 
		///  
		///  
		///  
		public bool isFilterURL(ref PageElement pe) 
		{ 
			bool isFilter = false; 
 
			//检查是否为过滤的URL 
			if(filteredURL.IndexOf(pe.URL)==-1) 
			{ 
				MyUri newUri = new MyUri(pe.URL); 
				// 指定传输类型为HTTP及HTTPS 
				if(newUri.Scheme != Uri.UriSchemeHttp && newUri.Scheme != Uri.UriSchemeHttps) 
					return false; 
				//计算链接文本的权值HrefText 
				try 
				{ 
					pe.Score = this.GetScore(pe.HrefText)*TITLE_SCORE_PORPROTION; 
						       //this.GetScore(pe.Title)*HREFTEXT_SCORE_PORPROTION+ 
							   //this.GetScore(pe.Keywords)*KEYWORDS_SCORE_PORPROTION; 
							   //this.GetScore(pe.PageSourceCode)*CONTENT_SCORE_PORPROTION;									 
				} 
				catch(System.NullReferenceException eNullr) 
				{  
					pe.Score = 0; 
				} 
					 
				//如果分值等于零的则抛弃,表示与主题不相关 
				if(pe.Score<0.1) 
				{ 
					isFilter = true; 
				} 
 
			} 
			return isFilter; 
		} 
		 
		///  
		/// 计算权值 
		///  
		///  
		///  
		public float GetScore(string strText) 
		{ 
			int recourdCount=0; 
			ArrayList anchorWords = new ArrayList(); 
			string splitedAText; 
			float score=0; 
 
			seg.Separator = "/"; 
			splitedAText = seg.SegmentText(strText,true); 
			if(splitedAText.IndexOf('/')!=-1) 
				while(splitedAText.IndexOf('/')!=-1 && splitedAText.Length>2) 
				{ 
					anchorWords.Add(splitedAText.Substring(0,splitedAText.IndexOf('/'))); 
					splitedAText = splitedAText.Substring(splitedAText.IndexOf('/')+1); 
				} 
			else 
				anchorWords.Add(strText); 
 
			foreach(string strtemp in anchorWords) 
			{ 
				if(topicLibs.ContainsKey((object)strtemp.Trim()))					 
				{ 
					score+=(float)Convert.ToDouble(topicLibs[strtemp].ToString()); 
					recourdCount++; 
				} 
			} 
			if(recourdCount!=0) 
			{ 
				score=score/recourdCount; 
			} 
 
			return score; 
		} 
 
	} 
}