www.pudn.com > Crawler_bemjh.rar > FilterURL.cs
using System;
using Crawler;
using ShootSeg;
using System.Collections;
namespace CrawlerLib
{
///
/// FilterURL 的摘要说明。
///
public class FilterURL
{
const float TITLE_SCORE_PORPROTION = 0.4F;
const float HREFTEXT_SCORE_PORPROTION = 0.4F;
const float KEYWORDS_SCORE_PORPROTION = 0.2F;
const float CONTENT_SCORE_PORPROTION = 0.5F;
// 保存关键词
private Hashtable topicLibs;
// 要过滤的网址数组
private ArrayList filteredURL;
// 分词变量
private static Segment seg ;
private LogicalLayer llay;
public FilterURL()
{
// 初始化词典
seg = new Segment();
seg.InitWordDics();
llay = new LogicalLayer();
//保存关键词对应的权值
topicLibs = llay.GetTopicLib();
//获取过滤的URL
filteredURL = llay.GetFilterURL();
}
///
/// 判断一条网址是否要被过滤
///
///
///
public bool isFilterURL(ref PageElement pe)
{
bool isFilter = false;
//检查是否为过滤的URL
if(filteredURL.IndexOf(pe.URL)==-1)
{
MyUri newUri = new MyUri(pe.URL);
// 指定传输类型为HTTP及HTTPS
if(newUri.Scheme != Uri.UriSchemeHttp && newUri.Scheme != Uri.UriSchemeHttps)
return false;
//计算链接文本的权值HrefText
try
{
pe.Score = this.GetScore(pe.HrefText)*TITLE_SCORE_PORPROTION;
//this.GetScore(pe.Title)*HREFTEXT_SCORE_PORPROTION+
//this.GetScore(pe.Keywords)*KEYWORDS_SCORE_PORPROTION;
//this.GetScore(pe.PageSourceCode)*CONTENT_SCORE_PORPROTION;
}
catch(System.NullReferenceException eNullr)
{
pe.Score = 0;
}
//如果分值等于零的则抛弃,表示与主题不相关
if(pe.Score<0.1)
{
isFilter = true;
}
}
return isFilter;
}
///
/// 计算权值
///
///
///
public float GetScore(string strText)
{
int recourdCount=0;
ArrayList anchorWords = new ArrayList();
string splitedAText;
float score=0;
seg.Separator = "/";
splitedAText = seg.SegmentText(strText,true);
if(splitedAText.IndexOf('/')!=-1)
while(splitedAText.IndexOf('/')!=-1 && splitedAText.Length>2)
{
anchorWords.Add(splitedAText.Substring(0,splitedAText.IndexOf('/')));
splitedAText = splitedAText.Substring(splitedAText.IndexOf('/')+1);
}
else
anchorWords.Add(strText);
foreach(string strtemp in anchorWords)
{
if(topicLibs.ContainsKey((object)strtemp.Trim()))
{
score+=(float)Convert.ToDouble(topicLibs[strtemp].ToString());
recourdCount++;
}
}
if(recourdCount!=0)
{
score=score/recourdCount;
}
return score;
}
}
}