www.pudn.com > Crawler_bemjh.rar > Utility.cs
using System;
using ShootSeg;
using System.IO;
using System.Net;
using System.Web;
using System.Data;
using System.Text;
using System.Collections;
using System.Data.SqlClient;
using System.Runtime.InteropServices;
namespace CrawlerLib
{
///
/// utilty 的摘要说明。
/// 1、GetVoc :读取特征词库
/// 2、GetPage:下载指定URL的网页
/// 3、教育资源相关性判断,并添加到相应的资源列表,addTo(……)
/// 4、读写 INI 文本文件类
/// 5、返回连接文本的权值
///
#region 读取特征库
///
/// 从指定的数据库服务器读取特征词库
///
public class GetVoc
{
#region 私有变量
///
/// 特征词库数据库 IP
///
private string dbIp="";
///
/// 特征词库数据库名
///
private string dbName=null;
///
/// 特征词库数据库帐户名
///
private string userName=null;
///
/// 特征词库数据库帐户密码
///
private string userPwd=null;
///
/// 返回的 DataViewVoc
///
private DataView dvVoc = new DataView();
///
/// 返回的处理过程中的错误消息
///
private string noteMessage=null;
#endregion
#region 公共属性
///
/// 设置或返回特征词库数据库IP
///
public string IP
{
get{return dbIp;}
set{dbIp=value;}
}
///
/// 设置或返回特征词库数据库名
///
public string DBName
{
get{return dbName;}
set{dbName=value;}
}
///
/// 设置或返回特征词库数据库帐户名
///
public string UserName
{
get{return userName;}
set{userName=value;}
}
///
/// 特征词库数据库帐户密码
///
public string UserPwd
{
get{return userPwd;}
set{userPwd=value;}
}
///
/// 代理服务器账号
///
public System.Data.DataView DVVoc
{
get{return dvVoc;}
}
public string NoteMessage
{
get{return noteMessage;}
}
#endregion
#region 构造函数
public GetVoc(string IP, string DbName, string UserName, string UserPwd)
{
dbIp= IP; dbName = DbName; userName = UserName; userPwd = UserPwd;
}
#endregion
#region 公共方法 getVoc
///
/// 读取指定URL地址,以字符串返回
///
public DataView getVoc(string SelectString)
{
try
{
SqlConnection cn = new SqlConnection("Server=" + dbIp + "; database=" + dbName + "; uid=" + userName + "; pwd=" + userPwd);
System.Data.DataSet myDataSet = new DataSet();
System.Data.SqlClient.SqlDataAdapter myDataAdapter = new SqlDataAdapter();
cn.Open();
myDataAdapter.SelectCommand = new SqlCommand(SelectString,cn);
myDataAdapter.Fill(myDataSet,"Vocabulary");
dvVoc = new DataView(myDataSet.Tables["Vocabulary"]);
// dvVoc.Sort = "Word desc";
cn.Close();
cn.Dispose();
return dvVoc;
}
catch(SqlException e1)
{
noteMessage = "在读取特征词库时发生如下错误:" + e1.Message.ToString();
return null;
}
}
#endregion
}
#endregion
#region 计算资源权重
///
/// 计算 url 权重,计算机依据:
/// 1、猫文本中的
///
public class Classifier
{
#region 私有变量
///
/// 猫链接URL
///
private string anchorUrl=null;
///
/// 猫链接文本最大逆向分词列表
///
private string anchorText=null;
///
/// 基础教育权威网站
///
private DataView dataviewHosts=null;
///
/// 学段特征词
///
private DataView dataviewGrade=null;
///
/// 学科特征词
///
private DataView dataviewSubject=null;
///
/// 资源类型特征词
///
private DataView dataviewResourceType=null;
///
/// 学段、学科组合特征词
///
private DataView dataviewGradeSubject=null;
///
/// 分词主词库
///
private DataView dataviewMainVoc=null;
///
/// Url中的特殊字串列表
///
private DataView dataviewUrlString=new DataView();
///
/// 资源的学段编号
///
private string gradeID="0000";
///
/// 资源的学科编号
///
private string subjectID="0000";
///
/// 资源的类型编号
///
private string resourcetypeID="0000";
///
/// 资源的学段、学科组合编号
///
private string gradesubjectID="0000";
///
/// 资源权重
///
private int weight=0;
///
/// Url中包含的特殊字串权重
///
private int urlWeight=0;
///
/// 权威网站权重
///
private int hostWeight=0;
///
/// 分词变量
///
private static Segment seg ;
#endregion
#region 公共属性
///
/// 猫链接URL
///
public string AnchorUrl
{
set{anchorUrl=value;}
}
///
/// 猫链接文本最大逆向分词列表
///
public string AnchorText
{
set{anchorText=value;}
}
///
/// 基础教育权威网站
///
public DataView DataViewHosts
{
set{dataviewHosts=value;}
}
///
/// 学段特征词
///
public DataView DataViewGrade
{
set{dataviewGrade=value;}
}
///
/// 学科特征词
///
public DataView DataViewSubject
{
set{dataviewSubject=value;}
}
///
/// 资源类型特征词
///
public DataView DataViewResourceType
{
set{dataviewResourceType=value;}
}
///
/// 学段、学科组合特征词
///
public DataView DataViewGradeSubject
{
set{dataviewGradeSubject=value;}
}
///
/// 分词主词库
///
public DataView DataViewMainVoc
{
set{dataviewMainVoc=value;}
}
///
/// Url中的特殊字串列表
///
public DataView DataViewUrlString
{
get{return dataviewUrlString;}
set{dataviewUrlString=value;}
}
///
/// 资源的学段编号
///
public string GradeID
{
get{return gradeID;}
}
///
/// 资源的学科编号
///
public string SubjectID
{
get{return subjectID;}
}
///
/// 资源的类型编号
///
public string ResourceTypeID
{
get{return resourcetypeID;}
}
///
/// 资源的学段、学科组合编号
///
public string GradeSubjectID
{
get{return gradesubjectID;}
}
///
/// 资源权重
///
public int Weight
{
get{return weight;}
}
///
/// Url中包含的特殊字串权重
///
public int UrlWeight
{
set{urlWeight=value;}
}
///
/// 权威网站权重
///
public int HostWeight
{
set{hostWeight=value;}
}
#endregion
#region 无参构造函数
public Classifier()
{
//加载词语列表
seg = new Segment();
seg.InitWordDics();
}
#endregion
#region 构造函数
///
/// 根据猫链接和猫文本进行权重计算、类别判断
///
/// 猫链接URL
/// 猫链接文本
/// 基础教育权威网站
/// 学段特征词
/// 学科特征词
/// 资源类型特征词
/// 学段、学科组合特征词
public Classifier(int UrlWeight, int HostWeight, DataView DataViewHosts, DataView DataViewGrade, DataView DataViewSubject, DataView DataViewResourceType, DataView DataViewGradeSubject, DataView DataViewMainVoc)
{
// 实例化时,给各属性赋值
urlWeight = UrlWeight;
hostWeight = HostWeight;
dataviewHosts = DataViewHosts;
dataviewGrade = DataViewGrade;
dataviewSubject = DataViewSubject;
dataviewResourceType = DataViewResourceType;
dataviewGradeSubject = DataViewGradeSubject;
dataviewMainVoc = DataViewMainVoc;
// 添加URL中的特殊字串列表
DataColumn myColumn1;
DataColumn myColumn2;
DataRow myRow;
DataTable myDataTable = new DataTable("UrlString");
myColumn1 = new DataColumn();
myColumn1.DataType = Type.GetType("System.String");
myColumn1.ColumnName = "UrlString";
myDataTable.Columns.Add(myColumn1);
myColumn2 = new DataColumn();
myColumn2.DataType = Type.GetType("System.String");
myColumn2.ColumnName = "CategoryID";
myDataTable.Columns.Add(myColumn2);
myRow = myDataTable.NewRow(); myRow["UrlString"]="yuwen"; myRow["CategoryID"] = "0001"; myDataTable.Rows.Add(myRow); // 语文
myRow = myDataTable.NewRow(); myRow["UrlString"]="shuxue"; myRow["CategoryID"] = "0002"; myDataTable.Rows.Add(myRow); // 数学
myRow = myDataTable.NewRow(); myRow["UrlString"]="yingyu"; myRow["CategoryID"] = "0003"; myDataTable.Rows.Add(myRow); // 英语
myRow = myDataTable.NewRow(); myRow["UrlString"]="wuli"; myRow["CategoryID"] = "0004"; myDataTable.Rows.Add(myRow); // 物理
myRow = myDataTable.NewRow(); myRow["UrlString"]="huaxue"; myRow["CategoryID"] = "0005"; myDataTable.Rows.Add(myRow); // 化学
myRow = myDataTable.NewRow(); myRow["UrlString"]="zhengzhi"; myRow["CategoryID"] = "0006"; myDataTable.Rows.Add(myRow); // 政治
myRow = myDataTable.NewRow(); myRow["UrlString"]="lishi"; myRow["CategoryID"] = "0007"; myDataTable.Rows.Add(myRow); // 历史
myRow = myDataTable.NewRow(); myRow["UrlString"]="dili"; myRow["CategoryID"] = "0008"; myDataTable.Rows.Add(myRow); // 地理
myRow = myDataTable.NewRow(); myRow["UrlString"]="shengwu"; myRow["CategoryID"] = "0009"; myDataTable.Rows.Add(myRow); // 生物
myRow = myDataTable.NewRow(); myRow["UrlString"]="yinyue"; myRow["CategoryID"] = "0010"; myDataTable.Rows.Add(myRow); // 音乐
myRow = myDataTable.NewRow(); myRow["UrlString"]="tiyu"; myRow["CategoryID"] = "0011"; myDataTable.Rows.Add(myRow); // 体育
myRow = myDataTable.NewRow(); myRow["UrlString"]="meishu"; myRow["CategoryID"] = "0012"; myDataTable.Rows.Add(myRow); // 美术
myRow = myDataTable.NewRow(); myRow["UrlString"]="xinxijishu";myRow["CategoryID"] = "0012"; myDataTable.Rows.Add(myRow); // 信息技术
myRow = myDataTable.NewRow(); myRow["UrlString"]="chinese"; myRow["CategoryID"] = "0001"; myDataTable.Rows.Add(myRow); // 语文
myRow = myDataTable.NewRow(); myRow["UrlString"]="math"; myRow["CategoryID"] = "0002"; myDataTable.Rows.Add(myRow); // 数学
myRow = myDataTable.NewRow(); myRow["UrlString"]="english"; myRow["CategoryID"] = "0003"; myDataTable.Rows.Add(myRow); // 英语
myRow = myDataTable.NewRow(); myRow["UrlString"]="physics"; myRow["CategoryID"] = "0004"; myDataTable.Rows.Add(myRow); // 物理
myRow = myDataTable.NewRow(); myRow["UrlString"]="chemist"; myRow["CategoryID"] = "0005"; myDataTable.Rows.Add(myRow); // 化学
myRow = myDataTable.NewRow(); myRow["UrlString"]="kejian"; myRow["CategoryID"] = "R001"; myDataTable.Rows.Add(myRow); // 课件
myRow = myDataTable.NewRow(); myRow["UrlString"]="cai"; myRow["CategoryID"] = "R001"; myDataTable.Rows.Add(myRow); // 课件
myRow = myDataTable.NewRow(); myRow["UrlString"]="gaokao"; myRow["CategoryID"] = "R001"; myDataTable.Rows.Add(myRow); // 高中
dataviewUrlString.Table = myDataTable;
}
#endregion
#region 根据有关猫链接url、猫链接文本进行资源属性判断、权重计算
///
/// 根据有关猫链接url、猫链接文本进行资源属性判断、权重计算
///
public void classify(string AnchorUrl, string AnchorText)
{
// // 对猫链接文本进行分词
// anchorUrl = AnchorUrl;
// anchorText = AnchorText;
// WordSegment myS = new WordSegment();
// ArrayList KeyWords = myS.GetLongWords(dataviewMainVoc,anchorText,"off");
// weight = 0;
//
// // 进行学段判断
// foreach(string KeyWord in KeyWords)
// {
// int p = dataviewGrade.Find(KeyWord); // 关键词在词库中的位置
// if(p>=0)
// {
// weight += int.Parse(dataviewGrade[p]["WordWeight"].ToString());
// if( dataviewGrade[p]["CategoryID"].ToString().Length!=0 ) gradeID = dataviewGrade[p]["CategoryID"].ToString();
// }
// }
//
// // 进行学科判断
// foreach(string KeyWord in KeyWords)
// {
// int p = dataviewSubject.Find(KeyWord); // 关键词在词库中的位置
// if(p>=0)
// {
// weight += int.Parse(dataviewSubject[p]["WordWeight"].ToString());
// if( dataviewSubject[p]["CategoryID"].ToString().Length!=0 ) subjectID = dataviewSubject[p]["CategoryID"].ToString();
// }
// }
//
// // 进行资源类型判断
// foreach(string KeyWord in KeyWords)
// {
// int p = dataviewResourceType.Find(KeyWord); // 关键词在词库中的位置
// if(p>=0)
// {
// System.Windows.Forms.Application.DoEvents();
// weight += int.Parse(dataviewResourceType[p]["WordWeight"].ToString());
// if( dataviewResourceType[p]["CategoryID"].ToString().Length!=0 ) resourcetypeID = dataviewResourceType[p]["CategoryID"].ToString();
// }
// }
//
// // 进行学段、学科类型判断
// foreach(string KeyWord in KeyWords)
// {
// int p = dataviewGradeSubject.Find(KeyWord); // 关键词在词库中的位置
// if(p>=0)
// {
// System.Windows.Forms.Application.DoEvents();
// weight += int.Parse(dataviewGradeSubject[p]["WordWeight"].ToString());
// if( dataviewGradeSubject[p]["CategoryID"].ToString().Length!=0 ) gradesubjectID = dataviewGradeSubject[p]["CategoryID"].ToString();
// }
// }
//
// // 如果资源来源于权威网站,则权重加权
// try
// {
// Uri cUri = new Uri(anchorUrl);
// if (dataviewHosts.Find(cUri.Host)>=0) weight += hostWeight;
// }
// catch
// {
// throw;
// }
//
// // 如果URL中含有特殊字串,则权重加权、类型判断
// for(int i=0;i=0)
// {
// weight += urlWeight;
// }
// }
//
// // 统一学段、学科、学段学科组合属性
// //if ( (gradeID==null) && (gradesubjectID!=null) ) gradeID = "G0" + gradesubjectID.Substring(0,2);
// //if ((subjectID==null) && (gradesubjectID !=null) ) subjectID = "00" + gradesubjectID.Substring(3,2);
// //if ( (gradeID!=null) && (subjectID!=null) && (gradesubjectID ==null) ) gradesubjectID = gradeID.Substring(3,2) + subjectID.Substring(3,2);
}
#endregion
#region 小马写的返回权值
public float getScore(string aText, ref Hashtable wordsList)
{
int recourdCount=0;
ArrayList anchorWords = new ArrayList();
string splitedAText;
float score=0;
seg.Separator = "/";
splitedAText = seg.SegmentText(aText,true);
if(splitedAText.IndexOf('/')!=-1)
while(splitedAText.IndexOf('/')!=-1 && splitedAText.Length>2)
{
anchorWords.Add(splitedAText.Substring(0,splitedAText.IndexOf('/')));
splitedAText = splitedAText.Substring(splitedAText.IndexOf('/')+1);
}
else
anchorWords.Add(aText);
foreach(string strtemp in anchorWords)
{
if(wordsList.ContainsKey((object)strtemp.Trim()))
{
score+=(float)Convert.ToDouble(wordsList[strtemp].ToString());
recourdCount++;
}
}
if(recourdCount!=0)
{
score=score/recourdCount;
}
return score;
}
#endregion
}
#endregion
#region 读写 INI 文本文件
///
/// 1、IniFile() 构造函数
/// 2、GetKeyValue() 读取键值
/// 3、WriteKeyValue() 写入键值
///
public class IniFile
{
private string m_IniFileName=null;
//导入AIP函数读写INI文件
[DllImport("Kernel32")]
private static extern int GetPrivateProfileString(
string lpAppName,string lpKeyName,string lpDefault,
StringBuilder lpReturnedString,int nSize,string lpFileName);
[DllImport("Kernel32")]
private static extern bool WritePrivateProfileString(
string lpAppName,string lpKeyNaem,string lpString,
string lpFileName);
///
/// 构造函数:提供INI文件名;INI文件位于应用程序所在文件夹下。
///
/// INI文件名
public IniFile(string IniFileName)
{
m_IniFileName=".\\"+IniFileName;
}
///
/// 按给定的节名称及键名,返回特定的健值。
///
/// 节名
/// 键名
/// 键值
public string GetKeyValue(string sectionName,string keyName)
{
StringBuilder keyValue=new StringBuilder(51000,51000); // 最大可以处理 20 个 URL
GetPrivateProfileString(sectionName,keyName,"",keyValue,500,m_IniFileName);
return keyValue.ToString();
}
///
///按给定的节名/键名/键值更改键值。
///
/// 节名
/// 键名
/// 键值
public void WriteKeyValue(string sectionName,string keyName,string keyValue)
{
WritePrivateProfileString(sectionName,keyName,keyValue,m_IniFileName);
}
}
#endregion
#region 从网络下载指定的 url
///
/// 从网络下载指定的 url
///
public class GetPage
{
#region 私有变量
///
/// 网页URL地址
///
private string url=null;
///
/// 是否使用代码服务器:0 不使用 1 使用代理服务器
///
private int proxyState=0;
///
/// 代理服务器地址
///
private string proxyAddress=null;
///
/// 代理服务器端口
///
private string proxyPort=null;
///
/// 代理服务器用户名
///
private string proxyAccount=null;
///
/// 代理服务器密码
///
private string proxyPassword=null;
///
/// 代理服务器域
///
private string proxyDomain=null;
///
/// 输出文件路径
///
private string outFilePath=null;
///
/// 输出的字符串
///
private string outString=null;
///
/// 提示信息
///
private string noteMessage=null;
#endregion
#region 公共属性
///
/// 欲读取的URL地址
///
public string Url
{
get{return url;}
set{url=value;}
}
///
/// 是否使用代理服务器标志
///
public int ProxyState
{
get{return proxyState;}
set{proxyState=value;}
}
///
/// 代理服务器地址
///
public string ProxyAddress
{
get{return proxyAddress;}
set{proxyAddress=value;}
}
///
/// 代理服务器端口
///
public string ProxyPort
{
get{return proxyPort;}
set{proxyPort=value;}
}
///
/// 代理服务器账号
///
public string ProxyAccount
{
get{return proxyAccount;}
set{proxyAccount=value;}
}
///
/// 代理服务器密码
///
public string ProxyPassword
{
get{return proxyPassword;}
set{proxyPassword=value;}
}
///
/// 代理服务器域
///
public string ProxyDomain
{
get{return proxyDomain;}
set{proxyDomain=value;}
}
///
/// 输出文件路径
///
public string OutFilePath
{
get{return outFilePath;}
set{outFilePath=value;}
}
///
/// 返回的字符串
///
public string OutString
{
get{return outString;}
}
///
/// 返回提示信息
///
public string NoteMessage
{
get{return noteMessage;}
}
#endregion
#region 构造函数
public GetPage()
{
}
#endregion
#region 公共方法 GetHtml
///
/// 读取指定URL地址,返回Html代码串
///
public string GetHtml()
{
string tempCode = null;
if(this.url==null | this.url.Length==0)
{
noteMessage = "Url 不能为空...";
return tempCode;
}
WebRequest request = WebRequest.Create(this.url);
if(this.proxyState==1)//使用代理服务器的处理
{
//默认读取80端口的数据
if(this.proxyPort==null)
this.ProxyPort="80";
WebProxy myProxy=new WebProxy();
myProxy = (WebProxy)request.Proxy;
myProxy.Address = new Uri(this.ProxyAddress+":"+this.ProxyPort);
myProxy.Credentials = new NetworkCredential(this.proxyAccount, this.proxyPassword, this.ProxyDomain);
request.Proxy = myProxy;
}
try// 下载指定的 url
{
WebResponse response = request.GetResponse(); //请求服务
Stream resStream = response.GetResponseStream(); //返回信息
StreamReader sr = new StreamReader(resStream, System.Text.Encoding.Default);
tempCode= sr.ReadToEnd();
resStream.Close();
sr.Close();
}
catch(Exception e1)
{
this.noteMessage = e1.Message.ToString();
}
return tempCode;
}
#endregion
#region 检查一个网址是否为主题蜘蛛想要的网址
public static bool checkURL(string aUrl)
{
if( aUrl.ToLower().IndexOf("mailto") ==-1 &&
aUrl.ToLower().IndexOf(".wam") ==-1 &&
aUrl.ToLower().IndexOf(".gif") ==-1 &&
aUrl.ToLower().IndexOf(".jpg") ==-1 &&
aUrl.ToLower().IndexOf(".rm") ==-1 &&
aUrl.ToLower().IndexOf(".mp3") ==-1 &&
aUrl.ToLower().IndexOf(".pdf") ==-1 &&
aUrl.ToLower().IndexOf(".doc") ==-1 &&
aUrl.ToLower().IndexOf("javascript") ==-1 &&
aUrl.ToLower().IndexOf("ppt") ==-1
)
{
return true;
}
else
{
return false;
}
}
#endregion
}
#endregion
#region 读写日志文件
public class Log
{
public void writeLogText(string msg)
{
try
{
int LogMaxContent = 4048000;
string LogfileName = "logFile.txt";
string PathName =System.IO.Directory.GetCurrentDirectory() + LogfileName;
FileInfo Finfo = new FileInfo(PathName);
string PathNameMove = PathName.Substring(0,PathName.LastIndexOf("\\"))+"\\" + DateTime.Now.ToString("yyyyMMddhhmm") + LogfileName;
if( Finfo.Exists && Finfo.Length > LogMaxContent ) // 如果超出,重名名
{
Finfo.CopyTo(PathNameMove);
Finfo.Delete();
}
using(FileStream Fs = Finfo.OpenWrite())
{
StreamWriter Sw = new StreamWriter(Fs);
Sw.BaseStream.Seek(0, SeekOrigin.End); //设置写数据流的起始位置为文件流的末尾
StringBuilder StrInput = new StringBuilder(); // 记录写入的内容
StrInput.Append("\r\n Log Entry : ");
StrInput.Append(DateTime.Now.ToString());
StrInput.Append("\r\n");
StrInput.Append(msg + "\r\n");
StrInput.Append("-------------------------------------------------------------------------------------------\n");
Sw.Write(StrInput);
Sw.Flush();
Sw.Close();
}
}
catch(Exception e){}
}
}
#endregion
}