www.pudn.com > ContentAnalyzer.rar > Program.cs, change:2009-03-02,size:3836b


/***************************************************************** 
*                Net.LikeShow.ContentAnalyze 
*                code by King      
*                http://www.likeshow.net 
*                qq:5088300 MSN:yy_8354@hotmail.com 
*   正文抽取类 提供基本的网页正文分析抽取 返回正文标题 发布时间 正文内容 及正文类型 
*   正文类型分为: news bbs blogs  
*   该组件内部算法主要应用了规则模型抽取,所有规则基本使用正则表达式实现,具体正则可参考我的BLOG上《正文抽取正则》以及《聊聊网页正文抽取》内容。 
* http://www.likeshow.net/article.asp?id=60《聊聊网页正文抽取》 
*http://www.likeshow.net/article.asp?id=55《正文抽取正则》 
*****************************************************************/ 
 
 
using System; 
using System.Collections.Generic; 
using System.Text; 
using System.IO; 
using System.Net; 
using Net.LikeShow.ContentAnalyze; 
using Net.LikeShow.ContentAnalyze.DataClass; 
 
namespace TestAnalyzer 
{ 
    class Program 
    { 
        private static string GetCharType(string tmp1)                                                  //获取页面编码 
        { 
            string tmp = ""; 
            tmp = tmp1; 
            int a = tmp.IndexOf("charset=") + 8; 
            tmp = tmp.Substring(a, tmp.Length - a); 
            return tmp; 
        } 
        private static string GetPage(string url)                                                       //获取页面内容 
        { 
            string userAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0)"; 
            string pagehtml; 
            WebResponse response = null; 
            Stream stream = null; 
            StreamReader reader = null; 
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url); 
            request.UserAgent = userAgent; 
            response = request.GetResponse(); 
            stream = response.GetResponseStream(); 
            if (response.ContentType.IndexOf("charset") > 0) 
            { 
                reader = new StreamReader(stream, System.Text.Encoding.GetEncoding(GetCharType(response.ContentType.ToString()))); 
            } 
            else 
            { 
                reader = new StreamReader(stream, System.Text.Encoding.Default); 
            } 
            pagehtml = reader.ReadToEnd(); 
            return pagehtml; 
        } 
        static void Main(string[] args) 
        { 
 
            string url = @"http://www.taobao.com"; 
             
            UriKind uriKind = UriKind.Absolute; 
 
                if ((String.IsNullOrEmpty(url) != true) && (Uri.IsWellFormedUriString(url, uriKind))) 
                { 
                    string str = GetPage(url); 
                    System.Diagnostics.Stopwatch watch = new System.Diagnostics.Stopwatch(); 
                    watch.Start(); 
                    Html myhtml = new Html(); 
                    myhtml.Web = str; 
                    myhtml.Url = url; 
                    CommonAnalyze ca = new CommonAnalyze(); 
                    ca.LoadHtml(myhtml); 
                    Document doc = ca.GetResult(); 
                    watch.Stop(); 
                    Console.WriteLine(url);         
                    Console.WriteLine("\r\n"); 
                    Console.WriteLine(doc.Title);           //标题 
                    Console.WriteLine("\r\n"); 
                    Console.WriteLine(doc.UpTime);          //发布时间 
                    Console.WriteLine("\r\n"); 
                    Console.WriteLine(doc.Doc);            //正文 
                    Console.WriteLine("\r\n"); 
                    Console.WriteLine(doc.SiteType);       //正文类型 
                    Console.WriteLine("\r\n"); 
                    Console.WriteLine(watch.Elapsed); 
                    Console.WriteLine("\r\n"); 
                } 
                Console.Read(); 
        } 
    } 
}