www.pudn.com > Crawler_bemjh.rar > Parse.cs
using System;
namespace CrawlerLib
{
///
/// Base class for parseing tag based files, such as HTML, HTTP headers
/// or XML.
///
///
/// This spider is copyright 2003 by Jeff Heaton. However, it is
/// released under a Limited GNU Public License (LGPL). You may
/// use it freely in your own programs. For the latest version visit
/// http://www.jeffheaton.com.
///
///
public class Parse:AttributeList
{
///
/// The source text that is being parsed.
///
private string m_source;
///
/// The current position inside of the text that
/// is being parsed.
///
private int m_idx;
///
/// The most reciently parsed attribute delimiter.
///
private char m_parseDelim;
///
/// This most receintly parsed attribute name.
///
private string m_parseName;
///
/// The most reciently parsed attribute value.
///
private string m_parseValue;
///
/// The most reciently parsed tag.
///
protected string m_tag;
///
/// 文档总长度
///
private int m_docLength=0;
///
/// 网页的根地址
///
private string m_baseURL="";
///
/// 网页根地址属性
///
///
public string BaseURL
{
set
{
m_baseURL=value;
}
get
{
return m_baseURL;
}
}
///
/// Determine if the specified character is whitespace or not.
///
/// A character to check
/// true if the character is whitespace
protected static bool IsWhiteSpace(char ch)
{
return( "\t\n\r ".IndexOf(ch) != -1 );
}
///
/// Advance the index until past any whitespace.
///
protected void EatWhiteSpace()
{
while ( !Eof() )
{
if ( !IsWhiteSpace(GetCurrentChar()) )
return;
m_idx++;
}
}
///
/// Determine if the end of the source text has been
/// reached.
///
/// True if the end of the source text has been
/// reached.
protected bool Eof()
{
return(m_idx>=m_docLength );
}
///
/// 解析属性名称
///
protected void ParseAttributeName()
{
//去掉所有\r\n\t及空格
EatWhiteSpace();
//通过循环获取属性名称
while ( !Eof() )
{
//判断属性:空格 = 或 遇到>
if ( IsWhiteSpace(GetCurrentChar()) ||
(GetCurrentChar()=='=') ||
(GetCurrentChar()=='>') )
break;
m_parseName+=char.ToUpper(GetCurrentChar());
m_idx++;
}
EatWhiteSpace();
}
///
/// Parse the attribute value
///
protected void ParseAttributeValue()
{
if ( m_parseDelim != 0 )
return;
if ( GetCurrentChar()=='=' )
{
m_idx++;
//Advance();
EatWhiteSpace();
// 如果有分隔符,是'或"
if ( (GetCurrentChar()=='\'') ||
(GetCurrentChar()=='\"') )
{
//保留当前的分隔符,并后移一字符
m_parseDelim = GetCurrentChar();
m_idx++;
//若当前字符不是保存的分隔符或空格,则为属性值
while ( GetCurrentChar() != m_parseDelim && !IsWhiteSpace(GetCurrentChar()) && !Eof())
{
m_parseValue+=GetCurrentChar();
m_idx++;
}
//后移分隔符字符位()
m_idx++;
}
//否则直接读取属性值
else
{
while ( !Eof() &&
!IsWhiteSpace(GetCurrentChar()) &&
(GetCurrentChar()!='>') )
{
m_parseValue += GetCurrentChar();
m_idx++;
}
}
EatWhiteSpace();
}
}
///
/// Add a parsed attribute to the collection.
///
protected void AddAttribute()
{
Attribute a = new Attribute(m_parseName,m_parseValue,m_parseDelim);
Add(a);
}
///
/// Get the current character that is being parsed.
///
///
protected char GetCurrentChar()
{
return GetCurrentChar(0);
}
///
/// Get a few characters ahead of the current character.
///
/// How many characters to peek ahead for.
/// The character that was retrieved.
protected char GetCurrentChar(int peek)
{
if( (m_idx+peek)
/// 获取下一个字符,并后移一字符.
///
/// The next character
protected char AdvanceCurrentChar()
{
return m_source[m_idx++];
}
///
/// 前进一字符.
///
protected void Advance()
{
m_idx++;
}
///
///前进指定个字符
///
///
protected void Advance(int i)
{
m_idx+=i;
}
///
/// 最后遇到的属性名称.
///
protected string ParseName
{
get
{
return m_parseName;
}
set
{
m_parseName = value;
}
}
///
/// 最后遇到的属性值.
///
protected string ParseValue
{
get
{
return m_parseValue;
}
set
{
m_parseValue = value;
}
}
///
/// The last attribute delimeter that was encountered.
///
protected char ParseDelim
{
get
{
return m_parseDelim;
}
set
{
m_parseDelim = value;
}
}
///
/// The text that is to be parsed.
///
public string Source
{
get
{
return m_source;
}
set
{
m_source = value + " "; //防止最后一个字符是"<"而出现读字符错误
try
{
m_docLength=value.Length;
}
catch(Exception e)
{
}
}
}
}
}