www.pudn.com > Crawler_bemjh.rar > Parse.cs


using System; 
 
namespace CrawlerLib 
{ 
	///  
	/// Base class for parseing tag based files, such as HTML, HTTP headers 
	/// or XML. 
	///  
	///  
	/// This spider is copyright 2003 by Jeff Heaton. However, it is 
	/// released under a Limited GNU Public License (LGPL). You may  
	/// use it freely in your own programs. For the latest version visit 
	/// http://www.jeffheaton.com. 
	///	 
	///  
	public class Parse:AttributeList  
	{
		///  
		/// The source text that is being parsed. 
		/// 
		private string m_source;

		///  
		/// The current position inside of the text that 
		/// is being parsed. 
		/// 
		private int m_idx;

		///  
		/// The most reciently parsed attribute delimiter. 
		/// 
		private char m_parseDelim;

		///  
		/// This most receintly parsed attribute name. 
		/// 
		private string m_parseName;

		///  
		/// The most reciently parsed attribute value. 
		/// 
		private string m_parseValue;

		///  
		/// The most reciently parsed tag. 
		/// 
		protected string m_tag;
		///  
		/// 文档总长度 
		/// 
		private int m_docLength=0;
		///  
		/// 网页的根地址 
		///  
		private string m_baseURL=""; 
		///  
		/// 网页根地址属性 
		///  
		///  
		public string BaseURL 
		{ 
			set 
			{ 
				m_baseURL=value; 
			} 
			get 
			{ 
				return m_baseURL; 
			} 
		} 
		///  
		/// Determine if the specified character is whitespace or not. 
		///  
		/// A character to check 
		/// true if the character is whitespace
		protected static bool IsWhiteSpace(char ch)
		{
			return( "\t\n\r ".IndexOf(ch) != -1 );
		}


		///  
		/// Advance the index until past any whitespace. 
		/// 
		protected void EatWhiteSpace()
		{
			while ( !Eof() )  
			{
				if ( !IsWhiteSpace(GetCurrentChar()) )
					return;
				m_idx++;
			}
		}

		///  
		/// Determine if the end of the source text has been 
		/// reached.  
		///  
		/// True if the end of the source text has been
		/// reached.
		protected bool Eof()
		{
			return(m_idx>=m_docLength );
		}

		///  
		/// 解析属性名称 
		/// 
		protected void ParseAttributeName()
		{
			//去掉所有\r\n\t及空格
			EatWhiteSpace();
			//通过循环获取属性名称
			while ( !Eof() )  
			{
				//判断属性:空格 = 或 遇到>
				if ( IsWhiteSpace(GetCurrentChar()) ||
					(GetCurrentChar()=='=') ||
					(GetCurrentChar()=='>') )
					break;
				m_parseName+=char.ToUpper(GetCurrentChar());
				m_idx++;
			}
			EatWhiteSpace();
		}


		///  
		/// Parse the attribute value 
		/// 
		protected void ParseAttributeValue()
		{
			if ( m_parseDelim != 0 )
				return;
			if ( GetCurrentChar()=='=' )  
			{
				m_idx++;
				//Advance();
				EatWhiteSpace();
				// 如果有分隔符,是'或"
				if ( (GetCurrentChar()=='\'') ||
					(GetCurrentChar()=='\"') )  
				{
					//保留当前的分隔符,并后移一字符
					m_parseDelim = GetCurrentChar();
					m_idx++;
					//若当前字符不是保存的分隔符或空格,则为属性值
					while ( GetCurrentChar() != m_parseDelim && !IsWhiteSpace(GetCurrentChar()) && !Eof())  
					{
						m_parseValue+=GetCurrentChar();
						m_idx++;
					}
					//后移分隔符字符位()
					m_idx++;
				}  
					//否则直接读取属性值 
				else  
				{
					while ( !Eof() &&
						!IsWhiteSpace(GetCurrentChar()) &&
						(GetCurrentChar()!='>') )  
					{
						m_parseValue += GetCurrentChar();
						m_idx++;
					}
				}
				EatWhiteSpace();
			}
		}

		///  
		/// Add a parsed attribute to the collection. 
		/// 
		protected void AddAttribute()
		{
			Attribute a = new Attribute(m_parseName,m_parseValue,m_parseDelim);
			Add(a);
		}
 
		///  
		/// Get the current character that is being parsed. 
		///  
		///  
		protected char GetCurrentChar() 
		{ 
			return GetCurrentChar(0); 
		} 
 
		///  
		/// Get a few characters ahead of the current character. 
		///  
		/// How many characters to peek ahead for. 
		/// The character that was retrieved. 
		protected char GetCurrentChar(int peek) 
		{ 
			if( (m_idx+peek) 
		/// 获取下一个字符,并后移一字符. 
		///  
		/// The next character 
		protected char AdvanceCurrentChar() 
		{ 
			return m_source[m_idx++]; 
		} 
 
		///  
		/// 前进一字符. 
		/// 
		protected void Advance()
		{
			m_idx++;
		}
		 
		///  
		///前进指定个字符  
		///  
		/// 
		protected void Advance(int i)
		{
			m_idx+=i;
		}

		///  
		/// 最后遇到的属性名称. 
		/// 
		protected string ParseName
		{
			get 
			{
				return m_parseName;
			}

			set 
			{
				m_parseName = value;
			}
		}

		///  
		/// 最后遇到的属性值. 
		/// 
		protected string ParseValue
		{
			get 
			{
				return m_parseValue;
			}

			set 
			{
				m_parseValue = value;
			}
		}

		///  
		/// The last attribute delimeter that was encountered. 
		/// 
		protected char ParseDelim
		{
			get 
			{
				return m_parseDelim;
			}

			set 
			{
				m_parseDelim = value;
			}
		}

		///  
		/// The text that is to be parsed. 
		/// 
		public string Source
		{
			get 
			{
				return m_source;
			}

			set 
			{
				m_source = value + " "; //防止最后一个字符是"<"而出现读字符错误
				try 
				{
					m_docLength=value.Length;
				}
				catch(Exception e)
				{
				
				}
			}
		}
	} 
}