www.pudn.com > spider(java).rar > coincidencies.java, change:2004-09-02,size:8910b


import java.util.regex.*; 
import java.net.*; 
import java.sql.*; 
 
public class coincidencies 
{ 
	static Statement stmt = null;  
	static ResultSet rsPerVeure = null;  
	String paraula=""; 
	Connection conn; 
	 
 
	public coincidencies(URL url,String ascii,String textSencer,Connection conn,Statement stmt) 
	{	 
			int id_pagina=0,i=0,id_paraula=0; 
			 
			try 
			{  
				ascii=ascii.replaceAll(" "," "); 
				ascii=ascii.replaceAll(" "," "); 
				ascii=ascii.replaceAll("nbsp;"," "); 
				ascii=ascii.replaceAll("nbsp"," "); 
				 
				Class.forName("com.mysql.jdbc.Driver").newInstance(); 
							 
				//Connection conn = DriverManager.getConnection("jdbc:mysql://192.168.1.2/spider?user=spider&password=spider"); 
	 
				//stmt = conn.createStatement(ResultSet.TYPE_SCROLL_SENSITIVE,ResultSet.CONCUR_UPDATABLE); 
				this.conn=conn; 
				this.stmt=stmt; 
		 
				//mirem si la pÓgina ja existeix a la nostra base de dades 
				//if (stmt.execute("SELECT * FROM vistes WHERE url='"+url.toString()+"'")) 
 
				rsPerVeure = stmt.executeQuery("SELECT * FROM vistes WHERE url='"+url.toString()+"'"); 
				if(this.countItems(rsPerVeure)>0) 
				{ 
					//Si existeix agafem el seu id 
				//  rsPerVeure = stmt.getResultSet(); 
				  rsPerVeure.first();  
				  id_pagina=rsPerVeure.getInt("id"); 
				} 
				else//if the program runs wel this part never have to run 
				{ 
//				  sino en creem un de nou i l'insertem a la BD 
				  stmt.execute("INSERT INTO vistes (url) VALUES ('"+url.toString()+"')"); 
				   
				  rsPerVeure = stmt.executeQuery("SELECT id FROM vistes WHERE url='"+url.toString()+"'"); 
				  //agafem el seu id 
				  rsPerVeure.first();  
				  id_pagina=rsPerVeure.getInt(1); 
				}	 
 
				 
				//comenšem a parsejar! 
				Pattern p; 
				Matcher m; 
				boolean resultado; 
				p = Pattern.compile("\\w+"); 
				m = p.matcher(ascii); 
				resultado=m.find(); 
 
				while(resultado) 
				{ 
					paraula= new String(ascii.subSequence(m.start(),m.end()).toString()); 
					 
					paraula=paraula.replaceAll("\\<.*?\\>","").toLowerCase(); 
 
					//comprovem si la paraula trobada existeix a la BD 
				 
					rsPerVeure = stmt.executeQuery("SELECT id FROM paraules WHERE paraula='"+paraula+"'"); 
					 
					//System.out.println(countItems(rsPerVeure)); 
					if(countItems(rsPerVeure)>0) 
					{		 
						//si existeix ens quedem amb el seu id 
					//	rsPerVeure = stmt.getResultSet(); 
						rsPerVeure.first();  
						id_paraula=rsPerVeure.getInt(1); 
					} 
					else  
					{ 
				//		System.out.println("6"); 
//							sino en creem un de nou i l'insertem a la BD 
						if(paraula.length()<50) 
						{ 
							stmt.execute("INSERT INTO paraules (paraula) VALUES ('"+paraula+"')"); 
							rsPerVeure=stmt.executeQuery("SELECT id FROM paraules WHERE paraula='"+paraula+"'"); 
							 
							//Si existeix agafem el seu id 
							//rsPerVeure = stmt.getResultSet(); 
							rsPerVeure.first();  
							id_paraula=rsPerVeure.getInt(1); 
						} 
					} 
 
					if(paraula.length()<50)//max length of a word declared on the database 
					{ 
						rsPerVeure = stmt.executeQuery("SELECT * FROM coincidencies WHERE paraula='"+id_paraula+"' AND pagina='"+id_pagina+"'"); 
					 
						int n_coincidencies=0,id_coincidencies=0; 
						 
						if(countItems(rsPerVeure)>0) 
						{ 
						 
							//si existeix ens quedem amb el seu id 
							//rsPerVeure = stmt.getResultSet(); 
							rsPerVeure.first();  
							n_coincidencies=rsPerVeure.getInt("n_coincidencies"); 
							id_coincidencies=rsPerVeure.getInt("id"); 
							n_coincidencies++; 
							//	System.out.println("9"); 
							//ja existeix, augmentem la coincidencia 
							rsPerVeure.updateString("n_coincidencies",new Integer(n_coincidencies).toString()); 
							rsPerVeure.updateRow(); 
							 
							//ResultSet rs = stmt.executeQuery("UPDATE coincidencies SET n_coincidencies = '"+n_coincidencies+"' WHERE id ='"+id_coincidencies+"' "); 
						} 
						else 
							stmt.execute("INSERT INTO coincidencies (paraula,pagina,n_coincidencies) VALUES ('"+id_paraula+"','"+id_pagina+"','1')"); 
					} 
					 
					//busquem la seguent paraula				 
					resultado=m.find(); 
				}					 
				//	System.out.println("Indexat: "+paraula); 
				 
					 
			comprovaCoincidencies(textSencer,id_pagina,"<b>.+</b>"); 
			comprovaCoincidencies(textSencer,id_pagina,"<B>.+</B>"); 
			comprovaCoincidencies(textSencer,id_pagina,"<h1>.+</h1>"); 
			comprovaCoincidencies(textSencer,id_pagina,"<H1>.+</H1>"); 
 
			freeResultSets(); 
			 
		} 
		catch(SQLException ex)  
		{   
		   // handle any errors  
		   System.out.println("SQLException: " + ex.getMessage());  
		   System.out.println("SQLState: " + ex.getSQLState());  
		   System.out.println("VendorError: " + ex.getErrorCode());  
		   return; 
		} 
			catch(Exception e) 
			{ 
				e.printStackTrace(); 
				return; 
			} 
		} 
	 
	public void freeResultSets() 
	{ 
		if (rsPerVeure != null)  
		{  
			try  
			{ 
				rsPerVeure.close();  
			} 
			catch (SQLException sqlEx)  
			{ // ignore  
			}  
 
		}	 
	/*	if (stmt != null) 
		{  
			try  
			{  
				stmt.close();  
			} 
			 catch (SQLException sqlEx)  
			 { 
				// ignore  
			 }  
 
			stmt = null;  
		} */ 
		 
	 
	} 
		 
		 
		public int countItems(ResultSet rs) 
		{ 
			int i=0; 
			try{ 
			 
			while(rs.next()) 
				i++; 
					 
			} 
			catch(Exception e) 
			{ 
				System.out.println("ERROR: while counting items"); 
				e.printStackTrace(); 
				 
			 
			} 
			return i; 
		} 
		 
		//this method count the words of a web page 
		public int wordCount(String ascii) 
		{ 
			Pattern p; 
			Matcher m; 
			int wordCount=0; 
			boolean resultado; 
			 
			p = Pattern.compile("\\w+"); 
			m = p.matcher(ascii); 
			resultado=m.find(); 
			 
			while(resultado) 
			{ 
				wordCount++; 
				resultado=m.find(); 
			} 
			return wordCount; 
		 
		 
		} 
 
		//we find the words that are between <b></b> and <h1></h1> and <u></u> 
		//if we find words we increase the value of n_coincidencies on the table coincidencies 
		//with this we can give more importance to the words that are between tags <b></b> and <h1></h1> and <u></u> 
		// 
		//words that ar between tags and: 
		//length less than 20 caracters (not all the page) 
		public void comprovaCoincidencies(String text,int id_pagina,String pattern) 
		{ 
			Pattern p,p1; 
			Matcher m,m1; 
			boolean resultado; 
			String paraula="",subParaula=""; 
			int id_paraula=0,coincidencies=0; 
			 
			try 
			{ 
				text=text.toLowerCase(); 
				p = Pattern.compile(pattern); 
			//	p1 = Pattern.compile("<b>.+[</b>]"); 
				m = p.matcher(text); 
				 
				resultado=m.find(); 
	 
				while(resultado) 
				{ 
					paraula= new String(text.subSequence(m.start(),m.end()).toString()); 
					paraula=paraula.replaceAll("\\<.*?\\>","");//delete <b> </b> tags 
					paraula=paraula.replaceAll("!","");//delete the signs that won't be indexed 
					paraula=paraula.replaceAll("í",""); 
					paraula=paraula.replaceAll("\\?",""); 
					paraula=paraula.replaceAll("┐",""); 
					paraula=paraula.replaceAll("\\."," "); 
					paraula=paraula.replaceAll(","," "); 
					paraula=paraula.replaceAll(":"," "); 
					paraula=paraula.replaceAll(" "," "); 
					paraula=paraula.replaceAll(";"," "); 
			//		System.out.println("PARAULA: "+paraula); 
					 
					try 
					{ 
						 
						//for every word that are between <b> </b>... 
						p1 = Pattern.compile("\\w+"); 
						m1 = p1.matcher(paraula); 
						resultado=m1.find(); 
 
						while(resultado) 
						{ 
							subParaula= new String(paraula.subSequence(m1.start(),m1.end()).toString()); 
							 
							rsPerVeure = stmt.executeQuery("SELECT * FROM paraules WHERE paraula='"+subParaula+"'"); 
							 
							if(countItems(rsPerVeure)>0) 
							{		 
								rsPerVeure.first();  
								id_paraula=rsPerVeure.getInt(1); 
							} 
							 
							 
							 
							 
							rsPerVeure = stmt.executeQuery("SELECT * FROM coincidencies WHERE paraula='"+id_paraula+"' AND pagina='"+id_pagina+"'"); 
							rsPerVeure.first(); 
							coincidencies=rsPerVeure.getInt("n_coincidencies"); 
						//	System.out.println("COINCIDENCIES: "+coincidencies); 
							 
							coincidencies++; 
							rsPerVeure.updateString("n_coincidencies",new Integer(coincidencies).toString());  
							rsPerVeure.updateRow(); 
						 
							resultado=m1.find(); 
						}		 
						 
					} 
					catch(SQLException ex)  
					{   
					   // handle any errors  
					   System.out.println("SQLException: " + ex.getMessage());  
					   System.out.println("SQLState: " + ex.getSQLState());  
					   System.out.println("VendorError: " + ex.getErrorCode());  
					   return; 
					} 
					 
					 
					resultado=m.find(); 
				}	 
			} 
			catch(Exception e) 
			{ 
				System.out.println("ERROR: Pattern!"); 
				e.printStackTrace(); 
				 
			 
			} 
		} 
		 
		 
		 
}