www.pudn.com > openwebspiderv0.1a.zip > misc.h



/* OpenWebSpider
 *
 *     Coded by Shen139
 *     shen139 [at] eviltime (dot) com
 *
 *
 * This file is part of OpenWebSpider
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 */

#ifndef __MISC
#define __MISC
#include "strfnct.h"
#include "htmlfnct.h"
#include "ndzlist.h"
#include "options.h"
#include "sqlfnct.h" 
#include "snprintf.c"


/* ForgePacket
 * hst -> packet <-
 * hst.Page = "/prova.htm" ==> packet = "GET /prova.htm HTTP/1.1"
 */
int ForgeHTTPPacket(struct sHost hst,char * packet)
{
	snprintf(packet,MAXPACKETSIZE,"GET %s HTTP/1.1\r\nHost: %s\r\n\r\n",hst.Page,hst.Host);
return 1;
}

/* ParseHTTPRequest
 * recvdpkt -> htmlOut <- maxout ->
 * Return the packet without the HTTP header
 */
int ParseHTTPRequest(char* recvdpkt,char* htmlOut,int maxout)
{
int c;
#ifdef WIN32 
	if(strnicmp(recvdpkt,"HTTP/1.1 200 OK",15)!=0 && strnicmp(recvdpkt,"HTTP/1.1 302 Found",15)!=0) 
#else
	if(strncasecmp(recvdpkt,"HTTP/1.1 200 OK",15)!=0 && strncasecmp(recvdpkt,"HTTP/1.1 302 Found",15)!=0) 
#endif
		return 0;

	for(c=0;c<(signed)strlen(recvdpkt);c++)
		if(strncmp(recvdpkt+c,"\r\n\r\n",4)==0)
		{
			strncpy(htmlOut,recvdpkt+c+4,MIN(maxout,MAXPACKETSIZE));
			return 1;
		}

return 0;
}

/* UnHtml
 * html -> text <-
 * html = "

TesT123

" * text => TesT123 */ int UnHtml(char* html, char* text,int maxout) { int i,m,x=0,pOpen=0; memset(text,0,maxout); m=MIN((signed)strlen(html),maxout); for(i=0;i') pOpen=0; if(pOpen==0 && html[i]!='>') text[x++]=html[i]; } return x; } int Split(char* text,char* Tokens, NODE* first,int MaxWordSize) { int i; char tmp[2]; int pWord=0; char* word=malloc(MaxWordSize); char* UpWord=malloc(MaxWordSize); tmp[1]=0; for(i=0;i<(signed)strlen(text);i++) { tmp[0]=text[i]; if(bTokenIn(tmp,Tokens,1)==1) { if(pWord1) { atoupper(word,UpWord,strlen(word)); if(ndzLookForWord(first,UpWord)==NULL) //Add unique word lstAddWord(first,UpWord); } } pWord=0; } else if(pWord0) { snprintf(title,MAXDESCRIPTIONSIZE,"%s - %s",tmpTitle,host.Description); usetitle=1; } UnHtml(html,cTmp,MAXPACKETSIZE); OnlyOneSpace(cTmp,pureText,MAXPACKETSIZE); free(cTmp); ndzFirst=ndzInit(); Split(pureText," !?'^\"()[]{}+-=,;.:_<>\n\r\t",ndzFirst,MAXWORDSIZE); lstGetLastNode(ndzFirst,&numofword); if(sqlConnect(hostname, username ,password , dbname , &mysql)==0) { fprintf(stderr, "Failed to connect to database: Error: %s\n",mysql_error(&mysql)); lstFreeAll(ndzFirst); return 0; } if(GetUrlIDfromHostPage(&mysql, host.Host, host.Page, urlID)==0) { //Host is not in db! adding it snprintf(sqlQuery,MAXQUERYSIZE,"INSERT INTO urllist (hostname,page,description) VALUES(\"%s\",\"%s\",\"%s\");",host.Host,host.Page,(usetitle==1) ? title: host.Description); sqlSendSqlQuery(&mysql, sqlQuery, &result); if(mysql_affected_rows(&mysql)==1) GetUrlIDfromHostPage(&mysql, host.Host, host.Page, urlID); else return 0; } else { //delete all entries and reinsert new words snprintf(sqlQuery,MAXQUERYSIZE,"DELETE FROM urllist WHERE urlID = \"%s\" ;",urlID); sqlSendSqlQuery(&mysql, sqlQuery, &result); snprintf(sqlQuery,MAXQUERYSIZE,"DELETE FROM mid WHERE urlID = \"%s\" ;",urlID); sqlSendSqlQuery(&mysql, sqlQuery, &result); snprintf(sqlQuery,MAXQUERYSIZE,"INSERT INTO urllist (hostname,page,description) VALUES(\"%s\",\"%s\",\"%s\");",host.Host,host.Page,(usetitle==1) ? title: host.Description); sqlSendSqlQuery(&mysql, sqlQuery, &result); if(mysql_affected_rows(&mysql)==1) GetUrlIDfromHostPage(&mysql, host.Host, host.Page, urlID); else return 0; } for(i=0;ifield != NULL) { //is the word var in db? snprintf(sqlQuery,MAXQUERYSIZE,"SELECT * FROM wordlist where word=\"%s\";",(char*)tmpNode->field); if(sqlSendSqlQuery(&mysql, sqlQuery, &result)==1); { if((row=mysql_fetch_row(&result))==NULL) { //New word! Add it snprintf(sqlQuery,MAXQUERYSIZE,"INSERT INTO wordlist (word) VALUES(\"%s\");",(char*)tmpNode->field); sqlSendSqlQuery(&mysql, sqlQuery, &result); if(mysql_affected_rows(&mysql)!=1) break; } //Links wordID with urlID //Get wordID snprintf(sqlQuery,MAXQUERYSIZE,"SELECT wordID FROM wordlist WHERE word=\"%s\";",(char*)tmpNode->field); sqlSendSqlQuery(&mysql, sqlQuery, &result); if((row=mysql_fetch_row(&result))==NULL) break; else { //wordID OK urlID OK! adding them in mid snprintf(sqlQuery,MAXQUERYSIZE,"INSERT INTO mid (wordID, urlID) VALUES (\"%s\",\"%i\");",row[0],atoi(urlID)); sqlSendSqlQuery(&mysql, sqlQuery, &result); if(mysql_affected_rows(&mysql)!=1) break; } } } } free(pureText); lstFreeAll(ndzFirst); return 1; } int IndicizedSearch(char* hostname,char* username,char* password,char* dbname,char* Query, int havingcount) { char sqlQuery[MAXQUERYSIZE]; char sngKey[MAXKEYSIZE]; int iNumofkeys=0; char sNumofkeys[3]; int i,c; int NumOfResults=0; MYSQL mysql; MYSQL_RES result,result2; MYSQL_ROW row,row2; if(sqlConnect(hostname, username, password, dbname , &mysql)==0) { #ifdef CGI printf("

Failed to connect to database: %s\r\n",mysql_error(&mysql)); #else fprintf(stderr, "Failed to connect to database: Error: %s\n",mysql_error(&mysql)); #endif return 0; } snprintf(sqlQuery,MAXQUERYSIZE,"SELECT wordID FROM wordlist WHERE word IN("); c=0; if(strchr(Query,',')>Query) { for(i=0;i<(signed)strlen(Query);i++) { if(Query[i]==',') { sngKey[c]=0; c=0; if(strlen(sqlQuery)+1+strlen(sngKey)+1>=MAXQUERYSIZE) { #ifdef CGI printf("

Errors in Query\r\n"); #else fprintf(stderr, "Errors in Query"); #endif return 0; } strcat(sqlQuery,"'"); strcat(sqlQuery,sngKey); strcat(sqlQuery,"',"); iNumofkeys++; } else sngKey[c++]=Query[i]; } sngKey[c]=0; if(strlen(sqlQuery)+1+strlen(sngKey)+3>=MAXQUERYSIZE) { #ifdef CGI printf("

Errors in Query\r\n"); #else fprintf(stderr, "Errors in Query"); #endif return 0; } strcat(sqlQuery,"'"); strcat(sqlQuery,sngKey); strcat(sqlQuery,"');"); iNumofkeys++; } else { if(strlen(sqlQuery)+1+strlen(Query)+3>=MAXQUERYSIZE) { #ifdef CGI printf("

Errors in Query\r\n"); #else fprintf(stderr, "Errors in Query"); #endif return 0; } strcat(sqlQuery,"'"); strcat(sqlQuery,Query); strcat(sqlQuery,"');"); iNumofkeys=1; } sqlSendSqlQuery(&mysql, sqlQuery, &result); if((row=mysql_fetch_row(&result))==NULL) { #ifdef CGI printf("

Nothing Found\r\n

Back\r\n"); #else printf("Nothing Found\n"); #endif return 0; } else { snprintf(sqlQuery,MAXQUERYSIZE,"SELECT urlID FROM mid WHERE wordID IN ("); do { if(strlen(sqlQuery)+1+strlen(row[0])+1>=MAXQUERYSIZE) { #ifdef CGI printf("

Errors in Query\r\n"); #else fprintf(stderr, "Errors in Query"); #endif return 0; } strcat(sqlQuery,"'"); strcat(sqlQuery,row[0]); strcat(sqlQuery,"',"); } while ((row=mysql_fetch_row(&result))!=NULL); sqlQuery[strlen(sqlQuery)-1]=0; if(strlen(sqlQuery)+strlen(") GROUP BY urlID HAVING COUNT(*) = ")>=MAXQUERYSIZE) { #ifdef CGI printf("

Errors in Query\r\n"); #else fprintf(stderr, "Errors in Query"); #endif return 0; } strcat(sqlQuery,") GROUP BY urlID HAVING COUNT(*) = "); sprintf(sNumofkeys,"%i",(havingcount==0xFF) ? iNumofkeys : havingcount); if(strlen(sqlQuery)+strlen(sNumofkeys)+1>=MAXQUERYSIZE) { #ifdef CGI printf("

Errors in Query\r\n"); #else fprintf(stderr, "Errors in Query"); #endif return 0; } strcat(sqlQuery,sNumofkeys); strcat(sqlQuery," ;"); sqlSendSqlQuery(&mysql, sqlQuery, &result); while ((row = mysql_fetch_row(&result))) { snprintf(sqlQuery,MAXQUERYSIZE,"SELECT * FROM urllist WHERE urlID=\"%s\";",row[0]); sqlSendSqlQuery(&mysql, sqlQuery, &result2); if((row2=mysql_fetch_row(&result2))!=NULL) { #ifdef CGI printf("

Url: %s\r\n",row2[1],row2[2],row2[3]); #else printf("- %s\n - Url: http://%s%s\n\n",row2[3],row2[1],row2[2]); #endif NumOfResults++; } } } if(NumOfResults<100 && havingcount > 1) { return (IndicizedSearch(hostname,username,password,dbname,Query,(havingcount==0xFF) ? iNumofkeys-1 : havingcount-1)); } #ifdef CGI printf("\r\n"); #endif return 1; } void unencode(char *src, char *last, char *dest) { for(; src != last; src++, dest++) if(*src == '+') *dest = ' '; else if(*src == '%') { int code; if(sscanf(src+1, "%2x", &code) != 1) code = '?'; *dest = code; src +=2; } else *dest = *src; *dest = '\n'; *++dest = '\0'; } #endif /*EOF*/