www.pudn.com > openwebspiderv0.1a.zip > htmlfnct.h
/* OpenWebSpider * * Coded by Shen139 * shen139 [at] eviltime (dot) com * * * This file is part of OpenWebSpider * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * */ #ifndef __HTMLFNCT #define __HTMLFNCT #include#include #include "options.h" #include "hstlist.h" #include "strfnct.h" #include "misc.h" /* PageType * Host <- * Host->Page = "/test.htm" Host->type = 1 (type htm/html) */ int PageType(struct sHost* Host) { int i; if(Host->Page[strlen(Host->Page)-1]=='/') { Host->type = 1; // Html file return 1; } for(i=0;HtmlExtensions[i][0]!=0;i++) if(strncmp(Host->Page+strlen(Host->Page)-strlen(HtmlExtensions[i]),HtmlExtensions[i],strlen(HtmlExtensions[i]))==0) { Host->type = 1; // Html file return 1; } for(i=0;PlainTextExtensions[i][0]!=0;i++) { if(strncmp(Host->Page+strlen(Host->Page)-strlen(PlainTextExtensions[i]),PlainTextExtensions[i],strlen(PlainTextExtensions[i]))==0) { Host->type = 2; // PlainText file return 1; } } for(i=0;BinaryExtensions[i][0]!=0;i++) if(strncmp(Host->Page+strlen(Host->Page)-strlen(BinaryExtensions[i]),BinaryExtensions[i],strlen(BinaryExtensions[i]))==0) { Host->type = 3; // Binary file return 1; } if(strncmp(Host->Page,"/mailto",7)==0) { Host->type = 4; // Email return 1; } Host->type = 1; //no comment :) return 1; } /* LooksForKey * html -> Key -> 0|1 <- */ int LooksForKey(char* html, char* Key) { char Text[MAXPACKETSIZE]; int i,m; UnHtml(html,Text,sizeof(Text)); m=MIN(strlen(Text),MAXPACKETSIZE); for(i=0;i * ==> sHost.Url = Url && sHost.Host = "www.test.com" && sHost.Page = "page.htm" */ int ParseUrl(char* url,struct sHost* sh,char* currentHost) { char tUrl[MAXURLSIZE]; int offset=0, c; if(strlen(url)>MAXURLSIZE) return -1; memset(sh,0,sizeof(struct sHost)); memset(tUrl,0,MAXURLSIZE); if(strncmp(url,"http://",7)==0) offset=7; if(strncmp(url,"//",2)==0) offset=2; strncpy(tUrl,url+offset,strlen(url)-offset); tUrl[strlen(url)-offset]=0; for(c=0;c<(signed)strlen(tUrl);c++) if(tUrl[c]!='/') sh->Host[c] = tUrl[c]; else break; sh->Host[c]=0; if(offset==0 && strncmp(tUrl,"www",3)!=0) { if(currentHost==NULL) return -1; if(tUrl[0]=='/' && strlen(sh->Host)==0) { strcpy(sh->Page,tUrl); strncpy(sh->Host,currentHost,MAXHOSTSIZE); PageType(sh); return 1; } sprintf(sh->Page,"/%s",sh->Host); strncpy(sh->Host,currentHost,MAXHOSTSIZE); PageType(sh); return 1; } strncpy(sh->Page,tUrl+c,strlen(tUrl)-c); sh->Page[strlen(tUrl)-c]=0; if(strcmp(sh->Page,"")==0) strcpy(sh->Page,"/"); PageType(sh); return 1; } /* BetweenTag * html text -> tag -> maxout out <- * html: " ciao ciao
==> "ciao ciao" */ int BetweenTag(char* html, char* tag,char* out,int maxout) { char tmptag1[MAXTAGSIZE+1], tmptag2[MAXTAGSIZE+3]; int c,i=0,found=0; sprintf(tmptag1,"<%s",tag); sprintf(tmptag2,"%s>",tag); for(c=0;c<(signed)strlen(html);c++) { if(found==0) { //looks for initial tag#ifdef WIN32 if(strnicmp(html+c,tmptag1,strlen(tmptag1))==0 && (html[c+strlen(tmptag1)]==' ' || html[c+strlen(tmptag1)]=='>')) #else if(strncasecmp(html+c,tmptag1,strlen(tmptag1))==0 && (html[c+strlen(tmptag1)]==' ' || html[c+strlen(tmptag1)]=='>')) #endif { found=c+1; c+=strlen(tmptag1); } } else { if(i * url = "www.test.com" ==> last_node->next = &node: Host= "www.test.com" Page = "/" */ int AddUrl(struct sHost hst) { if(starthostonly==1) if(strncmp(strtHst->Host,hst.Host,MAXHOSTSIZE)!=0) return -1; //Out of range of Host if(lstGetNodeByHost(first,hst)==NULL) //Host is not in list { lstAddHost(first,hst); return 1; } else return -1; } /* ReturnFirstUrl * Host <- * Return the first host it found in the list */ int ReturnFirstUrl(struct sHost* Host) { NODE* nFirstUrl; if((nFirstUrl=lstGetFirstFreeNode(first))!=NULL) { memcpy(Host,((struct sHost*)nFirstUrl->field),sizeof(struct sHost)); ((struct sHost*)nFirstUrl->field)->viewed = 1; return 1; } else return -1; } /* LookForUrls * html -> AddUrl() <- */ int LookForUrls(char *html,struct sHost hst) { char a2a[MAXURLSIZE+100]; //........... char a2app[MAXURLSIZE+100]; // char tmpurl[MAXURLSIZE]; char url[MAXURLSIZE]; char fnd[MAXDESCRIPTIONSIZE]; char strComment[MAXDESCRIPTIONSIZE]; int strlenhtml=strlen(html); int c=0,i,stage,x,tmpc; struct sHost tmphst; while(c sprintf(a2app,"<%s>",a2a); //looks for the href's comment XXX UnHtml(a2app,fnd,sizeof(fnd)); UnToken(fnd,"\r\n",strComment,strlen(fnd)); OnlyOneSpace(strComment,fnd,sizeof(fnd)); if(ParseUrl(tmpurl,&tmphst,hst.Host)==-1) sprintf(url,"(%s) %s%s",fnd,hst.Host,tmpurl); if(bTokenIn(tmpurl,",;!|$£&",strlen(tmpurl))==0) { tmphst.viewed = 0; memcpy(tmphst.Description,fnd,MAXDESCRIPTIONSIZE); if(tmphst.type < 3 || (listmails==1 && tmphst.type ==4)) AddUrl(tmphst); } } return 1; } #endif /*EOF*/