www.pudn.com > openwebspiderv0.1a.zip > htmlfnct.h



/* OpenWebSpider
 *
 *     Coded by Shen139
 *     shen139 [at] eviltime (dot) com
 *
 *
 * This file is part of OpenWebSpider
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 */

#ifndef __HTMLFNCT
#define __HTMLFNCT

#include 
#include 
#include "options.h"
#include "hstlist.h"
#include "strfnct.h"
#include "misc.h"

/* PageType
 * Host <-
 * Host->Page = "/test.htm" Host->type = 1 (type htm/html)
 */
int PageType(struct sHost* Host)
{
int i;

	if(Host->Page[strlen(Host->Page)-1]=='/')
	{
		Host->type = 1;			// Html file
	return 1;
	}

	for(i=0;HtmlExtensions[i][0]!=0;i++)
		if(strncmp(Host->Page+strlen(Host->Page)-strlen(HtmlExtensions[i]),HtmlExtensions[i],strlen(HtmlExtensions[i]))==0) 
		{
			Host->type = 1;			// Html file
		return 1;
		}


	for(i=0;PlainTextExtensions[i][0]!=0;i++)
	{
		if(strncmp(Host->Page+strlen(Host->Page)-strlen(PlainTextExtensions[i]),PlainTextExtensions[i],strlen(PlainTextExtensions[i]))==0) 
		{
			Host->type = 2;			// PlainText file
		return 1;
	}	}

	for(i=0;BinaryExtensions[i][0]!=0;i++)
		if(strncmp(Host->Page+strlen(Host->Page)-strlen(BinaryExtensions[i]),BinaryExtensions[i],strlen(BinaryExtensions[i]))==0) 
		{
			Host->type = 3;			// Binary file
		return 1;
		}

		if(strncmp(Host->Page,"/mailto",7)==0)
		{
			Host->type = 4;			// Email
			return 1;
		}

		Host->type = 1;				//no comment :)

return 1;
}

/* LooksForKey
 * html -> Key -> 0|1 <-
 */
int LooksForKey(char* html, char* Key)
{
char Text[MAXPACKETSIZE];
int i,m;

	UnHtml(html,Text,sizeof(Text));

	m=MIN(strlen(Text),MAXPACKETSIZE);
	for(i=0;i
 *	==> sHost.Url = Url &&  sHost.Host = "www.test.com" &&  sHost.Page = "page.htm"
 */
int ParseUrl(char* url,struct sHost* sh,char* currentHost)
{
char tUrl[MAXURLSIZE];
int offset=0, c;

	if(strlen(url)>MAXURLSIZE)
		return -1;

	memset(sh,0,sizeof(struct sHost));
	memset(tUrl,0,MAXURLSIZE);

	if(strncmp(url,"http://",7)==0)
		offset=7;

	if(strncmp(url,"//",2)==0)
		offset=2;

	strncpy(tUrl,url+offset,strlen(url)-offset);
	tUrl[strlen(url)-offset]=0;

	for(c=0;c<(signed)strlen(tUrl);c++)
		if(tUrl[c]!='/')
			sh->Host[c] = tUrl[c];
		else
			break;			

	sh->Host[c]=0;

	if(offset==0 && strncmp(tUrl,"www",3)!=0)
	{
		if(currentHost==NULL)
			return -1;
		if(tUrl[0]=='/' && strlen(sh->Host)==0)
		{
			strcpy(sh->Page,tUrl);
			strncpy(sh->Host,currentHost,MAXHOSTSIZE); 
			PageType(sh);
		return 1;
		}

		sprintf(sh->Page,"/%s",sh->Host);
		strncpy(sh->Host,currentHost,MAXHOSTSIZE); 
		PageType(sh);
		return 1;
	}

	strncpy(sh->Page,tUrl+c,strlen(tUrl)-c);
	sh->Page[strlen(tUrl)-c]=0;
	if(strcmp(sh->Page,"")==0)
		strcpy(sh->Page,"/");

	PageType(sh);

return 1;
}

/* BetweenTag
 * html text -> tag -> maxout out <-
 * html: "

ciao ciao

==> "ciao ciao" */ int BetweenTag(char* html, char* tag,char* out,int maxout) { char tmptag1[MAXTAGSIZE+1], tmptag2[MAXTAGSIZE+3]; int c,i=0,found=0; sprintf(tmptag1,"<%s",tag); sprintf(tmptag2,"",tag); for(c=0;c<(signed)strlen(html);c++) { if(found==0) { //looks for initial tag #ifdef WIN32 if(strnicmp(html+c,tmptag1,strlen(tmptag1))==0 && (html[c+strlen(tmptag1)]==' ' || html[c+strlen(tmptag1)]=='>')) #else if(strncasecmp(html+c,tmptag1,strlen(tmptag1))==0 && (html[c+strlen(tmptag1)]==' ' || html[c+strlen(tmptag1)]=='>')) #endif { found=c+1; c+=strlen(tmptag1); } } else { if(i * url = "www.test.com" ==> last_node->next = &node: Host= "www.test.com" Page = "/" */ int AddUrl(struct sHost hst) { if(starthostonly==1) if(strncmp(strtHst->Host,hst.Host,MAXHOSTSIZE)!=0) return -1; //Out of range of Host if(lstGetNodeByHost(first,hst)==NULL) //Host is not in list { lstAddHost(first,hst); return 1; } else return -1; } /* ReturnFirstUrl * Host <- * Return the first host it found in the list */ int ReturnFirstUrl(struct sHost* Host) { NODE* nFirstUrl; if((nFirstUrl=lstGetFirstFreeNode(first))!=NULL) { memcpy(Host,((struct sHost*)nFirstUrl->field),sizeof(struct sHost)); ((struct sHost*)nFirstUrl->field)->viewed = 1; return 1; } else return -1; } /* LookForUrls * html -> AddUrl() <- */ int LookForUrls(char *html,struct sHost hst) { char a2a[MAXURLSIZE+100]; //........... char a2app[MAXURLSIZE+100]; // char tmpurl[MAXURLSIZE]; char url[MAXURLSIZE]; char fnd[MAXDESCRIPTIONSIZE]; char strComment[MAXDESCRIPTIONSIZE]; int strlenhtml=strlen(html); int c=0,i,stage,x,tmpc; struct sHost tmphst; while(c sprintf(a2app,"<%s>",a2a); //looks for the href's comment XXX UnHtml(a2app,fnd,sizeof(fnd)); UnToken(fnd,"\r\n",strComment,strlen(fnd)); OnlyOneSpace(strComment,fnd,sizeof(fnd)); if(ParseUrl(tmpurl,&tmphst,hst.Host)==-1) sprintf(url,"(%s) %s%s",fnd,hst.Host,tmpurl); if(bTokenIn(tmpurl,",;!|$£&",strlen(tmpurl))==0) { tmphst.viewed = 0; memcpy(tmphst.Description,fnd,MAXDESCRIPTIONSIZE); if(tmphst.type < 3 || (listmails==1 && tmphst.type ==4)) AddUrl(tmphst); } } return 1; } #endif /*EOF*/