www.pudn.com > openwebspiderv0.1a.zip > openwebspider-0.1.c



/* OpenWebSpiderV0.1a (Work in progress...)
 *
 *  Coded by Shen139
 *    shen139 [at] eviltime (dot) com
 *
 *  Compile with
 *  - Linux:   gcc openwebspider-0.1.c -o openwebspider -g -L /usr/local/mysql/lib/ -lmysqlclient -lnsl -lm
 *   - libmysqlclient10-dev (or mysql-dev) needed
 *  - Windows: Microsoft Visual C++ 6.0
 *
 *
 * FAQ about Robots and Search engine here: http://www.robotstxt.org/wc/faq.html
 *
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 */

#include 
#include 
#include "options.h"
#include "getopt.h"
#include "socket.h"
#include "misc.h"
#include "htmlfnct.h"
#include "hstlist.h"
#include "ndzlist.h"
#include "sqlfnct.h"

#define AUTHOR			Shen139
#define VERSION			"0.1a"

int usage(char* Error)
{
	printf("Error:\n %s\n",Error);
	printf("Usage:\nOpenWebSpider Mode Scan Mode Parameters\n");
	printf(" OpenWebSpider Mode:\n  -i\t[Indicize] Hostname User Password Database start_url\n  -I\t[Indicized search] Hostname User Password Database Query\n  -r\t[Real Time search] [Key] start_url\n");
	printf(" Scan Mode:\n  -s\t[Single Host scan mode]\n  -R\t[Recursive Host scan mode]\n");
	printf(" -m\t[Show mails too]\n -c\t[Create DataBase] hostname username password name_of_database\n");

exit(0);
}

int main(int argc, char*argv[])
{
#ifdef linux
int sock;
#else
SOCKET sock;
#endif
char packet[MAXPACKETSIZE];
char html[MAXPACKETSIZE];
char starturl[MAXURLSIZE], *starturlTmp;
char LastHostOK[MAXHOSTSIZE];
char Key[MAXKEYSIZE];
char Hostname[MAXHOSTSIZE];
char username[20];
char password[30];
char dbname[30];
char Query[MAXARGQUERYSIZE];
int c, condition = 1, recvdbytes;
struct sHost currentHst;
extern int optind;

	printf("OpenWebSpider(v%s)\n  Coded by Shen139\n   shen139(at)eviltime(dot)com\n\n",VERSION);

	if(argc<3)
		usage("Too few arguments");

	memset(starturl,0,MAXURLSIZE);
	memset(Key,0,MAXKEYSIZE);

	while ((c = getopt(argc, argv, "IirsRmc")) != -1)
	switch (c)
	{
		case 'I':
			if(scan_mode!=0xFF)
				usage("Scan Mode redefinition");

			scan_mode=2;

			if(optind+4>=argc)
				usage("No enough arguments in real time search");

			if(strlen(argv[optind])>MAXHOSTSIZE)
				usage("Hostname too long");
			else
				strncpy(Hostname,argv[optind],MAXHOSTSIZE);

			if(strlen(argv[optind+1])>20)
				usage("Username too long");
			else
				strncpy(username,argv[optind+1],20);

			if(strlen(argv[optind+2])>30)
				usage("Password too long");
			else
				strncpy(password,argv[optind+2],30);

			if(strlen(argv[optind+3])>30)
				usage("DataBase name too long");
			else
				strncpy(dbname,argv[optind+3],30);

			if(strlen(argv[optind+4])>MAXARGQUERYSIZE)
				usage("Query too long");
			else
				strncpy(Query,argv[optind+4],MAXARGQUERYSIZE);

			break;

		case 'i':
			if(scan_mode!=0xFF)	//At startup scan_mode==0xFF => uninitialized
				usage("Scan Mode redefinition");
			scan_mode=1;

			if(optind+4>=argc)
				usage("No enough arguments in real time search");

			if(strlen(argv[optind])>MAXHOSTSIZE)
				usage("Hostname too long");
			else
				strncpy(Hostname,argv[optind],MAXHOSTSIZE);

			if(strlen(argv[optind+1])>20)
				usage("Username too long");
			else
				strncpy(username,argv[optind+1],20);

			if(strlen(argv[optind+2])>30)
				usage("Password too long");
			else
				strncpy(password,argv[optind+2],30);

			if(strlen(argv[optind+3])>30)
				usage("DataBase name too long");
			else
				strncpy(dbname,argv[optind+3],30);

			if(strlen(argv[optind+4])>MAXHOSTSIZE)
				usage("Url too long");
			else
				strncpy(starturl,argv[optind+4],MAXHOSTSIZE);

		break;
		case 'r':
			if(scan_mode!=0xFF)
				usage("Scan Mode redefinition");

			if(optind+1>=argc)
				usage("No enough arguments in real time search");
			scan_mode=0;

			if(strlen(argv[optind])>MAXKEYSIZE)
				usage("Key too long");
			else
				strncpy(Key,argv[optind],MAXKEYSIZE);

			if(strlen(argv[optind+1])>MAXHOSTSIZE)
				usage("Url too long");
			else
				strncpy(starturl,argv[optind+1],MAXHOSTSIZE);

		break;
		case 's':
			starthostonly=1;
		break;
		case 'R':
			starthostonly=0;
		break;
		case 'm':
			listmails=1;
		break;
		case 'c':	//Create database
			if(argc>6)
				usage("Too many arguments to create a database");
			else if(argc<6)
				usage("Too few arguments to create a database");
			else
			{
				if(sqlCreateDB(argv[optind],argv[optind+1],argv[optind+2],argv[optind+3])==0)
					fprintf(stderr,"Error creating tables\n");
				else
					printf("Database created\n");
				return 1;
			}

		break;
	}

	if(scan_mode==0xFF)			//at this point scan_mode bust be 0 OR 1
		usage("Scan mode undefined");

	if(scan_mode==2)
	{
		printf("Scan Mode:      \tIndicized\n");
		printf("Key:            \t%s\n",Query);
		printf("Surfing the DB...\n");

		return(IndicizedSearch(Hostname,username,password,dbname,Query,0xFF));
	}
	if(strncmp(starturl,"http://",7)!=0)
	{
		starturlTmp=malloc(MAXURLSIZE);
		strncpy(starturlTmp,starturl,strlen(starturl)+1);
		snprintf(starturl, MAXURLSIZE, "http://%s",starturlTmp);
		free(starturlTmp);
	}

	if(ParseUrl(starturl,¤tHst,NULL)==-1)
		usage("Wrong start URL");

	currentHst.viewed = 1;
	strncpy(currentHst.Description,starturl,MIN(strlen(starturl),MAXDESCRIPTIONSIZE));

	first = lstInit(currentHst);

	if(starthostonly==1)
	{
		strtHst=malloc(sizeof(struct sHost));
		memcpy(strtHst,¤tHst,sizeof(struct sHost));
	}

	printf("\nStart Host:     \t%s\nStart Page:     \t%s\n",currentHst.Host, currentHst.Page);
	if(scan_mode==0)
		printf("Key:\t\t\t%s\n",Key);

	printf("Scan Mode:      \t%s\n",(scan_mode==0)?"Real Time Search":"Indicize");
	printf("OpenWebSpider Mode:\t%s\n\n",(starthostonly==1)?"Single Host":"Recursive");
	printf("Surfing the net...\n");

#ifdef WIN32
	if(!StartUpWinsock())
	{
		fprintf(stderr,"WSAStartup() error\n");
		return -1;
	}
#endif

	while(condition)
	{
		if(currentHst.type == 4 && listmails==1)
		{
			currentHst.viewed = 1;
			printf("Mail Found: %s\n",currentHst.Page);
			lstFreeAll(first);
			goto endofwhile;
		}

		printf("Current -> http://%s%s",currentHst.Host,currentHst.Page);
		if(!LoadSocket(&sock,¤tHst,LastHostOK))
		{
			#ifdef linux
			close(sock);
			#else
			closesocket(sock);
			#endif

			fprintf(stderr,"\nSocket() error\n");
			goto endofwhile;
		}

#ifdef linux
		if (connect(sock, (struct sockaddr*) &saddr, sizeof(saddr)) == -1)
#else
		if (connect(sock, (LPSOCKADDR) &saddr, sizeof(saddr)) == SOCKET_ERROR)
#endif
		{
			#ifdef linux
			close(sock);
			#else
			closesocket(sock);
			#endif

			goto endofwhile;
		}

		strncpy(LastHostOK,currentHst.Host,MAXHOSTSIZE);
		ForgeHTTPPacket(currentHst,packet);

		send(sock,packet,strlen(packet),0);

		recvdbytes=RecvPackets(sock,packet,sizeof(packet));

		if(recvdbytes<=1)
		{
			printf("\t\t[ERROR]]\n");
			goto endofwhile;
		}

		if(currentHst.type !=4	&& recvdbytes > 1)
		{
			if(ParseHTTPRequest(packet,html,MAXPACKETSIZE)==1)
				printf("\t\t[%i bytes (%i KB) OK]\n", recvdbytes,recvdbytes/1024);
			else
			{
				printf("\t\t[ERROR]\n");
				goto endofwhile;
			}
			if(currentHst.type == 1)						//Looks for urls only in html page
				LookForUrls(html,currentHst);

			if(currentHst.type <= 2)						//Looks for the key only in plain text files
			{
				if(scan_mode==0)
				{
					if(LooksForKey(html,Key)==1)
						printf("Key found\n\n");
				}
				else if(scan_mode==1)
					Indicize(Hostname,username,password,dbname,html,currentHst);
			}
		}
		else
			printf("\n");


#ifdef linux
		close(sock);
#else
		closesocket(sock);
#endif

endofwhile:
		if((ReturnFirstUrl(¤tHst))==-1)
		{
			fprintf(stderr,"\nBuffer empty\n");
			lstFreeAll(first);
			return 0;
		}
	}

	lstFreeAll(first);

return 1;
}


/*EOF*/