www.pudn.com > downloadingNet.rar > document.cpp
/*
Document.cpp : implementation of the CSnaggerDoc class
Implements project file persistence for options, tree data and statistics.
Also performs the actual retrieval of files from the host using the
CInet class.
Author: Steven E. Sipe
*/
#include "stdafx.h"
#include "SiteSnag.h"
#include "Document.h"
#include "View.h"
#include "progress.h"
#ifdef _DEBUG
#define new DEBUG_NEW
#undef THIS_FILE
static char THIS_FILE[] = __FILE__;
#endif
// File version for serialization
const long g_lFileVersion = 0x0101;
/////////////////////////////////////////////////////////////////////////////
// CSnaggerDoc
IMPLEMENT_DYNCREATE(CSnaggerDoc, CDocument)
BEGIN_MESSAGE_MAP(CSnaggerDoc, CDocument)
//{{AFX_MSG_MAP(CSnaggerDoc)
//}}AFX_MSG_MAP
END_MESSAGE_MAP()
/////////////////////////////////////////////////////////////////////////////
// CSnaggerDoc construction/destruction
// Constructor
CSnaggerDoc::CSnaggerDoc()
{
// Set some default project options
m_Options.nMaxDepth = 2;
m_Options.nMaxPages = 0;
m_Options.bFixupLinks = TRUE;
m_Options.bContents = TRUE;
m_Options.bMultimedia = TRUE;
m_Options.bOffsiteLinks = FALSE;
// Set the initial hash table sizes
m_arrPagesDone.InitHashTable(1200);
m_arrMediaDone.InitHashTable(2400);
// Initialize some flags
m_bProjectLoaded = FALSE;
m_pProgress = NULL;
m_bAutoMode = FALSE;
m_nLevel = 0;
}
// Destructor
CSnaggerDoc::~CSnaggerDoc()
{
try
{
// Remove the page and media maps
ClearCacheMaps();
}
catch(...)
{
}
}
//建立一个新文档
BOOL CSnaggerDoc::OnNewDocument()
{
static bFirstTime = TRUE;
// Is is this the empty project file?
if(bFirstTime)
{
bFirstTime = FALSE;
// Yes, set the title to "(No project)"
CString strDefName;
strDefName.LoadString(IDS_NO_PROJECT);
SetTitle(strDefName);
// Call the base class and get out...
if (!CDocument::OnNewDocument())
return FALSE;
return(TRUE);
}
// Call the base class
if (!CDocument::OnNewDocument())
return FALSE;
// Clear the statisitics and indicate that we now have a
// project loaded
m_bProjectLoaded = TRUE;
m_strStartPage.Empty();
m_nGottenPageCount = 0;
m_nGottenFileCount = 0;
m_nQueuedPageCount = 0;
m_nTotalBytes = 0;
// Make sure that the info in the statistics window is reset
POSITION pos = GetFirstViewPosition();
CSnaggerView* pView = (CSnaggerView *) GetNextView(pos);
m_pProgress = pView->GetProgress();
m_pProgress->SetActionTitle("");
return TRUE;
}
// 打开一个已经保存的文档
BOOL CSnaggerDoc::OnOpenDocument(LPCTSTR lpszPathName)
{
POSITION pos = GetFirstViewPosition();
CSnaggerView* pView = (CSnaggerView *) GetNextView(pos);
// Save the current project (if necessary)
SaveModified();
// Initialize the tree
pView->ClearTree();
// Call the base class to read the document's contents from disk
if (!CDocument::OnOpenDocument(lpszPathName))
return FALSE;
// Set the project location
SetPathName(lpszPathName);
m_strDirectory = CInet::SplitFileName(lpszPathName,
CInet::DRIVE|CInet::PATH|CInet::FNAME)+"\\";
// Set the document's title
SetTitle(CInet::SplitFileName(lpszPathName,CInet::FNAME|CInet::EXT));
// Indicate that the project is loaded
m_bProjectLoaded = TRUE;
// Update the project's information in the statistics window
if(m_pProgress)
{
m_pProgress = pView->GetProgress();
m_pProgress->SetActionTitle("");
UpdateStatus();
}
// Make sure this document gets saved at the end
SetModifiedFlag(TRUE);
return TRUE;
}
// Saves files that have been change (DoFileSave() calls the document's
// ::Serialize() method
BOOL CSnaggerDoc::SaveModified()
{
// Was the document changed??
if(IsModified())
return CDocument::DoFileSave();
return(TRUE);
}
// Prevents the user from exiting the application if a snagging operation is
// in progress
BOOL CSnaggerDoc::CanCloseFrame(CFrameWnd* pFrame)
{
POSITION pos = GetFirstViewPosition();
CSnaggerView* pView = (CSnaggerView *) GetNextView(pos);
return(!pView->GetSnagging());
}
// 载入一个空的工程
void CSnaggerDoc::Reset(LPCTSTR lpszProjName)
{
CString strNewProjName;
if(lpszProjName)
strNewProjName = lpszProjName;
// Use the default name -- (No Project)
strNewProjName.LoadString(IDS_NO_PROJECT);
m_strPathName.Empty();
m_strDirectory.Empty();
m_bProjectLoaded = FALSE;
SetModifiedFlag(FALSE);
SetTitle(strNewProjName);
// Reset the statistics window's information
m_strStartPage.Empty();
m_nGottenPageCount = 0;
m_nGottenFileCount = 0;
m_nQueuedPageCount = 0;
m_nTotalBytes = 0;
// Update the statistics window
POSITION pos = GetFirstViewPosition();
CSnaggerView* pView = (CSnaggerView *) GetNextView(pos);
m_pProgress = pView->GetProgress();
m_pProgress->SetActionTitle("");
UpdateStatus();
}
// Handles closing the document -- i.e. the user pressed the X button or chose the
// close menu item
void CSnaggerDoc::OnCloseDocument()
{
// Make sure that we save the current document
SaveModified();
CDocument::OnCloseDocument();
}
// 获得工程选项
void CSnaggerDoc::GetOptions(CSnagOptions& Options)
{
Options.nMaxDepth = m_Options.nMaxDepth;
Options.nMaxPages = m_Options.nMaxPages;
Options.bFixupLinks = m_Options.bFixupLinks;
Options.bContents = m_Options.bContents;
Options.bMultimedia = m_Options.bMultimedia;
Options.bOffsiteLinks = m_Options.bOffsiteLinks;
}
//设置工程选项,当通过命令行来执行的时候调用
void CSnaggerDoc::SetOptions(CSnagOptions& Options)
{
m_Options.nMaxDepth = Options.nMaxDepth;
m_Options.nMaxPages = Options.nMaxPages;
m_Options.bFixupLinks = Options.bFixupLinks;
m_Options.bContents = Options.bContents;
m_Options.bMultimedia = Options.bMultimedia;
m_Options.bOffsiteLinks = Options.bOffsiteLinks;
}
//文件序列化
void CSnaggerDoc::Serialize(CArchive& ar)
{
long lFileVersion;
POSITION pos = GetFirstViewPosition();
CSnaggerView* pView = (CSnaggerView *) GetNextView(pos);
// 是否要写入硬盘
if (ar.IsStoring())
{
// 写入版本信息
ar << g_lFileVersion;
lFileVersion = g_lFileVersion;
// 写入参数窗口的信息
ar << m_nGottenPageCount;
ar << m_nGottenFileCount;
ar << m_nTotalBytes;
}
else
{
// 获得信息
ar >> lFileVersion;
// 读入参数信息
ar >> m_nGottenPageCount;
ar >> m_nGottenFileCount;
ar >> m_nTotalBytes;
// Reset the queued page count (used in the statistics window)
m_nQueuedPageCount = 0;
}
// 获得或者保存工程信息
m_Options.SetVersion(lFileVersion);
m_Options.Serialize(ar);
// 获得或者保存树形控件的内容
pView->SerializeTree(ar);
m_nLevel = 0;
}
//下载指定的页面,通过CInet或者从硬盘上现在
//如果通过CInet下载,则保存程文件名strFileName,同时分析该文件中的链接
BOOL CSnaggerDoc::GetPage(CString& strPage, CString& strFileName, LINKS& linkEntry)
{
BYTE *pbyBuffer = m_byBuffer;
int nLen;
BOOL bPageInCache = FALSE;
BOOL bRet = FALSE;
CInet::RESULTS ret;
MAP_FILES* pMapEntry;
// 初始化链接堆栈入口
linkEntry.arrLinks.SetSize(0,100);
linkEntry.arrMedia.SetSize(0,100);
linkEntry.arrOffsite.SetSize(0,100);
linkEntry.nIndex = 0;
// 判断是否需要下载该页
if(ShouldGetPage(strPage,pMapEntry))
{
// 向服务器提出申请,要下载该文件
ret = m_Inet.GetPage(strPage,&pbyBuffer,nLen,TRUE);
if(ret == CInet::SUCCESS)
{
// 将总下载字节数增加
bRet = TRUE;
m_nTotalBytes += nLen;
}
}
else
{
// 如果已经下载过了,则复制本地文件
// 因为需要链接指向原来下载过的文件
CFile fileIn;
CFileException ex;
// 获得完整的文件路径
strFileName = pMapEntry->strFileName;
CString strTempFileName = m_strDirectory+strFileName;
// 打开该文件
if(fileIn.Open(strTempFileName,CFile::modeRead,&ex))
{
// 读入数据
nLen = fileIn.Read(pbyBuffer,MAX_INET_BUFFER);
fileIn.Close();
bRet = TRUE;
}
// 指示并没有重新下载该文件
bPageInCache = TRUE;
}
// 是否完毕
if(bRet)
{
// 获得链接以及多媒体文件信息
CSnaggerHtmlParser Parser;
Parser.SetPageURL(strPage);
m_pProgress->SetActionTitle("Parsing Page: "+strPage);
// 判断是否超出最大缓冲区
if(nLen > MAX_INET_BUFFER)
nLen = MAX_INET_BUFFER;
// 初始化并调用分析子程序parser
pbyBuffer = m_byBuffer;
Parser.SetFixupMode(FALSE);
Parser.ResetArrays();
Parser.SetGetMedia(m_Options.bMultimedia);
Parser.ParseText((char *)pbyBuffer,nLen);
m_strPageTitle = Parser.GetTitle();
// 保存该文件
if(!bPageInCache)
{
m_pProgress->SetActionTitle("Saving Page: "+strPage);
pbyBuffer = m_byBuffer;
m_Inet.SaveFile(strFileName,m_strDirectory,pbyBuffer,nLen);
}
// 分析被链接的页面的数量
int nLinks;
BOOL bOffsite;
CString strNewPage;
nLinks = Parser.GetLinks().GetSize();
// 分析每一个链接,是否要添加到下载等待列表中
for(int i = 0; i < nLinks; i++)
{
// 获得网页的URL地址
strNewPage = Parser.GetLinks().GetAt(i);
// Get the offsite link flag for this page
bOffsite = Parser.GetOffsiteFlags().GetAt(i);
// See if we should at it to the download queue
if(ShouldQueuePage(strNewPage,bOffsite))
{
linkEntry.arrLinks.Add(strNewPage);
linkEntry.arrOffsite.Add(bOffsite);
}
}
// Don't need the images if we've already parsed this page
// before
if(!bPageInCache)
{
// New page, so get the all of the media information
int nMedia = Parser.GetMedia().GetSize();
CString strMedia;
for(i = 0; i < nMedia; i++)
{
strMedia = Parser.GetMedia().GetAt(i);
if(ShouldGetMedia(strMedia,pMapEntry))
linkEntry.arrMedia.Add(strMedia);
}
}
// Success
bRet = TRUE;
}
return(bRet);
}
// 获得指定的多媒体文件
BOOL CSnaggerDoc::GetMedia(CString& strMedia, CString& strFileName)
{
BYTE *pbyBuffer = m_byBuffer;
int nLen;
BOOL bRet = FALSE;
// 获得指定的页面
CInet::RESULTS ret;
// 从INet获得文件
ret = m_Inet.GetFile(strMedia,&pbyBuffer,nLen);
if(ret == CInet::SUCCESS)
{
// 总下载数据增加该下载文件大小
m_nTotalBytes += nLen;
// 写入文件
m_pProgress->SetActionTitle("Saving File: "+strMedia);
m_Inet.SaveFile(strFileName,m_strDirectory,pbyBuffer,nLen);
bRet = TRUE;
}
return(bRet);
}
//旁断是否该页面已经下载过,如果已经下载过,则返回true
//如果已经下载,则返回指向指针
BOOL CSnaggerDoc::ShouldGetPage(CString& strPage, MAP_FILES*& pMapEntry)
{
// Page names shouldn't be case sensitive
CString strNewPage = strPage;
strNewPage.MakeLower();
strNewPage = strNewPage.SpanExcluding("#");
// 设定一个相对路径
CString strExt = CInet::SplitFileName(strNewPage,CInet::EXT);
if(strExt.IsEmpty() && strNewPage.Right(1) != "/")
strNewPage += "/";
// Did we find it??
return(!m_arrPagesDone.Lookup(strNewPage,(CObject *&) pMapEntry));
}
//判断是否下载多媒体文件,如果以前已经下载过,则返回true
//如果已经下载过,则返回一个指针真想已经下载的多媒体列表的入口
BOOL CSnaggerDoc::ShouldGetMedia(CString& strMedia, MAP_FILES*& pMapEntry)
{
// Page names shouldn't be case sensitive
CString strNewMedia = strMedia;
strNewMedia.MakeLower();
strNewMedia = strNewMedia.SpanExcluding("#");
// Page names shouldn't be case sensitive
return(!m_arrMediaDone.Lookup(strNewMedia,(CObject *&) pMapEntry));
}
//该函数返回true,如果指定的页面要被增加到下载队列中
//主要是确定是否该页面是否在前一层中被下载
//同时还要检查是否是本站页面
BOOL CSnaggerDoc::ShouldQueuePage(CString& strNewPage, BOOL bOffsite)
{
MAP_FILES* pMapEntry;
// 判断是否以前下载过
if(ShouldGetPage(strNewPage,pMapEntry))
{
// 如果还没有,则在等待列表中寻找是否已经在前一层的等待列表中
for(int i = 0; i < m_nLevel; i++)
{
for(int j = 0; j < m_aLinks[i].arrLinks.GetSize(); j++)
{
if(strNewPage == m_aLinks[i].arrLinks.GetAt(j))
return(FALSE);
}
}
}
else
{
// 如果达到最大层数,则返回true
if(m_Options.nMaxDepth && m_nLevel >= pMapEntry->nMaxLevel)
return(TRUE);
}
// 是否允许其他站点的页面下载
if(bOffsite && !m_Options.bOffsiteLinks)
return(FALSE);
return(TRUE);
}
// 初始化指定链接的堆栈入口
void CSnaggerDoc::ResetLink(int nLevel)
{
m_aLinks[nLevel].nIndex = 0;
m_aLinks[nLevel].arrLinks.SetSize(0,100);
m_aLinks[nLevel].arrMedia.SetSize(0,100);
m_aLinks[nLevel].arrOffsite.SetSize(0,100);
}
// 更新参数显示
void CSnaggerDoc::UpdateStatus()
{
// Does the statistics window exist?
if(m_pProgress)
{
// Yep...update the info in its fields
m_pProgress->SetQueuedFiles(m_nQueuedPageCount);
m_pProgress->SetDownloadedPages(m_nGottenPageCount);
m_pProgress->SetDownloadedFiles(m_nGottenFileCount);
m_pProgress->SetKBDownloaded(m_nTotalBytes);
m_pProgress->SetLevel(m_nLevel+1);
}
}
// 现在网页以及网页中的多媒体元素
UINT CSnaggerDoc::DownloadThread(LPVOID lpvData)
{
HTREEITEM htreePage;
CSnaggerDoc *pThis = (CSnaggerDoc *) lpvData;
int nMaxDepth = pThis->m_Options.nMaxDepth-1;
int nCount;
CString strPage = pThis->m_strStartPage;
CString strFileName;
CString strLogData;
CString strText;
POSITION pos = pThis->GetFirstViewPosition();
CSnaggerView* pView = (CSnaggerView *) pThis->GetNextView(pos);
BOOL bIsOffsite = FALSE;
// 建立WinInet会话
try
{
pThis->m_Inet.OpenSession(pThis->m_Options.bUseProxy,pThis->m_Options.strProxyName);
}
catch(...)
{
}
// 创建日志文件
pThis->m_fileLog.Open(pThis->m_strDirectory+"sitesnag.log",
CFile::modeCreate|CFile::modeWrite);
// 创建内容列表文件
if(pThis->m_Options.bContents)
{
pThis->m_fileContents.Open(pThis->m_strDirectory+"SnagCon1.htm",
CFile::modeCreate|CFile::modeWrite);
// 将下载内容增加到下载文件列表中
pThis->SetPageCacheEntry("snagcon1.htm","SnagCon1.htm",0);
// 内容列表加入到树形控件中
CString strTitle = "Contents Page 1 (SnagCon1.htm)";
pView->AddTreeContent(strTitle);
// 写入第一个内容列表也的开始
strText = "\r\n\r\nSiteSnagger Contents \r\n";
strText += "\r\n";
strText += "SiteSnagger Table of Contents
\r\n