www.pudn.com > htmlpars.zip > html.cpp, change:1998-12-30,size:8606b


/* 
    Implement an HTML parser using IE4's IHTMLDocument2 interface. 
*/ 
 
 
#include <windows.h> 
#include <comdef.h> 
#include <io.h> 
#include "html.h" 
 
#include <iostream> 
using namespace std; 
 
 
/* 
	static function used to force dynamic allocation 
*/ 
HTMLParser *HTMLParser::Create() 
{ 
	return new HTMLParser; 
} 
 
// constructor/destructor 
 
HTMLParser::HTMLParser() 
{ 
	HRESULT hr; 
	LPCONNECTIONPOINTCONTAINER pCPC = NULL; 
	LPOLEOBJECT pOleObject = NULL; 
	LPOLECONTROL pOleControl = NULL; 
 
 
    // initialize all the class member variables 
    m_dwRef = 1;	// must start at 1 for the current instance 
    m_hrConnected = S_FALSE; 
    m_dwCookie = 0; 
    m_pMSHTML = NULL; 
    m_pCP = NULL; 
    m_pAnchorLinks = NULL; 
    m_pImageLinks = NULL; 
 
 
	// Create an instance of an dynamic HTML document 
	if (FAILED(hr = CoCreateInstance( CLSID_HTMLDocument, NULL, CLSCTX_INPROC_SERVER, IID_IHTMLDocument2, (LPVOID*)&m_pMSHTML ))) 
	{ 
		goto Error; 
	} 
 
	if (FAILED(hr = m_pMSHTML->QueryInterface(IID_IOleObject, (LPVOID*)&pOleObject))) 
	{ 
		goto Error; 
	} 
	hr = pOleObject->SetClientSite((IOleClientSite*)this); 
	pOleObject->Release(); 
 
	if (FAILED(hr = m_pMSHTML->QueryInterface(IID_IOleControl, (LPVOID*)&pOleControl))) 
	{ 
		goto Error; 
	} 
	hr = pOleControl->OnAmbientPropertyChange(DISPID_AMBIENT_DLCONTROL); 
	pOleControl->Release(); 
 
	// Hook up sink to catch ready state property change 
	if (FAILED(hr = m_pMSHTML->QueryInterface(IID_IConnectionPointContainer, (LPVOID*)&pCPC))) 
	{ 
		goto Error; 
	} 
 
	if (FAILED(hr = pCPC->FindConnectionPoint(IID_IPropertyNotifySink, &m_pCP))) 
	{ 
		goto Error; 
	} 
 
	m_hrConnected = m_pCP->Advise((LPUNKNOWN)(IPropertyNotifySink*)this, &m_dwCookie); 
 
Error: 
	if (pCPC) pCPC->Release(); 
 
} 
 
HTMLParser::~HTMLParser() 
{ 
 
    if ( m_pAnchorLinks ) 
        m_pAnchorLinks->Release(); 
 
    if ( m_pImageLinks ) 
        m_pImageLinks->Release(); 
 
	if (SUCCEEDED(m_hrConnected)) 
		m_pCP->Unadvise(m_dwCookie); 
 
	if (m_pCP)  
		m_pCP->Release(); 
 
    if ( m_pMSHTML ) 
        m_pMSHTML->Release(); 
 
} 
 
 
 
STDMETHODIMP HTMLParser::QueryInterface(REFIID riid, LPVOID* ppv) 
{ 
	*ppv = NULL; 
 
	if (IID_IUnknown == riid || IID_IPropertyNotifySink == riid) 
	{ 
		*ppv = (LPUNKNOWN)(IPropertyNotifySink*)this; 
		AddRef(); 
		return NOERROR; 
	} 
	else if (IID_IOleClientSite == riid) 
	{ 
		*ppv = (IOleClientSite*)this; 
		AddRef(); 
		return NOERROR; 
	} 
	else if (IID_IDispatch == riid) 
	{ 
		*ppv = (IDispatch*)this; 
		AddRef(); 
		return NOERROR; 
	} 
	else 
		return E_NOTIMPL; 
} 
 
STDMETHODIMP_(ULONG) HTMLParser::AddRef() 
{ 
	return ++m_dwRef; 
} 
 
STDMETHODIMP_(ULONG) HTMLParser::Release() 
{ 
	if (--m_dwRef == 0)  
	{  
		delete this;  
		return 0;  
	} 
 
	return m_dwRef; 
} 
 
STDMETHODIMP HTMLParser::OnChanged(DISPID dispID) 
{ 
	HRESULT hr; 
 
	if (DISPID_READYSTATE == dispID) 
	{ 
		VARIANT vResult = {0}; 
		EXCEPINFO excepInfo; 
		UINT uArgErr; 
		long lReadyState; 
 
 
		DISPPARAMS dp = {NULL, NULL, 0, 0}; 
		if (SUCCEEDED(hr = m_pMSHTML->Invoke(DISPID_READYSTATE, IID_NULL, LOCALE_SYSTEM_DEFAULT,  
			DISPATCH_PROPERTYGET, &dp, &vResult, &excepInfo, &uArgErr))) 
		{ 
			lReadyState = (READYSTATE)V_I4(&vResult); 
			switch (lReadyState) 
			{	 
			case READYSTATE_UNINITIALIZED: 
			case READYSTATE_LOADING:  
			case READYSTATE_LOADED:	 
			case READYSTATE_INTERACTIVE: 
				break; 
 
			case READYSTATE_COMPLETE:  
				// IE4 is finished parsing the file 
				BOOL fRet = PostThreadMessage(GetCurrentThreadId(), 
									WM_USER_LOAD_COMPLETE, 
									(WPARAM)0, 
									(LPARAM)0); 
				break; 
			} 
			VariantClear(&vResult); 
		} 
	} 
 
	return NOERROR; 
} 
 
STDMETHODIMP HTMLParser::Invoke(DISPID dispIdMember, 
            REFIID riid, 
            LCID lcid, 
            WORD wFlags, 
            DISPPARAMS __RPC_FAR *pDispParams, 
            VARIANT __RPC_FAR *pVarResult, 
            EXCEPINFO __RPC_FAR *pExcepInfo, 
            UINT __RPC_FAR *puArgErr) 
{ 
	if (!pVarResult) 
	{ 
		return E_POINTER; 
	} 
 
	switch(dispIdMember) 
	{ 
	case DISPID_AMBIENT_DLCONTROL: 
		// This tells IE4 that we want to download the page,  
		// but we don't want to run scripts, Java applets, or  
		// ActiveX controls 
		V_VT(pVarResult) = VT_I4; 
		V_I4(pVarResult) =  DLCTL_DOWNLOADONLY |  
							DLCTL_NO_SCRIPTS |  
							DLCTL_NO_JAVA | 
							DLCTL_NO_DLACTIVEXCTLS | 
							DLCTL_NO_RUNACTIVEXCTLS; 
		break; 
	default: 
		return DISP_E_MEMBERNOTFOUND; 
	} 
 
	return NOERROR; 
} 
 
 
 
 
BOOL HTMLParser::LoadHTMLFile(LPCSTR pcszFile) 
{ 
    HRESULT        hr; 
	LPPERSISTFILE  pPF; 
	IHTMLElementCollection* pColl = NULL; 
    MSG msg; 
 
 
	if ( !IsConnected() ) 
		return FALSE; 
 
    // kill any previous links 
    if ( m_pAnchorLinks ) 
    { 
        m_pAnchorLinks->Release(); 
        m_pAnchorLinks = NULL; 
    } 
 
    if ( m_pImageLinks ) 
    { 
        m_pImageLinks->Release(); 
        m_pImageLinks = NULL; 
    } 
 
	// avoid IE error msg box if the file does not exist 
    if ( access(pcszFile, 0x00) != 0x00 ) 
    { 
        return FALSE; 
    } 
 
    _bstr_t bstrFile(pcszFile); 
 
 
	// use IPersistFile to load the HTML 
    if ( SUCCEEDED(hr = m_pMSHTML->QueryInterface(IID_IPersistFile, (LPVOID*) &pPF))) 
	{ 
		hr = pPF->Load((LPCWSTR)bstrFile, 0); 
		pPF->Release(); 
	} 
 
    BOOL bOK = FALSE; 
 
    if (SUCCEEDED(hr)) 
    { 
		while (GetMessage(&msg, NULL, 0, 0)) 
		{ 
			// notification from OnChanged 
			if (WM_USER_LOAD_COMPLETE == msg.message && NULL == msg.hwnd) 
			{ 
                bOK = TRUE; 
                break; 
			} 
			else 
			{ 
				DispatchMessage(&msg); 
			} 
		} 
    } 
 
 
    if ( bOK ) 
    { 
		try 
		{ 
			if ( FAILED(m_pMSHTML->get_links(&m_pAnchorLinks)) || 
				 FAILED(m_pMSHTML->get_images(&m_pImageLinks)) )  
			{ 
				throw exception(); 
			} 
		}  
		catch ( exception e ) 
		{ 
			if ( m_pAnchorLinks ) 
			{ 
				m_pAnchorLinks->Release(); 
				m_pAnchorLinks = NULL; 
			} 
 
			if ( m_pImageLinks ) 
			{ 
				m_pImageLinks->Release(); 
				m_pImageLinks = NULL; 
			} 
 
			bOK = FALSE; 
		} 
    } 
 
	return bOK; 
} 
 
/* 
	Get the number of links present in the current HTML file 
*/ 
long HTMLParser::GetLinkCount() 
{ 
    long lCount = 0; 
 
    if ( m_pAnchorLinks ) 
        m_pAnchorLinks->get_length(&lCount); 
 
    return lCount; 
} 
 
 
/* 
	Get the number of images present in the current HTML file 
*/ 
long HTMLParser::GetImageCount() 
{ 
    long lCount = 0; 
 
    if ( m_pImageLinks ) 
        m_pImageLinks->get_length(&lCount); 
 
    return lCount; 
} 
 
 
/* 
	Get the URL associated with a given link 
*/ 
BOOL HTMLParser::GetLinkURL(long lIndex, string &rstrURL) 
{ 
	if ( IsConnected() && m_pAnchorLinks ) 
	    return GetURLFromCollection(m_pAnchorLinks, IID_IHTMLAnchorElement, lIndex, rstrURL); 
	else 
		return FALSE; 
} 
 
/* 
	Get the URL associated with a given image 
*/ 
BOOL HTMLParser::GetImageURL(long lIndex, string &rstrURL) 
{ 
	if ( IsConnected() && m_pImageLinks ) 
	    return GetURLFromCollection(m_pImageLinks, IID_IHTMLImgElement, lIndex, rstrURL); 
	else 
		return FALSE; 
} 
 
/* 
	Get the URL associated with an element in a collection.  The element must 
	be an image or an anchor. 
*/ 
BOOL HTMLParser::GetURLFromCollection(IHTMLElementCollection *pCollection, REFIID rIID, long lIndex, string &rstrURL) 
{ 
	VARIANT     varIndex; 
	VARIANT     var2; 
    HRESULT     hr; 
	IDispatch*  pDisp = NULL;  
    BOOL        bFound = FALSE; 
 
    varIndex.vt = VT_UINT; 
	varIndex.lVal = lIndex; 
 
	VariantInit( &var2 ); 
 
	hr = pCollection->raw_item( varIndex, var2, &pDisp ); 
 
	if ( SUCCEEDED(hr) && pDisp) 
	{ 
		IHTMLImgElement* pImgElem = NULL; 
		IHTMLAnchorElement* pAnchorElem = NULL; 
        BSTR bstr = NULL; 
 
        if ( rIID == IID_IHTMLImgElement &&              
		     SUCCEEDED(pDisp->QueryInterface(rIID, (void **)&pImgElem)) ) 
		{ 
			pImgElem->get_href(&bstr); 
            pImgElem->Release(); 
            bFound = (bstr != NULL); 
		} 
        else if ( rIID == IID_IHTMLAnchorElement &&              
		          SUCCEEDED(pDisp->QueryInterface(rIID, (void **)&pAnchorElem)) ) 
        { 
			pAnchorElem->get_href(&bstr); 
            pAnchorElem->Release(); 
            bFound = (bstr != NULL); 
		} 
 
		pDisp->Release(); 
 
        if ( bFound && bstr ) 
        { 
			// _bstr_t wrapper will delete since fCopy is FALSE 
            _bstr_t bstrHREF(bstr, FALSE); 
            rstrURL = (LPCSTR)bstrHREF;  
        } 
         
	} 
 
    return bFound; 
}