zoukankan      html  css  js  c++  java
  • 使用mshtml解析html

    测试用例

    测试用例
    <html>
    <head>
    <title>
        Just a Test
    </title>
    </head>
    <body>
    gaofeng hello!!
    <div>
    <table bgcolor="red">

    <tr>
    <td bgcolor="yellow" border="2">Name</td>
    <td id="qualify1" border="1" class="blueBorder" bgcolor=blue></td>
    </tr>
    <tr>
    <td><id="qualify2" class="blueBorder" bgcolor="blue" border="1">Surname</p></td>
    <td></td>
    </tr>
    <tr>
    <td>address</td>
    <td></td>
    </tr>
    </table>
    </div>
    </body>
    </html>

    头文件

    #include <iostream>
    #include 
    <comdef.h>
    #include 
    <mshtml.h>
    #include 
    <string>
    #include 
    <fstream>
    #include 
    <vector>
    #include 
    <map>
    #import 
    <mshtml.tlb> no_auto_exclude

    测试代码

    代码
    // TestMSHTML.cpp : 定义控制台应用程序的入口点。
    //

    #include 
    "stdafx.h"
    #include 
    "TestMSHTML.h"
    #ifdef _DEBUG
    #define new DEBUG_NEW
    #endif


    // 唯一的应用程序对象

    CWinApp theApp;
    FILE 
    * fout;
    using namespace std;
    //OLECHAR szHTML[] = OLESTR("<HTML><BODY>Hello World!</BODY></HTML>");

    typedef  
    int BorderAttribute;
    void FindAllElementHavingBg(IHTMLDocument2 * pNewDoc,map<BorderAttribute,IHTMLElement *>& borderValue2ElementMap)
    {
        IHTMLElement 
    * pBody;
        pNewDoc
    ->get_body(&pBody);
        pBody
    ->Release();
    }

    void PrintTabs(int n)
    {
        
    for (int i = 0;i<n;i++)
        {
            
    //cout << '\t';
            fwprintf(fout,_T("\t"));
        }
    }

    void VisitNode(IHTMLElement* pElement,int level)
    {
        BSTR strName,strId,strTag;
        PrintTabs(level);
        pElement
    ->get_className(&strName);
        pElement
    ->get_id(&strId);
        pElement
    ->get_tagName(&strTag);
        
    if (strTag!=NULL)
        {
            fwprintf(fout,_T(
    "TagName:%s "),strTag);
        }
        
    if (strName!=NULL)
        {
            fwprintf(fout,_T(
    "className:%s "),strName);
        }
        
    if (strId != NULL)
        {
            fwprintf(fout,_T(
    "Id:%s "),strId);
        }
        SysFreeString(strName);
        SysFreeString(strId);
        SysFreeString(strTag);
        BSTR strAttrName1 
    = _T("border");
        BSTR strAttrName2 
    = _T("bgcolor");
        VARIANT val;

        pElement
    ->getAttribute(strAttrName1,2,&val);
        
    if (val.vt != VT_NULL)
        {
            
    if (val.bstrVal != NULL)
            {
                fwprintf(fout,_T(
    "border:%s "),val.bstrVal);
            }
        }


        pElement
    ->getAttribute(strAttrName2,2,&val);
        
    if (val.vt != VT_NULL)
        {
            
    if (val.bstrVal != NULL)
            {
                fwprintf(fout,_T(
    "bgcolor:%s "),val.bstrVal);
            }
        }

        
        fwprintf(fout,_T(
    "\n"));
    }
    //将DOM树打印出来
    void Run(IHTMLElement * pElement,int level)
    {
        IHTMLElementCollection 
    * children;

        VisitNode(pElement,level);


        IDispatch
    * pDisp;
        pElement
    ->get_children(&pDisp);
        pDisp
    ->QueryInterface(IID_IHTMLElementCollection,(void**)&children);
        pDisp
    ->Release();

        
    long len;
        children
    ->get_length(&len);
        VARIANT dummy;
        dummy.vt 
    = VT_I4;
        
    for (int i = 0;i < len;i++)
        {
            IHTMLElement
    * child;
            dummy.intVal 
    = i;
            children
    ->item(dummy,dummy,(IDispatch**)&pDisp);
            pDisp
    ->QueryInterface(IID_IHTMLElement,(void**)&child);
            pDisp
    ->Release();
            Run(child,level 
    + 1);
            child
    ->Release();
        }
        children
    ->Release();
    }
    void TestParse(IHTMLDocument2 * pNewDoc)
    {
        BSTR strText;
        IHTMLElement 
    *pBody;
        pNewDoc
    ->get_body(&pBody);
        pBody
    ->get_innerText(&strText);
        wprintf(_T(
    "%s\n"),strText);
        SysFreeString(strText);
        

        pNewDoc
    ->get_title(&strText);
        wprintf(_T(
    "%s\n"),strText);
        SysFreeString(strText);
        
        cout 
    << "Run begin...."<<endl;
        Run(pBody,
    0);
        cout 
    << "Run end...."<<endl;

        pBody
    ->Release();

        
    //FindAllElementHavingBg(pNewDoc);

    }
    void TestMSHTML(wchar_t * wcontent)
    {
        IHTMLDocument2 
    *pDoc = NULL;
        CoInitialize(NULL);
        CoCreateInstance(CLSID_HTMLDocument, 
                         NULL, 
                         CLSCTX_INPROC_SERVER, 
                         IID_IHTMLDocument2, 
                        (LPVOID 
    *&pDoc);

        
    if (pDoc)
        {
            IPersistStreamInit 
    *pPersist = NULL;
            pDoc
    ->QueryInterface(IID_IPersistStreamInit, 
                                 (LPVOID 
    *&pPersist);
            
    if (pPersist)
            {
                IMarkupServices 
    *pMS = NULL;
                pPersist
    ->InitNew();
                pPersist
    ->Release();
                pDoc
    ->QueryInterface(IID_IMarkupServices, 
                                    (LPVOID 
    *&pMS);

                
    if (pMS)
                {
                    IMarkupContainer 
    *pMC = NULL;
                    IMarkupPointer 
    *pMkStart = NULL;
                    IMarkupPointer 
    *pMkFinish = NULL;
                    pMS
    ->CreateMarkupPointer(&pMkStart);
                    pMS
    ->CreateMarkupPointer(&pMkFinish);
                    pMS
    ->ParseString(wcontent,
                        
    0
                        
    &pMC, 
                        pMkStart, 
                        pMkFinish);

                    
    if (pMC)
                    {
                        IHTMLDocument2 
    *pNewDoc = NULL;

                        pMC
    ->QueryInterface(IID_IHTMLDocument, 
                            (LPVOID 
    *&pNewDoc);

                        
    if (pNewDoc)
                        {
                            
    // do anything with pNewDoc, in this case 
                            
    // get the body innerText.
                            TestParse(pNewDoc);
        
                            pNewDoc
    ->Release();
                        }

                        pMC
    ->Release();
                    }

                    
    if (pMkStart)
                        pMkStart
    ->Release();

                    
    if (pMkFinish)
                        pMkFinish
    ->Release();

                    pMS
    ->Release();
                }
            }

            pDoc
    ->Release();
        }

        CoUninitialize();

    }

    inline wchar_t
    * AnsiToUnicode( const char* szStr )
    {
        
    int nLen = MultiByteToWideChar( CP_ACP, MB_PRECOMPOSED, szStr, -1, NULL, 0 );
        
    if (nLen == 0)
        {
            
    return NULL;
        }
        wchar_t
    * pResult = new wchar_t[nLen+1];
        MultiByteToWideChar( CP_ACP, MB_PRECOMPOSED, szStr, 
    -1, pResult, nLen );
        pResult[nLen] 
    = L'\0';
        
    return pResult;
    }

    //调用者负责delete wcontent
    wchar_t * ReadFromHtmlFile(string str,string & content)
    {
        ifstream fin(str.c_str());
        
    string line;
        
    while(getline(fin,line))
        {
        
    //    cout << line << endl;
            content = content + line;
        }
        
    //cout << content << endl;
        
    //cout << content.size() << endl;
        
    //printf("original html code\n%s\n",content.c_str());
        wchar_t * wcontent = AnsiToUnicode(content.c_str()); 
        
    //wprintf(L"after transferred\n%s\n",wcontent);
        
    //delete[] wcontent;
        fin.close();
        fin.clear();
        
    return wcontent;
    }

    int _tmain(int argc, TCHAR* argv[], TCHAR* envp[])
    {
        
    int nRetCode = 0;

        
    // 初始化 MFC 并在失败时显示错误
        if (!AfxWinInit(::GetModuleHandle(NULL), NULL, ::GetCommandLine(), 0))
        {
            
    // TODO: 更改错误代码以符合您的需要
            _tprintf(_T("错误: MFC 初始化失败\n"));
            nRetCode 
    = 1;
        }
        
    else
        {
            fout 
    = fopen("out.txt","w");
            
    string str = "test.html";
            
    string content;
            wchar_t 
    * wcontent = ReadFromHtmlFile(str,content);
            
    int len = wcslen(wcontent);
            
    //cout << len << endl;
            
            TestMSHTML(wcontent);
            delete[] wcontent;
            fclose(fout);
        }
        
        
    return nRetCode;
    }
    输入结果
    TagName:BODY
     TagName:DIV
      TagName:TABLE bgcolor:#ff0000
       TagName:TBODY
        TagName:TR
         TagName:TD border:2 bgcolor:#ffff00
         TagName:TD className:blueBorder Id:qualify1 border:1 bgcolor:#0000ff
        TagName:TR
         TagName:TD
          TagName:P className:blueBorder Id:qualify2 border:1 bgcolor:blue
         TagName:TD
        TagName:TR
         TagName:TD
         TagName:TD
  • 相关阅读:
    CenterNet-TensorRT错误记录
    NAS研究要点分析
    conda如何安装从源下载的离线安装包
    Xavier上pytorch半精度inference问题
    Xavier 使用便携程序
    Xavier疑问
    Python输入(Leetcode
    兴趣爱好
    生活目标
    TX2装机教程
  • 原文地址:https://www.cnblogs.com/speedmancs/p/1797442.html
Copyright © 2011-2022 走看看