zoukankan      html  css  js  c++  java
  • VBA分别使用MSXML的DOM属性和XPATH进行网页爬虫

    本文要重点介绍的是VBA中的XmlHttp对象(MSXML2.XMLHTTP或MSXML.XMLHTTP),它可以向http服务器发送请求并使用微软XML文档对象模型Microsoft XML Document Object Model (DOM)处理回应。练习抓取的网页例子是https://www.qppstudio.net/public-holidays-by-date/month1.htm

    第一种方法——DOM经典属性:

    参考http://club.excelhome.net/thread-1233167-1-1.htmlhttps://www.jianshu.com/p/1920550cb4a6

    Sub Main()
    ActiveSheet.Cells.Clear
    Url = "https://www.qppstudio.net/public-holidays-by-date/month1.htm"
    Set oHttp = CreateObject("MSXML2.XMLHTTP") '创建一个xmlhttp对象
    Set odom = CreateObject("htmlfile") '创建一个Dom对象
    With oHttp
    'open,创建一个新的http请求,并指定此请求的方法、URL以及验证信息(用户名/密码) 'send,发送请求到http服务器并接收回应 .Open "GET", Url, False '使用Open方法,用get请求,False代表非异步加载
        .Open "GET", Url, False '使用Open方法,用get请求,False代表非异步加载
        .send '将open方法的信息发送给网页服务器
         odom.body.innerHTML = .responseText '将响应网页的HTML赋值给Dom对象,并只需要body标签里面的内容
    End With
    dom (odom)
    End Sub
    Sub dom(odom As Object) i = 2 For Each Item In odom.all If Item.className = "list-item" Then For Each itemch In Item.Children If itemch.className = "list-item-heading" Then Range("a" & i) = itemch.innerText ElseIf itemch.className = "list-subitem" Then Range("b" & i) = itemch.Children(1).innerText Range("c" & i) = itemch.Children(3).innerText i = i + 1 End If Next Exit For End If Next End Sub

    第二种方法——转换为XML并使用XPATH(比较麻烦):

    参考http://club.excelhome.net/thread-1233167-1-1.html

    Sub Main()
    Url = "https://www.qppstudio.net/public-holidays-by-date/month1.htm"
    Set oHttp = CreateObject("MSXML2.XMLHTTP") '创建一个xmlhttp对象
    Set odom = CreateObject("htmlfile") '创建一个Dom对象
    With oHttp
    'open,创建一个新的http请求,并指定此请求的方法、URL以及验证信息(用户名/密码) 'send,发送请求到http服务器并接收回应 .Open "GET", Url, False '使用Open方法,用get请求,False代表非异步加载
        .Open "GET", Url, False '使用Open方法,用get请求,False代表非异步加载
        .send '将open方法的信息发送给网页服务器
         odom.body.innerHTML = .responseText '将响应网页的HTML赋值给Dom对象,并只需要body标签里面的内容
    End With
    
    '需要先将html文本进行格式化才能写入xmldoc,才能使用自带的xpath,比如节点一定要有开始和结束,节点属性一定要用双引号括起来
    '例如
    'sXML = "<NewDataSet class=""123""><MyTable>"
    'sXML = sXML & "     <Active>true</Active>"
    'sXML = sXML & "     <SQLServer>APCD03</SQLServer>"
    'sXML = sXML & "     <SQLDatabase>OIS</SQLDatabase>"
    'sXML = sXML & " </MyTable>"
    'sXML = sXML & " <MyTable>"
    'sXML = sXML & "     <Active>false</Active>"
    'sXML = sXML & "     <SQLServer>APCD04</SQLServer>"
    'sXML = sXML & "     <SQLDatabase>OIS</SQLDatabase>"
    'sXML = sXML & " </MyTable></NewDataSet>"
    'Debug.Print sXML
    Dim sXML As String, xDoc, a, nodelist, node
    For Each Item In odom.all
      If Item.className = "list-item" Then
        sXML = Item.outerHTML
        Exit For
      End If
    Next
    sXML = rr(sXML, "<IMG.*?>", "")
    sXML = rr(sXML, "class=.*?>", ">")
    Set xDoc = CreateObject("MSXML.DOMDocument")
    a = xDoc.LoadXML(sXML)
    'a为true时代表写入成功,为false代表写入失败
    'Debug.Print a
    '一旦a为false就可以先写入txt再看哪些还不符合xml规范
    'file = ThisWorkbook.Path & "	est.txt"
    'Open file For Output As #1
    'Print #1, sXML
    'Close #1
    Set nodelist = xDoc.SelectNodes("//P")
    Set node = xDoc.SelectSingleNode("//P")
    'Debug.Print nodelist.Length
    For Each Item In nodelist
    Debug.Print Item.Text
    Next
    End Sub
    
    Function rr(str As String, pattern As String, repstr As String)
    Set reg = CreateObject("vbscript.regexp")
    With reg
    .Global = True
    .pattern = pattern
    End With
    rr = reg.Replace(str, repstr)
    End Function
  • 相关阅读:
    day02_接口测试流程
    day01_接口测试常识丶HTTP协议
    day03_元素操作丶浏览器操作方法丶鼠标操作
    day05_数组
    day04_运算符
    day03_数据类型丶字符编码丶基本数据类型转换
    day02_注释丶关键字丶标识符丶常量丶变量
    day04_数据序列之字符串
    day03_流程控制语句
    day02_输入数据丶数据类型转换丶运算符
  • 原文地址:https://www.cnblogs.com/JTCLASSROOM/p/11132518.html
Copyright © 2011-2022 走看看