zoukankan      html  css  js  c++  java
  • 获取标题

    Sub GetCatalogPages()
          For n = 1 To 20
                CatalogURL = "http://blog.sina.com.cn/s/_" & n & ".html"
               Call GetCatalogByUrl(CatalogURL)
          Next n
    End Sub
    Sub GetCatalogByUrl(ByVal CatalogURL As String)
        'Dim CatalogURL As String
        Dim WebText As String
        Dim OneSpan As Object
        Dim OneA As Object
        Dim Wb As Workbook
        Dim Sht As Worksheet
        Dim Rng As Range
        Dim i As Long, j As Long
        
        Dim StartTime As Variant    '开始时间
        Dim UsedTime As Variant    '使用时间
        StartTime = VBA.Timer    '记录开始时间
        
        AppSettings
    
        
        Set Wb = Application.ThisWorkbook
        Set Sht = Wb.Worksheets("Catalog")
        With Sht
            '.UsedRange.Offset(1).ClearContents
            'i = 1
          endrow = .Cells(.Cells.Rows.Count, 1).End(xlUp).Row
          i = endrow
            '发送请求
            With CreateObject("MSXML2.XMLHTTP")
                .Open "GET", CatalogURL, False
                .Send
                WebText = .responsetext
            End With
            '创建网页文件 创建 Html Dom
            'Microsoft HTML Object Library
            With CreateObject("htmlfile")
                .write WebText
                For Each OneA In .getElementsByTagName("a")
                    href = OneA.href
                    If href Like "*http://blog.sina.com.cn/s/blog_*" Then
                        i = i + 1
                        Sht.Cells(i, 2).Value = href
                       '     Sht.Hyperlinks.Add Sht.Cells(i, 2), href ', href
                    End If
                Next OneA
                 i = endrow
                For Each OneMeta In .getElementsByTagName("meta")
                    If OneMeta.Name = "description" Then
                        cnt = OneMeta.Content
                        'Debug.Print cnt
                        titles = Split(Split(cnt, "xxxx,")(1), ",")
                        For n = LBound(titles) To UBound(titles) Step 1
                            i = i + 1
                            Sht.Cells(i, 1).Value = titles(n)
                        Next n
                    End If
                Next OneMeta
            End With
        End With
        AppSettings False
        UsedTime = VBA.Timer - StartTime
        Debug.Print "采集     " & CatalogURL; " :  " & Format(UsedTime, "#0.0000秒")
        'MsgBox "本次运行耗时:" & Format(UsedTime, "#0.0000秒")
    End Sub
    Sub GetQuestionsByExamUrl()
        
        Dim Wb As Workbook
        Dim Sht As Worksheet
        Dim oSht As Worksheet
    Set Wb = Application.ThisWorkbook
        
        Set Sht = Wb.Worksheets("Catalog")
        Set oSht = Wb.Worksheets("Question")
        
        With Sht
            endrow = .Cells(.Cells.Rows.Count, 1).End(xlUp).Row
            Set Rng = .Range("A2:B" & endrow)
            Arr = Rng.Value
        End With
        With oSht
            r = 1
            For i = LBound(Arr) To UBound(Arr)
                ExamTitle = Arr(i, 1)
                ExamUrl = Arr(i, 2)
                ExamText = GetExamTextByUrl(ExamUrl)
                Ques = RegGetArray(ExamText, "([((]d[))][^
    ]*)[
    ]")
                For n = LBound(Ques) To UBound(Ques) Step 1
                    r = r + 1
                    .Cells(r, 1).Value = ExamTitle
                    .Cells(r, 2).Value = ExamUrl
                    .Cells(r, 3).Value = Ques(n)
                Next n
                
            Next i
        End With
        
        
        
        Set Wb = Nothing
        Set Sht = Nothing
        Set oSht = Nothing
        
    End Sub
    
    Function GetExamTextByUrl(ByVal ExamUrl As String) As String
           '发送请求
            With CreateObject("MSXML2.XMLHTTP")
                .Open "GET", ExamUrl, False
                .Send
                WebText = .responsetext
                'Debug.Print WebText
            End With
            With CreateObject("htmlfile")
                .write WebText
               Set examdiv = .getElementById("sina_keyword_ad_area2")
               ' Debug.Print examdiv.innerText
              GetExamTextByUrl = examdiv.innerText
            End With
    End Function
    Private Sub AppSettings(Optional IsStart As Boolean = True)
        If IsStart Then
            Application.ScreenUpdating = False
            Application.DisplayAlerts = False
            Application.Calculation = xlCalculationManual
            Application.StatusBar = ">>>>>>>>Macro Is Running>>>>>>>>"
        Else
            Application.ScreenUpdating = True
            Application.DisplayAlerts = True
            Application.Calculation = xlCalculationAutomatic
            Application.StatusBar = False
        End If
    End Sub
    
    Public Function RegGetArray(ByVal OrgText As String, ByVal Pattern As String) As String()
        Dim Reg As Object, Mh As Object, OneMh As Object
        Dim Arr() As String, Index As Long
        Dim Elm As String
        Set Reg = CreateObject("Vbscript.Regexp")
        With Reg
            .MultiLine = True
            .Global = True
            .Ignorecase = False
            .Pattern = Pattern
            Set Mh = .Execute(OrgText)
            
            Index = 0
            ReDim Arr(1 To 1)
            For Each OneMh In Mh
                Index = Index + 1
                ReDim Preserve Arr(1 To Index)
                'If OneMh.submatches(0) <> "" Then Elm = OneMh.submatches(0)
                Arr(Index) = OneMh.submatches(0)
                'Debug.Print OneMh.submatches(0)
            Next OneMh
        End With
        RegGetArray = Arr
        Set Reg = Nothing
        Set Mh = Nothing
    End Function
    
    
    Public Function RegGet(ByVal OrgText As String, ByVal Pattern As String) As String
    '传递参数 :原字符串, 匹配模式
        Dim Regex As Object
        Dim Mh As Object
        Set Regex = CreateObject("VBScript.RegExp")
        With Regex
            .Global = True
            .Pattern = Pattern
        End With
        If Regex.test(OrgText) Then
            Set Mh = Regex.Execute(OrgText)
            RegGet = Mh.Item(0).submatches(0)
        Else
            RegGet = ""
        End If
        Set Regex = Nothing
    End Function
    

      

  • 相关阅读:
    45_ansible概述、ansible基础 、ad-hoc、批量配置管理
    44_自定义镜像及仓库、持久化存储 、 Docker网络架构
    43_Docker概述、部署Docker、Docker镜像、Docker基本命令
    42_KVM简介、 Virsh管理 、 自定义虚拟机、虚拟设备管理
    41_iptables防火墙 filter表控制 扩展匹配 nat表典型应用
    40_系统审计 服务安全 Linux安全之打补丁
    39_加密与解密 AIDE入侵检测系统 扫描与抓包
    38_Linux基本防护 用户切换与提权 SSH访问控制 SELinux安全 、SSH访问控制 SELinux安全
    hdu5530
    bzoj3456
  • 原文地址:https://www.cnblogs.com/nextseven/p/8437061.html
Copyright © 2011-2022 走看看