zoukankan      html  css  js  c++  java
  • 20180301越努力越轻松

    '目前存在的BUG
    '图片补丁存在多个URL
    '题目中间存在小数的问题在正则表达式里加上d+D
    Private Declare Function URLDownloadToFile Lib "urlmon" Alias "URLDownloadToFileA" (ByVal pCaller As Long, ByVal szURL As String, ByVal szFileName As String, ByVal dwReserved As Long, ByVal lpfnCB As Long) As Long
    Private Declare Function DeleteUrlCacheEntry Lib "wininet" Alias "DeleteUrlCacheEntryA" (ByVal lpszUrlName As String) As Long
    Sub DownloadImageName(ByVal ImageURL As String, ByVal ImagePath As String)
        Dim lngRetVal As Long
        lngRetVal = URLDownloadToFile(0, ImageURL, ImagePath, 0, 0)
        If lngRetVal = 0 Then
            DeleteUrlCacheEntry ImageURL  '清除缓存
            'MsgBox "成功"
        Else
            'MsgBox "失败"
        End If
    End Sub
    Sub LoopGetSubject()
        Dim StartTime As Variant
        Dim UsedTime As Variant
        StartTime = VBA.Timer
        
        Dim msg As Variant
        msg = MsgBox("Choose 'Yes' to Continue,Choose 'No' to Exit !", vbYesNo, "AuthorQQ 84857038")
        If msg = vbNo Then Exit Sub
        
        Dim Sht As Worksheet
        Set Sht = ThisWorkbook.ActiveSheet
        With Sht
            EndRow = .Cells(.Cells.Rows.Count, 1).End(xlUp).Row
            For i = 2 To EndRow
                SetFontRed .Cells(i, 1).Resize(1, 3)
                FindText = Mid(.Cells(i, 3).Text, 4, Len(.Cells(i, 3).Text) - 8)
                ExamUrl = .Cells(i, 2).Text
                Source = .Cells(i, 1).Text
                Call GetExamTextByUrl(ExamUrl, FindText, Source)
            Next i
        End With
        Set Sht = Nothing
        UsedTime = VBA.Timer - StartTime
        Debug.Print "UsedTime :" & Format(UsedTime, "#0.0000 Seconds")
        MsgBox "UsedTime :" & Format(UsedTime, "#0.0000 Seconds")
    End Sub
    
    Sub ConditionGetSubject()
        Dim StartTime As Variant
        Dim UsedTime As Variant
        StartTime = VBA.Timer
        
        Dim msg As Variant
        
        
        Text = Application.InputBox("请输入筛选关键词,支持LIKE方法的通配符与|分支: ", "AuthorQQ 84857038", , , , , , 2)
        
        If Text = False Then
            msg = MsgBox("本次执行等同于提取所有题目,是否继续?,Choose 'No' to Exit !", vbYesNo, "AuthorQQ 84857038")
            If msg = vbNo Then Exit Sub
        End If
        
        'Condition = "*" & Text & "*"
        
        Dim Sht As Worksheet
        Set Sht = ThisWorkbook.ActiveSheet
        With Sht
            EndRow = .Cells(.Cells.Rows.Count, 1).End(xlUp).Row
            For i = 2 To EndRow
                If InStr(Text, "|") = 0 Then
                    Condition = "*" & Text & "*"
                    If .Cells(i, 3).Text Like Condition Then
                        SetFontRed .Cells(i, 1).Resize(1, 3)
                        FindText = Mid(.Cells(i, 3).Text, 4, Len(.Cells(i, 3).Text) - 8)
                        ExamUrl = .Cells(i, 2).Text
                        Source = .Cells(i, 1).Text
                        Call GetExamTextByUrl(ExamUrl, FindText, Source)
                    End If
                Else
                    conditions = Split(Text, "|")
                    For n = LBound(conditions) To UBound(conditions) Step 1
                        Condition = "*" & conditions(n) & "*"
                        If .Cells(i, 3).Text Like Condition Then
                            SetFontRed .Cells(i, 1).Resize(1, 3)
                            FindText = Mid(.Cells(i, 3).Text, 4, Len(.Cells(i, 3).Text) - 8)
                            ExamUrl = .Cells(i, 2).Text
                            Source = .Cells(i, 1).Text
                            Call GetExamTextByUrl(ExamUrl, FindText, Source)
                        End If
                    Next n
                End If
            Next i
            
            
        End With
        Set Sht = Nothing
        UsedTime = VBA.Timer - StartTime
        Debug.Print "UsedTime :" & Format(UsedTime, "#0.0000 Seconds")
        MsgBox "UsedTime :" & Format(UsedTime, "#0.0000 Seconds")
    End Sub
    
    
    
    Sub GetSubject()
        Dim Rng As Range
        Dim OneCell As Range
        Set Rng = Application.Selection
        For Each OneCell In Rng.Cells
            If OneCell.Column = 3 Then
                If Len(OneCell.Text) > 0 Then
                    SetFontRed OneCell
                    FindText = Mid(OneCell.Text, 4, Len(OneCell.Text) - 8)
                    ExamUrl = OneCell.Offset(0, -1).Text
                    Source = OneCell.Offset(0, -2).Text
                    Call GetExamTextByUrl(ExamUrl, FindText, Source)
                End If
            End If
        Next OneCell
    End Sub
    Sub GetExamTextByUrl(ByVal ExamUrl As String, ByVal FindText As String, ByVal Source As String)
        Dim Subject As String
        Dim HasImageText As String
        Dim Question As String
        Dim ImageURL As String
        Dim Answer As String
        Dim HasGetContent As Boolean
        Dim docName As String
        Dim docPath As String
        Dim Independent As Boolean
        Dim IsQuestion As Boolean
        Dim IsAnswer As Boolean
        Dim oneP As Object
        Dim nextTag As Object
        
        'send request
        With CreateObject("MSXML2.XMLHTTP")
            .Open "GET", ExamUrl, False
            .Send
            WebText = .responsetext
            'Debug.Print WebText
            ' Stop
        End With
        With CreateObject("htmlfile")
            .write WebText
            Set examdiv = .getElementById("sina_keyword_ad_area2")
            '获取试卷文本内容
            ExamText = examdiv.innerText
            
            '判断试卷是否含有独立答案
            Independent = ExamText Like "*参考答案*"
            'Debug.Print "  Independent "; Independent
            '设定搜集题目Word文档名称和路径
            docName = Application.ActiveSheet.Name & "_题目搜集.doc"
            docPath = ThisWorkbook.Path & "" & docName
            '判断某个段落是否为题目/答案的开始
            IsQuestion = False
            IsAnswer = False
            '判断是否已经提取到内容
            HasGetContent = False
            '循环所有段落
            For Each oneP In .getElementsByTagName("p")
                If HasGetContent = False Then
                    '判断某段内容是否为题号行
                    'If oneP.innerText Like "##.*" Or oneP.innerText Like "##.*" Then
                    If RegTest(oneP.innerText, "(d{1,2})[..]D.*") Then
                        '保留题干源码 并去除掉标题title的内容,提取出汉字作为分隔符
                        pInnerHtml = oneP.innerHTML
                        pInnerHtml = Replace(pInnerHtml, Source, "")
                        HeadSp = RegGet(pInnerHtml, "([u4e00-u9fa5]{5,})")
                        
                        Subject = ""
                        Question = ""
                        ImageURL = ""
                        Answer = ""
                        '开始记录题干内容
                        Subject = oneP.innerText
                        'Debug.Print OneP.innerText
                    Else
                        If InStr(oneP.innerText, FindText) = 0 Then
                            '过滤不相干的问题,仅保留符合条件的问题
                            If Not RegTest(oneP.innerText, "([((]d[))]).*") Then
                                '继续记录问题内容
                                Subject = Subject & vbCrLf & oneP.innerText
                            End If
                        End If
                    End If
                    
                    '提取题目的序号和问题的序号
                    If InStr(oneP.innerText, FindText) > 0 Then
                        '保留问题的源码,删除掉标题Title的内容 并提取出汉字 作为分隔符
                        questionHtml = oneP.innerHTML
                        questionHtml = Replace(questionHtml, Source, "")
                        TailSp = RegGet(questionHtml, "([u4e00-u9fa5]{5,})")
                         
                        'Debug.Print ">>>>>汉字分隔符>>>>"; HeadSp
                        'Debug.Print ">>>>>查找>>>>" & FindText; InStr(WebText, TailSp) > 0
                        HasImageText = Split(WebText, TailSp)(0)
                        pos = InStrRev(HasImageText, HeadSp)
                        HasImageText = Mid(HasImageText, pos)
                        
                        SubjectIndex = RegGet(Subject, "(d{1,2})[..]D.*")
                        Question = oneP.innerText
                        questionIndex = RegGet(Question, "[((](d)[))].*")
                        'Debug.Print "题序:"; SubjectIndex; "   问序: "; questionIndex
                        HasGetContent = True
                    End If
                    
                Else
                    '提取内容后 开始找答案
                    '试卷不含独立答案,答案就附在每道题后面
                    If Independent = False Then
                        
                        If IsAnswer = False Then
                            If RegTest(oneP.innerText, "[((](" & questionIndex & ")[))].*") Then
                                Answer = oneP.innerText
                                IsAnswer = True
                                'Exit For
                            End If
                        Else
                            Debug.Print oneP.innerText
                            If RegTest(oneP.innerText, "[((](d)[))].*") Or RegTest(oneP.innerText, "(d{1,2})[..]D.*") Then
                                Exit For
                            Else
                                Answer = Answer & oneP.innerText
                            End If
                        End If
                    Else
                        '试卷还有独立参考答案
                        '判断某段内容的题号是否符合条件
                        If RegTest(oneP.innerText, "(" & SubjectIndex & ")[..].D*") Then
                            IsQuestion = True
                            'Debug.Print isQuestion
                        End If
                        If IsQuestion = True Then
                            '判断某段内容的问题序号是否符合条件
                            If IsAnswer = False Then
                                If RegTest(oneP.innerText, "([((]" & questionIndex & "[))]).*") Then
                                    '记录问题答案
                                    Answer = oneP.innerText
                                    IsAnswer = True
                                    'Exit For
                                End If
                            Else
                                Debug.Print oneP.innerText
                                If RegTest(oneP.innerText, "[((](d)[))].*") Or RegTest(oneP.innerText, "(d{1,2})[..]D.*") Then
                                    Exit For
                                Else
                                    Answer = Answer & oneP.innerText
                                End If
                            End If
                        End If
                    End If
                End If
            Next oneP
            '图片地址处理
            ' ImageURL = Mid(ImageURL, 2)
            '测试
            
            'Debug.Print ImageURL
            Debug.Print Question
            Debug.Print Answer
        End With
        '<span style="font-family:">43.</span>
        '【补丁,有待改进】2017年下半年部分图片提取不到的问题修正
        ImageURL = ""
        If Len(ImageURL) = 0 Then
        
            imgs = RegGetArray(HasImageText, "real_src =""(http.*?)""")
            
            For n = LBound(imgs) To UBound(imgs) Step 1
                'Debug.Print imgs(n)
                ImageURL = ImageURL & "|" & imgs(n)
            Next n
            
            'Stop
            ImageURL = Mid(ImageURL, 2)
            Debug.Print "所有图片地址:"; ImageURL
            'Stop
            'hasimagetext = Split(hasimagetext, "real_src")(UBound(Split(hasimagetext, "real_src")))
            'ImageURL = Split(hasimagetext, """")(1)
        End If
        
        '输出题目内容到Word文档
        Dim wdApp As Object
        Dim Doc As Object
        
        On Error Resume Next
        Set wdApp = GetObject(, "Word.Application")
        On Error GoTo 0
        If Not wdApp Is Nothing Then
            wdApp.Visible = True
            On Error Resume Next
            Set Doc = wdApp.documents(docName)
            On Error GoTo 0
            If Doc Is Nothing Then
                Set Doc = wdApp.documents.Add()
                Doc.SaveAs docPath
            End If
        Else
            Set wdApp = CreateObject("Word.Application")
            wdApp.Visible = True
            
            If Dir(docPath) <> "" Then
                Set Doc = wdApp.documents.Open(docPath)
            Else
                Set Doc = wdApp.documents.Add()
                Doc.SaveAs docPath
            End If
        End If
        
        Doc.Activate
        wdApp.Selection.EndKey 6
        wdApp.Selection.TypeParagraph
        wdApp.Selection.InsertBreak 7
        '输出题干内容
        'Debug.Print Subject
        Subject = RegReplace(Subject, "(" & SubjectIndex & "[..])") & "."
        'Debug.Print Subject
        'Stop
        wdApp.Selection.TypeText Text:=Subject
        wdApp.Selection.TypeParagraph
        
        '下载图片并插入WORD文档
        If ImageURL <> "" Then
            If InStr(ImageURL, "|") = 0 Then
                ImagePath = ThisWorkbook.Path & Application.PathSeparator & "tmp.jpg"
                DownloadImageName ImageURL, ImagePath
                wdApp.Selection.InlineShapes.AddPicture Filename:=ImagePath, LinkToFile:=False, SaveWithDocument:=True
                wdApp.Selection.TypeParagraph
                Kill ImagePath
                'Stop'
            Else
                ImageURLs = Split(ImageURL, "|")
                For n = LBound(ImageURLs) To UBound(ImageURLs) Step 1
                    ImagePath = ThisWorkbook.Path & Application.PathSeparator & n & "tmp.jpg"
                    DownloadImageName ImageURLs(n), ImagePath
                    Debug.Print ImagePath
                    wdApp.Selection.InlineShapes.AddPicture Filename:=ImagePath, LinkToFile:=False, SaveWithDocument:=True
                    wdApp.Selection.TypeParagraph
                    Kill ImagePath
                Next n
            End If
        End If
        '输出问题内容
        wdApp.Selection.TypeText Text:=RegReplace(Question, "([((]d[))])")
        wdApp.Selection.TypeParagraph
        '输出答案内容
        
        sp = RegGet(Answer, "([((]" & questionIndex & "[))]).*")
        'Debug.Print Sp
        If Len(sp) > 0 Then
            Answer = Split(Answer, sp)(1)
            sp = RegGet(Answer, "([((]" & questionIndex + 1 & "[))]).*")
            If Len(sp) > 0 Then
                Answer = Split(Answer, sp)(0)
            End If
        End If
        'Debug.Print Answer
        Answer = RegReplace(Answer, "(【来源】.*)")
        Answer = RegReplace(Answer, "(【解析】.*)")
        'Debug.Print Answer
        'Stop
        wdApp.Selection.TypeText Text:="【答案】" & Answer
        
        
        Source = Replace(Source, "【", "")
        Source = Replace(Source, "】", "")
        Source = Replace(Source, "解析", "")
        
        wdApp.Selection.TypeParagraph
        wdApp.Selection.TypeText Text:="[ 来源:" & Source & " 第" & SubjectIndex & "题 第(" & questionIndex & ")问 ]"
        wdApp.Selection.TypeParagraph
        
        Set wdApp = Nothing
        Set Doc = Nothing
        Set oneP = Nothing
    End Sub
    Private Function RegTest(ByVal OrgText As String, ByVal Pattern As String) As Boolean
    '传递参数 :原字符串, 匹配模式
        Dim Regex As Object
        Set Regex = CreateObject("VBScript.RegExp")
        With Regex
            .Global = True
            .Pattern = Pattern
        End With
        RegTest = Regex.test(OrgText)
        Set Regex = Nothing
    End Function
    Public Function RegGet(ByVal OrgText As String, ByVal Pattern As String) As String
    '传递参数 :原字符串, 匹配模式
        Dim Regex As Object
        Dim Mh As Object
        Set Regex = CreateObject("VBScript.RegExp")
        With Regex
            .Global = True
            .Pattern = Pattern
        End With
        If Regex.test(OrgText) Then
            Set Mh = Regex.Execute(OrgText)
            RegGet = Mh.Item(0).submatches(0)
        Else
            RegGet = ""
        End If
        Set Regex = Nothing
    End Function
    Sub SetFontRed(ByVal Rng As Range)
        With Rng.Font
            .Color = -16776961
            .TintAndShade = 0
        End With
    End Sub
    Public Function RegReplace(ByVal OrgText As String, ByVal Pattern As String, Optional RepStr As String = "") As String
    '传递参数 :原字符串, 匹配模式 ,替换字符
        Dim Regex As Object
        Dim newText As String
        Set Regex = CreateObject("VBScript.RegExp")
        With Regex
            .Global = True
            .Pattern = Pattern
        End With
        newText = Regex.Replace(OrgText, RepStr)
        RegReplace = newText
        Set Regex = Nothing
    End Function
    Public Function RegGetArray(ByVal OrgText As String, ByVal Pattern As String) As String()
        Dim Reg As Object, Mh As Object, OneMh As Object
        Dim Arr() As String, Index As Long
        Dim Elm As String
        Set Reg = CreateObject("Vbscript.Regexp")
        With Reg
            .MultiLine = True
            .Global = True
            .Ignorecase = False
            .Pattern = Pattern
            Set Mh = .Execute(OrgText)
            
            Index = 0
            ReDim Arr(1 To 1)
            For Each OneMh In Mh
                Index = Index + 1
                ReDim Preserve Arr(1 To Index)
                'If OneMh.submatches(0) <> "" Then Elm = OneMh.submatches(0)
                Arr(Index) = OneMh.submatches(0)
                'Debug.Print OneMh.submatches(0)
            Next OneMh
        End With
        RegGetArray = Arr
        Set Reg = Nothing
        Set Mh = Nothing
    End Function
    
    Function RealInnerHtml(ByVal OrgInnerHtml) As String
          Dim x As String
          x = OrgInnerHtml
          x = Replace(x, "SPAN", "span")
          x = Replace(x, "FONT-SIZE", "font-size")
          x = Replace(x, "FONT-FAMILY", "font-family")
          x = Replace(x, "FONT", "font")
          x = Replace(x, "WBR", "wbr")
          x = Replace(x, "COLOR", "color")
          RealInnerHtml = x
    End Function
    Public Function RegGetLast(ByVal OrgText As String, ByVal Pattern As String) As String
    '传递参数 :原字符串, 匹配模式
        Dim Regex As Object
        Dim Mh As Object
        Set Regex = CreateObject("VBScript.RegExp")
        With Regex
            .Global = True
            .Pattern = Pattern
        End With
        If Regex.test(OrgText) Then
            Set Mh = Regex.Execute(OrgText)
            'RegGetLast = Mh.Item(0).submatches(0)
            For Each OneMh In Mh
                RegGetLast = OneMh.submatches(0)
            Next OneMh
        Else
            RegGetLast = ""
        End If
        Set Regex = Nothing
    End Function
    

      

  • 相关阅读:
    BZOJ 3028 食物 ——生成函数
    BZOJ 1933 [Shoi2007]Bookcase 书柜的尺寸 ——动态规划
    论咸鱼的自我修养之网络流
    SPOJ LCS2 Longest Common Substring II ——后缀自动机
    SPOJ NSUBSTR Substrings ——后缀自动机
    BZOJ 1879 [Sdoi2009]Bill的挑战 ——状压DP
    BZOJ 1875 [SDOI2009]HH去散步 ——动态规划 矩阵乘法
    BZOJ 1226 [SDOI2009]学校食堂Dining ——状压DP
    BZOJ 4566 [Haoi2016]找相同字符 ——广义后缀自动机
    BZOJ 3473 字符串 ——广义后缀自动机
  • 原文地址:https://www.cnblogs.com/nextseven/p/8487515.html
Copyright © 2011-2022 走看看