zoukankan      html  css  js  c++  java
  • 2018-02-16 GetSameTypeQuestion

    '目前存在的BUG
    '图片补丁存在多个URL
    Private Declare Function URLDownloadToFile Lib "urlmon" Alias "URLDownloadToFileA" (ByVal pCaller As Long, ByVal szURL As String, ByVal szFileName As String, ByVal dwReserved As Long, ByVal lpfnCB As Long) As Long
    Private Declare Function DeleteUrlCacheEntry Lib "wininet" Alias "DeleteUrlCacheEntryA" (ByVal lpszUrlName As String) As Long
    Sub DownloadImageName(ByVal ImageURL As String, ByVal ImagePath As String)
        Dim lngRetVal As Long
        lngRetVal = URLDownloadToFile(0, ImageURL, ImagePath, 0, 0)
        If lngRetVal = 0 Then
            DeleteUrlCacheEntry ImageURL  '清除缓存
            'MsgBox "成功"
        Else
            'MsgBox "失败"
        End If
    End Sub
    Sub LoopGetSubject()
        Dim StartTime As Variant
        Dim UsedTime As Variant
        StartTime = VBA.Timer
        Dim Sht As Worksheet
        Set Sht = ThisWorkbook.ActiveSheet
        With Sht
            EndRow = .Cells(.Cells.Rows.Count, 1).End(xlUp).Row
            For i = 2 To EndRow
                SetFontRed .Cells(i, 1).Resize(1, 3)
                FindText = Mid(.Cells(i, 3).Text, 4, Len(.Cells(i, 3).Text) - 8)
                ExamUrl = .Cells(i, 2).Text
                Call GetExamTextByUrl(ExamUrl, FindText)
            Next i
        End With
        Set Sht = Nothing
        UsedTime = VBA.Timer - StartTime
        Debug.Print "UsedTime :" & Format(UsedTime, "#0.0000 Seconds")
        MsgBox "UsedTime :" & Format(UsedTime, "#0.0000 Seconds")
    End Sub
    Sub GetSubject()
        SetFontRed Application.ActiveCell
        FindText = Mid(Application.ActiveCell.Text, 4, Len(Application.ActiveCell.Text) - 8)
        ExamUrl = Application.ActiveCell.Offset(0, -1).Text
        Call GetExamTextByUrl(ExamUrl, FindText)
    End Sub
    Sub GetExamTextByUrl(ByVal ExamUrl As String, ByVal FindText As String)
        Dim Subject As String
        Dim Question As String
        Dim ImageURL As String
        Dim Answer As String
        Dim HasGetContent As Boolean
        Dim docName As String
        Dim docPath As String
        Dim Independent As Boolean
        Dim IsQuestion As Boolean
        Dim IsAnswer As Boolean
        Dim oneP As Object
        Dim nextTag As Object
        
        'send request
        With CreateObject("MSXML2.XMLHTTP")
            .Open "GET", ExamUrl, False
            .Send
            WebText = .responsetext
            'Debug.Print WebText
        End With
        With CreateObject("htmlfile")
            .write WebText
            Set examdiv = .getElementById("sina_keyword_ad_area2")
            '获取试卷文本内容
            ExamText = examdiv.innerText
            '判断试卷是否含有独立答案
            Independent = ExamText Like "*参考答案*"
            'Debug.Print "  Independent "; Independent
            '设定搜集题目Word文档名称和路径
            docName = Application.ActiveSheet.Name & "_题目搜集.doc"
            docPath = ThisWorkbook.Path & "" & docName
            '判断某个段落是否为题目/答案的开始
            IsQuestion = False
            IsAnswer = False
            '判断是否已经提取到内容
            HasGetContent = False
            '循环所有段落
            For Each oneP In .getElementsByTagName("p")
                If HasGetContent = False Then
                    '判断某段内容是否为题号行
                    If oneP.innerText Like "##.*" Or oneP.innerText Like "##.*" Then
                        Subject = ""
                        Question = ""
                        ImageURL = ""
                        Answer = ""
                        '开始记录题干内容
                        Subject = oneP.innerText
                        'Debug.Print OneP.innerText
                    Else
                        If InStr(oneP.innerText, FindText) = 0 Then
                            '过滤不相干的问题,仅保留符合条件的问题
                            If Not RegTest(oneP.innerText, "([((]d[))]).*") Then
                                '继续记录问题内容
                                Subject = Subject & oneP.innerText
                            End If
                        End If
                    End If
                    '提取题目图片的地址
                    Set nextTag = oneP.NextSibling
                    If Not nextTag Is Nothing Then
                        If UCase(nextTag.tagName) = "A" Then
                            If nextTag.HasChildNodes Then
                                If nextTag.href Like "http://photo.blog.sina.com.cn/showpic.html*" Then
                                    ImageURL = ImageURL & "|" & nextTag.FirstChild.getAttribute("real_src")
                                    'Debug.Print ImageURL
                                End If
                            End If
                        End If
                    End If
                    
                    '提取题目的序号和问题的序号
                    If InStr(oneP.innerText, FindText) > 0 Then
                        SubjectIndex = RegGet(Subject, "(d{1,2})[..].*")
                        Question = oneP.innerText
                        questionIndex = RegGet(Question, "[((](d)[))].*")
                        'Debug.Print "题序:"; SubjectIndex; "   问序: "; questionIndex
                        HasGetContent = True
                    End If
                    
                Else
                    '提取内容后 开始找答案
                    '试卷不含独立答案,答案就附在每道题后面
                    If Independent = False Then
                        
                        If IsAnswer = False Then
                            If RegTest(oneP.innerText, "[((](" & questionIndex & ")[))].*") Then
                                Answer = oneP.innerText
                                IsAnswer = True
                                'Exit For
                            End If
                        Else
                            Debug.Print oneP.innerText
                            If RegTest(oneP.innerText, "[((](d)[))].*") Or RegTest(oneP.innerText, "(d{1,2})[..].*") Then
                                Exit For
                            Else
                                Answer = Answer & oneP.innerText
                            End If
                        End If
                        
                        
                        
                        
                    Else
                        '试卷还有独立参考答案
                        '判断某段内容的题号是否符合条件
                        If RegTest(oneP.innerText, "(" & SubjectIndex & ")[..].*") Then
                            IsQuestion = True
                            'Debug.Print isQuestion
                        End If
                        If IsQuestion = True Then
                            '判断某段内容的问题序号是否符合条件
                            If IsAnswer = False Then
                                If RegTest(oneP.innerText, "([((]" & questionIndex & "[))]).*") Then
                                    '记录问题答案
                                    Answer = oneP.innerText
                                    IsAnswer = True
                                    'Exit For
                                End If
                            Else
                                Debug.Print oneP.innerText
                                If RegTest(oneP.innerText, "[((](d)[))].*") Or RegTest(oneP.innerText, "(d{1,2})[..].*") Then
                                    Exit For
                                Else
                                    Answer = Answer & oneP.innerText
                                End If
                            End If
                        End If
                    End If
                End If
            Next oneP
            '图片地址处理
            ImageURL = Mid(ImageURL, 2)
            '测试
            Debug.Print Subject
            Debug.Print ImageURL
            Debug.Print Question
            Debug.Print Answer
        End With
        
        '【补丁,有待改进】2017年下半年部分图片提取不到的问题修正
        If Len(ImageURL) = 0 Then
            hasimagetext = Split(WebText, FindText)(0)
            hasimagetext = Split(hasimagetext, "real_src")(UBound(Split(hasimagetext, "real_src")))
            ImageURL = Split(hasimagetext, """")(1)
        End If
        
        '输出题目内容到Word文档
        Dim wdApp As Object
        Dim Doc As Object
        
        On Error Resume Next
        Set wdApp = GetObject(, "Word.Application")
        On Error GoTo 0
        If Not wdApp Is Nothing Then
            wdApp.Visible = True
            On Error Resume Next
            Set Doc = wdApp.documents(docName)
            On Error GoTo 0
            If Doc Is Nothing Then
                Set Doc = wdApp.documents.Add()
                Doc.SaveAs docPath
            End If
        Else
            Set wdApp = CreateObject("Word.Application")
            wdApp.Visible = True
            Set Doc = wdApp.documents.Add()
            Doc.SaveAs docPath
        End If
        
        Doc.Activate
        wdApp.Selection.EndKey 6
        wdApp.Selection.TypeParagraph
        wdApp.Selection.InsertBreak 7
        '输出题干内容
        wdApp.Selection.TypeText Text:=Subject
        wdApp.Selection.TypeParagraph
        
        '下载图片并插入WORD文档
        If ImageURL <> "" Then
            If InStr(ImageURL, "|") = 0 Then
                ImagePath = ThisWorkbook.Path & Application.PathSeparator & "tmp.jpg"
                DownloadImageName ImageURL, ImagePath
                wdApp.Selection.InlineShapes.AddPicture Filename:=ImagePath, LinkToFile:=False, SaveWithDocument:=True
                wdApp.Selection.TypeParagraph
                Kill ImagePath
                'Stop
            Else
                ImageURLs = Split(ImageURL, "|")
                For n = LBound(ImageURLs) To UBound(ImageURLs) Step 1
                    ImagePath = ThisWorkbook.Path & Application.PathSeparator & "tmp.jpg"
                    DownloadImageName ImageURL, ImagePath
                    wdApp.Selection.InlineShapes.AddPicture Filename:=ImagePath, LinkToFile:=False, SaveWithDocument:=True
                    wdApp.Selection.TypeParagraph
                    Kill ImagePath
                Next n
            End If
        End If
        '输出问题内容
        wdApp.Selection.TypeText Text:=Question
        wdApp.Selection.TypeParagraph
        '输出答案内容
        wdApp.Selection.TypeText Text:="【答案】" & Answer
        wdApp.Selection.TypeParagraph
        Set wdApp = Nothing
        Set Doc = Nothing
        Set oneP = Nothing
    End Sub
    Private Function RegTest(ByVal OrgText As String, ByVal Pattern As String) As Boolean
    '传递参数 :原字符串, 匹配模式
        Dim Regex As Object
        Set Regex = CreateObject("VBScript.RegExp")
        With Regex
            .Global = True
            .Pattern = Pattern
        End With
        RegTest = Regex.test(OrgText)
        Set Regex = Nothing
    End Function
    Public Function RegGet(ByVal OrgText As String, ByVal Pattern As String) As String
    '传递参数 :原字符串, 匹配模式
        Dim Regex As Object
        Dim Mh As Object
        Set Regex = CreateObject("VBScript.RegExp")
        With Regex
            .Global = True
            .Pattern = Pattern
        End With
        If Regex.test(OrgText) Then
            Set Mh = Regex.Execute(OrgText)
            RegGet = Mh.Item(0).submatches(0)
        Else
            RegGet = ""
        End If
        Set Regex = Nothing
    End Function
    Sub SetFontRed(ByVal Rng As Range)
        With Rng.Font
            .Color = -16776961
            .TintAndShade = 0
        End With
    End Sub
    

      

  • 相关阅读:
    博客园的第一篇博客
    I-如何办好比赛
    塞特斯玛斯塔
    字典序最大的子序列
    百练POJ 1657:Distance on Chessboard
    百练POJ2750:鸡兔同笼
    HDU3790最短路径问题
    HDU 2544最短路Dijkstra算法
    快速幂【倍增+二分】
    树的高度
  • 原文地址:https://www.cnblogs.com/nextseven/p/8476576.html
Copyright © 2011-2022 走看看