zoukankan      html  css  js  c++  java
  • 2017-11-11 Sa Oct Spider

    2017-11-11 Sa Oct Spider

    4:33 PM

    Again.

    Firstly test liburl:

    # -*- coding: utf-8 -*-
    
    import json
    import datetime
    import HTMLParser  
    import urlparse  
    import urllib  
    import urllib2  
    import cookielib  
    import string  
    import re
    import sys
    import threading
    import os
    import tempfile
    from bs4 import BeautifulSoup
    from prettytable import PrettyTable
    
    reload(sys)
    sys.setdefaultencoding("utf-8")
    
    def openWithBrowser(filename):
        os.system('python -m webbrowser "{}"'.format(filename))
    
    name = 'xxx'
    no = 'xxx'
    
    hosturl = 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
    posturl = 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
    cj = cookielib.LWPCookieJar()
    cookie_support = urllib2.HTTPCookieProcessor(cj)
    opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)  
    urllib2.install_opener(opener)
    h = urllib2.urlopen(hosturl)
    
    headers = {
        'User-Agent' : 'Mozilla/5.0 (iPad; U; CPU OS 3_2_1 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Mobile/7B405',
        'Referer' : 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
    }
    
    idx = name + ' ' + no
    
    postData = {
        '__VIEWSTATE' : '/wEPDwULLTE3NzI1OTE3OTFkZJk4xBOpTGvHILGFeCbFQfQQv9dbWzdoB6AOexN4BTx0',
        '__EVENTVALIDATION' : '/wEWBQLMmfO1BgL7uPQdAt765bwOAsaZ0ZUMApn3i+sBQi9nlVqoFrBfAjkxtVAWnUBZPnKm6VON7F01iBJzBXw=',
        'name' : name,
        'pwd' : '12345',
        'btnchange' : '登录',
        'xuehao' : no
    }
    
    postData = urllib.urlencode(postData)
    request = urllib2.Request(posturl, postData, headers)
    response = urllib2.urlopen(request, timeout=5)
    
    with tempfile.NamedTemporaryFile(suffix='.html', delete=False) as f:
        f.write(response.read())
        openWithBrowser(f.name)
    

    Good. Nothing changed. Them apply the table.

    5:09 PM

    # -*- coding: utf-8 -*-
    
    import json
    import datetime
    import HTMLParser  
    import urlparse  
    import urllib  
    import urllib2  
    import cookielib  
    import string  
    import re
    import sys
    import threading
    import os
    import tempfile
    from bs4 import BeautifulSoup
    from prettytable import PrettyTable
    
    reload(sys)
    sys.setdefaultencoding("utf-8")
    
    def openWithBrowser(filename):
        os.system('python -m webbrowser "{}"'.format(filename))
    
    version = datetime.datetime.now().strftime("%y-%m-%d %a %b %H-%M-%S result")
    os.mkdir(version)
    
    hosturl = 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
    posturl = 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
    cj = cookielib.LWPCookieJar()
    cookie_support = urllib2.HTTPCookieProcessor(cj)
    opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)  
    urllib2.install_opener(opener)
    h = urllib2.urlopen(hosturl)
    
    headers = {
        'User-Agent' : 'Mozilla/5.0 (iPad; U; CPU OS 3_2_1 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Mobile/7B405',
        'Referer' : 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
    }
    
    def get(name, no):
        global hosturl, posturl, cj, cookie_support, opener, h, headers
    
        postData = {
            '__VIEWSTATE' : '/wEPDwULLTE3NzI1OTE3OTFkZJk4xBOpTGvHILGFeCbFQfQQv9dbWzdoB6AOexN4BTx0',
            '__EVENTVALIDATION' : '/wEWBQLMmfO1BgL7uPQdAt765bwOAsaZ0ZUMApn3i+sBQi9nlVqoFrBfAjkxtVAWnUBZPnKm6VON7F01iBJzBXw=',
            'name' : name,
            'pwd' : '12345',
            'btnchange' : '登录',
            'xuehao' : no
        }
    
        postData = urllib.urlencode(postData)
        request = urllib2.Request(posturl, postData, headers)
        response = urllib2.urlopen(request, timeout=5)
    
        with open('{}/{}.html'.format(version, no), 'w') as f:
            f.write(response.read().replace('<head>', '<head><meta charset="utf-8">'))
    
    with open('result_utf8.csv', "rb") as f:
        print version
        for line in f:
            (name, no, x1, x2) = line.split(',')
            try:
                get(name, no)
            except:
                pass
    

    It took some time to output to Chinese filename. Gave up eventually. It even raised exception when I printed name (Chinese) to the console (decode stuff).

    Then I'd write a reporter.

    6:41 PM

    # -*- coding: utf-8 -*-
    
    import json
    import datetime
    import HTMLParser  
    import urlparse  
    import urllib  
    import urllib2  
    import cookielib  
    import string  
    import re
    import sys
    import threading
    import os
    import tempfile
    from bs4 import BeautifulSoup
    from prettytable import PrettyTable
    import Tkinter
    
    reload(sys)
    sys.setdefaultencoding("utf-8")
    
    csv = [line.split(',') for line in open('result_utf8.csv')]
    
    def getname(no):
        for i in csv:
            if i[1] == no:
                return i[0]
        return ''
    
    def getcourse(filename):
        s = open(filename).read()
        i = s.find('退选')
    
        if i != -1:
            trbegin = s.find('<tr>', i)
    
        # s[trbegin...] e.g.
        # <tr>
        #                                         <td width="10%">
        #                                     <a id="GridView1_ctl02_LinkButton1" href="ja
        # vascript:__doPostBack(&#39;GridView1$ctl02$LinkButton1&#39;,&#39;&#39;)">348</a>
        # 
        #                                 </td><td>12</td><td>生物培优班</td><td>xxx</td><td>&n
        # bsp;</td><td width="10%">
        #                                     <a id="GridView1_ctl02_LinkButton2" href="ja
        # vascript:__doPostBack(&#39;GridView1$ctl02$LinkButton2&#39;,&#39;&#39;)">退选</a>
        #                                </td>
    
            trend = s.find('</tr>', trbegin)
    
            read = 0
            res = ''
    
            i = trbegin
            while i < trend:
                if s[i] == '<':
                    while s[i] != '>':
                        i += 1
                    i += 1
                    continue
    
                end = False
                while s[i] != '<':
                    if s[i] == '&':
                        end = True
                        break
                    res += s[i]
                    i += 1
    
                if end:
                    break
    
                res += ' '
            
            res2 = ''
            i = 0
            while i < len(res) and not (res[i] in "0123456789"):
                i += 1
    
            while i < len(res):
                if res[i] == '
    ':
                    i += 1
                else:
                    res2 += res[i]
                    if res[i] == ' ':
                        while i < len(res) and res[i] == ' ':
                            i += 1
                    else:
                        i += 1
    
            return res2
        return ''
    
    def report():
        with tempfile.NamedTemporaryFile(suffix='.html', delete=False) as f:  
            wd = workdir.get()
    
            os.chdir(wd)
            f.write('<head><meta charset="utf-8"></head>')
            f.write('<h1>Spider report</h1>')
            f.write('<p><b>Version {}</b></p>'.format(wd))
            f.write('<table>')
    
            for i in os.listdir('.'):
                (no, x1) = i.split('.')
                name = getname(no)
                s = getcourse(i)
                f.write('<tr><th>{}</th><th>{}</th><td>{}</td></tr>'.format(no, name, s))
    
            os.system('python -m webbrowser {}'.format(f.name))
            os.chdir('..')
    
    gui = Tkinter.Tk()
    workdir = Tkinter.StringVar()
    Tkinter.Button(gui, text="Report", command=report).pack(side=Tkinter.LEFT)
    Tkinter.Entry(gui, textvariable=workdir, width=40).pack(side=Tkinter.LEFT)
    gui.mainloop()
    
  • 相关阅读:
    [LeetCode] 875. Koko Eating Bananas 科科吃香蕉
    [LeetCode] 874. Walking Robot Simulation 走路机器人仿真
    [LeetCode] 995. Minimum Number of K Consecutive Bit Flips 连续K位翻转的最小次数
    [LeetCode] 873. Length of Longest Fibonacci Subsequence 最长的斐波那契序列长度
    [LeetCode] 872. Leaf-Similar Trees 叶结点相似的树
    [LeetCode] 870. Advantage Shuffle 优势洗牌
    [LeetCode] 869. Reordered Power of 2 重新排序为2的倍数
    [LeetCode] 868. Binary Gap 二进制间隙
    [LeetCode] 867. Transpose Matrix 转置矩阵
    [LeetCode] 866. Prime Palindrome 质数回文数
  • 原文地址:https://www.cnblogs.com/yanhuihang/p/7819698.html
Copyright © 2011-2022 走看看