zoukankan      html  css  js  c++  java
  • 2017-11-11 Sa Oct Spider

    2017-11-11 Sa Oct Spider

    4:33 PM

    Again.

    Firstly test liburl:

    # -*- coding: utf-8 -*-
    
    import json
    import datetime
    import HTMLParser  
    import urlparse  
    import urllib  
    import urllib2  
    import cookielib  
    import string  
    import re
    import sys
    import threading
    import os
    import tempfile
    from bs4 import BeautifulSoup
    from prettytable import PrettyTable
    
    reload(sys)
    sys.setdefaultencoding("utf-8")
    
    def openWithBrowser(filename):
        os.system('python -m webbrowser "{}"'.format(filename))
    
    name = 'xxx'
    no = 'xxx'
    
    hosturl = 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
    posturl = 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
    cj = cookielib.LWPCookieJar()
    cookie_support = urllib2.HTTPCookieProcessor(cj)
    opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)  
    urllib2.install_opener(opener)
    h = urllib2.urlopen(hosturl)
    
    headers = {
        'User-Agent' : 'Mozilla/5.0 (iPad; U; CPU OS 3_2_1 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Mobile/7B405',
        'Referer' : 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
    }
    
    idx = name + ' ' + no
    
    postData = {
        '__VIEWSTATE' : '/wEPDwULLTE3NzI1OTE3OTFkZJk4xBOpTGvHILGFeCbFQfQQv9dbWzdoB6AOexN4BTx0',
        '__EVENTVALIDATION' : '/wEWBQLMmfO1BgL7uPQdAt765bwOAsaZ0ZUMApn3i+sBQi9nlVqoFrBfAjkxtVAWnUBZPnKm6VON7F01iBJzBXw=',
        'name' : name,
        'pwd' : '12345',
        'btnchange' : '登录',
        'xuehao' : no
    }
    
    postData = urllib.urlencode(postData)
    request = urllib2.Request(posturl, postData, headers)
    response = urllib2.urlopen(request, timeout=5)
    
    with tempfile.NamedTemporaryFile(suffix='.html', delete=False) as f:
        f.write(response.read())
        openWithBrowser(f.name)
    

    Good. Nothing changed. Them apply the table.

    5:09 PM

    # -*- coding: utf-8 -*-
    
    import json
    import datetime
    import HTMLParser  
    import urlparse  
    import urllib  
    import urllib2  
    import cookielib  
    import string  
    import re
    import sys
    import threading
    import os
    import tempfile
    from bs4 import BeautifulSoup
    from prettytable import PrettyTable
    
    reload(sys)
    sys.setdefaultencoding("utf-8")
    
    def openWithBrowser(filename):
        os.system('python -m webbrowser "{}"'.format(filename))
    
    version = datetime.datetime.now().strftime("%y-%m-%d %a %b %H-%M-%S result")
    os.mkdir(version)
    
    hosturl = 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
    posturl = 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
    cj = cookielib.LWPCookieJar()
    cookie_support = urllib2.HTTPCookieProcessor(cj)
    opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)  
    urllib2.install_opener(opener)
    h = urllib2.urlopen(hosturl)
    
    headers = {
        'User-Agent' : 'Mozilla/5.0 (iPad; U; CPU OS 3_2_1 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Mobile/7B405',
        'Referer' : 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
    }
    
    def get(name, no):
        global hosturl, posturl, cj, cookie_support, opener, h, headers
    
        postData = {
            '__VIEWSTATE' : '/wEPDwULLTE3NzI1OTE3OTFkZJk4xBOpTGvHILGFeCbFQfQQv9dbWzdoB6AOexN4BTx0',
            '__EVENTVALIDATION' : '/wEWBQLMmfO1BgL7uPQdAt765bwOAsaZ0ZUMApn3i+sBQi9nlVqoFrBfAjkxtVAWnUBZPnKm6VON7F01iBJzBXw=',
            'name' : name,
            'pwd' : '12345',
            'btnchange' : '登录',
            'xuehao' : no
        }
    
        postData = urllib.urlencode(postData)
        request = urllib2.Request(posturl, postData, headers)
        response = urllib2.urlopen(request, timeout=5)
    
        with open('{}/{}.html'.format(version, no), 'w') as f:
            f.write(response.read().replace('<head>', '<head><meta charset="utf-8">'))
    
    with open('result_utf8.csv', "rb") as f:
        print version
        for line in f:
            (name, no, x1, x2) = line.split(',')
            try:
                get(name, no)
            except:
                pass
    

    It took some time to output to Chinese filename. Gave up eventually. It even raised exception when I printed name (Chinese) to the console (decode stuff).

    Then I'd write a reporter.

    6:41 PM

    # -*- coding: utf-8 -*-
    
    import json
    import datetime
    import HTMLParser  
    import urlparse  
    import urllib  
    import urllib2  
    import cookielib  
    import string  
    import re
    import sys
    import threading
    import os
    import tempfile
    from bs4 import BeautifulSoup
    from prettytable import PrettyTable
    import Tkinter
    
    reload(sys)
    sys.setdefaultencoding("utf-8")
    
    csv = [line.split(',') for line in open('result_utf8.csv')]
    
    def getname(no):
        for i in csv:
            if i[1] == no:
                return i[0]
        return ''
    
    def getcourse(filename):
        s = open(filename).read()
        i = s.find('退选')
    
        if i != -1:
            trbegin = s.find('<tr>', i)
    
        # s[trbegin...] e.g.
        # <tr>
        #                                         <td width="10%">
        #                                     <a id="GridView1_ctl02_LinkButton1" href="ja
        # vascript:__doPostBack(&#39;GridView1$ctl02$LinkButton1&#39;,&#39;&#39;)">348</a>
        # 
        #                                 </td><td>12</td><td>生物培优班</td><td>xxx</td><td>&n
        # bsp;</td><td width="10%">
        #                                     <a id="GridView1_ctl02_LinkButton2" href="ja
        # vascript:__doPostBack(&#39;GridView1$ctl02$LinkButton2&#39;,&#39;&#39;)">退选</a>
        #                                </td>
    
            trend = s.find('</tr>', trbegin)
    
            read = 0
            res = ''
    
            i = trbegin
            while i < trend:
                if s[i] == '<':
                    while s[i] != '>':
                        i += 1
                    i += 1
                    continue
    
                end = False
                while s[i] != '<':
                    if s[i] == '&':
                        end = True
                        break
                    res += s[i]
                    i += 1
    
                if end:
                    break
    
                res += ' '
            
            res2 = ''
            i = 0
            while i < len(res) and not (res[i] in "0123456789"):
                i += 1
    
            while i < len(res):
                if res[i] == '
    ':
                    i += 1
                else:
                    res2 += res[i]
                    if res[i] == ' ':
                        while i < len(res) and res[i] == ' ':
                            i += 1
                    else:
                        i += 1
    
            return res2
        return ''
    
    def report():
        with tempfile.NamedTemporaryFile(suffix='.html', delete=False) as f:  
            wd = workdir.get()
    
            os.chdir(wd)
            f.write('<head><meta charset="utf-8"></head>')
            f.write('<h1>Spider report</h1>')
            f.write('<p><b>Version {}</b></p>'.format(wd))
            f.write('<table>')
    
            for i in os.listdir('.'):
                (no, x1) = i.split('.')
                name = getname(no)
                s = getcourse(i)
                f.write('<tr><th>{}</th><th>{}</th><td>{}</td></tr>'.format(no, name, s))
    
            os.system('python -m webbrowser {}'.format(f.name))
            os.chdir('..')
    
    gui = Tkinter.Tk()
    workdir = Tkinter.StringVar()
    Tkinter.Button(gui, text="Report", command=report).pack(side=Tkinter.LEFT)
    Tkinter.Entry(gui, textvariable=workdir, width=40).pack(side=Tkinter.LEFT)
    gui.mainloop()
    
  • 相关阅读:
    MySQL基准测试--innodb_buffer_pool_instances
    MySQL参数优化:back_log
    MySQL open_files_limit相关设置
    Django权限系统auth模块详解
    2.9 go mod 之本地仓库搭建
    my40_MySQL锁概述之意向锁
    my39_InnoDB锁机制之Gap Lock、Next-Key Lock、Record Lock解析
    2.8 GO 参数传递
    my38_MySQL事务知识点零记
    my37_MGR流控对数据库性能的影响以及MGR与主从的性能对比
  • 原文地址:https://www.cnblogs.com/yanhuihang/p/7819698.html
Copyright © 2011-2022 走看看