zoukankan      html  css  js  c++  java
  • 2017-11-11 Sa Oct Spider

    2017-11-11 Sa Oct Spider

    4:33 PM

    Again.

    Firstly test liburl:

    # -*- coding: utf-8 -*-
    
    import json
    import datetime
    import HTMLParser  
    import urlparse  
    import urllib  
    import urllib2  
    import cookielib  
    import string  
    import re
    import sys
    import threading
    import os
    import tempfile
    from bs4 import BeautifulSoup
    from prettytable import PrettyTable
    
    reload(sys)
    sys.setdefaultencoding("utf-8")
    
    def openWithBrowser(filename):
        os.system('python -m webbrowser "{}"'.format(filename))
    
    name = 'xxx'
    no = 'xxx'
    
    hosturl = 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
    posturl = 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
    cj = cookielib.LWPCookieJar()
    cookie_support = urllib2.HTTPCookieProcessor(cj)
    opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)  
    urllib2.install_opener(opener)
    h = urllib2.urlopen(hosturl)
    
    headers = {
        'User-Agent' : 'Mozilla/5.0 (iPad; U; CPU OS 3_2_1 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Mobile/7B405',
        'Referer' : 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
    }
    
    idx = name + ' ' + no
    
    postData = {
        '__VIEWSTATE' : '/wEPDwULLTE3NzI1OTE3OTFkZJk4xBOpTGvHILGFeCbFQfQQv9dbWzdoB6AOexN4BTx0',
        '__EVENTVALIDATION' : '/wEWBQLMmfO1BgL7uPQdAt765bwOAsaZ0ZUMApn3i+sBQi9nlVqoFrBfAjkxtVAWnUBZPnKm6VON7F01iBJzBXw=',
        'name' : name,
        'pwd' : '12345',
        'btnchange' : '登录',
        'xuehao' : no
    }
    
    postData = urllib.urlencode(postData)
    request = urllib2.Request(posturl, postData, headers)
    response = urllib2.urlopen(request, timeout=5)
    
    with tempfile.NamedTemporaryFile(suffix='.html', delete=False) as f:
        f.write(response.read())
        openWithBrowser(f.name)
    

    Good. Nothing changed. Them apply the table.

    5:09 PM

    # -*- coding: utf-8 -*-
    
    import json
    import datetime
    import HTMLParser  
    import urlparse  
    import urllib  
    import urllib2  
    import cookielib  
    import string  
    import re
    import sys
    import threading
    import os
    import tempfile
    from bs4 import BeautifulSoup
    from prettytable import PrettyTable
    
    reload(sys)
    sys.setdefaultencoding("utf-8")
    
    def openWithBrowser(filename):
        os.system('python -m webbrowser "{}"'.format(filename))
    
    version = datetime.datetime.now().strftime("%y-%m-%d %a %b %H-%M-%S result")
    os.mkdir(version)
    
    hosturl = 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
    posturl = 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
    cj = cookielib.LWPCookieJar()
    cookie_support = urllib2.HTTPCookieProcessor(cj)
    opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)  
    urllib2.install_opener(opener)
    h = urllib2.urlopen(hosturl)
    
    headers = {
        'User-Agent' : 'Mozilla/5.0 (iPad; U; CPU OS 3_2_1 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Mobile/7B405',
        'Referer' : 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
    }
    
    def get(name, no):
        global hosturl, posturl, cj, cookie_support, opener, h, headers
    
        postData = {
            '__VIEWSTATE' : '/wEPDwULLTE3NzI1OTE3OTFkZJk4xBOpTGvHILGFeCbFQfQQv9dbWzdoB6AOexN4BTx0',
            '__EVENTVALIDATION' : '/wEWBQLMmfO1BgL7uPQdAt765bwOAsaZ0ZUMApn3i+sBQi9nlVqoFrBfAjkxtVAWnUBZPnKm6VON7F01iBJzBXw=',
            'name' : name,
            'pwd' : '12345',
            'btnchange' : '登录',
            'xuehao' : no
        }
    
        postData = urllib.urlencode(postData)
        request = urllib2.Request(posturl, postData, headers)
        response = urllib2.urlopen(request, timeout=5)
    
        with open('{}/{}.html'.format(version, no), 'w') as f:
            f.write(response.read().replace('<head>', '<head><meta charset="utf-8">'))
    
    with open('result_utf8.csv', "rb") as f:
        print version
        for line in f:
            (name, no, x1, x2) = line.split(',')
            try:
                get(name, no)
            except:
                pass
    

    It took some time to output to Chinese filename. Gave up eventually. It even raised exception when I printed name (Chinese) to the console (decode stuff).

    Then I'd write a reporter.

    6:41 PM

    # -*- coding: utf-8 -*-
    
    import json
    import datetime
    import HTMLParser  
    import urlparse  
    import urllib  
    import urllib2  
    import cookielib  
    import string  
    import re
    import sys
    import threading
    import os
    import tempfile
    from bs4 import BeautifulSoup
    from prettytable import PrettyTable
    import Tkinter
    
    reload(sys)
    sys.setdefaultencoding("utf-8")
    
    csv = [line.split(',') for line in open('result_utf8.csv')]
    
    def getname(no):
        for i in csv:
            if i[1] == no:
                return i[0]
        return ''
    
    def getcourse(filename):
        s = open(filename).read()
        i = s.find('退选')
    
        if i != -1:
            trbegin = s.find('<tr>', i)
    
        # s[trbegin...] e.g.
        # <tr>
        #                                         <td width="10%">
        #                                     <a id="GridView1_ctl02_LinkButton1" href="ja
        # vascript:__doPostBack(&#39;GridView1$ctl02$LinkButton1&#39;,&#39;&#39;)">348</a>
        # 
        #                                 </td><td>12</td><td>生物培优班</td><td>xxx</td><td>&n
        # bsp;</td><td width="10%">
        #                                     <a id="GridView1_ctl02_LinkButton2" href="ja
        # vascript:__doPostBack(&#39;GridView1$ctl02$LinkButton2&#39;,&#39;&#39;)">退选</a>
        #                                </td>
    
            trend = s.find('</tr>', trbegin)
    
            read = 0
            res = ''
    
            i = trbegin
            while i < trend:
                if s[i] == '<':
                    while s[i] != '>':
                        i += 1
                    i += 1
                    continue
    
                end = False
                while s[i] != '<':
                    if s[i] == '&':
                        end = True
                        break
                    res += s[i]
                    i += 1
    
                if end:
                    break
    
                res += ' '
            
            res2 = ''
            i = 0
            while i < len(res) and not (res[i] in "0123456789"):
                i += 1
    
            while i < len(res):
                if res[i] == '
    ':
                    i += 1
                else:
                    res2 += res[i]
                    if res[i] == ' ':
                        while i < len(res) and res[i] == ' ':
                            i += 1
                    else:
                        i += 1
    
            return res2
        return ''
    
    def report():
        with tempfile.NamedTemporaryFile(suffix='.html', delete=False) as f:  
            wd = workdir.get()
    
            os.chdir(wd)
            f.write('<head><meta charset="utf-8"></head>')
            f.write('<h1>Spider report</h1>')
            f.write('<p><b>Version {}</b></p>'.format(wd))
            f.write('<table>')
    
            for i in os.listdir('.'):
                (no, x1) = i.split('.')
                name = getname(no)
                s = getcourse(i)
                f.write('<tr><th>{}</th><th>{}</th><td>{}</td></tr>'.format(no, name, s))
    
            os.system('python -m webbrowser {}'.format(f.name))
            os.chdir('..')
    
    gui = Tkinter.Tk()
    workdir = Tkinter.StringVar()
    Tkinter.Button(gui, text="Report", command=report).pack(side=Tkinter.LEFT)
    Tkinter.Entry(gui, textvariable=workdir, width=40).pack(side=Tkinter.LEFT)
    gui.mainloop()
    
  • 相关阅读:
    无聊,只发两张图……
    LA
    “万能数据库查询分析器”5.04 发布,撰写的相关技术文章达63篇
    HDU 1010Tempter of the Bone(奇偶剪枝回溯dfs)
    uva 10051 Tower of Cubes(DAG最长路)
    uva 103 Stacking Boxes(DAG)
    异步处理(列出所有文件)
    Android开发8:UI组件TextView,EditText,Button
    植物-蔬菜:红菜苔
    植物-蔬菜:菜苔
  • 原文地址:https://www.cnblogs.com/yanhuihang/p/7819698.html
Copyright © 2011-2022 走看看