zoukankan      html  css  js  c++  java
  • 2017-11-11 Sa Oct Spider

    2017-11-11 Sa Oct Spider

    4:33 PM

    Again.

    Firstly test liburl:

    # -*- coding: utf-8 -*-
    
    import json
    import datetime
    import HTMLParser  
    import urlparse  
    import urllib  
    import urllib2  
    import cookielib  
    import string  
    import re
    import sys
    import threading
    import os
    import tempfile
    from bs4 import BeautifulSoup
    from prettytable import PrettyTable
    
    reload(sys)
    sys.setdefaultencoding("utf-8")
    
    def openWithBrowser(filename):
        os.system('python -m webbrowser "{}"'.format(filename))
    
    name = 'xxx'
    no = 'xxx'
    
    hosturl = 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
    posturl = 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
    cj = cookielib.LWPCookieJar()
    cookie_support = urllib2.HTTPCookieProcessor(cj)
    opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)  
    urllib2.install_opener(opener)
    h = urllib2.urlopen(hosturl)
    
    headers = {
        'User-Agent' : 'Mozilla/5.0 (iPad; U; CPU OS 3_2_1 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Mobile/7B405',
        'Referer' : 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
    }
    
    idx = name + ' ' + no
    
    postData = {
        '__VIEWSTATE' : '/wEPDwULLTE3NzI1OTE3OTFkZJk4xBOpTGvHILGFeCbFQfQQv9dbWzdoB6AOexN4BTx0',
        '__EVENTVALIDATION' : '/wEWBQLMmfO1BgL7uPQdAt765bwOAsaZ0ZUMApn3i+sBQi9nlVqoFrBfAjkxtVAWnUBZPnKm6VON7F01iBJzBXw=',
        'name' : name,
        'pwd' : '12345',
        'btnchange' : '登录',
        'xuehao' : no
    }
    
    postData = urllib.urlencode(postData)
    request = urllib2.Request(posturl, postData, headers)
    response = urllib2.urlopen(request, timeout=5)
    
    with tempfile.NamedTemporaryFile(suffix='.html', delete=False) as f:
        f.write(response.read())
        openWithBrowser(f.name)
    

    Good. Nothing changed. Them apply the table.

    5:09 PM

    # -*- coding: utf-8 -*-
    
    import json
    import datetime
    import HTMLParser  
    import urlparse  
    import urllib  
    import urllib2  
    import cookielib  
    import string  
    import re
    import sys
    import threading
    import os
    import tempfile
    from bs4 import BeautifulSoup
    from prettytable import PrettyTable
    
    reload(sys)
    sys.setdefaultencoding("utf-8")
    
    def openWithBrowser(filename):
        os.system('python -m webbrowser "{}"'.format(filename))
    
    version = datetime.datetime.now().strftime("%y-%m-%d %a %b %H-%M-%S result")
    os.mkdir(version)
    
    hosturl = 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
    posturl = 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
    cj = cookielib.LWPCookieJar()
    cookie_support = urllib2.HTTPCookieProcessor(cj)
    opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)  
    urllib2.install_opener(opener)
    h = urllib2.urlopen(hosturl)
    
    headers = {
        'User-Agent' : 'Mozilla/5.0 (iPad; U; CPU OS 3_2_1 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Mobile/7B405',
        'Referer' : 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
    }
    
    def get(name, no):
        global hosturl, posturl, cj, cookie_support, opener, h, headers
    
        postData = {
            '__VIEWSTATE' : '/wEPDwULLTE3NzI1OTE3OTFkZJk4xBOpTGvHILGFeCbFQfQQv9dbWzdoB6AOexN4BTx0',
            '__EVENTVALIDATION' : '/wEWBQLMmfO1BgL7uPQdAt765bwOAsaZ0ZUMApn3i+sBQi9nlVqoFrBfAjkxtVAWnUBZPnKm6VON7F01iBJzBXw=',
            'name' : name,
            'pwd' : '12345',
            'btnchange' : '登录',
            'xuehao' : no
        }
    
        postData = urllib.urlencode(postData)
        request = urllib2.Request(posturl, postData, headers)
        response = urllib2.urlopen(request, timeout=5)
    
        with open('{}/{}.html'.format(version, no), 'w') as f:
            f.write(response.read().replace('<head>', '<head><meta charset="utf-8">'))
    
    with open('result_utf8.csv', "rb") as f:
        print version
        for line in f:
            (name, no, x1, x2) = line.split(',')
            try:
                get(name, no)
            except:
                pass
    

    It took some time to output to Chinese filename. Gave up eventually. It even raised exception when I printed name (Chinese) to the console (decode stuff).

    Then I'd write a reporter.

    6:41 PM

    # -*- coding: utf-8 -*-
    
    import json
    import datetime
    import HTMLParser  
    import urlparse  
    import urllib  
    import urllib2  
    import cookielib  
    import string  
    import re
    import sys
    import threading
    import os
    import tempfile
    from bs4 import BeautifulSoup
    from prettytable import PrettyTable
    import Tkinter
    
    reload(sys)
    sys.setdefaultencoding("utf-8")
    
    csv = [line.split(',') for line in open('result_utf8.csv')]
    
    def getname(no):
        for i in csv:
            if i[1] == no:
                return i[0]
        return ''
    
    def getcourse(filename):
        s = open(filename).read()
        i = s.find('退选')
    
        if i != -1:
            trbegin = s.find('<tr>', i)
    
        # s[trbegin...] e.g.
        # <tr>
        #                                         <td width="10%">
        #                                     <a id="GridView1_ctl02_LinkButton1" href="ja
        # vascript:__doPostBack(&#39;GridView1$ctl02$LinkButton1&#39;,&#39;&#39;)">348</a>
        # 
        #                                 </td><td>12</td><td>生物培优班</td><td>xxx</td><td>&n
        # bsp;</td><td width="10%">
        #                                     <a id="GridView1_ctl02_LinkButton2" href="ja
        # vascript:__doPostBack(&#39;GridView1$ctl02$LinkButton2&#39;,&#39;&#39;)">退选</a>
        #                                </td>
    
            trend = s.find('</tr>', trbegin)
    
            read = 0
            res = ''
    
            i = trbegin
            while i < trend:
                if s[i] == '<':
                    while s[i] != '>':
                        i += 1
                    i += 1
                    continue
    
                end = False
                while s[i] != '<':
                    if s[i] == '&':
                        end = True
                        break
                    res += s[i]
                    i += 1
    
                if end:
                    break
    
                res += ' '
            
            res2 = ''
            i = 0
            while i < len(res) and not (res[i] in "0123456789"):
                i += 1
    
            while i < len(res):
                if res[i] == '
    ':
                    i += 1
                else:
                    res2 += res[i]
                    if res[i] == ' ':
                        while i < len(res) and res[i] == ' ':
                            i += 1
                    else:
                        i += 1
    
            return res2
        return ''
    
    def report():
        with tempfile.NamedTemporaryFile(suffix='.html', delete=False) as f:  
            wd = workdir.get()
    
            os.chdir(wd)
            f.write('<head><meta charset="utf-8"></head>')
            f.write('<h1>Spider report</h1>')
            f.write('<p><b>Version {}</b></p>'.format(wd))
            f.write('<table>')
    
            for i in os.listdir('.'):
                (no, x1) = i.split('.')
                name = getname(no)
                s = getcourse(i)
                f.write('<tr><th>{}</th><th>{}</th><td>{}</td></tr>'.format(no, name, s))
    
            os.system('python -m webbrowser {}'.format(f.name))
            os.chdir('..')
    
    gui = Tkinter.Tk()
    workdir = Tkinter.StringVar()
    Tkinter.Button(gui, text="Report", command=report).pack(side=Tkinter.LEFT)
    Tkinter.Entry(gui, textvariable=workdir, width=40).pack(side=Tkinter.LEFT)
    gui.mainloop()
    
  • 相关阅读:
    Protected和Default的区别
    将数组中负数放在正数前面
    java.io包和杯子测楼
    hadoop基础
    极限编程和JUnit
    接口和抽象类
    C# 中窗口AutoScaleMode属性
    计算机的自启动管理
    labview中的移位寄存器、循环隧道,自动索引隧道的区别
    发现C#winform编程中不常用的控件(一)<FlowLayoutPanel控件><拆分器控件Splitcontainer >
  • 原文地址:https://www.cnblogs.com/yanhuihang/p/7819698.html
Copyright © 2011-2022 走看看