zoukankan html css js c++ java

2017-11-11 Sa Oct Spider

4:33 PM

Again.

Firstly test liburl:

# -*- coding: utf-8 -*-

import json
import datetime
import HTMLParser  
import urlparse  
import urllib  
import urllib2  
import cookielib  
import string  
import re
import sys
import threading
import os
import tempfile
from bs4 import BeautifulSoup
from prettytable import PrettyTable

reload(sys)
sys.setdefaultencoding("utf-8")

def openWithBrowser(filename):
    os.system('python -m webbrowser "{}"'.format(filename))

name = 'xxx'
no = 'xxx'

hosturl = 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
posturl = 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
cj = cookielib.LWPCookieJar()
cookie_support = urllib2.HTTPCookieProcessor(cj)
opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)  
urllib2.install_opener(opener)
h = urllib2.urlopen(hosturl)

headers = {
    'User-Agent' : 'Mozilla/5.0 (iPad; U; CPU OS 3_2_1 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Mobile/7B405',
    'Referer' : 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
}

idx = name + ' ' + no

postData = {
    '__VIEWSTATE' : '/wEPDwULLTE3NzI1OTE3OTFkZJk4xBOpTGvHILGFeCbFQfQQv9dbWzdoB6AOexN4BTx0',
    '__EVENTVALIDATION' : '/wEWBQLMmfO1BgL7uPQdAt765bwOAsaZ0ZUMApn3i+sBQi9nlVqoFrBfAjkxtVAWnUBZPnKm6VON7F01iBJzBXw=',
    'name' : name,
    'pwd' : '12345',
    'btnchange' : '登录',
    'xuehao' : no
}

postData = urllib.urlencode(postData)
request = urllib2.Request(posturl, postData, headers)
response = urllib2.urlopen(request, timeout=5)

with tempfile.NamedTemporaryFile(suffix='.html', delete=False) as f:
    f.write(response.read())
    openWithBrowser(f.name)

Good. Nothing changed. Them apply the table.

5:09 PM

# -*- coding: utf-8 -*-

import json
import datetime
import HTMLParser  
import urlparse  
import urllib  
import urllib2  
import cookielib  
import string  
import re
import sys
import threading
import os
import tempfile
from bs4 import BeautifulSoup
from prettytable import PrettyTable

reload(sys)
sys.setdefaultencoding("utf-8")

def openWithBrowser(filename):
    os.system('python -m webbrowser "{}"'.format(filename))

version = datetime.datetime.now().strftime("%y-%m-%d %a %b %H-%M-%S result")
os.mkdir(version)

hosturl = 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
posturl = 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
cj = cookielib.LWPCookieJar()
cookie_support = urllib2.HTTPCookieProcessor(cj)
opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)  
urllib2.install_opener(opener)
h = urllib2.urlopen(hosturl)

headers = {
    'User-Agent' : 'Mozilla/5.0 (iPad; U; CPU OS 3_2_1 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Mobile/7B405',
    'Referer' : 'http://android.gdgzez.com.cn/szxy/yanjiuxingxuexi/student_login.aspx'
}

def get(name, no):
    global hosturl, posturl, cj, cookie_support, opener, h, headers

    postData = {
        '__VIEWSTATE' : '/wEPDwULLTE3NzI1OTE3OTFkZJk4xBOpTGvHILGFeCbFQfQQv9dbWzdoB6AOexN4BTx0',
        '__EVENTVALIDATION' : '/wEWBQLMmfO1BgL7uPQdAt765bwOAsaZ0ZUMApn3i+sBQi9nlVqoFrBfAjkxtVAWnUBZPnKm6VON7F01iBJzBXw=',
        'name' : name,
        'pwd' : '12345',
        'btnchange' : '登录',
        'xuehao' : no
    }

    postData = urllib.urlencode(postData)
    request = urllib2.Request(posturl, postData, headers)
    response = urllib2.urlopen(request, timeout=5)

    with open('{}/{}.html'.format(version, no), 'w') as f:
        f.write(response.read().replace('<head>', '<head><meta charset="utf-8">'))

with open('result_utf8.csv', "rb") as f:
    print version
    for line in f:
        (name, no, x1, x2) = line.split(',')
        try:
            get(name, no)
        except:
            pass

It took some time to output to Chinese filename. Gave up eventually. It even raised exception when I printed name (Chinese) to the console (decode stuff).

Then I'd write a reporter.

6:41 PM

# -*- coding: utf-8 -*-

import json
import datetime
import HTMLParser  
import urlparse  
import urllib  
import urllib2  
import cookielib  
import string  
import re
import sys
import threading
import os
import tempfile
from bs4 import BeautifulSoup
from prettytable import PrettyTable
import Tkinter

reload(sys)
sys.setdefaultencoding("utf-8")

csv = [line.split(',') for line in open('result_utf8.csv')]

def getname(no):
    for i in csv:
        if i[1] == no:
            return i[0]
    return ''

def getcourse(filename):
    s = open(filename).read()
    i = s.find('退选')

    if i != -1:
        trbegin = s.find('<tr>', i)

    # s[trbegin...] e.g.
    # <tr>
    #                                         <td width="10%">
    #                                     <a id="GridView1_ctl02_LinkButton1" href="ja
    # vascript:__doPostBack(&#39;GridView1$ctl02$LinkButton1&#39;,&#39;&#39;)">348</a>
    # 
    #                                 </td><td>12</td><td>生物培优班</td><td>xxx</td><td>&n
    # bsp;</td><td width="10%">
    #                                     <a id="GridView1_ctl02_LinkButton2" href="ja
    # vascript:__doPostBack(&#39;GridView1$ctl02$LinkButton2&#39;,&#39;&#39;)">退选</a>
    #                                </td>

        trend = s.find('</tr>', trbegin)

        read = 0
        res = ''

        i = trbegin
        while i < trend:
            if s[i] == '<':
                while s[i] != '>':
                    i += 1
                i += 1
                continue

            end = False
            while s[i] != '<':
                if s[i] == '&':
                    end = True
                    break
                res += s[i]
                i += 1

            if end:
                break

            res += ' '
        
        res2 = ''
        i = 0
        while i < len(res) and not (res[i] in "0123456789"):
            i += 1

        while i < len(res):
            if res[i] == '
':
                i += 1
            else:
                res2 += res[i]
                if res[i] == ' ':
                    while i < len(res) and res[i] == ' ':
                        i += 1
                else:
                    i += 1

        return res2
    return ''

def report():
    with tempfile.NamedTemporaryFile(suffix='.html', delete=False) as f:  
        wd = workdir.get()

        os.chdir(wd)
        f.write('<head><meta charset="utf-8"></head>')
        f.write('<h1>Spider report</h1>')
        f.write('<p><b>Version {}</b></p>'.format(wd))
        f.write('<table>')

        for i in os.listdir('.'):
            (no, x1) = i.split('.')
            name = getname(no)
            s = getcourse(i)
            f.write('<tr><th>{}</th><th>{}</th><td>{}</td></tr>'.format(no, name, s))

        os.system('python -m webbrowser {}'.format(f.name))
        os.chdir('..')

gui = Tkinter.Tk()
workdir = Tkinter.StringVar()
Tkinter.Button(gui, text="Report", command=report).pack(side=Tkinter.LEFT)
Tkinter.Entry(gui, textvariable=workdir, width=40).pack(side=Tkinter.LEFT)
gui.mainloop()

查看全文

相关阅读:
无聊，只发两张图……
LA
“万能数据库查询分析器”5.04 发布，撰写的相关技术文章达63篇
 HDU 1010Tempter of the Bone(奇偶剪枝回溯dfs)
uva 10051 Tower of Cubes(DAG最长路）
uva 103 Stacking Boxes（DAG）
异步处理（列出所有文件）
Android开发8：UI组件TextView,EditText,Button
植物-蔬菜：红菜苔
 植物-蔬菜：菜苔

原文地址：https://www.cnblogs.com/yanhuihang/p/7819698.html