# -*- coding: utf-8 -*- import urllib2,cookielib import urllib import cStringIO import datetime from PIL import Image from lxml import etree import sys reload(sys) sys.setdefaultencoding('utf8') def setOpener(): cookie = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie)) opener.addheaders.append(('User-Agent','Mozilla/5.0 (Windows NT 5.1; rv:25.0) Gecko/20100101 Firefox/25.0')) return opener def md5(str): import hashlib import types if type(str) is types.StringType: m = hashlib.md5() m.update(str) return m.hexdigest() else: return '' class spider: def __init__(self): self.opener=setOpener()#保存cookie信息 self.imgUrl='http://210.42.121.241/servlet/GenImg' self.loginUrl='http://210.42.121.241/servlet/Login' self.queryScoreUrl='http://210.42.121.241/servlet/Svlt_QueryStuScore' self.studentID='' self.password='' self.captcha=''#验证码 self.mainPageContent='' def getCaptcha(self): res =self.opener.open(urllib2.Request(self.imgUrl)) tempIm = cStringIO.StringIO(res.read()) im = Image.open(tempIm) return im #im.save('test.jpg') #im.show() #self.captcha = raw_input("验证码:") def loginMainPage(self): #需要post的数据 pwdMD5=md5(self.password) postdata = urllib.urlencode({ 'id':self.studentID, 'pwd':pwdMD5, 'xdvfb':self.captcha }) req = urllib2.Request( url = self.loginUrl, data = postdata ) response = self.opener.open(req) self.mainPageContent = response.read().decode('gb2312') def getAndSaveScore(self): page=etree.HTML(self.mainPageContent) text=page.xpath('//div[@id="school"]/@onclick') try: token=text[0][65:101] except IndexError: print "Error:未能正确打开主页面" return 0 else: GMT_FORMAT = '%a, %d %b %Y %H:%M:%S GMT' GMT_time=datetime.datetime.utcnow().strftime(GMT_FORMAT) getParams=urllib.urlencode({ 'csrftoken':token, 'learnType':'', 'scoreFlag':'0', 't':GMT_time, 'term':'', 'year':'0' }) url = self.queryScoreUrl fullUrl=url+'?'+getParams #print fullUrl req = urllib2.Request(fullUrl) response = self.opener.open(req) result = response.read().decode('gb2312') # 由于该网页是gb2312的编码,所以需要解码 #print result out=open('inputScore.html','wb') out.write(result) out.close() return 1 #mySpider=spider() #mySpider.getCaptcha() #mySpider.loginMainPage() #mySpider.getAndSaveScore()