zoukankan      html  css  js  c++  java
  • <双十一特辑> 模拟登录学校教务处爬取全校女生资料和头像

    
    
      1 #-*- coding=utf-8 -*- 
      2 import requests
      3 import re
      4 import json
      5 import time
      6 from PIL import Image
      7 import cStringIO
      8 import cookielib  
      9 import urllib
     10 import os
     11 import xlrd
     12 
     13 from requests.packages.urllib3.exceptions import InsecureRequestWarning,InsecurePlatformWarning
     14 requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
     15 requests.packages.urllib3.disable_warnings(InsecurePlatformWarning)
     16 
     17 data=xlrd.open_workbook('1.xlsx')
     18 table=data.sheet_by_name(u'Sheet1')
     19 
     20 message_url='https://matrix.dean.swust.edu.cn/acadmicManager/index.cfm?event=studentProfile:DEFAULT_EVENT'
     21 login_url='https://matrix.dean.swust.edu.cn/cas/login'
     22 topic_url=''
     23 flag=0
     24 temp=''
     25 pic_count=1
     26 
     27 student = {}
     28 student = {
     29     '学号':'',
     30     '姓名':'',
     31     '性别':'',
     32     '生日':'',
     33     'pic':'',
     34     '民族':'',
     35     '行政班':'',
     36     '专业':'',
     37     }
     38 
     39 headers={
     40 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
     41 }
     42 
     43 session=requests.Session()
     44 session.headers=headers    
     45 session.cookies = cookielib.LWPCookieJar(filename='cookies') 
     46 # try:  
     47 #     session.cookies.load(ignore_discard=True)  
     48 # except:  
     49 #     print u"未登陆过,需先登录"  
     50 
     51 
     52 def get_lt(url="https://matrix.dean.swust.edu.cn/cas/login"):  
     53     '''''_lt 是一个动态变化的参数'''  
     54     global session
     55     index_url =  url
     56     index_page = session.get(index_url,verify=False)  
     57     html = index_page.content  
     58     pattern = r'name="lt" type="hidden" value="(.*?)"'  
     59     lt = re.findall(pattern, html)
     60     return lt[0]
     61 
     62 def login(username,password):
     63     global session
     64     global topic_url
     65     global flag
     66     data={
     67     'lt':get_lt(),
     68     'username':username,
     69     'password':password,
     70     'service':'https://matrix.dean.swust.edu.cn/acadmicManager/index.cfm?event=studentPortal:DEFAULT_EVENT',
     71     }
     72     loginurl=login_url
     73     try:
     74         login_page=session.post(loginurl,data=data)
     75         login_code=login_page.content
     76         pattern=r'<a class="btn btn-primary" href="(.*?)"'
     77         real_url=re.findall(pattern, login_code)
     78         topic_url=real_url[0]
     79         flag=1
     80     except:
     81         pass
     82     session.cookies.save()
     83 
     84 def error_clean(error_temp):
     85     global student
     86     global temp
     87     if(error_temp==temp):
     88         session.cookies.clear()
     89         student = {
     90         '学号':'',
     91         '姓名':'',
     92         '性别':'',
     93         '生日':'',
     94         'pic':'',
     95         '民族':'',
     96         '行政班':'',
     97         '专业':'',
     98         }
     99         flag=0
    100         topic_url=''
    101     else:
    102         pass
    103 
    104 
    105 
    106 
    107 def isLogin():  
    108     global session
    109     url = "https://matrix.dean.swust.edu.cn/acadmicManager/index.cfm?event=studentProfile:DEFAULT_EVENT"  
    110     login_code = session.get(url, allow_redirects=False).status_code  
    111     if int(x=login_code) == 200:  
    112         return True  
    113     else:  
    114         return False 
    115 
    116 def get_message():
    117     global session
    118     global topic_url
    119     global message_url
    120     global student
    121 
    122     html=session.get(topic_url)
    123     html=session.get(message_url).text
    124 
    125     pattern_ming=r'<td>(.*?)</td>'
    126     pattern_id=r'<span class="number">(.*?)</span>'
    127     pattern_pic=r'<td style="padding:0;" width="135" height="180" valign="middle" align="center" rowspan="6"><img width="135" height="180" align="middle" src="(.*?)" /></td>'
    128     message_name=re.findall(pattern_ming, html)
    129     message_pic=re.findall(pattern_pic, html)
    130     try:
    131         student['学号']=re.findall(r'<span class="number">(d*?)</span>', message_name[2])[0]
    132         student['姓名']=message_name[4]
    133         student['性别']=message_name[6]
    134         student['专业']=message_name[37]
    135         student['行政班']=message_name[27]
    136         student['pic']='https://matrix.dean.swust.edu.cn/acadmicManager/student/profile/'+student['学号']+'.jpg'
    137 
    138     except:
    139         pass
    140 
    141     #student['生日']=re.findall(r'<span class="number">(.*?)</span>', message_name[8])[0]
    142     #student['民族']=message_name[10]
    143 
    144 
    145 def download():
    146     global student
    147     global session
    148     global temp
    149     global pic_count
    150     basepath=os.path.abspath('.')
    151     savepath=os.path.join(basepath,student['专业'])
    152     if not os.path.exists(savepath):
    153         os.mkdir(savepath)
    154     try:
    155         picpath=os.path.join(savepath,student['姓名']+student['学号']+'.jpg')
    156         r=session.get(student['pic'])
    157         with open(picpath, "wb") as pic:
    158             pic.write(r.content)
    159         print u'>>>>>>>>>成功抓取>>>>>>>>>>>>>>>>>>>>'+student['姓名']
    160         temp=student['姓名']
    161         session.cookies.clear()
    162     except Exception, e:
    163         pass
    164     
    165 
    166 if __name__ == '__main__':
    167     count=table.nrows
    168     i=5000
    169     while(count>0):
    170         if(table.col_values(3)[i]==u'' and table.col_values(2)[i]!=u'王珀会'):
    171             try:
    172                 login(str(int(table.col_values(1)[i])), str(table.col_values(13)[i])[11:17])
    173             except:
    174                 pass
    175         if(flag==1):
    176             flag=0
    177             get_message()
    178             download()
    179         count=count-1
    180         i=i+1
    181         session.cookies.clear()

    总结:
    python处理excel>>  http://www.cnblogs.com/lhj588/archive/2012/01/06/2314181.html
    session释放>>    
    http://stackoverflow.com/questions/23816139/clear-cookies-from-requests-pytho
    注明:
      1.xlsx为提供学生资料的excel
      异常处理之间的妥协关系需要事先计划好
  • 相关阅读:
    ASP.NET MVC 3 新特性
    C#用WebClient下载File时操作超时的问题
    用C# 实现 Zen Cart 的用户密码加密算法
    ASP.NET MVC 局部缓存实现 用户控件缓存 Partial Output Caching
    关于MarshalByRefObject的解释
    文件的上传和下载
    浅谈三维GIS的应用之三维管线
    Python:处理不是经由EXPORT出來的Windows日志
    python :简单邮件发送
    日志文件分割:将包含不同关键字的行写入到不同的文件
  • 原文地址:https://www.cnblogs.com/vincebye/p/6049465.html
Copyright © 2011-2022 走看看