zoukankan      html  css  js  c++  java
  • 【python爬虫】coursera抓取

      1 # -*- coding: utf-8 -*-”
      2 #!/usr/bin/env python
      3 
      4 """
      5 用于抓取coursera网站的下载链接
      6 """
      7 
      8 import sys
      9 import string
     10 import re,random
     11 import urllib,urllib2
     12 import cookielib
     13 import getpass
     14 
     15 
     16 class Coursera(object):
     17     """Coursera类定义
     18     
     19             实现模拟登陆,抓取网页代码和正则匹配,保存连接到文件
     20     
     21     Attributes:
     22         login_url:保存真正的登陆页面URL
     23         url:保存用于爬取下载连接的URL
     24         user_name:存储用户登陆Email
     25         password:存储用户登陆密码
     26     """
     27     
     28     def __init__(self,url,user_name,password):
     29         self.login_url = "https://accounts.coursera.org/api/v1/login"
     30         self.url = url
     31         if user_name == "" or password == "":
     32             raise UserOrPwdNone("the username or password can't empty string")
     33             sys.exit(2)
     34         else :
     35             self.user_name=user_name
     36             self.password = password
     37     
     38     def simulation_login(self):
     39         """
     40                     模拟登录函数
     41         """
     42         
     43         cookie = cookielib.CookieJar()
     44         opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
     45         urllib2.install_opener(opener)
     46         form_data,request_header = self.structure_headers()
     47         req = urllib2.Request(self.login_url,data = form_data,headers=request_header)
     48         try:
     49             result = urllib2.urlopen(req)
     50         except urllib2.URLError,e:
     51             if hasattr(e, "code"):
     52                 print "The server couldn't fulfill the request.Please check your url and read the Reason"
     53                 print "Error code: %s" % e.code
     54             elif hasattr(e, "reason"):
     55                 print "We failed to reach a server. Please check your url and read the Reason"
     56                 print "Reason: %s" % e.reason
     57             sys.exit(2)
     58         if result.getcode()==200:
     59             print "登录成功..."
     60             
     61     def structure_headers(self):
     62         """
     63                     头部构造函数
     64         """
     65         #模拟表单数据,这个参数不是字典
     66         form_data = urllib.urlencode({
     67             "email":self.user_name,
     68             "password":self.password,
     69             "webrequest":"true"
     70         })
     71         user_agent = ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) "
     72             "AppleWebKit/537.36 (KHTML, like Gecko) "
     73             "Chrome/38.0.2125.111 Safari/537.36")
     74         XCSRF2Cookie = 'csrf2_token_%s' % ''.join(self.random_string(8))
     75         XCSRF2Token = ''.join(self.random_string(24))
     76         XCSRFToken = ''.join(self.random_string(24))
     77         cookie = "csrftoken=%s; %s=%s" % (XCSRFToken, XCSRF2Cookie, XCSRF2Token)
     78         
     79         request_header = {
     80             "Referer": "https://accounts.coursera.org/signin",  #对付防盗链设置, 为跳转来源的url
     81             "User-Agent": user_agent, #伪装成浏览器访问
     82             "X-Requested-With": "XMLHttpRequest",
     83             "X-CSRF2-Cookie": XCSRF2Cookie,
     84             "X-CSRF2-Token": XCSRF2Token,
     85             "X-CSRFToken": XCSRFToken,
     86             "Cookie": cookie
     87         }
     88         
     89         return form_data,request_header
     90     
     91     def random_string(self,length):
     92         """
     93                     随机生成指定长度的字母和数字序列
     94         """
     95         return  ''.join(random.choice(string.letters + string.digits) for i in xrange(length))
     96     
     97     def get_links(self):
     98         """
     99                     爬取页面代码,获取下载MP4和PDF连接
    100         """
    101         
    102         try:
    103             result = urllib2.urlopen(self.url)
    104         except urllib2.URLError,e:
    105             if hasattr(e, "code"):
    106                 print "The server couldn't fulfill the request."
    107                 print "Error code: %s" % e.code
    108             elif hasattr(e, "reason"):
    109                 print "We failed to reach a server. Please check your url and read the Reason"
    110                 print "Reason: %s" % e.reason
    111             sys.exit(2)
    112         content = result.read().decode("utf-8")
    113         print "读取网页成功..."
    114         down_links = re.findall(r'<a.*?href="(.*?mp4.*?)"', content)
    115         down_pdfs = re.findall(r'<a.*?href="(.*?pdf)"', content)
    116         print "正则匹配结束..."
    117         return down_links,down_pdfs
    118     
    119     def start_spider(self):
    120         """运行爬虫,将爬取链接写入不同文件
    121         """
    122         self.simulation_login()
    123         down_links,down_pdfs = self.get_links()
    124         with open("coursera.html","w+") as my_file:
    125             print "下载链接的长度",len(down_links)
    126             for link in down_links:
    127                 print link
    128                 try:
    129                     my_file.write(link+"
    ")
    130                 except UnicodeEncodeError:
    131                     sys.exit(2)
    132         with open("coursera.pdf", "w+") as my_file :
    133             print "下载pdf的长度", len(down_pdfs)
    134             for pdf in down_pdfs :
    135                 try :
    136                     my_file.write(pdf + "
    ")
    137                 except UnicodeEncodeError :
    138                     sys.exit(2)
    139         print "抓取Coursera课程下载链接和pdf链接成功"
    140         
    141         
    142 class UserOrPwdNone(BaseException):
    143     """
    144     Raised if the username or password is empty string
    145     """
    146 
    147 def main():
    148     """
    149     if len(sys.argv) != 2:
    150         print "Please Input what course you want to download.."
    151         sys.exit(2)
    152     """
    153     
    154     """
    155     user_name = raw_input("Input your Email > ")
    156     password = getpass.getpass("Input your Password > ")
    157     """
    158     url  = "https://class.coursera.org/{course}/lecture"
    159     user_name = "15258691200@163.com"
    160     password = "xxxxxxx"
    161     spider = Coursera(url.format(course = "python"),user_name,password)
    162     spider.start_spider()
    163     
    164 if __name__ == '__main__':
    165     main()

    通过谷歌浏览器的network工具分析http请求头中的内容,然后自己定义,模拟登陆。

    对比发现:请求头中X-CSRF2-Token和X-CSRFToken是完全随机的,X-CSRF2-Cookie后8位是随机生成的,字母和数字。

    于是就有了这样的请求头代码:

    def structure_headers(self) :
            #模拟表单数据,这个参数不是字典
            form_data = urllib.urlencode({
                "email": self.user_name,
                "password": self.password,
                "webrequest": "true"
            })  
            user_agent = ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) "
                "AppleWebKit/537.36 (KHTML, like Gecko) "
                "Chrome/38.0.2125.111 Safari/537.36")
            XCSRF2Cookie = 'csrf2_token_%s' % ''.join(self.random_string(8))
            XCSRF2Token = ''.join(self.random_string(24))
            XCSRFToken = ''.join(self.random_string(24))
            cookie = "csrftoken=%s; %s=%s" % (XCSRFToken, XCSRF2Cookie, XCSRF2Token)
    
            request_header = {
                "Referer": "https://accounts.coursera.org/signin",  #对付防盗链设置, 为跳转来源的url
                "User-Agent": user_agent, #伪装成浏览器访问
                "X-Requested-With": "XMLHttpRequest",
                "X-CSRF2-Cookie": XCSRF2Cookie,
                "X-CSRF2-Token": XCSRF2Token,
                "X-CSRFToken": XCSRFToken,
                "Cookie": cookie
            }
            return form_data, request_header
    
        def random_string(self, length):
            return ''.join(random.choice(string.letters + string.digits) for i in xrange(length))
    View Code

    最后的运行结果:

    因为输入的请求下载链接不正确,所以下载的长度都是0

  • 相关阅读:
    平台
    重构之践
    Linux.NET
    系统分析员级下午试题II(论文)解答方法
    通用泛型存储接口的设计
    .NET平台4.0 发布网站流程及出错总结
    在IIS上发布基于Windows Azure Service Bus的WCF服务
    epoll + 多线程实现并发网络连接处理
    Linux进程地址空间之初探:一
    排序、搜索
  • 原文地址:https://www.cnblogs.com/fjl-vxee/p/6694923.html
Copyright © 2011-2022 走看看