zoukankan      html  css  js  c++  java
  • 爬虫day 04(通过登录去爬虫 解决django的csrf_token)

    #通过登录去爬虫
    #首先要有用户名和密码
    import urllib.request
    import http.cookiejar
    from lxml import etree
    head = {
        'Connection': 'Keep-Alive',
        'Accept': 'text/html, application/xhtml+xml, */*',
        'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
    }
    # 给opener加上cookie
    def makeMyOpener(head):
        cj = http.cookiejar.CookieJar()
        opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
        header = []
        for key, value in head.items():
            elem = (key, value)
            header.append(elem)
        opener.addheaders = header
        return opener
    # 爬自己的页面 
    oper = makeMyOpener(head)
    uop = oper.open('http://127.0.0.1:8000/index/loginHtml/', timeout = 1000)
    data = uop.read()
    html = data.decode()
    # lxml提取 csrfmiddlewaretoken 
     selector = etree.HTML(html) links = selector.xpath('//form/input[@name="csrfmiddlewaretoken"]/@value') for link in links: csrfmiddlewaretoken = link print(link) url = 'http://127.0.0.1:8000/index/login/' datas = {'csrfmiddlewaretoken':csrfmiddlewaretoken,'email':'aa','pwd':'aa'}
    # 必须要把字符串改为二进制流 data_encoded
    = urllib.parse.urlencode(datas).encode(encoding='utf-8') response = oper.open(url,data_encoded) content = response.read() html = content.decode() print(html)
  • 相关阅读:
    java小知识点5
    java小知识点4
    java小知识点3
    编程之法:面试和算法心得(寻找最小的k个数)
    389. Find the Difference
    104. Maximum Depth of Binary Tree
    485. Max Consecutive Ones
    693. Binary Number with Alternating Bits
    463. Island Perimeter
    566. Reshape the Matrix
  • 原文地址:https://www.cnblogs.com/qieyu/p/7818511.html
Copyright © 2011-2022 走看看