zoukankan      html  css  js  c++  java
  • scrapy爬虫系列之六--模拟登录

    功能点:如何发送携带cookie访问登录后的页面,如何发送post请求登录

    爬取网站:bilibili、github

    完整代码:https://files.cnblogs.com/files/bookwed/login.zip

    主要代码:

    bili.py

    # -*- coding: utf-8 -*-
    import scrapy
    import re
    
    
    class BiliSpider(scrapy.Spider):
        """直接携带cookie访问登录后的bilibili页面"""
        name = 'bili'
        allowed_domains = ['bilibili.com']
        # 登录后的个人主页
        start_urls = ['https://account.bilibili.com/home/userInfo']
    
        def start_requests(self):
            cookies = "_uuid=738F48A9-E13A-9445-3577-3068FADC9F6A05981infoc; buvid3=5DE9F436-F051-44E1-9B97-AB53E60C3ED448999infoc;"
            cookies = {i.split("=")[0]: i.split("=")[1] for i in cookies.split("; ")}
            # 把cookies字符串放到headers里面传参,这种方式不行,要单独传cookies参数
            # headers={"Cookie": cookies}
            print(cookies)
            yield scrapy.Request(
                self.start_urls[0],
                callback=self.parse,
                cookies=cookies,
                # headers=headers
            )
    
    
        def parse(self, response):
            # 验证是否成功
            print("*"*30)
            print(re.findall("bookwed", response.body.decode()))
            print("*"*30)
    
            # yield scrapy.FormRequest(
            #     "http://",
            #     headers=self,
            #     formdata=dict(),
            #     callback=self.after_login
            # )

    github.py

    # -*- coding: utf-8 -*-
    import scrapy
    import re
    
    
    class GithubSpider(scrapy.Spider):
        """利用scrapy发送post请求,模拟登录github"""
        """注意点:针对form表单有action地址的情况,可以直接请求action,参考github2.py"""
        name = 'github'
        allowed_domains = ['github.com']
        start_urls = ['https://github.com/login']
    
        def parse(self, response):
            authenticity_token = response.xpath("//input[@name='authenticity_token']/@value").extract_first()
            commit = response.xpath("//input[@name='commit']/@value").extract_first()
            utf8 = response.xpath("//input[@name='utf8']/@value").extract_first()
            webauthn_support = response.xpath("//input[@name='webauthn-support']/@value").extract_first()
            # login = response.xpath("//input[@name='login']/@value").extract_first()
            # password = response.xpath("//input[@name='password']/@value").extract_first()
            post_data = dict(
                login="aa@163.com",
                password="aaaaaa",
                commit=commit,
                utf8=utf8,
                authenticity_token=authenticity_token,
                webauthn_support=webauthn_support
            )
            yield scrapy.FormRequest(
                "https://github.com/session",   #发送post请求登录接口
                formdata=post_data,
                callback=self.after_login
            )
    
            # 另外一种发送post请求的方式:指定请求方式为POST
            # yield scrapy.Request(
            #     "https://github.com/session",
            #     method='POST',
            #     body=
            # )
    
        def after_login(self,response):
            # 对于不太确认的情况,可以先把响应保存到本地,然后进行分析
            # with open('aa.html', 'w', encoding='utf-8') as f:
            #     f.write(response.body.decode())
            print("*"*30)
            print(re.findall('wed', response.body.decode()))
            print("*"*30)

    github2.py

    # -*- coding: utf-8 -*-
    import scrapy
    import re
    
    
    class Github2Spider(scrapy.Spider):
        """对于form表单有action地址的情况,可以直接请求action,只用传用户名密码即可"""
        name = 'github2'
        allowed_domains = ['github.com']
        start_urls = ['https://github.com/login']
    
        # 注意:针对网页中有多个form的情况,可以通过传参来指定form,如formname、formid、formnumber、formxpath
        def parse(self, response):
            yield scrapy.FormRequest.from_response(
                response,   # scrapy会从response中自动寻找form表单
                formdata={"login": "aa@163.com", "password": "aaaaaa"},     # key对应页面上的name,value对应实际的值
                callback=self.after_login
            )
    
        def after_login(self, response):
            print("*" * 30)
            print(re.findall('wed', response.body.decode()))
            print("*" * 30)
  • 相关阅读:
    leetcode——36.有效的数独
    leetcode——60.第K个排列
    leetcode——128. 最长连续序列
    leetcode——81. 搜索旋转排序数组 II
    leetcode——49.字母异构词分组
    leetcode——75.颜色分类
    leetcode——44.通配符匹配
    leetcode——88.合并两个有序数组
    leetcode——116.填充每一个节点的下一个右侧节点指针
    树莓派系统终端中让文件和文件夹显示不同颜色的设置
  • 原文地址:https://www.cnblogs.com/bookwed/p/10648522.html
Copyright © 2011-2022 走看看