zoukankan      html  css  js  c++  java
  • 网站登陆的两种方法

    目前来看,需要登陆才能爬取的页面有两种可用方法

    方法一:FormRequest 里面传入用户名和密码

    # -*- coding: utf-8 -*-
    import json
    import scrapy
    from scrapy import FormRequest
    from scrapy.mail import MailSender
    
    from bioon import settings
    from bioon.items import BioonItem
    
    class BioonspiderSpider(scrapy.Spider):
        name = "bioonspider"
        allowed_domains = ["bioon.com"]
        start_urls=['http://login.bioon.com/login']
        
        def parse(self,response):
            #从response.headers中获取cookies信息
            r_headers = response.headers['Set-Cookie']
            cookies_v = r_headers.split(';')[0].split('=')
            
            cookies = {cookies_v[0]:cookies_v[1]}
            
            #模拟请求的头部信息
            headers = {
            'Host':    'login.bioon.com',
            'Referer':'http://login.bioon.com/login',
            'User-Agent':'Mozilla/5.0 (Windows NT 6.1; rv:40.0) Gecko/20100101 Firefox/40.0',
            'X-Requested-With':'XMLHttpRequest' 
            }
            
            #获取验证信息
            csrf_token = response.xpath(
                '//input[@id="csrf_token"]/@value').extract()[0]
            
            #获得post的目的URL
            login_url = response.xpath(
                '//form[@id="login_form"]/@action').extract()[0]
            end_login = response.urljoin(login_url)
            
            #生成post的数据
            formdata={
            #请使用自己注册的用户名
            'account':'********',
            'client_id':'usercenter',
            'csrf_token':csrf_token,
            'grant_type':'grant_type',
            'redirect_uri':'http://login.bioon.com/userinfo',
            #请使用自己注册的用户名
            'username':'********',
            #请使用自己用户名的密码
            'password':'xxxxxxx',
            }
            
            #模拟登录请求
            return FormRequest(
            end_login,
            formdata=formdata,
            headers=headers,
            cookies=cookies,
            callback=self.after_login
            )
    
        def after_login(self,response):
            
            self.log('Now handling bioon login page.')
            
            aim_url = 'http://news.bioon.com/Cfda/'
            
            obj = json.loads(response.body)
            
            print "Loging state: ", obj['message']
            if "success" in obj['message']:
                self.logger.info("=========Login success.==========")
            
            return scrapy.Request(aim_url,callback = self.parse_list)
        
        def parse_list(self,response):
            
            lis_news = response.xpath(
                '//ul[@id="cms_list"]/li/div/h4/a/@href').extract()
            
            for li in lis_news:
                end_url = response.urljoin(li)
                yield scrapy.Request(end_url,callback=self.parse_content)
        
        def parse_content(self,response):
            
            head = response.xpath(
                '//div[@class="list_left"]/div[@class="title5"]')[0]
            
            item=BioonItem()
            
            item['title'] = head.xpath('h1/text()').extract()[0]
                
            item['source'] = head.xpath('p/text()').re(ur'来源:(.*?)s(.*?)$')[0]
            
            item['date_time'] = head.xpath('p/text()').re(ur'来源:(.*?)s(.*?)$')[1]
            
            item['body'] = response.xpath(
                '//div[@class="list_left"]/div[@class="text3"]').extract()[0]
            
            return item
    
            
        def closed(self,reason):
            import pdb;pdb.set_trace()
            self.logger.info("Spider closed: %s"%str(reason))
            mailer = MailSender.from_settings(self.settings)
            mailer.send(
                to=["******@qq.com"], 
                subject="Spider closed", 
                body=str(self.crawler.stats.get_stats()), 
                cc=["**********@xxxxxxxx.com"]
                )

    方法二:添加cookie

    #-*- coding:utf-8 -*-
    import scrapy
    
    class StackOverflowSpider(scrapy.Spider):
        name = 'stackoverflow'
        start_urls = ['http://stackoverflow.com/questions?sort=votes']
        
        def start_requests(self):
            url = "http://db.bioon.com/list.php?channelid=1016&classid=951"
            cookies = {
                'dz_username':'wst_today',
                'dz_uid':'1322052',
                'buc_key':'ofR1I78RBaCHkGp8MdBBRjMx7ustawtY',
                'buc_token':'a91b8fef55c66846d3975a9fd8883455'
            }
            return [
                scrapy.Request(url,cookies=cookies),
            ]
        
        def parse(self, response):
            ele = response.xpath(
                '//table[@class="table table-striped"]/thead/tr/th[1]/text()'
                ).extract()
            if ele:
                print "success"
  • 相关阅读:
    python笔记-2
    python笔记-1
    生成列表
    内置函数
    装饰器、包的导入
    python3 编码
    python3 初识函数
    python3 文件读写
    python3 流程控制
    python3 数据类型
  • 原文地址:https://www.cnblogs.com/themost/p/7116691.html
Copyright © 2011-2022 走看看