zoukankan      html  css  js  c++  java
  • 抽屉爬取

    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy.http.cookies import CookieJar
    from scrapy.http import Request
    class ChoutiSpider(scrapy.Spider):
        name = 'chouti'
        allowed_domains = ['chouti.com']
        start_urls = ['https://dig.chouti.com/']
        #ROBOTSTXT_OBEY = True需要注销掉
        cookie_dict = {}   #其他页面也要cookie,所以定义一个全局的
        def parse(self, response):
            #点赞需要登录才能,所以要携带cookie
            #去响应头中获取cookie,response中获取
            #cookie保存在cookie_jar对象中
            print(response.xpath("//div[@id='dig_lcpage']/a/@href"))
            print('=====')
            # cookie_dict = {}
            cookie_jar = CookieJar()
            cookie_jar.extract_cookies(response, response.request)
            #去对象中将cookie解析到字典中
            for k, v in cookie_jar._cookies.items():
                for i, j in v.items():
                    for m, n in j.items():
                        self.cookie_dict[m] = n.value
            print("+++++")
            print(self.cookie_dict)
    
            yield scrapy.Request(
                url="https://dig.chouti.com/login",
                method="POST",
                body="phone=8618588888888&password=88888888&oneMonth=1",
                cookies=self.cookie_dict,
                headers={"content-type":"application/x-www-form-urlencoded; charset=UTF-8"},
                callback=self.check_login
    
            )
    
        def check_login(self,response):
    
            print(response.text)
            print("==============")
            yield scrapy.Request(
                "https://dig.chouti.com/all/hot/recent/1",
                cookies=self.cookie_dict,
                callback=self.index
            )
        def index(self,response):
    
            div_list = response.xpath("//div[@id='content-list']/div[@class='item']")
            for div in div_list:
                link_id = div.xpath('*/div[@class="part2"]/@share-linkid').extract_first()
                print(link_id)
                #点赞请求
                yield Request(
                    url='http://dig.chouti.com/link/vote?linksId=%s' % (link_id,),
                    method='POST',
                    cookies=self.cookie_dict,
                    callback=self.check_result
                )
        def check_result(self,response):
            print(response.text)
  • 相关阅读:
    PB中的Grid视图
    MVC加jquery的无刷新列表分页摘要
    Quartz.Net 1.30的一些设置说明
    将Excel的数据库字典导到PDM中
    HubbleDotNet使用备忘
    EntLib5.0 日志应用程序块(logging) 使用与配置
    生成随机密码
    网站整合QQ登录
    PB代码参考段
    SQL查询之 Pivot 详解
  • 原文地址:https://www.cnblogs.com/chvv/p/10357288.html
Copyright © 2011-2022 走看看