zoukankan      html  css  js  c++  java
  • 抽屉爬取

    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy.http.cookies import CookieJar
    from scrapy.http import Request
    class ChoutiSpider(scrapy.Spider):
        name = 'chouti'
        allowed_domains = ['chouti.com']
        start_urls = ['https://dig.chouti.com/']
        #ROBOTSTXT_OBEY = True需要注销掉
        cookie_dict = {}   #其他页面也要cookie,所以定义一个全局的
        def parse(self, response):
            #点赞需要登录才能,所以要携带cookie
            #去响应头中获取cookie,response中获取
            #cookie保存在cookie_jar对象中
            print(response.xpath("//div[@id='dig_lcpage']/a/@href"))
            print('=====')
            # cookie_dict = {}
            cookie_jar = CookieJar()
            cookie_jar.extract_cookies(response, response.request)
            #去对象中将cookie解析到字典中
            for k, v in cookie_jar._cookies.items():
                for i, j in v.items():
                    for m, n in j.items():
                        self.cookie_dict[m] = n.value
            print("+++++")
            print(self.cookie_dict)
    
            yield scrapy.Request(
                url="https://dig.chouti.com/login",
                method="POST",
                body="phone=8618588888888&password=88888888&oneMonth=1",
                cookies=self.cookie_dict,
                headers={"content-type":"application/x-www-form-urlencoded; charset=UTF-8"},
                callback=self.check_login
    
            )
    
        def check_login(self,response):
    
            print(response.text)
            print("==============")
            yield scrapy.Request(
                "https://dig.chouti.com/all/hot/recent/1",
                cookies=self.cookie_dict,
                callback=self.index
            )
        def index(self,response):
    
            div_list = response.xpath("//div[@id='content-list']/div[@class='item']")
            for div in div_list:
                link_id = div.xpath('*/div[@class="part2"]/@share-linkid').extract_first()
                print(link_id)
                #点赞请求
                yield Request(
                    url='http://dig.chouti.com/link/vote?linksId=%s' % (link_id,),
                    method='POST',
                    cookies=self.cookie_dict,
                    callback=self.check_result
                )
        def check_result(self,response):
            print(response.text)
  • 相关阅读:
    windows10关闭更新,windowsUpdate禁用无效 windows无限重启 一分钟无限重启 win10无法连接到SENS服务
    Visual Studio项目/解决方案重命名
    关于Geometry中的一些简单形状
    无法打开http://localhost:6080/arcgis/manager/
    centOS无法联网
    FTP服务器搭建
    iFrame中dateGrid中数据不显示
    关于python的基础知识
    python中int str bool list dict数据操作方法汇总
    关于int str bool的讨论
  • 原文地址:https://www.cnblogs.com/chvv/p/10357288.html
Copyright © 2011-2022 走看看