zoukankan      html  css  js  c++  java
  • 爬豆瓣阅读遇到的问题

    1.发送get和post请求才能获取response信息,并且把两个response信息分开,只返回post请求的response信息。

    class DoubanSpider(scrapy.Spider):
        name = 'douban'
        allowed_domains = ['read.douban.com']
        page_num = 0
        url1 = 'https://read.douban.com/category/?kind/100&page='
        start_urls = (
            url1+str(page_num),
                     )
    
    
        def start_requests(self):
            url = 'https://read.douban.com/j/kind/'  
            headers = {
                "Content-Type": "application/json",
                "Referer": "https://read.douban.com/category/?kind=100&page=0&sort=hot",
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36",
            }
            payload = {"sort": "hot", "page": 1, "kind": 100,
                       "query": "
        query getFilterWorksList($works_ids: [ID!]) {
          worksList(worksIds: $works_ids) {
            
        
        title
        cover
        url
        isBundle
      
        
        url
        title
      
        
        author {
          name
          url
        }
        origAuthor {
          name
          url
        }
        translator {
          name
          url
        }
      
        
        abstract
        editorHighlight
      
        
        isOrigin
        kinds {
          
        name @skip(if: true)
        shortName @include(if: true)
        id
      
        }
        ... on WorksBase @include(if: true) {
          wordCount
          wordCountUnit
        }
        ... on WorksBase @include(if: true) {
          
        isEssay
        
        ... on EssayWorks {
          favorCount
        }
      
        
        isNew
        
        averageRating
        ratingCount
        url
      
      
      
        }
        ... on WorksBase @include(if: false) {
          isColumn
          isEssay
          onSaleTime
          ... on ColumnWorks {
            updateTime
          }
        }
        ... on WorksBase @include(if: true) {
          isColumn
          ... on ColumnWorks {
            isFinished
          }
        }
        ... on EssayWorks {
          essayActivityData {
            
        title
        uri
        tag {
          name
          color
          background
          icon2x
          icon3x
          iconSize {
            height
          }
          iconPosition {
            x y
          }
        }
      
          }
        }
        highlightTags {
          name
        }
      
        ... on WorksBase @include(if: false) {
          
        fixedPrice
        salesPrice
        isRebate
      
        }
        ... on EbookWorks {
          
        fixedPrice
        salesPrice
        isRebate
      
        }
        ... on WorksBase @include(if: true) {
          ... on EbookWorks {
            id
            isPurchased
            isInWishlist
          }
        }
      
            id
            isOrigin
          }
        }
      ",
                       "variables": {}}
    
            yield scrapy.Request(url, headers=headers, body=json.dumps(payload))

    2.分开之后处理response信息。

        def parse(self, response):
            Item = DoubanspiderItem()
            books = response.xpath('//div[@class="info"]')
            print(response.text)
    
            res = json.loads(response.text)["list"]
            for i in res :
                print(i["title"])
                Item["book"] = i["title"]
                Item["author"] = i["origAuthor"]["name"]
                Item["price"] = i["title"]
                Item["number"] = i["wordCount"]
                Item["grade"] = i["title"]
                Item["info"] = i["abstract"]
    
            yield Item
  • 相关阅读:
    win平台搭建Lnmp环境
    YII2 model where 条件拼接
    yii2框架-yii2局部关闭(开启)csrf的验证
    Yii2 控制器单独向view(layout)传值
    Yii2 数据库基本操作
    PHP 多线程采集
    php 阿拉伯数字转中文
    javascript里的sleep()方法
    PHP数组内容不重复组合排列算法
    git使用经验(一)
  • 原文地址:https://www.cnblogs.com/xuezhihao/p/11658776.html
Copyright © 2011-2022 走看看