zoukankan      html  css  js  c++  java
  • Scrapy 框架 手动发送请求 POST 请求的发送

     

    手动发送请求

    import scrapy
    
    from choutiSpider.items import ChoutispiderItem
    
    
    class ChoutiSpider(scrapy.Spider):
        name = 'chouti'
        # allowed_domains = ['www.xxx.com']
        start_urls = ['https://dig.****.com/r/scoff/hot/1']
    
        # 通用模板 url
        url = 'https://dig.****.com/r/scoff/hot/%s'
        page_num = 1
    
        def parse(self, response):
            div_list = response.xpath('//div[@id="content-list"]/div')
            # print(div_list)
    
            print(self.page_num)
            for div in div_list:
                content = div.xpath('./div[@class="news-content"]/div[1]/a/text()').extract_first().strip()
                author = div.xpath('./div[@class="news-content"]/div[2]/a[4]/b/text()').extract_first()
                # print(content, author)
                item = ChoutispiderItem()
                item['author'] = author
                item['content'] = content
                # 提交数据 到管道中
                yield item
            # 手动发送请求  分页爬取 
            if self.page_num < 120:
                self.page_num += 1
                new_url = self.url % self.page_num
                # 发送请求 提交
                yield scrapy.Request(url=new_url, callback=self.parse)

    post 请求发送

    # 在scrapy框架中默认情况下cookie会被自动处理,无需手动!
    
    class PostdemoSpider(scrapy.Spider):
        name = 'postdemo'
        allowed_domains = ['www.xxx.com']
        start_urls = ['https://fanyi.****.com/sug']
        # 调用父类的 发送请求的 方法
        def start_requests(self):
            for url in self.start_urls:
                data = {
                    'kw': 'cat'
                }
                yield scrapy.FormRequest(url=url, formdata=data, callback=self.parse)
    
        def parse(self, response):
            print(response.text)

    在scrapy框架中默认情况下cookie会被自动处理,无需手动!

    settings 配置:

    # Disable cookies (enabled by default)
    # COOKIES_ENABLED = False

    请求传参:

    • 二级详情页面 的 item 传递
    import scrapy
    
    from boosPro.items import BoosproItem
    
    
    class BoosSpider(scrapy.Spider):
        name = 'boos'
        # allowed_domains = ['www.xxx.com']
        start_urls = ['https://www.****.com/c101010100/?query=python%E7%88%AC%E8%99%AB&page=1']
    
        url = 'https://www.****.com/c101010100/?query=python%E7%88%AC%E8%99%AB&page=%s'
        page_num = 1
    
        def parse(self, response):
            li_list = response.xpath('//div[@class="job-list"]/ul/li')
            for li in li_list:
                item = BoosproItem()
                title = li.xpath('.//div[@class="job-title"]/text()').extract_first()
                # 薪资 salary
                salary = li.xpath('.//div[@class="info-primary"]/h3/a/span/text()').extract_first()
                # 公司 company
                company = li.xpath('.//div[@class="company-text"]/h3/a/text()').extract_first()
                detail_url = 'https://www.zhipin.com' + li.xpath('.//div[@class="info-primary"]/h3/a/@href').extract_first()
                item['title'] = title
                item['salary'] = salary
                item['company'] = company
    
                # 对详情页的url进行手动请求的发送
                yield scrapy.Request(url=detail_url, callback=self.parsrDetail, meta={'item': item})
    
            if self.page_num <= 3:
                self.page_num += 1
                newUrl = self.url % self.page_num
                yield scrapy.Request(url=newUrl, callback=self.parse)
    
        # 用来解析详情页的相关的数据
        def parsrDetail(self, response):
            # 接收meta
            item = response.meta['item']
            job_desc = response.xpath('//*[@id="main"]/div[3]/div/div[2]/div[2]/div[1]/div//text()').extract()
            company_content = response.xpath('//*[@id="main"]/div[3]/div/div[2]/div[2]/div[2]/div/text()').extract_first()
    
            job_desc = ' '.join(job_desc)
            item['job_desc'] = job_desc
            item['company_content'] = company_content
            # print(job_desc, 1111111)
    
            yield item
  • 相关阅读:
    JS事件冒泡、事件捕获和事件委托
    实现英文字母排序
    JavaScript异步加载的四种方法
    JavaScript作用域与作用域链
    JavaScript 自定义属性 data-*
    asycn和await
    style collectd
    JavaScript如何比较两个数组的内容是否相同
    VScode插件开发--M2D文档转换插件
    BOM笔记
  • 原文地址:https://www.cnblogs.com/ellisonzhang/p/11113273.html
Copyright © 2011-2022 走看看