zoukankan      html  css  js  c++  java
  • Scrapy 框架 手动发送请求 POST 请求的发送

     

    手动发送请求

    import scrapy
    
    from choutiSpider.items import ChoutispiderItem
    
    
    class ChoutiSpider(scrapy.Spider):
        name = 'chouti'
        # allowed_domains = ['www.xxx.com']
        start_urls = ['https://dig.****.com/r/scoff/hot/1']
    
        # 通用模板 url
        url = 'https://dig.****.com/r/scoff/hot/%s'
        page_num = 1
    
        def parse(self, response):
            div_list = response.xpath('//div[@id="content-list"]/div')
            # print(div_list)
    
            print(self.page_num)
            for div in div_list:
                content = div.xpath('./div[@class="news-content"]/div[1]/a/text()').extract_first().strip()
                author = div.xpath('./div[@class="news-content"]/div[2]/a[4]/b/text()').extract_first()
                # print(content, author)
                item = ChoutispiderItem()
                item['author'] = author
                item['content'] = content
                # 提交数据 到管道中
                yield item
            # 手动发送请求  分页爬取 
            if self.page_num < 120:
                self.page_num += 1
                new_url = self.url % self.page_num
                # 发送请求 提交
                yield scrapy.Request(url=new_url, callback=self.parse)

    post 请求发送

    # 在scrapy框架中默认情况下cookie会被自动处理,无需手动!
    
    class PostdemoSpider(scrapy.Spider):
        name = 'postdemo'
        allowed_domains = ['www.xxx.com']
        start_urls = ['https://fanyi.****.com/sug']
        # 调用父类的 发送请求的 方法
        def start_requests(self):
            for url in self.start_urls:
                data = {
                    'kw': 'cat'
                }
                yield scrapy.FormRequest(url=url, formdata=data, callback=self.parse)
    
        def parse(self, response):
            print(response.text)

    在scrapy框架中默认情况下cookie会被自动处理,无需手动!

    settings 配置:

    # Disable cookies (enabled by default)
    # COOKIES_ENABLED = False

    请求传参:

    • 二级详情页面 的 item 传递
    import scrapy
    
    from boosPro.items import BoosproItem
    
    
    class BoosSpider(scrapy.Spider):
        name = 'boos'
        # allowed_domains = ['www.xxx.com']
        start_urls = ['https://www.****.com/c101010100/?query=python%E7%88%AC%E8%99%AB&page=1']
    
        url = 'https://www.****.com/c101010100/?query=python%E7%88%AC%E8%99%AB&page=%s'
        page_num = 1
    
        def parse(self, response):
            li_list = response.xpath('//div[@class="job-list"]/ul/li')
            for li in li_list:
                item = BoosproItem()
                title = li.xpath('.//div[@class="job-title"]/text()').extract_first()
                # 薪资 salary
                salary = li.xpath('.//div[@class="info-primary"]/h3/a/span/text()').extract_first()
                # 公司 company
                company = li.xpath('.//div[@class="company-text"]/h3/a/text()').extract_first()
                detail_url = 'https://www.zhipin.com' + li.xpath('.//div[@class="info-primary"]/h3/a/@href').extract_first()
                item['title'] = title
                item['salary'] = salary
                item['company'] = company
    
                # 对详情页的url进行手动请求的发送
                yield scrapy.Request(url=detail_url, callback=self.parsrDetail, meta={'item': item})
    
            if self.page_num <= 3:
                self.page_num += 1
                newUrl = self.url % self.page_num
                yield scrapy.Request(url=newUrl, callback=self.parse)
    
        # 用来解析详情页的相关的数据
        def parsrDetail(self, response):
            # 接收meta
            item = response.meta['item']
            job_desc = response.xpath('//*[@id="main"]/div[3]/div/div[2]/div[2]/div[1]/div//text()').extract()
            company_content = response.xpath('//*[@id="main"]/div[3]/div/div[2]/div[2]/div[2]/div/text()').extract_first()
    
            job_desc = ' '.join(job_desc)
            item['job_desc'] = job_desc
            item['company_content'] = company_content
            # print(job_desc, 1111111)
    
            yield item
  • 相关阅读:
    003-代码补全,运行,调试
    002-创建基本项目-新项目、空项目、已有项目
    001-使用idea开发环境安装部署,npm工具栏,脚本运行
    0601-Zuul构建API Gateway-API gateway简介、基础使用、路由配置、负载配置
    005-TCP传输控制协议
    0505-Hystrix保护应用-Turbine集群状态监控
    0504-Hystrix保护应用-Hystrix Dashboard的使用与常见问题总结
    0503-Hystrix保护应用-feign的hystrix支持
    004-java类保存优化
    0502-Hystrix保护应用-简介,使用,健康指标等
  • 原文地址:https://www.cnblogs.com/ellisonzhang/p/11113273.html
Copyright © 2011-2022 走看看