zoukankan      html  css  js  c++  java
  • 19 03 12 环球网 三级页面跳转加翻页 数据抓取 打印(无数据库版)

    这次先没有数据库  

    spider

    # -*- coding: utf-8 -*-
    import scrapy
    # from yang_guan.items import YangGuanItem
    from copy import deepcopy
    from scrapy.spiders import CrawlSpider
    
    class YgSpider(scrapy.Spider):
        name = 'yg'
        # allowed_domains = ['huanqiu.com']
        start_urls = ['http://www.huanqiu.com/',
                      ]
    
        def parse (self, response):  # 总页面  第一个一定要用parse  用来传递start_urls
            item = {}
    
            class_news_urls_li = response.xpath(".//div[@class='navCon']/ul/li/a")
            print(class_news_urls_li)
            for class_news_url in class_news_urls_li:
                item["class_tittle"] = class_news_url.xpath("./text()").extract_first()
                print(item)
                new_url = class_news_url.xpath("./@href").extract_first()
                print(new_url)
                yield scrapy.Request(
                    new_url,
                    callback=self.second_class,
                    meta={"item": deepcopy(item)},  # 由于是多线程 所以要用深拷贝进入item
                )
    
        def second_class(self, response):  # 二级页面
            item = response.meta["item"]
            print(response.url)
    
            second_urls = response.xpath(".//div/h2/em")
    
            for second_url in second_urls:
    
                secoond_news_url = second_url.xpath("./a/@href").extract_first()
    
                yield scrapy.Request(
                    secoond_news_url,
                    callback=self.parse_detail_analyze,
                    meta={"item": deepcopy(item)}
                )
    
        def parse_detail_analyze(self, response):  # 进入第三成  总细节的抓取  http://china.huanqiu.com/leaders/'
            item = response.meta["item"]
    
            li_list = response.xpath("//ul[@class='listPicBox']/li")
    
            for li in li_list:
                # item = YangGuanItem()
                item["title"] = li.xpath("./h3/a/text()").extract_first()
                item["img_url"] = li.xpath("./a/img/@src").extract_first()
                item["detail"] = li.xpath("./h5/text()").extract_first()
                yield item
    
            next_url = response.xpath(".//div[@class='pageBox']/div/a[last()]/@href").extract_first()  # 遇见翻页就要这样写
    
            yield scrapy.Request(next_url, callback=self.parse_detail_analyze,meta={"item":response.meta["item"]})

    关于setting

    # -*- coding: utf-8 -*-
    import scrapy
    # from yang_guan.items import YangGuanItem
    from copy import deepcopy
    from scrapy.spiders import CrawlSpider
    
    class YgSpider(scrapy.Spider):
        name = 'yg'
        # allowed_domains = ['huanqiu.com']
        start_urls = ['http://www.huanqiu.com/',
                      ]
    
        def parse (self, response):  # 总页面  第一个一定要用parse  用来传递start_urls
            item = {}
    
            class_news_urls_li = response.xpath(".//div[@class='navCon']/ul/li/a")
            print(class_news_urls_li)
            for class_news_url in class_news_urls_li:
                item["class_tittle"] = class_news_url.xpath("./text()").extract_first()
                print(item)
                new_url = class_news_url.xpath("./@href").extract_first()
                print(new_url)
                yield scrapy.Request(
                    new_url,
                    callback=self.second_class,
                    meta={"item": deepcopy(item)},  # 由于是多线程 所以要用深拷贝进入item
                )
    
        def second_class(self, response):  # 二级页面
            item = response.meta["item"]
            print(response.url)
    
            second_urls = response.xpath(".//div/h2/em")
    
            for second_url in second_urls:
    
                secoond_news_url = second_url.xpath("./a/@href").extract_first()
    
                yield scrapy.Request(
                    secoond_news_url,
                    callback=self.parse_detail_analyze,
                    meta={"item": deepcopy(item)}
                )
    
        def parse_detail_analyze(self, response):  # 进入第三成  总细节的抓取  http://china.huanqiu.com/leaders/'
            item = response.meta["item"]
    
            li_list = response.xpath("//ul[@class='listPicBox']/li")
    
            for li in li_list:
                # item = YangGuanItem()
                item["title"] = li.xpath("./h3/a/text()").extract_first()
                item["img_url"] = li.xpath("./a/img/@src").extract_first()
                item["detail"] = li.xpath("./h5/text()").extract_first()
                yield item
    
            next_url = response.xpath(".//div[@class='pageBox']/div/a[last()]/@href").extract_first()  # 遇见翻页就要这样写
    
            yield scrapy.Request(next_url, callback=self.parse_detail_analyze,meta={"item":response.meta["item"]})
  • 相关阅读:
    SpringMVC -- 梗概--源码--贰--下载
    SpringMVC -- 梗概--源码--贰--上传
    SpringMVC -- 梗概--源码--贰--拦截器:Interceptor
    SpringMVC -- 梗概--源码--贰--异常管理
    SpringMVC -- 梗概--源码--贰--RestFul收参(了解) @PathVariable
    SpringMVC -- 梗概--源码--贰--静态资源的访问问题
    SpringMVC -- 梗概--源码--贰--mvc:annotation-driven
    SpringMVC -- 梗概--源码--壹--springMVC json处理
    SpringMVC -- 梗概--源码--壹--数据传递
    spring mvc处理方法返回方式
  • 原文地址:https://www.cnblogs.com/fromlantianwei/p/10514627.html
Copyright © 2011-2022 走看看