zoukankan      html  css  js  c++  java
  • 对空气质量历史数据的爬取

    爬取https://www.aqistudy.cn/historydata 网站的空气质量报告,爬取的数据以csv文件生成

    scrapy startproject air_quality 创建scrapy项目

    scrapy genspider api_history_spider https://www.apistudy.cn/historydata/index.php 编写spider

    文件目录如图所示

    seetings.py

    1 ITEM_PIPELINES = {
    2     'air_quality.pipelines.AirQualityPipeline': 300,
    3 }

    items.py

     1 import scrapy
     2 
     3 
     4 class AirQualityItem(scrapy.Item):
     5     # define the fields for your item here like:
     6     # name = scrapy.Field()
     7     city_name = scrapy.Field()  # 城市名称
     8     record_date = scrapy.Field()  # 检测日期
     9     aqi_val = scrapy.Field()  # AQI
    10     range_val = scrapy.Field()  # 范围
    11     quality_level = scrapy.Field()  # 质量等级
    12     pm2_5_val = scrapy.Field()  # PM2.5
    13     pm10_val = scrapy.Field()  # PM10
    14     so2_val = scrapy.Field()  # SO2
    15     co_val = scrapy.Field()  # CO
    16     no2_val = scrapy.Field()  # NO2
    17     o3_val = scrapy.Field()  # O3
    18     rank = scrapy.Field()  # 排名

    pipelines.py

     1 from scrapy.exporters import CsvItemExporter
     2 
     3 class AirQualityPipeline(object):
     4 
     5     def open_spider(self,spider):
     6         self.file = open('air_quality.csv', 'wb')
     7         self.exporter = CsvItemExporter(self.file)
     8         self.exporter.start_exporting()
     9 
    10     def close_spider(self,spider):
    11         self.exporter.finish_exporting()
    12         self.file.close()
    13 
    14     def process_item(self, item,spider):
    15         self.exporter.export_item(item)
    16         return item

    api_history_spider.py

     1 # -*- coding: utf-8 -*-
     2 import scrapy
     3 from urllib import parse
     4 from air_quality.items import AirQualityItem
     5 
     6 base_url = 'https://www.aqistudy.cn/historydata/'
     7 
     8 class ApiHistorySpiderSpider(scrapy.Spider):
     9     name = 'api_history_spider'
    10     allowed_domains = ["aqistudy.cn"]
    11     start_urls = ['https://www.aqistudy.cn/historydata/']
    12 
    13     def parse(self, response):
    14         """
    15             解析初始页面
    16         """
    17         # 获取所有城市的URL
    18         city_url_list = response.xpath('//div[@class="all"]//div[@class="bottom"]//a//@href')
    19 
    20         for city_url in city_url_list:
    21             # 依次遍历城市URL
    22             city_month_url = base_url + city_url.extract()
    23             # 解析每个城市的月份数据
    24             request = scrapy.Request(city_month_url, callback=self.parse_city_month)
    25             yield request
    26 
    27     def parse_city_month(self, response):
    28         """
    29             解析该城市的月份数据
    30         """
    31         # 获取该城市的所有月份URL
    32         month_url_list = response.xpath('//table[@class="table table-condensed '
    33                                         'table-bordered table-striped table-hover '
    34                                         'table-responsive"]//a//@href')
    35 
    36         for month_url in month_url_list:
    37             # 依次遍历月份URL
    38             city_day_url = base_url + month_url.extract()
    39             # 解析该城市的每日数据
    40             request = scrapy.Request(city_day_url, callback=self.parse_city_day)
    41             yield request
    42 
    43     def parse_city_day(self, response):
    44         """
    45             解析该城市的每日数据
    46         """
    47         url = response.url
    48         item = AirQualityItem()
    49         city_url_name = url[url.find('=') + 1:url.find('&')]
    50 
    51         # 解析url中文
    52         # item['city_name'] = city_url_name
    53         item['city_name'] = parse.unquote(city_url_name)
    54 
    55         # 获取每日记录
    56         day_record_list = response.xpath('//table[@class="table table-condensed '
    57                                          'table-bordered table-striped table-hover '
    58                                          'table-responsive"]//tr')
    59         for i, day_record in enumerate(day_record_list):
    60             if i == 0:
    61                 # 跳过表头
    62                 continue
    63             td_list = day_record.xpath('.//td')
    64 
    65             item['record_date'] = td_list[0].xpath('text()').extract_first()  # 检测日期
    66             item['aqi_val'] = td_list[1].xpath('text()').extract_first()  # AQI
    67             item['range_val'] = td_list[2].xpath('text()').extract_first()  # 范围
    68             item['quality_level'] = td_list[3].xpath('.//div/text()').extract_first()  # 质量等级
    69             item['pm2_5_val'] = td_list[4].xpath('text()').extract_first()  # PM2.5
    70             item['pm10_val'] = td_list[5].xpath('text()').extract_first()  # PM10
    71             item['so2_val'] = td_list[6].xpath('text()').extract_first()  # SO2
    72             item['co_val'] = td_list[7].xpath('text()').extract_first()  # CO
    73             item['no2_val'] = td_list[8].xpath('text()').extract_first()  # NO2
    74             item['o3_val'] = td_list[9].xpath('text()').extract_first()  # O3
    75             item['rank'] = td_list[10].xpath('text()').extract_first()  # 排名
    76 
    77             yield item

    运行spider

    scrapy crawl api_history_spider

    得到csv文件,部分如下图所示:

  • 相关阅读:
    敏捷之Scrum框架
    Google浏览器80版本以上无法打开系统页面问题
    ThreadLocal 了解
    简化两个list之间赋值操作
    linux下用命令调用dubbo服务
    13条代码审查建议
    初识MongoDB(1)
    linux日常常用命令整理
    java发起http请求
    工作思维方法
  • 原文地址:https://www.cnblogs.com/chengchengaqin/p/9700554.html
Copyright © 2011-2022 走看看