zoukankan      html  css  js  c++  java
  • Python爬取全国历史天气数据

    1、通过爬取历史首页,来获取城市地址和历史时间,构建链接;

    '''
    获取全国的城市名称和链接
    '''
    
    import requests
    from lxml import etree
    import random
    import pymongo
    from time_list import get_time
    
    client = pymongo.MongoClient('localhost',27017)
    tianqi_data = client['tianqi_data']
    time_url_table = tianqi_data['time_url_table']
    
    headers_data = [
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393',
        'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0',
    ]
    headers = {
        'User-Agent':random.choice(headers_data)
    }
    
    def get_cityname(url):     #爬取城市名称,并保存到数据到列表中
        city_name_list  = []
        city_response = requests.get(url,headers = headers)
        city_response.encoding = city_response.apparent_encoding
        city_names = etree.HTML(city_response.text).xpath('//*[@id="tool_site"]/div[2]/ul/li/a/text()')
        city_links = etree.HTML(city_response.text).xpath('//*[@id="tool_site"]/div[2]/ul/li/a/@href')
        for city_name,city_link in zip(city_names,city_links):
            if len(city_name) == 1:
                pass
            else:
                city_data = {
                    'city_name':str(city_name),
                    'city_link':str(city_link),
                }
                city_name_list.append(city_data)
        return city_name_list
        #print(city_name_list)
        print('获取城市名称和链接结束...')
    
    url = 'http://lishi.tianqi.com/'
    for link in get_cityname(url):      #构建每个城市的历史日期链接,并保存到数据库中
        url = link['city_link']
        for time_link in get_time():
            time = time_link.split('/')[-1].split('.')[0]
            time_url = url.replace('index',str(time))
            data = {
                'time_url':time_url,
                'city':link['city_name'],
            }
            print(data)
            time_url_table.insert(data)
    print('导入数据库存完成')
    View Code
    import requests
    from lxml import etree
    
    '''
    通过对比城市的链接和历史时间的链接发现,就是在把城市链接里面的index换成了相对应的时间,
    所以只要把index换成了历史月份就可以了
    '''
    
    def get_time():
        url = 'http://lishi.tianqi.com/acheng/index.html'
        response = requests.get(url)
        time_lists = etree.HTML(response.text).xpath('//*[@id="tool_site"]/div[2]/ul/li/a/@href')
        return time_lists
    View Code

    2、从数据库中读取数据,爬取每个城市的历史天气数据;

    import requests
    from lxml import etree
    import random
    import pymongo
    
    client = pymongo.MongoClient('localhost',27017)
    tianqi_data = client['tianqi_data']
    time_url_table = tianqi_data['time_url_table']
    tianqi_data_table = tianqi_data['tianqi_data_table']
    
    headers_data = [
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393',
        'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0',
    ]
    headers = {
        'User-Agent':random.choice(headers_data)
    }
    
    def get_tianqi_data():
        for link in time_url_table.find():
            url = link['time_url']
            print(url)
            response = requests.get(url,headers=headers)
            dates = etree.HTML(response.text).xpath('//*[@id="tool_site"]/div[2]/ul/li/a/text()')
            max_temps = etree.HTML(response.text).xpath('//*[@id="tool_site"]/div[2]/ul/li[2]/text()')[1:-1]
            low_temps = etree.HTML(response.text).xpath('//*[@id="tool_site"]/div[2]/ul/li[3]/text()')[1:-1]
            weathers = etree.HTML(response.text).xpath('//*[@id="tool_site"]/div[2]/ul/li[4]/text()')[1:-1]
            fengxiangs = etree.HTML(response.text).xpath('//*[@id="tool_site"]/div[2]/ul/li[5]/text()')[1:-1]
            fenglis = etree.HTML(response.text).xpath('//*[@id="tool_site"]/div[2]/ul/li[6]/text()')[1:-1]
            for date,max_temp,low_temp,weather,fengxiang,fengli in zip(dates,max_temps,low_temps,weathers,fengxiangs,fenglis):
                data = {
                    '日期':date,
                    '最高温度':max_temp,
                    '最低温度':low_temp,
                    '天气':weather,
                    '风向':fengxiang,
                    '风力':fengli,
                }
                tianqi_data_table.insert(data)
                print(data)
        print('爬取数据成功')
    View Code
  • 相关阅读:
    Parameter HTTP 参数基类
    XMLHttpRequest对象 转载来源于:http://dev.yesky.com
    35 岁前程序员要规划好的四件事
    java 初学之二
    从零开始学wordpress 之四
    Adobe flash cs5 的Java运行时环境初始化错误 完美解决方法
    从零开始学wordpress 之三
    java 初学 之一
    java 初学之三 多态和抽象类
    简单的js优化
  • 原文地址:https://www.cnblogs.com/114811yayi/p/6947473.html
Copyright © 2011-2022 走看看