zoukankan      html  css  js  c++  java
  • 毕业设计-1.06

    情况概述:

      发现问题,对代码进行优化改错,并且重新爬取2011-2020年天气数据。

    代码更正:

    import requests
    from bs4 import BeautifulSoup
    from Weather import IO as ios
    
    
    class item:
        def __init__(self):
            self.date = list()  # 日期
            self.max_temp = list()  # 最高温
            self.min_temp = list()  # 最低温
            self.weather = list()  # 天气
            self.wind_direction = list()  # 风向
            self.wind_force=list()  # 风力
    
    
    Data_Box = item()  # 数据盒子
    num=0
    
    # 函数默认返回北京市2018年1月到12月的url
    def get_url(city,min,max):
        '''
        city为城市拼写的字符串,year为年份+月份
        '''
        for i in range(min,max):
            for time in range(int(str(i)+"01"), int(str(i)+"13")):
                url = "http://lishi.tianqi.com/{}/{}.html".format(city, time)
                yield url+" "+city
    
    
    # 获取天气数据
    def get_datas(min,max):
        global num
        for line in open("CityEn_Deal.txt",encoding='utf-8'):
            print(str(line.split(" ")[1]).strip("
    "))
            urls = get_url(str(line.split(" ")[2]).strip("
    "),min,max)
            cookie = {
                "cityPy": "UM_distinctid=171f2280ef23fb-02a4939f3c1bd4-335e4e71-144000-171f2280ef3dab; Hm_lvt_ab6a683aa97a52202eab5b3a9042a8d2=1588905651; CNZZDATA1275796416=871124600-1588903268-%7C1588990372; Hm_lpvt_ab6a683aa97a52202eab5b3a9042a8d2=1588994046"}
            header = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3756.400 QQBrowser/10.5.4039.400"}
            for url in urls:
                try:
                    html = requests.get(url=url.split(" ")[0], headers=header, cookies=cookie)
                    soup = BeautifulSoup(html.text, 'html.parser')
                    ul = soup.find_all("ul", class_='thrui')[0]
                    # print(ul)
                    lis = ul.find_all("li")[:]
                    for li in lis:
                        # 最后一个li标签不是天气数据
                        div = li.find_all("div")
                        Data_Box.date.append(div[0].text.split("-")[0]+div[0].text.split("-")[1]+div[0].text.split("-")[2])
                        Data_Box.max_temp.append(div[1].text)
                        Data_Box.min_temp.append(div[2].text)
                        Data_Box.weather.append(div[3].text)
                        Data_Box.wind_direction.append(div[4].text.split(" ")[0])
                        Data_Box.wind_force.append(div[4].text.split(" ")[1])
                except:
                    print("该页面爬取失败!")
    
                else:
                    print("该页面爬取成功!")
                try:
                    for i in range(num, (len(Data_Box.date))):
                        print(str(line.split(" ")[0]),str(line.split(" ")[1]),url.split(" ")[1],Data_Box.date[i],  Data_Box.min_temp[i],Data_Box.max_temp[i], Data_Box.weather[i],
                              Data_Box.wind_direction[i],Data_Box.wind_force[i])
                        ios.cw("weather"+str(min)+".txt",str(line.split(" ")[0])+" "+str(line.split(" ")[1])+" "+Data_Box.date[i]+Data_Box.min_temp[i].split("")[0]+" "+Data_Box.max_temp[i].split("")[0]+" "+Data_Box.weather[i]+" "+
                              Data_Box.wind_direction[i].split("")[0]+" "+Data_Box.wind_force[i].split("")[0]+"
    ")
                        num=len(Data_Box.date)
                except:
                    print("写入失败!")
                    ios.cw("Fault.txt",str(line.split(" ")[1]+"写入失败"))
        return "数据获取完毕"
    
    
    # 爬取程序主函数
    if __name__ == "__main__":
    
        get_datas(2011, 2012)
        get_datas(2012, 2013)
        get_datas(2013, 2014)
        get_datas(2014, 2015)
        get_datas(2015, 2016)
        get_datas(2016, 2017)
        get_datas(2017, 2018)
        get_datas(2018, 2019)
        get_datas(2019, 2020)
        get_datas(2020, 2021)

    爬取结果:

  • 相关阅读:
    大话数据结构—散列表查找(哈希表)
    全栈project师?给把瑞士军刀你去砍鬼子好不好!?
    合作开发带来的思考
    女码农献丑-企业智能机器人客服(图灵机器人)
    Elasticsearch聚合 之 Date Histogram聚合
    Elasticsearch聚合 之 Terms
    Elasticsearch聚合初探——metric篇
    AngularJS API之$injector ---- 依赖注入
    AngularJS API之extend扩展对象
    AngularJS API之equal比较对象
  • 原文地址:https://www.cnblogs.com/zlc364624/p/14427792.html
Copyright © 2011-2022 走看看