情况概述:
发现问题,对代码进行优化改错,并且重新爬取2011-2020年天气数据。
代码更正:
import requests from bs4 import BeautifulSoup from Weather import IO as ios class item: def __init__(self): self.date = list() # 日期 self.max_temp = list() # 最高温 self.min_temp = list() # 最低温 self.weather = list() # 天气 self.wind_direction = list() # 风向 self.wind_force=list() # 风力 Data_Box = item() # 数据盒子 num=0 # 函数默认返回北京市2018年1月到12月的url def get_url(city,min,max): ''' city为城市拼写的字符串,year为年份+月份 ''' for i in range(min,max): for time in range(int(str(i)+"01"), int(str(i)+"13")): url = "http://lishi.tianqi.com/{}/{}.html".format(city, time) yield url+" "+city # 获取天气数据 def get_datas(min,max): global num for line in open("CityEn_Deal.txt",encoding='utf-8'): print(str(line.split(" ")[1]).strip(" ")) urls = get_url(str(line.split(" ")[2]).strip(" "),min,max) cookie = { "cityPy": "UM_distinctid=171f2280ef23fb-02a4939f3c1bd4-335e4e71-144000-171f2280ef3dab; Hm_lvt_ab6a683aa97a52202eab5b3a9042a8d2=1588905651; CNZZDATA1275796416=871124600-1588903268-%7C1588990372; Hm_lpvt_ab6a683aa97a52202eab5b3a9042a8d2=1588994046"} header = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3756.400 QQBrowser/10.5.4039.400"} for url in urls: try: html = requests.get(url=url.split(" ")[0], headers=header, cookies=cookie) soup = BeautifulSoup(html.text, 'html.parser') ul = soup.find_all("ul", class_='thrui')[0] # print(ul) lis = ul.find_all("li")[:] for li in lis: # 最后一个li标签不是天气数据 div = li.find_all("div") Data_Box.date.append(div[0].text.split("-")[0]+div[0].text.split("-")[1]+div[0].text.split("-")[2]) Data_Box.max_temp.append(div[1].text) Data_Box.min_temp.append(div[2].text) Data_Box.weather.append(div[3].text) Data_Box.wind_direction.append(div[4].text.split(" ")[0]) Data_Box.wind_force.append(div[4].text.split(" ")[1]) except: print("该页面爬取失败!") else: print("该页面爬取成功!") try: for i in range(num, (len(Data_Box.date))): print(str(line.split(" ")[0]),str(line.split(" ")[1]),url.split(" ")[1],Data_Box.date[i], Data_Box.min_temp[i],Data_Box.max_temp[i], Data_Box.weather[i], Data_Box.wind_direction[i],Data_Box.wind_force[i]) ios.cw("weather"+str(min)+".txt",str(line.split(" ")[0])+" "+str(line.split(" ")[1])+" "+Data_Box.date[i]+Data_Box.min_temp[i].split("℃")[0]+" "+Data_Box.max_temp[i].split("℃")[0]+" "+Data_Box.weather[i]+" "+ Data_Box.wind_direction[i].split("风")[0]+" "+Data_Box.wind_force[i].split("级")[0]+" ") num=len(Data_Box.date) except: print("写入失败!") ios.cw("Fault.txt",str(line.split(" ")[1]+"写入失败")) return "数据获取完毕" # 爬取程序主函数 if __name__ == "__main__": get_datas(2011, 2012) get_datas(2012, 2013) get_datas(2013, 2014) get_datas(2014, 2015) get_datas(2015, 2016) get_datas(2016, 2017) get_datas(2017, 2018) get_datas(2018, 2019) get_datas(2019, 2020) get_datas(2020, 2021)
爬取结果: