1 """中国天气网爬虫""" 2 3 import requests 4 from bs4 import BeautifulSoup 5 from pyecharts import Bar 6 7 8 HEADERS = { 9 'User-Agent': 'Mozilla/5.0' 10 } 11 12 ALL_DATA = [] # 所有爬取的数据 13 14 def parse_detail_page(url, is_html5lib): 15 """爬取具体页面具体数据""" 16 17 respose = requests.get(url, headers=HEADERS) 18 text = respose.content.decode('utf-8') 19 # with open('weather.html', 'w', encoding='utf-8') as fp: 20 # fp.write(text) 21 if is_html5lib == False: 22 soup = BeautifulSoup(text, 'lxml') 23 else: 24 soup = BeautifulSoup(text, 'html5lib') 25 # 以下为具体爬取数据方法 26 conMidtab = soup.find_all('div', attrs={'class':'conMidtab'}) 27 tables = conMidtab[0].find_all('table') 28 for table in tables: 29 trs = table.find_all('tr')[2:] 30 for index,tr in enumerate(trs): 31 tds = tr.find_all('td') 32 city_td = tds[0] 33 if index == 0: 34 city_td = tds[1] 35 city = list(city_td.stripped_strings)[0] 36 temp_td = tds[-2] 37 min_temp = list(temp_td.stripped_strings)[0] 38 # 存储爬取数据,把最低温度转成整型,排序需要 39 ALL_DATA.append({'city': city, 'min_temp': int(min_temp)}) 40 41 def data_visualization(): 42 """取出前10最低温度并用柱状图显示""" 43 44 # ALL_DATA.sort(key=lambda data:data['min_temp'], reverse=True) 45 ALL_DATA.sort(key=lambda data: data['min_temp']) 46 #print(ALL_DATA) 47 #print(len(ALL_DATA)) 48 # 获取前10 49 data = ALL_DATA[0:10] 50 print(data) 51 cities = list(map(lambda x:x['city'], data)) 52 temps = list(map(lambda x:x['min_temp'], data)) 53 chart = Bar("中国天气最低气温排行榜") 54 chart.add('', cities, temps) 55 chart.render('weather.html') 56 57 58 def get_detail_urls(url, base_url): 59 """得到华北、东北、华东、华中、华南、西北、西南、港澳台的具体页面链接""" 60 61 urllists = [] # 具体的页面信息列表 62 respose = requests.get(url, headers=HEADERS) 63 text = respose.content.decode('utf-8') 64 soup = BeautifulSoup(text, 'lxml') 65 # 数据爬取 66 uls = soup.find_all('ul', class_='lq_contentboxTab2') 67 alists = uls[0].find_all('a') 68 for list in alists: 69 newurl = base_url + list['href'] 70 urllists.append(newurl) 71 72 return urllists 73 74 def spider(): 75 """""" 76 77 # 初始爬取页面 78 src_url = "http://www.weather.com.cn/textFC/hb.shtml" 79 base_url = "http://www.weather.com.cn" 80 urllists = [] 81 urllists = get_detail_urls(src_url, base_url) 82 #print(urllists) 83 is_html5lib = False # 爬取页面是否用html5lib库 84 for index,urllist in enumerate(urllists): 85 if index != len(urllists)-1: 86 parse_detail_page(urllist, is_html5lib) 87 else: 88 is_html5lib = True 89 # url = "http://www.weather.com.cn/textFC/gat.shtml"这个页面需要用html5lib库解析,不然数据有错 90 parse_detail_page(urllist, is_html5lib) 91 92 # 排序并可视化数据 93 data_visualization() 94 95 if __name__ == '__main__': 96 spider()
注意:需要安装 pyecharts
建议 pip intstall pyecharts==0.1.9.5