import requests from bs4 import BeautifulSoup import re import json from tqdm import tqdm class CoronaVirusSpider(object): def __init__(self): self.home_url = 'http://ncov.dxy.cn/ncovh5/view/pneumonia' def get_content_from_url(self, url): """ 根据url,获取相应内容的字符串数据 :param url:请求的url :return:响应内容的字符串 """ response = requests.get(url) return response.content.decode() def parse_home_page(self, home_page, tag_id): """ 解析首页内容,获取解析后的Python数据 :param home_page: 首页内容 :return: 解析后的Python数据 """ # 2.从疫情首页,提取最近一日各国疫情数据 soup = BeautifulSoup(home_page, 'html5lib') script = soup.find(id=tag_id) text = script.text # 3.从疫情数据中,获取json格式的字符串 json_str = re.findall(r'[.+]', text)[0] # 4.以json格式的字符串转换为Python类型 data = json.loads(json_str) return data def load(self, path): """ 根据路径加载数据 :param path: :return: """ with open(path, encoding="utf8") as fp: data = json.load(fp) return data def save(self, data, path): # 5.以json格式保存最近一日各国疫情数据 with open(path, 'w', encoding="utf8") as fp: json.dump(data, fp, ensure_ascii=False) def crawl_last_day_corona_virus(self): """ 采集最近一天的各国信息 :return: """ # 1.发送请求,获取首页内容 home_page = self.get_content_from_url(self.home_url) # 2.解析首页内容,获取最近一天的各国疫情 last_day_corona_virus = self.parse_home_page(home_page, tag_id='getListByCountryTypeService2true') # 3.保存数据 self.save(last_day_corona_virus, 'data/last_day_corona_virus.json') def crawl_corona_virus(self): """ 采集1月23日以来各国疫情数据 :return: """ # 1.加载各国疫情数据 last_day_corona_virus = self.load('data/last_day_corona_virus.json') corona_virus = self.parse_corona_virus(last_day_corona_virus, "采集数据") # 5.把列表以json格式保存为文件 self.save(corona_virus, 'data/corona_virus.json') def crawl_last_day_corona_virus_of_china(self): """ 采集最近一日各省疫情数据 :return: """ # 1.发送请求,获取疫情首页 home_page = self.get_content_from_url(self.home_url) # 2.解析疫情首页,获取最近一日各省疫情数据 last_day_corona_virus_of_china = self.parse_home_page(home_page, tag_id='getAreaStat') # 3.保存疫情数据 self.save(last_day_corona_virus_of_china, 'data/last_day_corona_virus_of_china.json') def crawl_corona_virus_of_china(self): """ 采集从1月23日以来的全国各省疫情数据 :return: """ # 1.加载最近一日全国疫情信息,获取各省疫情url last_day_corona_virus_of_china = self.load('data/last_day_corona_virus_of_china.json') corona_virus = self.parse_corona_virus(last_day_corona_virus_of_china, "采集全国各省数据") self.save(corona_virus, 'data/corona_virus_of_china.json') def parse_corona_virus(self, last_day_corona_virus_of_china, desc): # 定义列表,用于存储各省从1月22日以来的疫情数据 corona_virus = [] # 2.遍历各国疫情数据,获取统计的url for country in tqdm(last_day_corona_virus_of_china, desc): # 3.发送请求,获取各省从1月22日至今的json数据 statistics_data_url = country['statisticsData'] statistics_data_json_str = self.get_content_from_url(statistics_data_url) # 4.把json数据转换为python类型的数据, 添加至列表中 statistics_data = json.loads(statistics_data_json_str)['data'] for one_day in statistics_data: one_day['provinceName'] = country['provinceName'] if country.get('countryShortCode'): one_day['countryShortCode'] = country['countryShortCode'] corona_virus.extend(statistics_data) return corona_virus def run(self): self.crawl_last_day_corona_virus() if __name__ == '__main__': spider = CoronaVirusSpider() spider.crawl_corona_virus()