对于这节软件工程PSP课上的任务:爬取网站上关于疫情统计实时的数据,刚开始听到的时候简直一头雾水,因为是从来没有了解到的领域,然后就开始看各种教程,知道python是比较好的爬取方式,然后就下载了python和PC,创建项目、setting pymysql、安装插件、跟着教程读取博客上的代码。然后看着视频讲解,写了爬取数据到数据库的代码。
然后,看教程大致了解到了爬虫的一般思路:1、分析目标网页,确定爬取的url路径,headers参数。2、发送请求-requests,模拟浏览器发送请求,获取响应数据。3、解析数据-json模块,把json字符串转化为python可交互的数据类型。4、保存数据,保存在目标文件夹中。
这道题是将数据保存在数据库的表中了,参考:https://blog.csdn.net/IT_XF/article/details/82184585
from os import path import request from bs4 import BeautifulSoup import json import pymysql import numpy as np import time import requests url = 'https://ncov.dxy.cn/ncovh5/view/pneumonia?from=timeline&isappinstalled=0' # 请求地址 headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'} # 创建头部信息 response = requests.get(url, headers=headers) # 发送网络请求 content = response.content.decode('utf-8') soup = BeautifulSoup(content, 'html.parser') listA = soup.find_all(name='script', attrs={"id": "getAreaStat"}) # 世界确诊 listB = soup.find_all(name='script', attrs={"id": "getListByCountryTypeService2"}) account = str(listA) messages = account[52:-21] messages_json = json.loads(messages) valuesList = [] cityList = [] con = len(messages_json) k = 0 for i in range(len(messages_json)): k = k + 1 value = ( k, time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), messages_json[i].get('provinceShortName'), None, messages_json[i].get('confirmedCount'), messages_json[i].get('suspectedCount'), messages_json[i].get('curedCount'), messages_json[i].get('deadCount'), messages_json[i].get('locationId')) valuesList.append(value) cityValue = messages_json[i].get('cities') for j in range(len(cityValue)): con = con + 1 cityValueList = ( con, time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), messages_json[i].get('provinceShortName'), cityValue[j].get('cityName'), cityValue[j].get('confirmedCount'), cityValue[j].get('suspectedCount'), cityValue[j].get('curedCount'), cityValue[j].get('deadCount'), cityValue[j].get('locationId')) cityList.append(cityValueList) db = pymysql.connect("localhost", "root", "123456", "yiqing1", charset='utf8') cursor = db.cursor() array = np.asarray(valuesList[0]) sql_clean_province = "TRUNCATE TABLE info3" sql = "INSERT INTO info3 values (%s,%s,%s,%s,%s,%s,%s,%s,%s) " cityTuple = tuple(cityList) try: cursor.execute(sql_clean_province) db.commit() except: print('执行失败,进入回调1') db.rollback() try: cursor.executemany(sql, value_tuple) db.commit() except: print('执行失败,进入回调3') db.rollback() try: cursor.executemany(sql, cityTuple) db.commit() except: print('执行失败,进入回调4') db.rollback() db.close()