上周做了java web的数据可视化,但是数据库里面的内容是固定的不能动态实时的更新,如果人为手动更新工作量也是巨大的,于是实时更新数据就涉及到了数据的爬取工作,对于数据爬去我是从零开的,
一开始去网上查询用java代码实现但是发现在短时间之内无法掌握,于是看了看Python 还是比较容易的。直接上代码,这是对全国各个省份市的新冠肺炎各项数据的爬取。
1 from os import path 2 import requests 3 from bs4 import BeautifulSoup 4 import json 5 import pymysql 6 import numpy as np 7 import time 8 9 url = 'https://ncov.dxy.cn/ncovh5/view/pneumonia?from=timeline&isappinstalled=0' # 请求地址 10 headers = { 11 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'} # 创建头部信息 12 response = requests.get(url, headers=headers) # 发送网络请求 13 # print(response.content.decode('utf-8'))#以字节流形式打印网页源码 14 content = response.content.decode('utf-8') 15 # print(content) 16 soup = BeautifulSoup(content, 'html.parser') 17 listA = soup.find_all(name='script', attrs={"id": "getAreaStat"}) 18 # 世界确诊 19 listB = soup.find_all(name='script', attrs={"id": "getListByCountryTypeService2"}) 20 # listA = soup.find_all(name='div',attrs={"class":"c-touchable-feedback c-touchable-feedback-no-default"}) 21 account = str(listA) 22 # world_messages = str(listB)[87:-21] 23 messages = account[52:-21] 24 messages_json = json.loads(messages) 25 # world_messages_json = json.loads(world_messages) 26 valuesList = [] 27 cityList = [] 28 29 con = len(messages_json) 30 k = 0 31 for i in range(len(messages_json)): 32 # value = messages_json[i] 33 k = k + 1 34 value = ( 35 k, time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), messages_json[i].get('provinceShortName'), None, 36 messages_json[i].get('confirmedCount'), messages_json[i].get('suspectedCount'), messages_json[i].get('curedCount'), 37 messages_json[i].get('deadCount'), messages_json[i].get('locationId')) 38 valuesList.append(value) 39 cityValue = messages_json[i].get('cities') 40 # print(cityValue) 41 for j in range(len(cityValue)): 42 con = con + 1 43 cityValueList = ( 44 con, time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), messages_json[i].get('provinceShortName'), 45 cityValue[j].get('cityName'), cityValue[j].get('confirmedCount'), cityValue[j].get('suspectedCount'), 46 cityValue[j].get('curedCount'), cityValue[j].get('deadCount'), cityValue[j].get('locationId')) 47 # print(cityValueList) 48 cityList.append(cityValueList) 49 # cityList.append(cityValue) 50 db = pymysql.connect("localhost", "root", "******", "yiqing", charset='utf8') 51 cursor = db.cursor() 52 array = np.asarray(valuesList[0]) 53 # sql_clean_world = "TRUNCATE TABLE world_map" 54 # sql_clean_city = "TRUNCATE TABLE city_map" 55 # sql_clean_json = "TRUNCATE TABLE province_data_from_json" 56 sql_clean_province = "TRUNCATE TABLE info3" 57 # sql1 = "INSERT INTO city_map values (%s,%s,%s,%s,%s,%s,%s,%s)" 58 # sql_world = "INSERT INTO world_map values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" 59 # sql = "INSERT INTO province_map values (0,'%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') " 60 sql = "INSERT INTO info2 values (%s,%s,%s,%s,%s,%s,%s,%s,%s) " 61 # sql = "INSERT INTO province_map (provinceName,provinceShortName,correntConfirmedCount,confirmedCount, 62 # suspectedCount,curedCount,deadCount,comment,locationId,statisticsData) values (0,'%s','%s','%s','%s','%s','%s', 63 # '%s','%s','%s','%s') " sql = """INSERT INTO province_map (provinceName,provinceShortName,correntConfirmedCount, 64 # confirmedCount,suspectedCount,curedCount,deadCount,comment,locationId,statisticsData) values ('湖北省', '湖北', 43334, 65 # 64786, 0, 18889, 2563, '', 420000, 'https://file1.dxycdn.com/2020/0223/618/3398299751673487511-135.json')""" 66 value_tuple = tuple(valuesList) 67 cityTuple = tuple(cityList) 68 # worldTuple = tuple(worldList) 69 # print(cityTuple) 70 # print(tuple(value_tuple)) 71 try: 72 # cursor.execute(sql_clean_city) 73 cursor.execute(sql_clean_province) 74 # cursor.executemany(sql, value_tuple) 75 # cursor.executemany(sql1,cityTuple) 76 db.commit() 77 except: 78 print('执行失败,进入回调1') 79 db.rollback() 80 81 try: 82 # cursor.execute(sql_clean_city) 83 # cursor.execute(sql_clean_province) 84 cursor.executemany(sql, value_tuple) 85 # cursor.executemany(sql1,cityTuple) 86 db.commit() 87 except: 88 print('执行失败,进入回调3') 89 db.rollback() 90 91 try: 92 # cursor.execute(sql_clean_city) 93 # cursor.execute(sql_clean_province) 94 # cursor.executemany(sql, value_tuple) 95 cursor.executemany(sql, cityTuple) 96 db.commit() 97 except: 98 print('执行失败,进入回调4') 99 db.rollback() 100 101 # print(messages_json) 102 # print(account[52:-21]) 103 # soupDiv = BeautifulSoup(listA,'html.parser') 104 # listB = soupDiv.find_all(name='div',attrs={"class":"c-gap-bottom-zero c-line-clamp2"}) 105 # for i in listA: 106 # print(i) 107 # listA[12] 108 # print(listA) 109 110 111 db.close()
其中上图便是和数据库的连接
这是数据库的设计。