今日主要学习了使用python爬取数据并存入数据库的基本操作:
操作环境是在Jupyter Notebook,它是以网页形式打开的直接操作代码,运行代码并可以写说明文档。
urllib的基本使用
from urllib import request #添加header信息 最基本的反扒措施 url = "http://www.bilibili.com/" header = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36" } req = request.Request(url,headers=header) res = request.urlopen(req)#获取相应 print(res.info())#响应头 print(res.getcode())#返回状态码 2xx正常访问 3xx发生了从重定向,4xx访问资源有问题,5xx服务器问题 print(res.geturl())#返回相应地址 html =res.read() html=html.decode("utf-8") print(html)
request基本使用
import requests url = "http://www.bilibili.com/" res=requests.get(url) print(res.encoding) print(res.headers)#里面没有 Content—Type encoding=utf-8 否则如果charset已设置为准,否则就是ISO-88599-1 print(res.url) print(res.text) print(res.status_code)
beautifulsoup4 解析内容
from bs4 import BeautifulSoup import requests url = "http://wsjkw.sc.gov.cn/scwsjkw/gzbd/fyzt.shtml" res = requests.get(url) res.encoding="utf-8" html = res.text soup=BeautifulSoup(html) soup.find("h2").text a = soup.find("a") print(a) print(a.attrs) print(a.attrs["href"]) url_new = "http://wsjkw.sc.gov.cn"+a.attrs["href"] url_new
re解析
import re res =requests.get(url_new) res.encoding="utf-8" soup=BeautifulSoup(res.text) context=soup.select("p") text=context[1].text print(text) patten="确诊病例(d+)例" res = re.search(patten,text) print(res) print(res.groups()) print(res.group(0)) print(res.group(1))
爬取疫情数据
import requests import json url="https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5" res=requests.get(url) d=json.loads(res.text) data_all=json.loads(d["data"]) print(data_all) print(data_all["areaTree"][0].keys()) print(data_all["areaTree"][0]["name"]) print(data_all["areaTree"][0]["today"]) print(data_all["areaTree"][0]["total"]) print(data_all["areaTree"][0]["children"]) print(len(data_all["areaTree"][0]["children"])) for i in data_all["areaTree"][0]["children"]: print(i["name"])
存入数据库
import pymysql import time import json import traceback import requests def get_tencent_data(): """ :return: 返回历史数据和当日详细数据 """ url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5' url_his='https://view.inews.qq.com/g2/getOnsInfo?name=disease_other'#加上这个history大兄弟++++++++ headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36', } r = requests.get(url, headers) res = json.loads(r.text) # json字符串转字典 data_all = json.loads(res['data']) #再加上history的配套东东++++++++ r_his=requests.get(url_his,headers) res_his=json.loads(r_his.text) data_his=json.loads(res_his['data']) history = {} # 历史数据 # for i in data_all["chinaDayList"]: # ds = "2020." + i["date"] # tup = time.strptime(ds, "%Y.%m.%d") # ds = time.strftime("%Y-%m-%d", tup) # 改变时间格式,不然插入数据库会报错,数据库是datetime类型 # confirm = i["confirm"] # suspect = i["suspect"] # heal = i["heal"] # dead = i["dead"] # history[ds] = {"confirm": confirm, "suspect": suspect, "heal": heal, "dead": dead} # for i in data_all["chinaDayAddList"]: # ds = "2020." + i["date"] # tup = time.strptime(ds, "%Y.%m.%d") # ds = time.strftime("%Y-%m-%d", tup) # confirm = i["confirm"] # suspect = i["suspect"] # heal = i["heal"] # dead = i["dead"] # history[ds].update({"confirm_add": confirm, "suspect_add": suspect, "heal_add": heal, "dead_add": dead}) #通过上面的代码肯定不行了,里面只有当日详细数据,修改也很简单,改一下循环遍历的数据源即可:++++ for i in data_his["chinaDayList"]: ds = "2020." + i["date"] tup = time.strptime(ds, "%Y.%m.%d") ds = time.strftime("%Y-%m-%d", tup) # 改变时间格式,不然插入数据库会报错,数据库是datetime类型 confirm = i["confirm"] suspect = i["suspect"] heal = i["heal"] dead = i["dead"] history[ds] = {"confirm": confirm, "suspect": suspect, "heal": heal, "dead": dead} for i in data_his["chinaDayAddList"]: ds = "2020." + i["date"] tup = time.strptime(ds, "%Y.%m.%d") ds = time.strftime("%Y-%m-%d", tup) confirm = i["confirm"] suspect = i["suspect"] heal = i["heal"] dead = i["dead"] history[ds].update({"confirm_add": confirm, "suspect_add": suspect, "heal_add": heal, "dead_add": dead}) #下面就不用动了 details = [] # 当日详细数据 update_time = data_all["lastUpdateTime"] data_country = data_all["areaTree"] # list 25个国家 data_province = data_country[0]["children"] # 中国各省 for pro_infos in data_province: province = pro_infos["name"] # 省名 for city_infos in pro_infos["children"]: city = city_infos["name"] confirm = city_infos["total"]["confirm"] confirm_add = city_infos["today"]["confirm"] heal = city_infos["total"]["heal"] dead = city_infos["total"]["dead"] details.append([update_time, province, city, confirm, confirm_add, heal, dead]) return history, details