zoukankan      html  css  js  c++  java
  • 大二下学期第一次结对作业(第一阶段)

    今日主要学习了使用python爬取数据并存入数据库的基本操作:

    操作环境是在Jupyter Notebook,它是以网页形式打开的直接操作代码,运行代码并可以写说明文档。

    urllib的基本使用

    from urllib import request
    #添加header信息 最基本的反扒措施
    url = "http://www.bilibili.com/"
    header = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36"
    }
    req = request.Request(url,headers=header)
    res = request.urlopen(req)#获取相应
    print(res.info())#响应头
    print(res.getcode())#返回状态码 2xx正常访问 3xx发生了从重定向,4xx访问资源有问题,5xx服务器问题
    print(res.geturl())#返回相应地址
    html =res.read()
    html=html.decode("utf-8")
    print(html)

    request基本使用

    import requests
    url = "http://www.bilibili.com/"
    res=requests.get(url)
    print(res.encoding)
    print(res.headers)#里面没有 Content—Type  encoding=utf-8 否则如果charset已设置为准,否则就是ISO-88599-1
    print(res.url)
    print(res.text)
    print(res.status_code)

    beautifulsoup4 解析内容

    from bs4 import BeautifulSoup
    import requests
    url = "http://wsjkw.sc.gov.cn/scwsjkw/gzbd/fyzt.shtml"
    res = requests.get(url)
    res.encoding="utf-8"
    html = res.text
    soup=BeautifulSoup(html)
    soup.find("h2").text
    a = soup.find("a")
    print(a)
    print(a.attrs)
    print(a.attrs["href"])
    
    url_new = "http://wsjkw.sc.gov.cn"+a.attrs["href"]
    url_new

    re解析

    import re
    res =requests.get(url_new)
    res.encoding="utf-8"
    soup=BeautifulSoup(res.text)
    context=soup.select("p")
    text=context[1].text
    print(text)
    patten="确诊病例(d+)例"
    res = re.search(patten,text)
    print(res)
    print(res.groups())
    print(res.group(0))
    print(res.group(1))

    爬取疫情数据

    import requests
    import json
    url="https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5"
    res=requests.get(url)
    d=json.loads(res.text)
    data_all=json.loads(d["data"])
    print(data_all)
    print(data_all["areaTree"][0].keys())
    print(data_all["areaTree"][0]["name"])
    print(data_all["areaTree"][0]["today"])
    print(data_all["areaTree"][0]["total"])
    print(data_all["areaTree"][0]["children"])
    print(len(data_all["areaTree"][0]["children"]))
    for i in data_all["areaTree"][0]["children"]:
        print(i["name"])

    存入数据库

    import pymysql
    import time
    import json
    import traceback
    import requests
    def get_tencent_data(): 
        """
        :return: 返回历史数据和当日详细数据
        """
        url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5'
        url_his='https://view.inews.qq.com/g2/getOnsInfo?name=disease_other'#加上这个history大兄弟++++++++
        
        
        headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
        }
        r = requests.get(url, headers)
        res = json.loads(r.text)  # json字符串转字典
        data_all = json.loads(res['data'])
        
        #再加上history的配套东东++++++++
        r_his=requests.get(url_his,headers)
        res_his=json.loads(r_his.text)
        data_his=json.loads(res_his['data'])
    
        history = {}  # 历史数据
    #     for i in data_all["chinaDayList"]:
    #         ds = "2020." + i["date"]
    #         tup = time.strptime(ds, "%Y.%m.%d")
    #         ds = time.strftime("%Y-%m-%d", tup)  # 改变时间格式,不然插入数据库会报错,数据库是datetime类型
    #         confirm = i["confirm"]
    #         suspect = i["suspect"]
    #         heal = i["heal"]
    #         dead = i["dead"]
    #         history[ds] = {"confirm": confirm, "suspect": suspect, "heal": heal, "dead": dead}
    #     for i in data_all["chinaDayAddList"]:
    #         ds = "2020." + i["date"]
    #         tup = time.strptime(ds, "%Y.%m.%d")
    #         ds = time.strftime("%Y-%m-%d", tup)
    #         confirm = i["confirm"]
    #         suspect = i["suspect"]
    #         heal = i["heal"]
    #         dead = i["dead"]
    #         history[ds].update({"confirm_add": confirm, "suspect_add": suspect, "heal_add": heal, "dead_add": dead})
    
    #通过上面的代码肯定不行了,里面只有当日详细数据,修改也很简单,改一下循环遍历的数据源即可:++++
        for i in data_his["chinaDayList"]:
            ds = "2020." + i["date"]
            tup = time.strptime(ds, "%Y.%m.%d")
            ds = time.strftime("%Y-%m-%d", tup)  # 改变时间格式,不然插入数据库会报错,数据库是datetime类型
            confirm = i["confirm"]
            suspect = i["suspect"]
            heal = i["heal"]
            dead = i["dead"]
            history[ds] = {"confirm": confirm, "suspect": suspect, "heal": heal, "dead": dead}
        for i in data_his["chinaDayAddList"]:
            ds = "2020." + i["date"]
            tup = time.strptime(ds, "%Y.%m.%d")
            ds = time.strftime("%Y-%m-%d", tup)
            confirm = i["confirm"]
            suspect = i["suspect"]
            heal = i["heal"]
            dead = i["dead"]
            history[ds].update({"confirm_add": confirm, "suspect_add": suspect, "heal_add": heal, "dead_add": dead})
            
    #下面就不用动了
        details = []  # 当日详细数据
        update_time = data_all["lastUpdateTime"]
        data_country = data_all["areaTree"]  # list 25个国家
        data_province = data_country[0]["children"]  # 中国各省
        for pro_infos in data_province:
            province = pro_infos["name"]  # 省名
            for city_infos in pro_infos["children"]:
                city = city_infos["name"]
                confirm = city_infos["total"]["confirm"]
                confirm_add = city_infos["today"]["confirm"]
                heal = city_infos["total"]["heal"]
                dead = city_infos["total"]["dead"]
                details.append([update_time, province, city, confirm, confirm_add, heal, dead])
        return history, details
  • 相关阅读:
    希腊字母写法
    The ASP.NET MVC request processing line
    lambda aggregation
    UVA 10763 Foreign Exchange
    UVA 10624 Super Number
    UVA 10041 Vito's Family
    UVA 10340 All in All
    UVA 10026 Shoemaker's Problem
    HDU 3683 Gomoku
    UVA 11210 Chinese Mahjong
  • 原文地址:https://www.cnblogs.com/fengchuiguobanxia/p/14527272.html
Copyright © 2011-2022 走看看