zoukankan      html  css  js  c++  java
  • 大二下学期第一次结对作业(第一阶段)

    今日主要学习了使用python爬取数据并存入数据库的基本操作:

    操作环境是在Jupyter Notebook,它是以网页形式打开的直接操作代码,运行代码并可以写说明文档。

    urllib的基本使用

    from urllib import request
    #添加header信息 最基本的反扒措施
    url = "http://www.bilibili.com/"
    header = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36"
    }
    req = request.Request(url,headers=header)
    res = request.urlopen(req)#获取相应
    print(res.info())#响应头
    print(res.getcode())#返回状态码 2xx正常访问 3xx发生了从重定向,4xx访问资源有问题,5xx服务器问题
    print(res.geturl())#返回相应地址
    html =res.read()
    html=html.decode("utf-8")
    print(html)

    request基本使用

    import requests
    url = "http://www.bilibili.com/"
    res=requests.get(url)
    print(res.encoding)
    print(res.headers)#里面没有 Content—Type  encoding=utf-8 否则如果charset已设置为准,否则就是ISO-88599-1
    print(res.url)
    print(res.text)
    print(res.status_code)

    beautifulsoup4 解析内容

    from bs4 import BeautifulSoup
    import requests
    url = "http://wsjkw.sc.gov.cn/scwsjkw/gzbd/fyzt.shtml"
    res = requests.get(url)
    res.encoding="utf-8"
    html = res.text
    soup=BeautifulSoup(html)
    soup.find("h2").text
    a = soup.find("a")
    print(a)
    print(a.attrs)
    print(a.attrs["href"])
    
    url_new = "http://wsjkw.sc.gov.cn"+a.attrs["href"]
    url_new

    re解析

    import re
    res =requests.get(url_new)
    res.encoding="utf-8"
    soup=BeautifulSoup(res.text)
    context=soup.select("p")
    text=context[1].text
    print(text)
    patten="确诊病例(d+)例"
    res = re.search(patten,text)
    print(res)
    print(res.groups())
    print(res.group(0))
    print(res.group(1))

    爬取疫情数据

    import requests
    import json
    url="https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5"
    res=requests.get(url)
    d=json.loads(res.text)
    data_all=json.loads(d["data"])
    print(data_all)
    print(data_all["areaTree"][0].keys())
    print(data_all["areaTree"][0]["name"])
    print(data_all["areaTree"][0]["today"])
    print(data_all["areaTree"][0]["total"])
    print(data_all["areaTree"][0]["children"])
    print(len(data_all["areaTree"][0]["children"]))
    for i in data_all["areaTree"][0]["children"]:
        print(i["name"])

    存入数据库

    import pymysql
    import time
    import json
    import traceback
    import requests
    def get_tencent_data(): 
        """
        :return: 返回历史数据和当日详细数据
        """
        url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5'
        url_his='https://view.inews.qq.com/g2/getOnsInfo?name=disease_other'#加上这个history大兄弟++++++++
        
        
        headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
        }
        r = requests.get(url, headers)
        res = json.loads(r.text)  # json字符串转字典
        data_all = json.loads(res['data'])
        
        #再加上history的配套东东++++++++
        r_his=requests.get(url_his,headers)
        res_his=json.loads(r_his.text)
        data_his=json.loads(res_his['data'])
    
        history = {}  # 历史数据
    #     for i in data_all["chinaDayList"]:
    #         ds = "2020." + i["date"]
    #         tup = time.strptime(ds, "%Y.%m.%d")
    #         ds = time.strftime("%Y-%m-%d", tup)  # 改变时间格式,不然插入数据库会报错,数据库是datetime类型
    #         confirm = i["confirm"]
    #         suspect = i["suspect"]
    #         heal = i["heal"]
    #         dead = i["dead"]
    #         history[ds] = {"confirm": confirm, "suspect": suspect, "heal": heal, "dead": dead}
    #     for i in data_all["chinaDayAddList"]:
    #         ds = "2020." + i["date"]
    #         tup = time.strptime(ds, "%Y.%m.%d")
    #         ds = time.strftime("%Y-%m-%d", tup)
    #         confirm = i["confirm"]
    #         suspect = i["suspect"]
    #         heal = i["heal"]
    #         dead = i["dead"]
    #         history[ds].update({"confirm_add": confirm, "suspect_add": suspect, "heal_add": heal, "dead_add": dead})
    
    #通过上面的代码肯定不行了,里面只有当日详细数据,修改也很简单,改一下循环遍历的数据源即可:++++
        for i in data_his["chinaDayList"]:
            ds = "2020." + i["date"]
            tup = time.strptime(ds, "%Y.%m.%d")
            ds = time.strftime("%Y-%m-%d", tup)  # 改变时间格式,不然插入数据库会报错,数据库是datetime类型
            confirm = i["confirm"]
            suspect = i["suspect"]
            heal = i["heal"]
            dead = i["dead"]
            history[ds] = {"confirm": confirm, "suspect": suspect, "heal": heal, "dead": dead}
        for i in data_his["chinaDayAddList"]:
            ds = "2020." + i["date"]
            tup = time.strptime(ds, "%Y.%m.%d")
            ds = time.strftime("%Y-%m-%d", tup)
            confirm = i["confirm"]
            suspect = i["suspect"]
            heal = i["heal"]
            dead = i["dead"]
            history[ds].update({"confirm_add": confirm, "suspect_add": suspect, "heal_add": heal, "dead_add": dead})
            
    #下面就不用动了
        details = []  # 当日详细数据
        update_time = data_all["lastUpdateTime"]
        data_country = data_all["areaTree"]  # list 25个国家
        data_province = data_country[0]["children"]  # 中国各省
        for pro_infos in data_province:
            province = pro_infos["name"]  # 省名
            for city_infos in pro_infos["children"]:
                city = city_infos["name"]
                confirm = city_infos["total"]["confirm"]
                confirm_add = city_infos["today"]["confirm"]
                heal = city_infos["total"]["heal"]
                dead = city_infos["total"]["dead"]
                details.append([update_time, province, city, confirm, confirm_add, heal, dead])
        return history, details
  • 相关阅读:
    Python3中的新特性(3)——代码迁移与2to3
    Python3中的新特性(1)——新的语言特性
    Python3中的新特性(2)——常见陷阱
    输入一行字符,统计其中有多少个单词,单词之间用空格分隔开
    scanf(),gets(),gechar()函数小结
    CI控制器调用内部方法并载入相应模板的做法
    script脚本中写不写$(document).ready(function() {});的区别
    CentOS系统时间与现在时间相差8小时解决方法
    Linux下MySQL慢查询分析mysqlsla安装使用
    导入 Mysql 示例数据库 employees
  • 原文地址:https://www.cnblogs.com/fengchuiguobanxia/p/14527272.html
Copyright © 2011-2022 走看看