zoukankan      html  css  js  c++  java
  • 用Python实现数据的爬取

    上周做了java web的数据可视化,但是数据库里面的内容是固定的不能动态实时的更新,如果人为手动更新工作量也是巨大的,于是实时更新数据就涉及到了数据的爬取工作,对于数据爬去我是从零开的,

    一开始去网上查询用java代码实现但是发现在短时间之内无法掌握,于是看了看Python 还是比较容易的。直接上代码,这是对全国各个省份市的新冠肺炎各项数据的爬取。

      1 from os import path
      2 import requests
      3 from bs4 import BeautifulSoup
      4 import json
      5 import pymysql
      6 import numpy as np
      7 import time
      8 
      9 url = 'https://ncov.dxy.cn/ncovh5/view/pneumonia?from=timeline&isappinstalled=0'  # 请求地址
     10 headers = {
     11     'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}  # 创建头部信息
     12 response = requests.get(url, headers=headers)  # 发送网络请求
     13 # print(response.content.decode('utf-8'))#以字节流形式打印网页源码
     14 content = response.content.decode('utf-8')
     15 # print(content)
     16 soup = BeautifulSoup(content, 'html.parser')
     17 listA = soup.find_all(name='script', attrs={"id": "getAreaStat"})
     18 # 世界确诊
     19 listB = soup.find_all(name='script', attrs={"id": "getListByCountryTypeService2"})
     20 # listA = soup.find_all(name='div',attrs={"class":"c-touchable-feedback c-touchable-feedback-no-default"})
     21 account = str(listA)
     22 # world_messages = str(listB)[87:-21]
     23 messages = account[52:-21]
     24 messages_json = json.loads(messages)
     25 # world_messages_json = json.loads(world_messages)
     26 valuesList = []
     27 cityList = []
     28 
     29 con = len(messages_json)
     30 k = 0
     31 for i in range(len(messages_json)):
     32     # value = messages_json[i]
     33     k = k + 1
     34     value = (
     35     k, time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), messages_json[i].get('provinceShortName'), None,
     36     messages_json[i].get('confirmedCount'), messages_json[i].get('suspectedCount'), messages_json[i].get('curedCount'),
     37     messages_json[i].get('deadCount'), messages_json[i].get('locationId'))
     38     valuesList.append(value)
     39     cityValue = messages_json[i].get('cities')
     40     # print(cityValue)
     41     for j in range(len(cityValue)):
     42         con = con + 1
     43         cityValueList = (
     44         con, time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), messages_json[i].get('provinceShortName'),
     45         cityValue[j].get('cityName'), cityValue[j].get('confirmedCount'), cityValue[j].get('suspectedCount'),
     46         cityValue[j].get('curedCount'), cityValue[j].get('deadCount'), cityValue[j].get('locationId'))
     47         # print(cityValueList)
     48         cityList.append(cityValueList)
     49     # cityList.append(cityValue)
     50 db = pymysql.connect("localhost", "root", "******", "yiqing", charset='utf8')
     51 cursor = db.cursor()
     52 array = np.asarray(valuesList[0])
     53 # sql_clean_world = "TRUNCATE TABLE world_map"
     54 # sql_clean_city = "TRUNCATE TABLE city_map"
     55 # sql_clean_json = "TRUNCATE TABLE province_data_from_json"
     56 sql_clean_province = "TRUNCATE TABLE info3"
     57 # sql1 = "INSERT INTO city_map values (%s,%s,%s,%s,%s,%s,%s,%s)"
     58 # sql_world = "INSERT INTO world_map values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
     59 # sql = "INSERT INTO province_map values (0,'%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') "
     60 sql = "INSERT INTO info2 values (%s,%s,%s,%s,%s,%s,%s,%s,%s) "
     61 # sql = "INSERT INTO province_map (provinceName,provinceShortName,correntConfirmedCount,confirmedCount,
     62 # suspectedCount,curedCount,deadCount,comment,locationId,statisticsData) values (0,'%s','%s','%s','%s','%s','%s',
     63 # '%s','%s','%s','%s') " sql = """INSERT INTO province_map (provinceName,provinceShortName,correntConfirmedCount,
     64 # confirmedCount,suspectedCount,curedCount,deadCount,comment,locationId,statisticsData) values ('湖北省', '湖北', 43334,
     65 # 64786, 0, 18889, 2563, '', 420000, 'https://file1.dxycdn.com/2020/0223/618/3398299751673487511-135.json')"""
     66 value_tuple = tuple(valuesList)
     67 cityTuple = tuple(cityList)
     68 # worldTuple = tuple(worldList)
     69 # print(cityTuple)
     70 # print(tuple(value_tuple))
     71 try:
     72     # cursor.execute(sql_clean_city)
     73     cursor.execute(sql_clean_province)
     74     # cursor.executemany(sql, value_tuple)
     75     # cursor.executemany(sql1,cityTuple)
     76     db.commit()
     77 except:
     78     print('执行失败,进入回调1')
     79     db.rollback()
     80 
     81 try:
     82     # cursor.execute(sql_clean_city)
     83     # cursor.execute(sql_clean_province)
     84     cursor.executemany(sql, value_tuple)
     85     # cursor.executemany(sql1,cityTuple)
     86     db.commit()
     87 except:
     88     print('执行失败,进入回调3')
     89     db.rollback()
     90 
     91 try:
     92     # cursor.execute(sql_clean_city)
     93     # cursor.execute(sql_clean_province)
     94     # cursor.executemany(sql, value_tuple)
     95     cursor.executemany(sql, cityTuple)
     96     db.commit()
     97 except:
     98     print('执行失败,进入回调4')
     99     db.rollback()
    100 
    101 # print(messages_json)
    102 # print(account[52:-21])
    103 # soupDiv = BeautifulSoup(listA,'html.parser')
    104 # listB = soupDiv.find_all(name='div',attrs={"class":"c-gap-bottom-zero c-line-clamp2"})
    105 # for i in listA:
    106 # print(i)
    107 # listA[12]
    108 # print(listA)
    109 
    110 
    111 db.close()

    其中上图便是和数据库的连接

     这是数据库的设计。

  • 相关阅读:
    推荐20个开源项目托管网站
    python 网络编程(网络基础之网络协议篇)
    python 异常处理
    python 内置函数的补充 isinstance,issubclass, hasattr ,getattr, setattr, delattr,str,del 用法,以及元类
    python3 封装之property 多态 绑定方法classmethod 与 非绑定方法 staticmethod
    python3 类 组合
    PYTHON3中 类的继承
    面向对象 与类
    包 与常用模块
    json 与pickle模块(序列化与反序列化))
  • 原文地址:https://www.cnblogs.com/g414056667/p/12462870.html
Copyright © 2011-2022 走看看