zoukankan      html  css  js  c++  java
  • [Python]爬取新型冠状病毒2.2至今的所有数据 python 2020.2.13

    爬取网址http://hu.yixue99.com/2020/kszx_0205/27792.html

    代码如下:

      1 import requests
      2 from bs4 import BeautifulSoup
      3 
      4 url="http://hu.yixue99.com/2020/kszx_0205/27792.html"
      5 kv = {'user-agent': 'Mozilla/5.0'}
      6 
      7 #爬取总览信息
      8 def content():
      9     url = "http://hu.yixue99.com/2020/kszx_0205/27792.html"
     10     r=requests.get(url,headers=kv)
     11     r.encoding=r.apparent_encoding
     12     demo=r.text
     13     soup=BeautifulSoup(demo,"html.parser")
     14     print("开始")
     15     #print(r.text)
     16     num=0
     17     texts=""
     18     for s in soup.find_all("span",{"style":"font-size:14px;"}):
     19         text=str(s.string).replace("时间(北京时间)", "").replace("确诊", "").replace("疑似", "").replace("死亡", "").replace("治愈","").replace("疫情详情", "").replace("点击查看", "")
     20         if text!="":
     21             num+=1
     22             if num % 5 != 0:
     23                 texts += text + " "
     24             else:
     25                 texts+=text
     26                 print(texts)
     27                 wtire_content(texts.replace("","") + "
    ")
     28                 texts=""
     29 
     30 
     31 #爬取链接
     32 def href():
     33     url = "http://hu.yixue99.com/2020/kszx_0205/27792.html"
     34     r = requests.get(url, headers=kv)
     35     r.encoding = r.apparent_encoding
     36     demo = r.text
     37     soup = BeautifulSoup(demo, "html.parser")
     38     print("开始")
     39     # print(r.text)
     40     num = 0
     41     texts = ""
     42     for s in soup.find_all("span", {"style": "font-size:14px;"}):
     43         if s.find("a") is not None:
     44             href=str(s.find("a").attrs["href"])
     45             print(href)
     46             wtire_href(href+"
    ")
     47 
     48 
     49 #爬取内容
     50 def content_day(url):
     51     r = requests.get(url, headers=kv)
     52     r.encoding = r.apparent_encoding
     53     demo = r.text
     54     soup = BeautifulSoup(demo, "html.parser")
     55     print(url)
     56     print("开始")
     57     num = 0
     58     texts = ""
     59     one=0
     60     time= str(soup.find("td",{"style" : "PADDING-BOTTOM: 0px; PADDING-TOP: 0px; PADDING-LEFT: 0px; MARGIN: 0px; PADDING-RIGHT: 0px"}).string).replace("各省疫情动态(截止至","").replace(" 10:00)","").replace("各省疫情动态(截止至","").replace(" 11:00)","")
     61     print(time)
     62     for s in soup.find_all("td",{"style" : "PADDING-BOTTOM: 0px; PADDING-TOP: 0px; PADDING-LEFT: 0px; MARGIN: 0px; PADDING-RIGHT: 0px"}):
     63             text = str(s.string).replace("确诊", "").replace("疑似", "").replace("死亡", "").replace("治愈", "").replace(" ", "").replace("省份", "")
     64             if one==0:
     65                 one+=1
     66             else:
     67                 if text !="":
     68                     num+=1
     69                     if num % 5 != 0:
     70                         texts += text + " "
     71                     else:
     72                         texts += text
     73                         print(time+texts)
     74                         write_content_day(time+" "+texts+"
    ")
     75                         texts = ""
     76 
     77 #写入总览信息
     78 def wtire_content(contents):
     79     f=open("E:/bingducsv/bingdusum.txt" , "a+" , encoding="utf-8")
     80     f.write(contents)
     81     f.close()
     82 
     83 #写入每日的链接
     84 def wtire_href(contents):
     85     f = open("E:/bingducsv/bingduhref.txt", "a+", encoding="utf-8")
     86     f.write(contents)
     87     f.close()
     88 
     89 def read():
     90     f = open("E:/bingducsv/bingduhref.txt", "r+", encoding="utf-8")
     91     for line in f:
     92         line=line.rstrip("
    ")
     93         url=line
     94         content_day(url)
     95 
     96 def write_content_day(contents):
     97     f = open("E:/bingducsv/bingduday.txt", "a+", encoding="utf-8")
     98     f.write(contents)
     99     f.close()
    100 
    101 
    102 if __name__=="__main__":
    103     content()
    104     href()
    105     read()
  • 相关阅读:
    java语言基础001
    Linux 使用硬盘
    Linux 系统运行命令 > 查看系统信息
    Linux rm 命令
    Linux 操作系统目录结构
    JavaScript || 事件基础
    My SQL随记 003 数据表基础操作语法
    My SQL随记 002 登陆
    My SQL随记 001 常用名词/结构化语言
    linux命令学习
  • 原文地址:https://www.cnblogs.com/zlc364624/p/12304119.html
Copyright © 2011-2022 走看看