zoukankan html css js c++ java

[网络爬虫]Python爬取中国气象科普网新闻

代码如下：

import requests
from bs4 import BeautifulSoup
import News.IO as io



url = "http://www.qxkp.net/zhfy/"

# 设置头
cookie = {
    "cityPy": "UM_distinctid=171f2280ef23fb-02a4939f3c1bd4-335e4e71-144000-171f2280ef3dab; Hm_lvt_" +
              "ab6a683aa97a52202eab5b3a9042a8d2=1588905651; CNZZDATA1275796416=871124600-1588903268-%7C1588990372; " +
              "Hm_lpvt_ab6a683aa97a52202eab5b3a9042a8d2=1588994046"}
header = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 " +
                  "(KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3756.400 QQBrowser/10.5.4039.400"}

Type = ["ghzh", "byhl", "scb", "tffy", "flaq", "hc", "jdtqsj", "gwfh"]


def get_url_addtion():
    global cookie, header
    Type=["bb","dw"]
    for type in Type:
        # 爬取内容
        html = requests.get(url="http://www.qxkp.net/zhfy/"+type+"/", headers=header, cookies=cookie)
        # 设置格式
        html.encoding = "utf-8"
        soup = BeautifulSoup(html.text, 'html.parser')
        # 寻找标签
        divs = soup.find_all("div", class_='con_list_news_zt')
        for d in divs:
            div = BeautifulSoup(d.text, "html.parser")
            print(div)
            print("http://www.qxkp.net/zhfy/"+type+"/" + str(d).split("./")[1].split("" tar")[0])
            print(type)
            title=str(div).split("
")[0]
            date=str(div).split("
")[1]
            href="http://www.qxkp.net/zhfy/"+type+"/" + str(d).split("./")[1].split("" tar")[0]
            io.cw("news",title+" "+date+" "+href+" "+type)


def get_url(url):
    global cookie, header
    for type in Type:
        for i in range(1, 10):
            # 爬取内容
            html = requests.get(url=url + type + "/index_" + str(i) + ".html", headers=header, cookies=cookie)
            # 设置格式
            html.encoding = "utf-8"
            soup = BeautifulSoup(html.text, 'html.parser')
            # 寻找标签
            divs = soup.find_all("div", class_='con_list_news_zt')
            for d in divs:
                div = BeautifulSoup(d.text, "html.parser")
                print(div)
                print("http://www.qxkp.net/zhfy/" + type + "/" + str(d).split("./")[1].split("" tar")[0])
                print(type)
                title = str(div).split("
")[0]
                date = str(div).split("
")[1]
                href = "http://www.qxkp.net/zhfy/" + type + "/" + str(d).split("./")[1].split("" tar")[0]
                io.cw("news", title + " " + date + " " + href + " " + type+"
")

if __name__ == '__main__':
    get_url(url)
    get_url_addtion()

# 写入文件
def cw(file,con):
    f=open(file,"a+",encoding='utf-8')       # 以追加的方式
    f.write(con)
    f.close()


if __name__=="__main__":
    cw("sss.txt", "aaaaa")  # 以追加的方式

查看全文

相关阅读:
centos出现“FirewallD is not running”怎么办
 Centos7开放及查看端口
 phpRedis函数使用总结
 如何在Windows的PHPstudy中使用redis
Redis命令操作详解
 不可不知 DDoS的攻击原理与防御方法
 DDoS 攻击与防御：从原理到实践
 ORM Integrity constraint violation: 1052 Column 'id' in where clause is ambiguous
PHP常用函数大全500+
Linux彻底卸载Nginx

原文地址：https://www.cnblogs.com/zlc364624/p/14612286.html