zoukankan      html  css  js  c++  java
  • 知乎热榜爬虫

    写了个爬取知乎热榜的爬虫,将热榜信息存成json格式(update 1)保存在当前目录下,根据爬取时间存取
    需要cookie换成自己的应该就能用了
    爬取的内容有Rank:当前热榜排名 Title:问题名称 Hot:当前问题热度 Url:问题链接 Tags:问题的tags(点进问题之后可以看到)
    update 1新增了Ans: 两个热门答案,并修改了json格式

    代码

    # coding:utf-8
    # author:graykido
    # data:2021.5.25
    
    
    from bs4 import BeautifulSoup
    import re
    import requests
    import os
    import urllib.request
    import random
    import time
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Upgrade-Insecure-Requests': '1',
        'Set-Cookie': '',
        'cookie': ''}
    
    url = "https://www.zhihu.com/hot"
    plus = ["", "?list=science", "?list=digital", "?list=sport", "?list=fashion", "?list=film", "?list=school", "?list=car",
            "?list=depth", "?list=focus"]
    plusNameInCn = ["全站", "科学", "数码", "体育", "时尚", "影视", "校园", "汽车", "时事", "国际"]
    
    for i in range(len(plus)):
        myRec2Txt = ""
        new_url = url + plus[i]
        name = plusNameInCn[i]
        print(name + ": ")
        r = requests.get(new_url, headers=headers)
        r.encoding = ('utf8')
        bs = BeautifulSoup(r.text, "lxml")
        sections = bs.find_all(class_="HotItem")
        for section in sections:
            # print(section)
            # soup2=BeautifulSoup(div.text,'lxml')
            # print(soup2.getText())
            tags = []
            rank = section.div.div.text
            title = section.a.get('title')
            heat = section.find(class_="HotItem-metrics HotItem-metrics--bottom")
            if heat == None:
                heat = section.find(class_="HotItem-metrics")
            heat_ar = heat.text.split(' ')
            heat = heat_ar[0] + "万热度"
            tag_url = section.find(class_="HotItem-content").a['href']
            r2 = requests.get(tag_url, headers=headers)
            r2.encoding = ('utf8')
            soup2 = BeautifulSoup(r2.text, 'lxml')
            try:
                tags_divs = soup2.find(class_="QuestionHeader-topics").find_all(class_="Popover")
                for tags_div in tags_divs:
                    tags_true = tags_div.find(id='null-toggle').text
                    tags.append(tags_true)
                myRec = {}
                myRec['Rank'] = rank
                myRec['Title'] = title
                myRec['Hot'] = heat
                myRec['Url'] = tag_url
                myRec['Tags'] = tags
                myRec2Txt += str(myRec) + "
    "
            except:
                print("error发生")
        fold_path = './' + name
        # 判断是否存在该文件夹
        if not os.path.exists(fold_path):
            print("正在创建文件夹...")
            os.makedirs(fold_path)
        filepath = fold_path + '/' + time.strftime("%Y-%m-%d %H_%M_%S", time.localtime()) + ".txt"
        if os.path.exists(filepath):
            print("已存在该文件")
        else:
            with open(filepath, "w") as f:
                f.write(str(myRec2Txt))
            print("成功创建文件")
    print("just like another saturday night,mission finshed!")
    
    

    update 1

    # coding:utf-8
    # author:graykido
    # data:2021.5.25
    # update1:2021.6.3
    
    from bs4 import BeautifulSoup
    import re
    import requests
    import os
    import urllib.request
    import random
    import time
    import json
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Upgrade-Insecure-Requests': '1',
        'Set-Cookie': '',
        'cookie': ''}
    
    url = "https://www.zhihu.com/hot"
    plus = ["", "?list=science", "?list=digital", "?list=sport", "?list=fashion", "?list=film", "?list=school", "?list=car",
            "?list=depth", "?list=focus"]
    plusNameInCn = ["全站", "科学", "数码", "体育", "时尚", "影视", "校园", "汽车", "时事", "国际"]
    
    for i in range(len(plus)):
        myRec2Json = []
        new_url = url + plus[i]
        name = plusNameInCn[i]
        # print(name + ": ")
        r = requests.get(new_url, headers=headers)
        r.encoding = ('utf8')
        # print(r.text)
        bs = BeautifulSoup(r.text, "lxml")
        sections = bs.find_all(class_="HotItem")
        for section in sections:
            # print(section)
            # soup2=BeautifulSoup(div.text,'lxml')
            # print(soup2.getText())
            tags = []
            rank = section.div.div.text
            title = section.a.get('title')
            heat = section.find(class_="HotItem-metrics HotItem-metrics--bottom")
            if heat == None:
                heat = section.find(class_="HotItem-metrics")
            heat_ar = heat.text.split(' ')
            heat = heat_ar[0] + "万热度"
            tag_url = section.find(class_="HotItem-content").a['href']
            r2 = requests.get(tag_url, headers=headers)
            r2.encoding = ('utf8')
            soup2 = BeautifulSoup(r2.text, 'lxml')
            try:
                tags_divs = soup2.find(class_="QuestionHeader-topics").find_all(class_="Popover")
                for tags_div in tags_divs:
                    tags_true = tags_div.find(id='null-toggle').text
                    tags.append(tags_true)
                resOfQues = []
                resBs = soup2.find_all(class_="RichContent-inner")
                cnt = 0
                for ans in resBs:
                    if cnt > 2:
                        break
                    cnt += 1
                    resOfQues.append(ans.text)
                myRec = {}
                myRec['Rank'] = rank
                myRec['Title'] = title
                myRec['Hot'] = heat
                myRec['Url'] = tag_url
                myRec['Tags'] = tags
                myRec['Ans'] = resOfQues
                myRec2Json.append(myRec)
            except:
                print("error发生")
        fold_path = './' + time.strftime("%Y/%m/%d")
        # 判断是否存在该文件夹
        if not os.path.exists(fold_path):
            print("正在创建文件夹...")
            os.makedirs(fold_path)
        filepath = fold_path + '/' + name + ".json"
        if os.path.exists(filepath):
            print("已存在该文件")
        else:
            with open(filepath, "w", encoding="utf-8") as f:
                json.dump(myRec2Json, f, ensure_ascii=False)
            print("成功创建文件")
    print("just like another saturday night,mission finshed!")
    
    
  • 相关阅读:
    Swift学习二
    Swift学习一
    iOS--点击地图上某一点获取该点的经纬度
    iOS--新特性一览
    搭建LNAMP环境(三)- 源码安装Apache2.4
    搭建LNAMP环境(四)- 源码安装PHP7
    搭建LNAMP环境(五)- PHP7源码安装Redis和Redis拓展
    搭建LNAMP环境(六)- PHP7源码安装MongoDB和MongoDB拓展
    搭建LNAMP环境(七)- PHP7源码安装Memcached和Memcache拓展
    PHP安装mysql.so扩展
  • 原文地址:https://www.cnblogs.com/graytido/p/14810206.html
Copyright © 2011-2022 走看看