zoukankan      html  css  js  c++  java
  • Python爬虫实现统计博客园博文数量、阅读量、评论数

    如何使用

    只需要将代码中的headurl替换以下格式,其中你只需要改变以下链接的阴影部分,阴影部分为你的博客园链接。

    类似:
    https://www.cnblogs.com/-wenli/default.html?page=

    原理

    使用requests爬取网页,再使用BeautifulSoup解析网页,获取数据、对数据做了预处理,最后使用正则匹配,匹配出需要的数据。

    最后的数据使用一个大字典存储。

    爬取网页

    爬取网页这里做了异常处理。

    def get_one_page(url,headers):
        try:
            response = requests.get(url,headers=headers)
            if response.status_code ==200:
                return response.text
        except RequestException:
            return None

    解析网页

    def parse_one_page(html):
        global item,flag
        soup = BeautifulSoup(html, 'lxml')
        divs = soup.find_all('div',class_='day')
        if(len(divs) == 0):
            flag = False
            return ""
        for i, child in enumerate(divs):
            list = []
            i=0
            titles = child.find_all('div',class_='postTitle')
            infomations = child.find_all('div', class_='postDesc')
            #
            for title in titles:
                partitle = title.find_all('a',class_='postTitle2')
                partitleinfo = partitle[0].get_text()
                partitleinfo = partitleinfo.replace('
    ', '')
                partitleinfo = partitleinfo.replace(' ', '')
                list.append(partitleinfo)
            for infomation in infomations:
                dic = {}
                info = infomation.get_text() #获得文本
                info = info.replace('
    ', '') #去掉换行
                info = info.replace(' ', '') #去掉空白字符
                result = re.match('^.*阅读.(d+)..*评论.(d+)..*编辑$', info)
                #print(result.group(1),result.group(2))
                dic["阅读量"] = result.group(1)
                dic["评论量"] = result.group(2)
                item[list[i]] = dic
                i+=1

    统计数据

    def statistics():
        global item
        readtotal = 0
        commandtotal = 0
        blogtotal = 0
        for v in item.values():
            readtotal = readtotal + int(v['阅读量'])
            commandtotal = commandtotal +  int(v['评论量'])
            blogtotal += 1
        print('总博文量:', blogtotal)
        print('总阅读量:',readtotal)
        print('总评论量:', commandtotal)

    源码

    from bs4 import BeautifulSoup
    import requests
    from requests.exceptions import RequestException
    import re
    import time
    
    def get_one_page(url,headers):
        try:
            response = requests.get(url,headers=headers)
            if response.status_code ==200:
                return response.text
        except RequestException:
            return None
    def parse_one_page(html):
        global item,flag
        soup = BeautifulSoup(html, 'lxml')
        divs = soup.find_all('div',class_='day')
        if(len(divs) == 0):
            flag = False
            return ""
        for i, child in enumerate(divs):
            list = []
            i=0
            titles = child.find_all('div',class_='postTitle')
            infomations = child.find_all('div', class_='postDesc')
            #
            for title in titles:
                partitle = title.find_all('a',class_='postTitle2')
                partitleinfo = partitle[0].get_text()
                partitleinfo = partitleinfo.replace('
    ', '')
                partitleinfo = partitleinfo.replace(' ', '')
                list.append(partitleinfo)
            for infomation in infomations:
                dic = {}
                info = infomation.get_text() #获得文本
                info = info.replace('
    ', '') #去掉换行
                info = info.replace(' ', '') #去掉空白字符
                result = re.match('^.*阅读.(d+)..*评论.(d+)..*编辑$', info)
                #print(result.group(1),result.group(2))
                dic["阅读量"] = result.group(1)
                dic["评论量"] = result.group(2)
                item[list[i]] = dic
                i+=1
    def statistics():
        global item
        readtotal = 0
        commandtotal = 0
        blogtotal = 0
        for v in item.values():
            readtotal = readtotal + int(v['阅读量'])
            commandtotal = commandtotal +  int(v['评论量'])
            blogtotal += 1
        print('总博文量:', blogtotal)
        print('总阅读量:',readtotal)
        print('总评论量:', commandtotal)
    
    def kind():
        pass
    
    def main():
        headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'}
        headurl = 'https://www.cnblogs.com/-wenli/default.html?page='
        i = 1
        while flag:
            url = headurl + str(i)
            print(url)
            #获取源码
            html = get_one_page(url,headers)
            #解析源码
            parse_one_page(html)
            i += 1
    
        #统计功能
        #print(item)
        statistics()
    
    
    if __name__ == '__main__':
        flag = True
        item = {}
        main()

     演示结果:

  • 相关阅读:
    防火墙实践
    提高工作效率的小技巧
    网络系统参数配置
    linux 文件截取
    python (1) 还不是大全的小问题
    iptables 命令记录
    网络常用端口
    HTTP时间指标
    错误集锦
    access数据库调用
  • 原文地址:https://www.cnblogs.com/-wenli/p/12464545.html
Copyright © 2011-2022 走看看