zoukankan html css js c++ java

Python爬虫实现统计博客园博文数量、阅读量、评论数

如何使用

只需要将代码中的headurl替换以下格式，其中你只需要改变以下链接的阴影部分，阴影部分为你的博客园链接。

类似：
https://www.cnblogs.com/-wenli/default.html?page=

原理

使用requests爬取网页，再使用BeautifulSoup解析网页，获取数据、对数据做了预处理，最后使用正则匹配，匹配出需要的数据。

最后的数据使用一个大字典存储。

爬取网页

爬取网页这里做了异常处理。

def get_one_page(url,headers):
    try:
        response = requests.get(url,headers=headers)
        if response.status_code ==200:
            return response.text
    except RequestException:
        return None

解析网页

def parse_one_page(html):
    global item,flag
    soup = BeautifulSoup(html, 'lxml')
    divs = soup.find_all('div',class_='day')
    if(len(divs) == 0):
        flag = False
        return ""
    for i, child in enumerate(divs):
        list = []
        i=0
        titles = child.find_all('div',class_='postTitle')
        infomations = child.find_all('div', class_='postDesc')
        #
        for title in titles:
            partitle = title.find_all('a',class_='postTitle2')
            partitleinfo = partitle[0].get_text()
            partitleinfo = partitleinfo.replace('
', '')
            partitleinfo = partitleinfo.replace(' ', '')
            list.append(partitleinfo)
        for infomation in infomations:
            dic = {}
            info = infomation.get_text() #获得文本
            info = info.replace('
', '') #去掉换行
            info = info.replace(' ', '') #去掉空白字符
            result = re.match('^.*阅读.(d+)..*评论.(d+)..*编辑$', info)
            #print(result.group(1),result.group(2))
            dic["阅读量"] = result.group(1)
            dic["评论量"] = result.group(2)
            item[list[i]] = dic
            i+=1

统计数据

def statistics():
    global item
    readtotal = 0
    commandtotal = 0
    blogtotal = 0
    for v in item.values():
        readtotal = readtotal + int(v['阅读量'])
        commandtotal = commandtotal +  int(v['评论量'])
        blogtotal += 1
    print('总博文量：', blogtotal)
    print('总阅读量：',readtotal)
    print('总评论量：', commandtotal)

源码

from bs4 import BeautifulSoup
import requests
from requests.exceptions import RequestException
import re
import time

def get_one_page(url,headers):
    try:
        response = requests.get(url,headers=headers)
        if response.status_code ==200:
            return response.text
    except RequestException:
        return None
def parse_one_page(html):
    global item,flag
    soup = BeautifulSoup(html, 'lxml')
    divs = soup.find_all('div',class_='day')
    if(len(divs) == 0):
        flag = False
        return ""
    for i, child in enumerate(divs):
        list = []
        i=0
        titles = child.find_all('div',class_='postTitle')
        infomations = child.find_all('div', class_='postDesc')
        #
        for title in titles:
            partitle = title.find_all('a',class_='postTitle2')
            partitleinfo = partitle[0].get_text()
            partitleinfo = partitleinfo.replace('
', '')
            partitleinfo = partitleinfo.replace(' ', '')
            list.append(partitleinfo)
        for infomation in infomations:
            dic = {}
            info = infomation.get_text() #获得文本
            info = info.replace('
', '') #去掉换行
            info = info.replace(' ', '') #去掉空白字符
            result = re.match('^.*阅读.(d+)..*评论.(d+)..*编辑$', info)
            #print(result.group(1),result.group(2))
            dic["阅读量"] = result.group(1)
            dic["评论量"] = result.group(2)
            item[list[i]] = dic
            i+=1
def statistics():
    global item
    readtotal = 0
    commandtotal = 0
    blogtotal = 0
    for v in item.values():
        readtotal = readtotal + int(v['阅读量'])
        commandtotal = commandtotal +  int(v['评论量'])
        blogtotal += 1
    print('总博文量：', blogtotal)
    print('总阅读量：',readtotal)
    print('总评论量：', commandtotal)

def kind():
    pass

def main():
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'}
    headurl = 'https://www.cnblogs.com/-wenli/default.html?page='
    i = 1
    while flag:
        url = headurl + str(i)
        print(url)
        #获取源码
        html = get_one_page(url,headers)
        #解析源码
        parse_one_page(html)
        i += 1

    #统计功能
    #print(item)
    statistics()


if __name__ == '__main__':
    flag = True
    item = {}
    main()

演示结果:

查看全文

相关阅读:
反击黑客之对网站攻击者的IP追踪
 如何使用Nginx对抗DDoS攻击？
nginx网站攻击防护
 Ora-01536：超出了表空间users的空间限量
 ASP.Net请求处理机制初步探索之旅
 ASP.Net请求处理机制初步探索之旅
 ASP.Net请求处理机制初步探索之旅
 自己动手写工具：百度图片批量下载器
 自己动手写游戏：坦克撕逼大战
 【大型网站技术实践】初级篇：海量图片的分布式存储设计与实现

原文地址：https://www.cnblogs.com/-wenli/p/12464545.html