如何使用
只需要将代码中的headurl替换以下格式,其中你只需要改变以下链接的阴影部分,阴影部分为你的博客园链接。
类似: https://www.cnblogs.com/-wenli/default.html?page=
原理
使用requests爬取网页,再使用BeautifulSoup解析网页,获取数据、对数据做了预处理,最后使用正则匹配,匹配出需要的数据。
最后的数据使用一个大字典存储。
爬取网页
爬取网页这里做了异常处理。
def get_one_page(url,headers): try: response = requests.get(url,headers=headers) if response.status_code ==200: return response.text except RequestException: return None
解析网页
def parse_one_page(html): global item,flag soup = BeautifulSoup(html, 'lxml') divs = soup.find_all('div',class_='day') if(len(divs) == 0): flag = False return "" for i, child in enumerate(divs): list = [] i=0 titles = child.find_all('div',class_='postTitle') infomations = child.find_all('div', class_='postDesc') # for title in titles: partitle = title.find_all('a',class_='postTitle2') partitleinfo = partitle[0].get_text() partitleinfo = partitleinfo.replace(' ', '') partitleinfo = partitleinfo.replace(' ', '') list.append(partitleinfo) for infomation in infomations: dic = {} info = infomation.get_text() #获得文本 info = info.replace(' ', '') #去掉换行 info = info.replace(' ', '') #去掉空白字符 result = re.match('^.*阅读.(d+)..*评论.(d+)..*编辑$', info) #print(result.group(1),result.group(2)) dic["阅读量"] = result.group(1) dic["评论量"] = result.group(2) item[list[i]] = dic i+=1
统计数据
def statistics(): global item readtotal = 0 commandtotal = 0 blogtotal = 0 for v in item.values(): readtotal = readtotal + int(v['阅读量']) commandtotal = commandtotal + int(v['评论量']) blogtotal += 1 print('总博文量:', blogtotal) print('总阅读量:',readtotal) print('总评论量:', commandtotal)
源码
from bs4 import BeautifulSoup import requests from requests.exceptions import RequestException import re import time def get_one_page(url,headers): try: response = requests.get(url,headers=headers) if response.status_code ==200: return response.text except RequestException: return None def parse_one_page(html): global item,flag soup = BeautifulSoup(html, 'lxml') divs = soup.find_all('div',class_='day') if(len(divs) == 0): flag = False return "" for i, child in enumerate(divs): list = [] i=0 titles = child.find_all('div',class_='postTitle') infomations = child.find_all('div', class_='postDesc') # for title in titles: partitle = title.find_all('a',class_='postTitle2') partitleinfo = partitle[0].get_text() partitleinfo = partitleinfo.replace(' ', '') partitleinfo = partitleinfo.replace(' ', '') list.append(partitleinfo) for infomation in infomations: dic = {} info = infomation.get_text() #获得文本 info = info.replace(' ', '') #去掉换行 info = info.replace(' ', '') #去掉空白字符 result = re.match('^.*阅读.(d+)..*评论.(d+)..*编辑$', info) #print(result.group(1),result.group(2)) dic["阅读量"] = result.group(1) dic["评论量"] = result.group(2) item[list[i]] = dic i+=1 def statistics(): global item readtotal = 0 commandtotal = 0 blogtotal = 0 for v in item.values(): readtotal = readtotal + int(v['阅读量']) commandtotal = commandtotal + int(v['评论量']) blogtotal += 1 print('总博文量:', blogtotal) print('总阅读量:',readtotal) print('总评论量:', commandtotal) def kind(): pass def main(): headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'} headurl = 'https://www.cnblogs.com/-wenli/default.html?page=' i = 1 while flag: url = headurl + str(i) print(url) #获取源码 html = get_one_page(url,headers) #解析源码 parse_one_page(html) i += 1 #统计功能 #print(item) statistics() if __name__ == '__main__': flag = True item = {} main()