问题描述
爬取博客园的首页数据URL【https://home.cnblogs.com/blog/page/1/】,之后写到自己的Excel里面
环境:
OS:Window10
python:3.7
代码
import requests
import os
from bs4 import BeautifulSoup
import xlwt
import xlrd
from xlutils.copy import copy
import threading
import datetime
class BlogHome:
def __init__(self):
self.url = "https://home.cnblogs.com/blog/page/{}/"
self.path = r"C:pythonProjectBlog"
def request(self, param):
url= self.url.format(param)
r = requests.get(self.url)
return r.text
def all_page(self, maxpage):
# wbk = xlwt.Workbook()
# sheet = wbk.add_sheet("Data")
wbk = xlrd.open_workbook(r"C:UserspeiqiangDesktopaaa.xls", formatting_info=True)
wbCopy = copy(wbk)
sheet = wbCopy.get_sheet(0)
row = 4
for page in range(1, maxpage):
thread_lock.acquire()
req = self.request(page)
reRow = self.getdata(req, sheet, row)
row = reRow
thread_lock.release()
wbCopy.save(r"C:UserspeiqiangDesktopaaa.xls")
print("書き込みました")
def getdata(self, req, sheet, row):
soup = BeautifulSoup(req, "xml")
all_title = soup.find_all(class_="post_block")
for title in all_title:
col = 1
# title取得
title_blank = title.find(class_="entry_title").find_all("a")
print("user:", title_blank[0].string.replace("[", "").replace("]", ""))
sheet.write(row, col, title_blank[0].string.replace("[", "").replace("]", ""))
col += 1
print("title:", title_blank[1].string)
sheet.write(row, col, title_blank[1].string)
col += 1
# 評論個数
post_comment = title.find(class_="post_comment")
print("評論個数:", post_comment.string)
sheet.write(row, col, post_comment.string)
col += 1
# 読込個数
post_view = title.find(class_="post_view")
print("読込個数:", post_view.string)
sheet.write(row, col, post_view.string)
col += 1
# 推奨個数
# susume = title.find(class_="entry_footer")
# print("推奨個数:", susume.string)
# 発表日付
postdate = title.find(class_="postdate")
print("発表日付:", postdate.string)
sheet.write(row, col, postdate.string)
col += 1
# 詳細取得
entry_summary = title.find(class_="entry_summary")
print("詳細取得:", entry_summary.string)
sheet.write(row, col, entry_summary.string)
col += 1
row += 1
return row
def writeExcel(self, row, col, data):
wbk = xlwt.Workbook()
sheet = wbk.add_sheet("Data", cell_overwrite_ok=True)
sheet.write(row, col, data)
wbk.save(r"C:UserspeiqiangDesktopaaa.xls")
print("書き込みました")
def mkdir(self):
path = self.path.strip()
isExist = os.path.exists(path)
if not isExist:
print('创建名字叫做', path, '的文件夹')
os.makedirs(path)
print('创建成功!')
return True
else:
print(path, '文件夹已经存在了,不再创建')
return False
def getBlog(self):
startTime = datetime.datetime.now()
print("開始", startTime)
self.all_page(10)
endTime = datetime.datetime.now()
print("実行時間:", (endTime - startTime).seconds)
print("開始", startTime)
print("終了", endTime)
thread_lock = threading.BoundedSemaphore(value=10)
blogHome = BlogHome()
blogHome.getBlog()
执行上面的代码

Excel上面的数据
