zoukankan html css js c++ java

python 爬虫实例（三）

问题描述

爬取博客园的首页数据URL【https://home.cnblogs.com/blog/page/1/】，之后写到自己的Excel里面

环境：

OS：Window10

python：3.7

代码

import requests
import os
from bs4 import BeautifulSoup
import xlwt
import xlrd
from xlutils.copy import copy
import threading
import datetime

class BlogHome:

    def __init__(self):
        self.url = "https://home.cnblogs.com/blog/page/{}/"
        self.path = r"C:pythonProjectBlog"

    def request(self, param):
        url= self.url.format(param)
        r = requests.get(self.url)
        return r.text

    def all_page(self, maxpage):

        # wbk = xlwt.Workbook()
        # sheet = wbk.add_sheet("Data")
        wbk = xlrd.open_workbook(r"C:UserspeiqiangDesktopaaa.xls", formatting_info=True)
        wbCopy = copy(wbk)
        sheet = wbCopy.get_sheet(0)
        row = 4
        for page in range(1, maxpage):
            thread_lock.acquire()
            req = self.request(page)
            reRow = self.getdata(req, sheet, row)
            row = reRow
            thread_lock.release()

        wbCopy.save(r"C:UserspeiqiangDesktopaaa.xls")
        print("書き込みました")

    def getdata(self, req, sheet, row):
        soup = BeautifulSoup(req, "xml")
        all_title = soup.find_all(class_="post_block")
        for title in all_title:
            col = 1
            # title取得
            title_blank = title.find(class_="entry_title").find_all("a")
            print("user：", title_blank[0].string.replace("[", "").replace("]", ""))
            sheet.write(row, col, title_blank[0].string.replace("[", "").replace("]", ""))
            col += 1
            print("title：", title_blank[1].string)
            sheet.write(row, col, title_blank[1].string)
            col += 1

            # 評論個数
            post_comment = title.find(class_="post_comment")
            print("評論個数:", post_comment.string)
            sheet.write(row, col, post_comment.string)
            col += 1
            # 読込個数
            post_view = title.find(class_="post_view")
            print("読込個数:", post_view.string)

            sheet.write(row, col, post_view.string)
            col += 1

            # 推奨個数
            # susume = title.find(class_="entry_footer")
            # print("推奨個数:", susume.string)
            # 発表日付
            postdate = title.find(class_="postdate")
            print("発表日付:", postdate.string)
            sheet.write(row, col, postdate.string)
            col += 1
            # 詳細取得
            entry_summary = title.find(class_="entry_summary")
            print("詳細取得:", entry_summary.string)
            sheet.write(row, col, entry_summary.string)
            col += 1
            row += 1
        return row

    def writeExcel(self, row, col, data):
        wbk = xlwt.Workbook()
        sheet = wbk.add_sheet("Data", cell_overwrite_ok=True)
        sheet.write(row, col, data)
        wbk.save(r"C:UserspeiqiangDesktopaaa.xls")
        print("書き込みました")

    def mkdir(self):
        path = self.path.strip()
        isExist = os.path.exists(path)
        if not isExist:
            print('创建名字叫做', path, '的文件夹')
            os.makedirs(path)
            print('创建成功！')
            return True
        else:
            print(path, '文件夹已经存在了，不再创建')
            return False


    def getBlog(self):

        startTime = datetime.datetime.now()
        print("開始", startTime)
        self.all_page(10)
        endTime = datetime.datetime.now()
        print("実行時間：", (endTime - startTime).seconds)
        print("開始", startTime)
        print("終了", endTime)


thread_lock = threading.BoundedSemaphore(value=10)
blogHome = BlogHome()
blogHome.getBlog()

　　执行上面的代码

Excel上面的数据

查看全文

相关阅读:
JVM运行时数据区--堆
 ES检索服务搜索结果高亮
 SpringBoot 设置编码UTF-8
response.setContentType()的作用及参数
 将 vue.js 获取的 html 文本转化为纯文本
 SpringBoot读取properties文件配置项
 关于Java的编译执行与解释执行
 Java沙箱安全机制介绍【转载】
JVM运行时数据区--本地方法栈
 JVM--先说本地方法接口

原文地址：https://www.cnblogs.com/killclock048/p/10136560.html