zoukankan      html  css  js  c++  java
  • python 爬虫实例(三)

    问题描述

    爬取博客园的首页数据URL【https://home.cnblogs.com/blog/page/1/】,之后写到自己的Excel里面

    环境:

    OS:Window10

    python:3.7

    代码

    import requests
    import os
    from bs4 import BeautifulSoup
    import xlwt
    import xlrd
    from xlutils.copy import copy
    import threading
    import datetime
    
    class BlogHome:
    
        def __init__(self):
            self.url = "https://home.cnblogs.com/blog/page/{}/"
            self.path = r"C:pythonProjectBlog"
    
        def request(self, param):
            url= self.url.format(param)
            r = requests.get(self.url)
            return r.text
    
        def all_page(self, maxpage):
    
            # wbk = xlwt.Workbook()
            # sheet = wbk.add_sheet("Data")
            wbk = xlrd.open_workbook(r"C:UserspeiqiangDesktopaaa.xls", formatting_info=True)
            wbCopy = copy(wbk)
            sheet = wbCopy.get_sheet(0)
            row = 4
            for page in range(1, maxpage):
                thread_lock.acquire()
                req = self.request(page)
                reRow = self.getdata(req, sheet, row)
                row = reRow
                thread_lock.release()
    
            wbCopy.save(r"C:UserspeiqiangDesktopaaa.xls")
            print("書き込みました")
    
        def getdata(self, req, sheet, row):
            soup = BeautifulSoup(req, "xml")
            all_title = soup.find_all(class_="post_block")
            for title in all_title:
                col = 1
                # title取得
                title_blank = title.find(class_="entry_title").find_all("a")
                print("user:", title_blank[0].string.replace("[", "").replace("]", ""))
                sheet.write(row, col, title_blank[0].string.replace("[", "").replace("]", ""))
                col += 1
                print("title:", title_blank[1].string)
                sheet.write(row, col, title_blank[1].string)
                col += 1
    
                # 評論個数
                post_comment = title.find(class_="post_comment")
                print("評論個数:", post_comment.string)
                sheet.write(row, col, post_comment.string)
                col += 1
                # 読込個数
                post_view = title.find(class_="post_view")
                print("読込個数:", post_view.string)
    
                sheet.write(row, col, post_view.string)
                col += 1
    
                # 推奨個数
                # susume = title.find(class_="entry_footer")
                # print("推奨個数:", susume.string)
                # 発表日付
                postdate = title.find(class_="postdate")
                print("発表日付:", postdate.string)
                sheet.write(row, col, postdate.string)
                col += 1
                # 詳細取得
                entry_summary = title.find(class_="entry_summary")
                print("詳細取得:", entry_summary.string)
                sheet.write(row, col, entry_summary.string)
                col += 1
                row += 1
            return row
    
        def writeExcel(self, row, col, data):
            wbk = xlwt.Workbook()
            sheet = wbk.add_sheet("Data", cell_overwrite_ok=True)
            sheet.write(row, col, data)
            wbk.save(r"C:UserspeiqiangDesktopaaa.xls")
            print("書き込みました")
    
        def mkdir(self):
            path = self.path.strip()
            isExist = os.path.exists(path)
            if not isExist:
                print('创建名字叫做', path, '的文件夹')
                os.makedirs(path)
                print('创建成功!')
                return True
            else:
                print(path, '文件夹已经存在了,不再创建')
                return False
    
    
        def getBlog(self):
    
            startTime = datetime.datetime.now()
            print("開始", startTime)
            self.all_page(10)
            endTime = datetime.datetime.now()
            print("実行時間:", (endTime - startTime).seconds)
            print("開始", startTime)
            print("終了", endTime)
    
    
    thread_lock = threading.BoundedSemaphore(value=10)
    blogHome = BlogHome()
    blogHome.getBlog()
    

      执行上面的代码

    Excel上面的数据

  • 相关阅读:
    题解 DTOJ #1438. 矮人排队(lineup)
    题解 DTOJ #4423. 「THUSC2019」塔
    题解 DTOJ #4123.「2019冬令营提高组」全连
    题解 DTOJ #4016.辉夜的夜空明珠(moon)
    题解 DTOJ #2498.大步小步(babystep)
    题解 DTOJ #3326.组队(group)
    题解 DTOJ #1515.三塔合一
    题解 DTOJ #2305.Bazarek
    【code】Splay 模板
    寻找乱序数组中第K大的数
  • 原文地址:https://www.cnblogs.com/killclock048/p/10136560.html
Copyright © 2011-2022 走看看