zoukankan      html  css  js  c++  java
  • python 爬取豆瓣图书

    #!-*-coding:utf-8-*-
    import requests
    import xlwt
    from bs4 import BeautifulSoup
    from collections import OrderedDict
    
    
    class DouBanBookSpider(object):
    
        def __init__(self, book_type, quantity):
            self.book_type = book_type
            self.quantity = quantity
            self.url_list = []
            self.book_dict = OrderedDict()
            self.count = 0
    
        #获取url
        def get_url(self):
            count = 0
            while count < self.quantity+1:
                url = 'https://book.douban.com/tag/%s?start=%d&type=S' % (self.book_type, count)
                self.url_list.append(url)
                #每页20本书,
                count += 20
            return self.url_list
    
        #爬虫主体
        def main_spider(self, url):
            rsp = requests.get(url)
            tag_bf = BeautifulSoup(rsp.text, 'lxml')
            content = tag_bf.find_all('li', class_='subject-item')
            if content:
                for i in content:
                    bt_bf = BeautifulSoup(str(i), 'lxml')
                    self.count += 1
                    book_name = bt_bf.h2.a.get_text(strip=True)
                    author = bt_bf.find('div', class_='pub').string.strip()
                    comment_info = bt_bf.find('div', class_='star clearfix')
                    co_bf = BeautifulSoup(str(comment_info), 'lxml')
                    grade = co_bf.find('span', class_='rating_nums')
                    if grade:
                        grade = grade.string
                    comment_count = co_bf.find('span', class_='pl').string.strip()
                    self.book_dict[str(self.count)] = {'序号': self.count, '书名': book_name, '评分': grade, '评论数': comment_count, '作者': author}
            else:
                return
    
        #执行爬虫
        def do_spider(self):
            for i in self.get_url():
                self.main_spider(i)
    
        #数据写入excel
        def write_excel(self):
            wb = xlwt.Workbook(encoding='ascii')
            ws = wb.add_sheet(self.book_type)
            style = xlwt.XFStyle()
            font = xlwt.Font()
            font.name = 'Times New Roman'
            font.bold = True
            style.font = font
            row0 = ['序号', '书名', '评分', '评论数', '出版信息']
            for i in range(0, len(row0)):
                ws.write(0, i, row0[i], style)
            for k, v in self.book_dict.items():
                for j in range(0, len(v.values())):
                    ws.write(int(k), j, list(v.values())[j])
            wb.save('%s.xlsx' % self.book_type)
    
    
    if __name__ == "__main__":
        ds = DouBanBookSpider('中国历史', 2000)
        ds.do_spider()
        ds.write_excel()
  • 相关阅读:
    eclipse 添加.gitignore
    HTTP method POST is not supported by this URL
    Nodejs 如何解决每次向后台发起请求时判断用户是否处于登录状态?
    Servlet 全局验证是否登录
    会员管理系统的设计和开发(1)
    C程序模拟实现银行家算法
    XPath Helper:chrome爬虫网页解析工具 Chrome插件
    scrapy爬虫出现Forbidden by robots.txt
    廖雪峰
    Nodejs中export的作用
  • 原文地址:https://www.cnblogs.com/LouisZJ/p/8663166.html
Copyright © 2011-2022 走看看