zoukankan      html  css  js  c++  java
  • 8.04-book

    import requests
    from lxml import etree
    from bs4 import BeautifulSoup
    import json
    
    class BookSpider(object):
        def __init__(self):
            self.base_url = 'http://www.allitebooks.com/page/{}'
            self.headers = {
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'}
    
            self.data_list = []
    
        # 1.构建所有url
        def get_url_list(self):
            url_list = []
            for i in range(1, 10):
                url = self.base_url.format(i)
    
                url_list.append(url)
    
            return url_list
    
        # 2.发请求
        def send_request(self, url):
            data = requests.get(url, headers=self.headers).content.decode()
            print(url)
            return data
    
        # 3.解析数据 xpath
        def parse_xpath_data(self, data):
            parse_data = etree.HTML(data)
    
            # 1.解析出所有的书 book
            book_list = parse_data.xpath('//div[@class="main-content-inner clearfix"]/article')
    
            # 2.解析出 每本书的 信息
            for book in book_list:
                book_dict = {}
                # 1.书名字
                book_dict['book_name'] = book.xpath('.//h2[@class="entry-title"]//text()')[0]
    
                # 2.书的图片url
                book_dict['book_img_url'] = book.xpath('div[@class="entry-thumbnail hover-thumb"]/a/img/@src')[0]
    
                # 3.书的作者
                book_dict['book_author'] = book.xpath('.//h5[@class="entry-author"]//text()')[0]
    
                # 4.书的简介
                book_dict['book_info'] = book.xpath('.//div[@class="entry-summary"]/p/text()')[0]
    
                self.data_list.append(book_dict)
    
        def parse_bs4_data(self, data):
    
            bs4_data = BeautifulSoup(data, 'lxml')
            # 1.取出所有的书
            book_list = bs4_data.select('article')
    
            # 2.解析出 每本书的 信息
            for book in book_list:
                book_dict = {}
                # 1.书名字
                book_dict['book_name'] = book.select_one('.entry-title').get_text()
    
                # # 2.书的图片url
                book_dict['book_img_url'] = book.select_one('.attachment-post-thumbnail').get('src')
    
                # # 3.书的作者
                book_dict['book_author'] = book.select_one('.entry-author').get_text()[3:]
                #
                # # 4.书的简介
                book_dict['book_info'] = book.select_one('.entry-summary p').get_text()
                print(book_dict)
                self.data_list.append(book_dict)
    
        # 4.保存数据
        def save_data(self):
            json.dump(self.data_list, open("04book.json", 'w'))
    
        # 统筹调用
        def start(self):
    
            url_list = self.get_url_list()
    
            # 循环遍历发送请求
            for url in url_list:
                data = self.send_request(url)
                # self.parse_xpath_data(data)
                self.parse_bs4_data(data)
    
            self.save_data()
    
    
    BookSpider().start()
  • 相关阅读:
    机器学习算法(SVM)公开课4月25日开讲
    手把手教你做文本挖掘
    ActiveReports公开课开启报名,学习如何解决中国式复杂报表难题
    DevExpress免费公开课,讲解即将发布的16.2新版功能
    Stimulsoft入门视频
    免费公开课,讲解强大的文档集成组件Aspose,现在可报名
    中国式商业智能报表ActiveReports免费公开课,10月20日开讲
    JavaScript图表FusionCharts免费在线公开课,由印度原厂技术工程师主讲,10月13日发车
    LoadRunner免费公开课,惠普金牌讲师亲授
    DevExpress VCL v16.1.3发布
  • 原文地址:https://www.cnblogs.com/hankleo/p/10626478.html
Copyright © 2011-2022 走看看