zoukankan      html  css  js  c++  java
  • 学习笔记(爬虫):爬取笔趣阁小说

    # -*- coding: utf-8 -*-
    import requests
    from lxml import etree
    
    class BookSpider(object):
        def __init__(self):
            self.url = "http://www.jianlaixiaoshuo.com/"
            self.base_url = "http://www.jianlaixiaoshuo.com/"
            self.headers = {
                "Use_Agent": "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"}
        #请求网页内容
        def get_html(self,url):
            html = requests.get(url,headers = self.headers).content.decode()
            return html
        #封装xpath
        def get_xpath(self,html,pattern):
            p = etree.HTML(html)
            result = p.xpath(pattern)
            return result
        #保存数据
        def save_data(self, data):
            with open('剑来.txt','a',encoding='utf-8')as f:
                f.write(data)
        #下载数据
        def down_load(self,url):
            html = self.get_html(self.url)
            # print(html)
            pattern1 = '//dl[@class="chapterlist"]/dd/a/@href'
            pattern2 = '//dl[@class="chapterlist"]/dd/a/text()'
            #获取每一章的链接地址
            book_lists = self.get_xpath(html, pattern1)
            #获取每一章的章节名
            book_name_lists = self.get_xpath(html, pattern2)
            print(book_lists)
            for book_name, url in zip(book_name_lists, book_lists):
                #完整的章节url地址
                book_url = self.base_url + url
                book_html = self.get_html(book_url)
                #数据清洗
                pattern = '//div[@id="BookText"]/p/text()'
                book_data = self.get_xpath(book_html, pattern)
                #将列表转换为str
                book_data = ''.join(book_data)
                book_data = book_data.replace('<p>','')
                book_data = book_data.replace('</p>', '')
                book_data = book_data.replace('<script type="text/javascript" src="/tb.js"></script>', '')
                book_data = book_data.replace('<br />', '')
                book_data = book_data.replace('&#8212;', '')
                book_data = book_data+'
    '
                book_text = book_name+'
    '+book_data
                print('正在下载',book_name)
                print(book_text)
                self.save_data(book_text)
        #运行程序
        def run(self):
            self.down_load(self.url)
    
    if __name__ == "__main__":
        p = BookSpider()
        p.run()
  • 相关阅读:
    省选模板_简单数学
    省选模板大杂烩
    省选_简单算法
    省选_简单图论
    省选_简单数据结构
    BZOJ4545: DQS的trie 广义后缀自动机 + LCT
    BZOJ 4229: 选择 LCT + 独创方法 + 边双
    luoguP2742 【模板】二维凸包 / [USACO5.1]圈奶牛 二维凸包
    python面向过程编程小程序 -ATM(里面用了终端打印)
    从7点到9点写的小程序(用了模块导入,python终端颜色显示,用了点局部和全局可变和不可变作用域,模块全是自定义)
  • 原文地址:https://www.cnblogs.com/maxxu11/p/12631126.html
Copyright © 2011-2022 走看看