zoukankan      html  css  js  c++  java
  • 小说爬取 python + urllib + lxml

    from urllib import parse
    from urllib import request
    from lxml import etree
    import time
    
    class Novel:
        def __init__(self,*args):
            self.name = args[0]
            self.dict = args[1]
            self.txt = ''
            for key in sorted(self.dict):
                self.txt = self.txt + self.dict[key]
    
        def write(self):
            f = open(self.name+'.txt','w')
            f.write(self.txt)
            f.close()
    
    #获取网页源代码
    def get_http_page(url,**kw):
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"
        }
        req = request.Request(url,headers=headers)
        response = request.urlopen(req)
        page = response.read()
        encoding = 'gbk'
        if kw:
            encoding = kw['encoding']
        page = page.decode(encoding)
        return page
    
    #获取漫画目录
    def get_comics_directory(url):
        url_list = []
        page = get_http_page(url,encoding='utf-8')
        html = etree.HTML(page)
        result = html.xpath('/html/body/div[2]/div/div[2]/h3/a')
        elment_select = None
        if len(result):
            url2 = result[0].get('href')
        if url2:
            page = get_http_page(url2)
            html = etree.HTML(page)
            elment_select = html.xpath('/html/body/div[4]/div[9]/span[2]/select')
            if len(elment_select):
                result_option = elment_select[0].findall('option')
                for option in result_option:
                    url_list.append('https://m.wenxuemi6.com{}'.format(option.get('value')))
        return url_list
    
    def downdload_txt(url_list,**kw):
        if kw:
            start = int(kw['start'])
            stop = int (kw['stop'])
            if start >= 0 and start < len(url_list) and stop > start and stop <len(url_list):
                count = kw['start']
                count_max = kw['stop']
        else:
            count = 0
            count_max = len(url_list)
        print('正在爬取目录和章节地址,请稍等……')
        d = {}
        while count < count_max:
            url = url_list[count]
            page = get_http_page(url)
            html = etree.HTML(page)
            result = html.xpath('/html/body/div[4]/ul[2]/li/a')
            txt = ''
            if type(result).__name__ == 'list':
                for l in result:
                    url = 'https://m.wenxuemi6.com{}'.format(l.get('href'))
                    #url_list.append('https://m.wenxuemi6.com{}'.format(l.get('href')))
                    print('Download chapters by URL:{}'.format(url))
                    d2 = {'{}'.format(count): ''}
                    page = get_http_page(url)
                    html = etree.HTML(page)
                    url_next = html.xpath('//*[@id="pb_next"]')
                    t = html.xpath('//*[@id="nr1"]/text()')
                    t2 = html.xpath('//*[@id="nr1"]/p')
                    txt_title = ''
                    txt_title_list = html.xpath('//*[@id="nr_title"]/text()')
                    if type(txt_title_list).__name__ == 'list':
                        if (len(txt_title_list) == 1):
                            txt_title = txt_title_list[0]
                    txt = txt + txt_title + '
    '
                    for l2 in t:
                        txt = txt + l2 + '
    '
                    if type(t2).__name__ == 'list':
                        if len(t2) == 1:
                            url = 'https://m.wenxuemi6.com{}'.format(l.get('href')[:-5] + '_2.html')
                            print('Download chapters by URL:{}'.format(url))
                            page = get_http_page(url)
                            html = etree.HTML(page)
                            t = html.xpath('//*[@id="nr1"]/text()')
                            for l2 in t:
                                txt = txt + l2 + '
    '
                    d2['{}'.format(count)] = txt
                    d.update(d2)
                    time.sleep(1)
        return d
    
    
    
    if __name__ == '__main__':
        txt_name = input("请输入要搜索的书名:")
        url = 'https://m.wenxuemi6.com/search.php?keyword={}'.format(parse.quote(txt_name))
        referer = url
        url_list = get_comics_directory(url)
        #下载第一页目录下的小说
        d = downdload_txt(url_list,start=0,stop=1)
        n1 = Novel(txt_name,d)
        #写出文件 [txt_name].txt 到当前目录下
        n1.write()
    
        #下载全本小说
        d2 = downdload_txt(url_list,start=0,stop=1)
        n2 = Novel(txt_name,d2)
        #写出文件 [txt_name].txt 到当前目录下
        n2.write()
  • 相关阅读:
    Linux更新时,出现无法更新锁
    Linux显示su:认证失败
    08 redis的缓存预热,雪崩,击穿,穿透问题以及常用的监控参数
    06 redis的哨兵系统的工作流程
    05 redis的主从复制机制的工作流程以及相关基础知识
    03 redis的事务以及锁、过期数据的删除策略、逐出算法、配置文件的核心配置参数
    02 jedis以及redis的持久化
    01 redis的5种基本数据类型的介绍,使用以及应用场景
    M1 MySQL事务知识点的总结
    02 Java文件读写通道的使用以及文件的基本操作方法
  • 原文地址:https://www.cnblogs.com/Dmail/p/11615049.html
Copyright © 2011-2022 走看看