zoukankan      html  css  js  c++  java
  • 小说爬取 python + urllib + lxml

    from urllib import parse
    from urllib import request
    from lxml import etree
    import time
    
    class Novel:
        def __init__(self,*args):
            self.name = args[0]
            self.dict = args[1]
            self.txt = ''
            for key in sorted(self.dict):
                self.txt = self.txt + self.dict[key]
    
        def write(self):
            f = open(self.name+'.txt','w')
            f.write(self.txt)
            f.close()
    
    #获取网页源代码
    def get_http_page(url,**kw):
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"
        }
        req = request.Request(url,headers=headers)
        response = request.urlopen(req)
        page = response.read()
        encoding = 'gbk'
        if kw:
            encoding = kw['encoding']
        page = page.decode(encoding)
        return page
    
    #获取漫画目录
    def get_comics_directory(url):
        url_list = []
        page = get_http_page(url,encoding='utf-8')
        html = etree.HTML(page)
        result = html.xpath('/html/body/div[2]/div/div[2]/h3/a')
        elment_select = None
        if len(result):
            url2 = result[0].get('href')
        if url2:
            page = get_http_page(url2)
            html = etree.HTML(page)
            elment_select = html.xpath('/html/body/div[4]/div[9]/span[2]/select')
            if len(elment_select):
                result_option = elment_select[0].findall('option')
                for option in result_option:
                    url_list.append('https://m.wenxuemi6.com{}'.format(option.get('value')))
        return url_list
    
    def downdload_txt(url_list,**kw):
        if kw:
            start = int(kw['start'])
            stop = int (kw['stop'])
            if start >= 0 and start < len(url_list) and stop > start and stop <len(url_list):
                count = kw['start']
                count_max = kw['stop']
        else:
            count = 0
            count_max = len(url_list)
        print('正在爬取目录和章节地址,请稍等……')
        d = {}
        while count < count_max:
            url = url_list[count]
            page = get_http_page(url)
            html = etree.HTML(page)
            result = html.xpath('/html/body/div[4]/ul[2]/li/a')
            txt = ''
            if type(result).__name__ == 'list':
                for l in result:
                    url = 'https://m.wenxuemi6.com{}'.format(l.get('href'))
                    #url_list.append('https://m.wenxuemi6.com{}'.format(l.get('href')))
                    print('Download chapters by URL:{}'.format(url))
                    d2 = {'{}'.format(count): ''}
                    page = get_http_page(url)
                    html = etree.HTML(page)
                    url_next = html.xpath('//*[@id="pb_next"]')
                    t = html.xpath('//*[@id="nr1"]/text()')
                    t2 = html.xpath('//*[@id="nr1"]/p')
                    txt_title = ''
                    txt_title_list = html.xpath('//*[@id="nr_title"]/text()')
                    if type(txt_title_list).__name__ == 'list':
                        if (len(txt_title_list) == 1):
                            txt_title = txt_title_list[0]
                    txt = txt + txt_title + '
    '
                    for l2 in t:
                        txt = txt + l2 + '
    '
                    if type(t2).__name__ == 'list':
                        if len(t2) == 1:
                            url = 'https://m.wenxuemi6.com{}'.format(l.get('href')[:-5] + '_2.html')
                            print('Download chapters by URL:{}'.format(url))
                            page = get_http_page(url)
                            html = etree.HTML(page)
                            t = html.xpath('//*[@id="nr1"]/text()')
                            for l2 in t:
                                txt = txt + l2 + '
    '
                    d2['{}'.format(count)] = txt
                    d.update(d2)
                    time.sleep(1)
        return d
    
    
    
    if __name__ == '__main__':
        txt_name = input("请输入要搜索的书名:")
        url = 'https://m.wenxuemi6.com/search.php?keyword={}'.format(parse.quote(txt_name))
        referer = url
        url_list = get_comics_directory(url)
        #下载第一页目录下的小说
        d = downdload_txt(url_list,start=0,stop=1)
        n1 = Novel(txt_name,d)
        #写出文件 [txt_name].txt 到当前目录下
        n1.write()
    
        #下载全本小说
        d2 = downdload_txt(url_list,start=0,stop=1)
        n2 = Novel(txt_name,d2)
        #写出文件 [txt_name].txt 到当前目录下
        n2.write()
  • 相关阅读:
    GRUB引导Win8,Win7,Ubuntu
    The vim syntax of systemd unit file
    Win8蓝屏(WHEA_UNCORRECTABLE_ERROR)
    C#生成Excel
    IE中使用IFrame或Frameset导致session丢失的问题
    Apache 配置详解 ( 最好的 APACHE 配置教程 )
    关于(enctype="multipart/formdata") post 提交时中文乱码解决方案(使用jspsmartupload时)
    Java获取当前时间
    windows中定时操作(SetTimer函数用法)
    _RecordsetPtr的 open函数
  • 原文地址:https://www.cnblogs.com/Dmail/p/11615049.html
Copyright © 2011-2022 走看看