zoukankan      html  css  js  c++  java
  • 爬虫 爬小说

    import requests as r
    import re,encodings
    import time
    from lxml import etree
    def pa( url,name):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'
        }
        z = r.get(url, headers = headers)
        z.encoding = 'UTF-8'
        html = etree.HTML(z.text)
        # 查找章节名字  
        zhangjie = html.xpath('//*[@id="wrapper"]/div[3]/div/div[2]/h1/text()')[0]
    
        print(zhangjie)
        # xpath 查找小说内容
        content = html.xpath('//*[@id="content"]/text()')
        content = '
    '.join(content)
        with open(name, 'a+', encoding="UTF-8") as txt:
            txt.write(zhangjie + "
    ")
            txt.write(content)
            print(zhangjie + ":	写入成功")
    
    
    
    if __name__ == '__main__':
        mulu_url = 'http://www.yuetutu.com/cbook_22694/'
        'User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'
        s = r.get(mulu_url)
        s.encoding = 'utf-8'
        html = etree.HTML(s.text)
        text = s.text
        name = (re.search('<h1>(.*?)</h1>',text)).group()
        name = (name.replace("<h1>",'')).replace('</h1>','')
        name = "./%s.txt"%name
        mulu = html.xpath('//*[@id="list"]/dl/dd/a/@href')
    
        print(name)
        print(mulu)
        b = 1;
        for i in mulu:
            if b > 8 :
                pa('http://www.yuetutu.com'+i, name)
            b= 1+b
            time.sleep(1)
    

      

  • 相关阅读:
    js动态获取地址栏后的参数
    html页面保存数的两种方式
    微信开发之八 页面获取周围beacon设备
    【摄影】田子坊
    最好的时光在路上,最美的风景在远方
    【前端统计图】echarts实现简单柱状图
    js实现计时功能
    luogu 电车
    cogs luogu 砍树
    cogs 通往奥格瑞玛的道路 WD
  • 原文地址:https://www.cnblogs.com/TTTAO/p/13199766.html
Copyright © 2011-2022 走看看