zoukankan      html  css  js  c++  java
  • python 爬虫小说

    思路:

    1.获取第一张内容

    2.判断请求方式

    3.对URL存在回车进行处理

    4.正则匹配

    5.写入文件中

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    # @Time : 2021/8/27 20:34
    # @Author : Lhtester
    # @Site : 
    # @File : book.py
    # @Software: PyCharm
    import requests
    import re
    import time
    import random
    import sys
    
    sys.setrecursionlimit(16000)#设置递归深度
    class Book_deaill():
        def __init__(self):
            self.url = 'https://m.xyshuge.com/k3nl5/19/19364/55418253.html'
            # self.url = 'https://m.xyshuge.com/k3nl5/19/19364/55728792_2.html'
            self.headers ={"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36"}
    
    
        def data_get(self,url=None):
            if url==None:
                url=self.url
            result = requests.get(url=url, headers=self.headers).text
            # print(result)
            title= re.findall(r'<div class="nr_title" id="nr_title">(.*?)</div>',result)#匹配章节名称
            print(title)
            with open('../image/book2.txt','a+',encoding='utf-8')as f:
                for i in title:
                    f.write(i)
                    f.write('
    ')
            print('write titie complete')
            text = re.findall(r"<p class='c_detail'>(.*?)</p>",result)#匹配正文
            with open('../image/book2.txt','a+',encoding='utf-8')as f:
                for n in text:
                    n = n.replace("&nbsp;","")
                    n= n.replace("阅书阁『wWw.xyshuge.Com』,全文免费阅读.","")#删除网站的自定义文字
                    f.write('
    ')
                    f.write(n)
            time.sleep(random.randint(1 , 5) )#随机休眠,避免被对方反爬检测到
            print('write text complete')
    
            self.start_analysis(result)
    
    
        def start_analysis(self,result):
    
            new_url = 'https://m.xyshuge.com/k3nl5'
            get_next_page = re.findall(r'<a id="pb_next" href="/k3nl5
    (.*?)">↓一页</a>',result)
            if len(get_next_page)==0:
                get_next_page = re.findall(r'<a id="pb_next" href="/k3nl5
    (.*?)">↓一章</a>',result)
                print('下一章:',get_next_page)
            if len(get_next_page) == 0:#最后一章再次判断
                print('爬虫结束')
            else:
                new_url = new_url+get_next_page[0]
                print(new_url)
                self.data_get(new_url)#地址拼接
    
        def start_get_data(self):
            print('start get data ')
            self.data_get()
    
    
    
    if __name__=='__main__':
        data =Book_deaill()
        data.start_get_data()
  • 相关阅读:
    74.Interesting Sequence(有趣的数列)(拓扑排序)
    CODEVS 1746 贪吃的九头龙
    NYOJ 110 剑客决斗
    CODEVS 2451 互不侵犯
    洛谷 P1896 互不侵犯King
    洛谷 P1066 2^k进制数
    洛谷 P1656 炸铁路
    洛谷 P1830 轰炸Ⅲ
    CODEVS 1051 接龙游戏
    POJ 3461 Oulipo
  • 原文地址:https://www.cnblogs.com/anhao-world/p/15196630.html
Copyright © 2011-2022 走看看