zoukankan      html  css  js  c++  java
  • python3爬虫-使用requests爬取起点小说

    import requests
    from lxml import etree
    from urllib import parse
    import os, time
    
    
    def get_page_html(url):
        '''向url发送请求'''
        resoponse = session.get(url, headers=headers, timeout=timeout)
        try:
            if resoponse.status_code == 200:
                return resoponse
        except Exception:
            return None
    
    
    def get_next_url(resoponse):
        '''获取下一页的url链接'''
        if resoponse:
            try:
                selector = etree.HTML(resoponse.text)
                url = selector.xpath("//a[@id='j_chapterNext']/@href")[0]
                next_url = parse.urljoin(resoponse.url, url)
                return next_url
            except IndexError:
                return None
    
    
    def xs_content(resoponse):
        '''获取小说的章节名,内容'''
        if resoponse:
            selector = etree.HTML(resoponse.text)
            title = selector.xpath("//h3[@class='j_chapterName']/text()")[0]
            content_xpath = selector.xpath(
                "//div[contains(@class,'read-content') and contains(@class,'j_readContent')]//p/text()")
            return title, content_xpath
    
    
    def write_to_txt(info_tuple: tuple):
        if not info_tuple: return
        path = os.path.join(BASE_PATH, info_tuple[0])
        if not os.path.exists(path):
            with open(path + ".txt", "wt", encoding="utf-8") as f:
                for line in info_tuple[1]:
                    f.write(line + "
    ")
                f.flush()
    
    
    def run(url):
        '''启动'''
        html = get_page_html(url)
        next_url = get_next_url(html)
        info_tupe = xs_content(html)
        if next_url and info_tupe:
            print("正在写入")
            write_to_txt(info_tupe)
            time.sleep(sleep_time)  # 延迟发送请求的时间,减少对服务器的压力。
            print("正在爬取%s" % info_tupe[0])
            print("正在爬取%s" % next_url)
            run(next_url)
    
    
    if __name__ == '__main__':
        session = requests.Session()
        sleep_time = 5
        timeout = 5
        BASE_PATH = r"D:图片LSZJ"  # 存放文件的目录
        url = "https://read.qidian.com/chapter/8iw8dkb_ZTxrZK4x-CuJuw2/fWJwrOiObhn4p8iEw--PPw2"  # 这是斗破苍穹第一章的url    需要爬取的小说的第一章的链接(url)
        headers = {
            "Referer": "read.qidian.com",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
        }
        print('开始运行爬虫')
        run(url)
  • 相关阅读:
    要学习TINY框架要有什么前提条件?
    如何获取最新的代码?
    python 反射的用法
    面试题1
    Process多进程的创建方法
    异常捕捉
    用type动态创建Form
    ModelForm的使用
    git 常见命令
    TVTK库的安装
  • 原文地址:https://www.cnblogs.com/zhuchunyu/p/10765939.html
Copyright © 2011-2022 走看看