zoukankan      html  css  js  c++  java
  • python 爬虫入门

    import requests
    import re
    
    # TODO 下载 每一个小说的首页url
    
    # TODO 大循环
    
    # 1. 下载小说首页
    novel_url = 'http://www.jingcaiyuedu.com/book/15205/list.html'
    response = requests.get(novel_url)
    # 处理字符编码 显式的指定,
    response.encoding = 'utf-8'
    html = response.text # 字符串
    # print(html)
    # 2. 提取 章节url 非贪婪匹配
    title = re.findall(r'<meta name="keywords" content="《(.*?)》',html)[0]
    # print(title)
    # id = list dl 有两个
    dl = re.findall(r'<dl id="list">.*?</dl>',html)[1]
    # print(dl)
    chapter_info_list = re.findall(r'<a.*?href="(.*?)".*?>(.*?)</a>',dl)
    # print(chapter_info_list)
    
    # 数据持久化 写入txt
    fb = open('%s.txt'%title,'w',encoding='utf-8')
    
    # 3. 循环的去访问每个章节,提取内容
    for chapter_info in chapter_info_list:
        chapter_url = chapter_info[0]
        chapter_title = chapter_info[1]
        # 处理 相对url
        if 'http' not in chapter_url:
            chapter_url = 'http://www.jingcaiyuedu.com%s' % chapter_url
        # 下载章节页面
        chapter_response = requests.get(chapter_url)
        chapter_response.encoding = "utf-8"
        chapter_html = chapter_response.text
        # print(chapter_response.text)
        # 提取内容
        chapter_content = re.findall(r'<script>a1();</script>(.*?)<script>a2();</script>',chapter_html)[0]
        # 清洗数据,把多余的字符处理掉
        chapter_content = chapter_content.replace(' ','')
        chapter_content = chapter_content.replace('<br/>','')
        chapter_content = chapter_content.replace('<br>','')
        chapter_content = chapter_content.replace('&nbsp;','')
        # print(chapter_content)
        # 写入文件
        fb.write(chapter_title)
        fb.write('
    ')
        fb.write(chapter_content)
        fb.write('
    ')
        # chapter_response.close()
        print(chapter_url)
    
        # exit()
  • 相关阅读:
    Linux命令之用户与组管理
    ubantu命令安装banner
    Linux命令之必杀绝技Vi文本编辑的使用
    Linux命令初步了解
    C#中的一些技巧
    .net对文件的操作之对文件目录的操作
    Linux学习一些在Terminal可以用到的快捷键及Shell常用的通配符
    【初识】正则表达式
    .net对文件的操作之文件读写
    分享一次在Windows Server2012 R2中安装SQL Server2008
  • 原文地址:https://www.cnblogs.com/stono/p/8861710.html
Copyright © 2011-2022 走看看