zoukankan      html  css  js  c++  java
  • 20 古诗文网站诗文爬取(正则方法)

     1 """古诗文网爬虫"""
     2 
     3 
     4 import re
     5 import requests
     6 
     7 def parse_page(url):
     8     headers = {
     9         'User-Agent': 'Mozilla/5.0',
    10     }
    11 
    12     response = requests.get(url, headers)
    13     # print(response.text)
    14     text = response.text
    15 
    16     # re解析
    17     titles = re.findall(r'<divsclass="cont">.*?<b>(.*?)</b>', text, re.DOTALL)     # .本不会匹配
    ,加上参数re.DOTALL即对任何字符都有效
    18     # print(titles)
    19     dynasties = re.findall(r'<p class="source">.*?<a.*?>(.*?)</a>', text, re.DOTALL)
    20     # print(dynasties)
    21     authors = re.findall(r'<p class="source">.*?<a.*?>.*?<a.*?>(.*?)</a>', text, re.DOTALL)
    22     # print(authors)
    23     content_tags = re.findall(r'<div class="contson" .*?>(.*?)</div>', text, re.DOTALL)
    24     # print(content_tags)
    25     contents = []
    26     for content in content_tags:
    27         x = re.sub(r'<.*>', "", content).strip()
    28         contents.append(x)
    29     poems = []
    30     for value in zip(titles, dynasties, authors, contents):
    31         title, dynasty, author, content = value
    32         poem = {
    33             'title': title,
    34             'dynasty': dynasty,
    35             'author': author,
    36             'content': content
    37         }
    38         poems.append(poem)
    39     
    40     # 输出诗文记录
    41     for poem in poems:
    42         print(poem)
    43 
    44 
    45 def main():
    46     url = "https://www.gushiwen.org/default_{}.aspx"
    47     for x in range(1, 11):
    48         newurl = url.format(x)
    49         parse_page(newurl)
    50 
    51 if __name__ == '__main__':
    52     main()
  • 相关阅读:
    mybatis-spring 集成
    maven 插件深入了解
    maven 常用插件3
    maven 插件2
    <转载> maven 详解 http://www.cnblogs.com/binyue/p/4729134.html
    linux base shell 基础语法
    浏览器运行原理
    <转载> js 闭包
    dubbo 学习资料
    HTTP
  • 原文地址:https://www.cnblogs.com/sruzzg/p/13128526.html
Copyright © 2011-2022 走看看