zoukankan      html  css  js  c++  java
  • 20 古诗文网站诗文爬取(正则方法)

     1 """古诗文网爬虫"""
     2 
     3 
     4 import re
     5 import requests
     6 
     7 def parse_page(url):
     8     headers = {
     9         'User-Agent': 'Mozilla/5.0',
    10     }
    11 
    12     response = requests.get(url, headers)
    13     # print(response.text)
    14     text = response.text
    15 
    16     # re解析
    17     titles = re.findall(r'<divsclass="cont">.*?<b>(.*?)</b>', text, re.DOTALL)     # .本不会匹配
    ,加上参数re.DOTALL即对任何字符都有效
    18     # print(titles)
    19     dynasties = re.findall(r'<p class="source">.*?<a.*?>(.*?)</a>', text, re.DOTALL)
    20     # print(dynasties)
    21     authors = re.findall(r'<p class="source">.*?<a.*?>.*?<a.*?>(.*?)</a>', text, re.DOTALL)
    22     # print(authors)
    23     content_tags = re.findall(r'<div class="contson" .*?>(.*?)</div>', text, re.DOTALL)
    24     # print(content_tags)
    25     contents = []
    26     for content in content_tags:
    27         x = re.sub(r'<.*>', "", content).strip()
    28         contents.append(x)
    29     poems = []
    30     for value in zip(titles, dynasties, authors, contents):
    31         title, dynasty, author, content = value
    32         poem = {
    33             'title': title,
    34             'dynasty': dynasty,
    35             'author': author,
    36             'content': content
    37         }
    38         poems.append(poem)
    39     
    40     # 输出诗文记录
    41     for poem in poems:
    42         print(poem)
    43 
    44 
    45 def main():
    46     url = "https://www.gushiwen.org/default_{}.aspx"
    47     for x in range(1, 11):
    48         newurl = url.format(x)
    49         parse_page(newurl)
    50 
    51 if __name__ == '__main__':
    52     main()
  • 相关阅读:
    os
    linux常用命令
    css-基础知识
    awk命令详解
    文献综述
    微信JSAPI支付
    SNMP详解
    SNMP进阶
    SNMP协议入门
    SNMP简单网络管理协议
  • 原文地址:https://www.cnblogs.com/sruzzg/p/13128526.html
Copyright © 2011-2022 走看看