zoukankan      html  css  js  c++  java
  • 20 古诗文网站诗文爬取(正则方法)

     1 """古诗文网爬虫"""
     2 
     3 
     4 import re
     5 import requests
     6 
     7 def parse_page(url):
     8     headers = {
     9         'User-Agent': 'Mozilla/5.0',
    10     }
    11 
    12     response = requests.get(url, headers)
    13     # print(response.text)
    14     text = response.text
    15 
    16     # re解析
    17     titles = re.findall(r'<divsclass="cont">.*?<b>(.*?)</b>', text, re.DOTALL)     # .本不会匹配
    ,加上参数re.DOTALL即对任何字符都有效
    18     # print(titles)
    19     dynasties = re.findall(r'<p class="source">.*?<a.*?>(.*?)</a>', text, re.DOTALL)
    20     # print(dynasties)
    21     authors = re.findall(r'<p class="source">.*?<a.*?>.*?<a.*?>(.*?)</a>', text, re.DOTALL)
    22     # print(authors)
    23     content_tags = re.findall(r'<div class="contson" .*?>(.*?)</div>', text, re.DOTALL)
    24     # print(content_tags)
    25     contents = []
    26     for content in content_tags:
    27         x = re.sub(r'<.*>', "", content).strip()
    28         contents.append(x)
    29     poems = []
    30     for value in zip(titles, dynasties, authors, contents):
    31         title, dynasty, author, content = value
    32         poem = {
    33             'title': title,
    34             'dynasty': dynasty,
    35             'author': author,
    36             'content': content
    37         }
    38         poems.append(poem)
    39     
    40     # 输出诗文记录
    41     for poem in poems:
    42         print(poem)
    43 
    44 
    45 def main():
    46     url = "https://www.gushiwen.org/default_{}.aspx"
    47     for x in range(1, 11):
    48         newurl = url.format(x)
    49         parse_page(newurl)
    50 
    51 if __name__ == '__main__':
    52     main()
  • 相关阅读:
    BZOJ 2212/BZOJ 3702
    BZOJ 4761 Cow Navigation
    BZOJ 3209 花神的数论题
    BZOJ 4760 Hoof, Paper, Scissors
    BZOJ 3620 似乎在梦中见过的样子
    BZOJ 3940 Censoring
    BZOJ 3942 Censoring
    BZOJ 3571 画框
    BZOJ 1937 最小生成树
    BZOJ 1058 报表统计
  • 原文地址:https://www.cnblogs.com/sruzzg/p/13128526.html
Copyright © 2011-2022 走看看