1 """古诗文网爬虫""" 2 3 4 import re 5 import requests 6 7 def parse_page(url): 8 headers = { 9 'User-Agent': 'Mozilla/5.0', 10 } 11 12 response = requests.get(url, headers) 13 # print(response.text) 14 text = response.text 15 16 # re解析 17 titles = re.findall(r'<divsclass="cont">.*?<b>(.*?)</b>', text, re.DOTALL) # .本不会匹配 ,加上参数re.DOTALL即对任何字符都有效 18 # print(titles) 19 dynasties = re.findall(r'<p class="source">.*?<a.*?>(.*?)</a>', text, re.DOTALL) 20 # print(dynasties) 21 authors = re.findall(r'<p class="source">.*?<a.*?>.*?<a.*?>(.*?)</a>', text, re.DOTALL) 22 # print(authors) 23 content_tags = re.findall(r'<div class="contson" .*?>(.*?)</div>', text, re.DOTALL) 24 # print(content_tags) 25 contents = [] 26 for content in content_tags: 27 x = re.sub(r'<.*>', "", content).strip() 28 contents.append(x) 29 poems = [] 30 for value in zip(titles, dynasties, authors, contents): 31 title, dynasty, author, content = value 32 poem = { 33 'title': title, 34 'dynasty': dynasty, 35 'author': author, 36 'content': content 37 } 38 poems.append(poem) 39 40 # 输出诗文记录 41 for poem in poems: 42 print(poem) 43 44 45 def main(): 46 url = "https://www.gushiwen.org/default_{}.aspx" 47 for x in range(1, 11): 48 newurl = url.format(x) 49 parse_page(newurl) 50 51 if __name__ == '__main__': 52 main()