zoukankan      html  css  js  c++  java
  • python3爬虫 -----爬取古诗文-------from古诗文网站

     1 # -*- coding: utf-8 -*-
     2 #author:zxy
     3 #Date:2018-10-19
     4 
     5 
     6 import requests
     7 import re
     8 HEADERS={
     9     "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
    10                  "Chrome/69.0.3497.100 Safari/537.36"
    11 }
    12 
    13 
    14 def parse_url(url):
    15     response=requests.get(url,headers=HEADERS)
    16     text=response.text
    17     titles=re.findall(r'<divsclass="cont">.*?<b>(.*?)</b>',text,re.DOTALL) #r raw
    18     dynasties=re.findall(r'<psclass="source">.*?<a.*?>(.*?)</a>',text,re.DOTALL)
    19     authors=re.findall(r'<psclass="source">.*?<a.*?<a.*?>(.*?)</a>',text,re.DOTALL)
    20     content_tags=re.findall(r'<divsclass="contson".*?>(.*?)</div>',text,re.DOTALL)
    21     contents=[]
    22     for content_tag in content_tags:
    23         x=re.sub('<.*?>','',content_tag)
    24         xx=re.sub('', '',x)
    25         contents.append(xx.strip())
    26     poems=[]
    27     for value in zip(titles,dynasties,authors,contents):
    28         title,dynasty,author,content=value
    29         poem={
    30             "title":title,
    31             "dynasty":dynasty,
    32             "author":author,
    33             "content":content
    34         }
    35         poems.append(poem)
    36 
    37     with open('poems.txt','w',encoding="utf-8") as f:
    38         for poem in poems:
    39             for (key,value) in poem.items():
    40                 if(key=="title"):
    41                     f.write("{}
    ".format(value))
    42                 if (key == "dynasty"):
    43                     f.write("	{}
    ".format(value))
    44                 if(key=="author"):
    45                     str="	{}
    "
    46                     f.write(str.format(value))
    47                 if(key=="content"):
    48                     print(value)
    49                     f.write("{}
    
    
    ".format(value))
    50                     # print(x+"{}
    
    
    ".format(value))
    51 
    52 if __name__ == '__main__':
    53     url="https://www.gushiwen.org/default_1.aspx"
    54     parse_url(url)
  • 相关阅读:
    移动端UI
    jQuery 下拉框三级联动
    jQuery基础与学习资源
    jQuery
    TCP、UDP、HTTP、SOCKET之间的区别
    Socket 通信原理(Android客户端和服务器以TCP&&UDP方式互通)
    android 中 任务、进程和线程的区别
    android中不同手机分辨率适配问题
    经验分享:CSS浮动(float,clear)通俗讲解(真的很通俗)
    关于各种排列组合java算法
  • 原文地址:https://www.cnblogs.com/z-712/p/9815253.html
Copyright © 2011-2022 走看看