zoukankan      html  css  js  c++  java
  • 爬虫15-正则表达式爬取中国诗词网

    import requests
    import re
    from  lxml import  etree
    headers={
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
    }
    def parse_page(url):
        response=requests.get(url,headers=headers)
        text=response.text
        # 使用xpath找的标题
        # html=etree.HTML(text)
        # titles=html.xpath("//div[@class='cont']//b/text()")
    
        titles =re.findall(r'<divsclass="cont">.*?<b>(.*?)</b>',text,re.DOTALL)#re.DOTALL代表.匹配所有字符
        dynasties=re.findall(r'<psclass="source">.*?<a.*?>(.*?)</a>',text,re.DOTALL)
        #author=re.findall(r'<psclass="source">.*?<a.*>(.*?)</a>',text)#偶然发现很神奇
        authors=re.findall(r'<psclass="source">.*?<a.*?>.*?<a.*?>(.*?)</a>',text,re.DOTALL)
        content_tags=re.findall(r'<divsclass="contson".*?>(.*?)</div>',text,re.DOTALL)
        contents=[]
        for content in content_tags:
            x=re.sub(r'<.*?>',"",content)
            contents.append(x.strip())
        poems=[]
        for value in zip(titles,dynasties,authors,contents):
            title,dynasty,author,content=value
            poem={
                'title':title,
                'dynasty':dynasty,
                'author':author,
                'content':content
            }
            poems.append(poem)
        for poem in poems:
            print(poem)
    def main():
        for x in range(1,11):
            url="https://www.gushiwen.org/default_%s.aspx" % x
            parse_page(url)
    
    if __name__ == '__main__':
        main()
    

      

  • 相关阅读:
    Permutations II
    N-Queens II
    Palindrome Number
    Minimum Path Sum
    JS的DOM操作2
    JS 的DOM操作
    函数概念
    JavaScript数组
    JavaScript循环及练习
    JS语言
  • 原文地址:https://www.cnblogs.com/wcyMiracle/p/12490166.html
Copyright © 2011-2022 走看看