zoukankan      html  css  js  c++  java
  • 第7课-正则表达式实现爬虫实战

    1、古诗文网爬虫

    import requests,re
    
    
    headers = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"
    }
    
    my_poetic_list = []
    
    
    def get_poetics(my_url):
        text = requests.get(url=my_url,headers = headers).text
    
        titles = re.findall(r'<div class="cont">.*?<b>(.*?)</b>',text,re.DOTALL)
    
        years = re.findall(r'<p class="source">.*?<a.*?>(.*?)</a>',text,re.DOTALL)
    
        potes = re.findall(r'<p class="source">.*?<a.*?>.*?</a>.*?<a.*?>(.*?)</a>',text,re.DOTALL)
    
        poetic = re.findall(r'<div class="contson".*?>(.*?)</div>',text,re.DOTALL)
    
        poetic_list = []
        for i in poetic:
            i = re.sub(r"<.*?>",'',i).replace("
    ",'').replace("u3000",'')
            poetic_list.append(i)
        for key,value in enumerate(titles):
            my_poetic = {}
            my_poetic["title"] = titles[key]
            my_poetic["year"] = years[key]
            my_poetic["pote"] = potes[key]
            my_poetic["poetics"] = poetic_list[key]
            my_poetic_list.append(my_poetic)
    
    if  __name__ == '__main__':
        for i in range(0,11):
            url = "https://www.gushiwen.org/default_{}.aspx".format(i)
            get_poetics(url)
        for i in my_poetic_list:
            print(i)
    

    2、糗事百科案例

    import re,requests
    
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"
    }
    my_lovehhy = []
    def get_acticles(url):
        text = requests.get(url=url,headers=headers).text
    
        titles = re.findall(r'<h3.*?><a.*?>(.*?)</a>',text,re.DOTALL)
        articles = re.findall(r'<div id="endtext">(.*?)</div>',text,re.DOTALL)
        for key,article in enumerate(articles):
            article = re.sub(r"<.*?>",'',article).replace("u3000",'')
            lovehhy = {}
            lovehhy["title"] = titles[key]
            lovehhy["content"] = article
            my_lovehhy.append(lovehhy)
    
    if __name__ == '__main__':
        for i in range(10):
            url = "http://www.lovehhy.net/Joke/Detail/QSBK/{}".format(i)
            get_acticles(url)
        for i in my_lovehhy:
            print(i)
  • 相关阅读:
    初识spring boot
    javascript的console命令
    (转)三角函数计算,Cordic 算法入门
    (原+转)ROC曲线
    (转)(VS2013 )由于应用程序配置不正确,程序未能启动”--原因及解决方法
    (转)最小二乘法拟合圆公式推导及vc实现[r]
    (原)Eclipse中将JNI生成的so打包成jar的步骤
    (原)Eclipse的java中文件读写
    (原)Microsoft Source Reader的简单使用
    (转)android ndk 给结构体赋值的方法
  • 原文地址:https://www.cnblogs.com/win0211/p/12091295.html
Copyright © 2011-2022 走看看