zoukankan      html  css  js  c++  java
  • re,xpath,BeautifulSoup三种方法爬取古诗词网上诗歌

     re,xpath ,bs4对同一个页面的解析速度
    发现re比xpath快接近10倍,xpath比bs4快接近10倍
    可见要想追求极致速度,使用正则表达式解析有多重要

    1、re解析的代码

    # 使用正则表达式解析网页元素
    # 关键点:直接找每个个体里面相同位置的元素,用findall一次提取出来到列表中
    import requests
    import re
    DATA = []
    def getHTMLtext(url,headers,timeout=10):
        try :
            resp = requests.get(url,headers=headers,timeout=timeout)
            resp.raise_for_status
            resp.encoding = 'utf-8'
            return resp.text
        except:
            return ''
    def reParser(text):
        name_list = re.findall(r'<div class="yizhu".*?<b>(.*?)</b>',text,re.S)  #re.DOTALL
         
        dynasty_list = re.findall(r'<p class="source">.*?target="_blank">(.*?)</a>',text,re.S)
         
        author_list = re.findall(r'<p class="source">.*?target="_blank">.*?</a>.*?target="_blank">(.*?)</a>',text,re.S)
         
        row_content_list = re.findall(r'<div class="contson".*?>(.*?)</div>',text,re.S)
        content_list = []
        for content in row_content_list:
            temp = re.sub(r'<.*?>','',content)  #这里一定要记得不要写成了贪婪匹配哦
            content_list.append(temp.strip()) #去除空格
         
        likes_list = re.findall(r'<span> (d*?)</span>',text,re.S)
         
        for value in zip(name_list,dynasty_list,author_list,content_list,likes_list):
            name,dynasty,author,content,likes = value
            poetry_dict = {
                '诗词名':name,
                '朝代':dynasty,
                '作者':author,
                '内容':content,
                '点赞数':likes
            }
            DATA.append(poetry_dict)
             
    def print_poetry(data):
        for every_poetry in data:
                print(every_poetry['诗词名'])
                print(every_poetry['朝代'] + ':' + every_poetry['作者'] )
                print(every_poetry['内容'])
                print('有{}人喜欢这首诗(词)哦'.format(every_poetry["点赞数"]))
                print("
    "+'*'*50+"
    ")
             
    if __name__ == '__main__':
        row_url = 'https://www.gushiwen.org/default_{}.aspx'
        headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'}
        num = input('请输入要爬取的页数(1-100):')
        for i in range(eval(num)):
            url = row_url.format(i+1)
            text = getHTMLtext(url,headers)
            if text == '':             print('url: {} 访问失败'.format(url))         else:             reParser(text)     DATA.sort(key=lambda x: int(x['点赞数']),reverse = True)     TOP10 = DATA[:10]     print_poetry(TOP10)

    2、Xpath版本
    from lxml import etree
    DATA = []
    def getHTMLtext(url,headers,timeout=10):
        try :
            resp = requests.get(url,headers=headers,timeout=timeout)
            resp.raise_for_status
            resp.encoding = 'utf-8'
            return resp.text
        except:
            return ''
    def xpathParser(text):
        htmlElement = etree.HTML(text)  # <class 'lxml.etree._Element'> 
        name_list = htmlElement.xpath('/html/body/div[2]/div[1]/div/div[1]/p[1]/a/b/text()')
        dynasty_list = htmlElement.xpath('/html/body/div[2]/div[1]/div/div[1]/p[2]/a[1]/text()')
        author_list = htmlElement.xpath('/html/body/div[2]/div[1]/div/div[1]/p[2]/a[2]/text()')
        content_list = []
        poetries = htmlElement.xpath('//div[@class="contson" and contains(@id,"contson")]') #返回一个列表,里面每一个都是'lxml.etree._Element'
       # print(etree.tostring(poetries[0],encoding = 'utf-8').decode('utf-8'))
        for poetry in poetries:
            row_content = ''.join(poetry.xpath('.//text()'))#这里的.可千万不能掉,否则会忽略掉poetry哦
            content_list.append(row_content.replace('
    ','')) 
        row_likes_list = htmlElement.xpath('//a[contains(@id,"agood")]/span/text()')  
        likes_list = [int(like.strip()) for like in row_likes_list]
        for value in zip(name_list,dynasty_list,author_list,content_list,likes_list):
            name,dynasty,author,content,likes = value
            poetry_dict = {
                '诗词名':name,
                '朝代':dynasty,
                '作者':author,
                '内容':content,
                '点赞数':likes
            }
            DATA.append(poetry_dict)  
     
    def print_poetry(data):
        for every_poetry in data:
                print(every_poetry['诗词名'])
                print(every_poetry['朝代'] + ':' + every_poetry['作者'] )
                print(every_poetry['内容'])
                print('有{}人喜欢这首诗(词)哦'.format(every_poetry["点赞数"]))
                print("
    "+'*'*50+"
    ")
             
    if __name__ == '__main__':
        row_url = 'https://www.gushiwen.org/default_{}.aspx'
        headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'}
        num = input('请输入要爬取的页数(1-100):')
        for i in range(eval(num)):
            url = row_url.format(i+1)
            text = getHTMLtext(url,headers)
            if text == '':
                print('url: {} 访问失败'.format(url))
            else:
                xpathParser(text)
        DATA.sort(key=lambda x: int(x['点赞数']),reverse = True)
        TOP10 = DATA[:10]
        print_poetry(TOP10)
    3、bs4版本
    # 使用bs4提取网页,先利用find_all解析
    import requests
    from bs4 import BeautifulSoup
    DATA = []
    def getHTMLtext(url,headers,timeout=10):
        try :
            resp = requests.get(url,headers=headers,timeout=timeout)
            resp.raise_for_status
            resp.encoding = 'utf-8'
            return resp.text
        except:
            return ''
    def bs4_find_all_Parser(text):
        soup = BeautifulSoup(text,'lxml')
        sons = soup.find_all('div',class_ = "sons")[:10] #返回一个<class 'bs4.element.ResultSet'>,每一个元素都是Tag类型
        # 注意:上一步里面返回了一些其他的元素,我们可以提取出前面的10项,那是我们需要用到的
        for son in sons:
            name = son.find('b').string
            print(name)
            dynasty_author = son.find('p',class_="source").get_text()
            print(dynasty_author)
            content = son.find('div',class_="contson").get_text().strip()
            print(content)
            like = son.find_all('span')[1].string.strip()
            print('点赞数:'+like)
            print('
    '+'*'*30+'
    ')
             
      
    if __name__ == '__main__':
            url = 'https://www.gushiwen.org/default_1.aspx'
            headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'}
            text = getHTMLtext(url,headers)
            if text == '':
                print('url: {} 访问失败'.format(url))
            else:
                bs4_find_all_Parser(text)
  • 相关阅读:
    英语生活箴言
    Javascript中最常用的55个经典技巧
    深刻理解Java编程的7个例子
    定制Apache索引样式
    【Androidin全球首发】国产Android Broncho A1 评测,第一印象
    系统程序员成长计划写得又快又好的秘诀(五)
    让adb logcat打印内核调试信息
    系统程序员成长计划写得又快又好的秘诀(三)
    Projects owned by limodev.cn
    Apache Direcotry Indexes目录列表显示样式定制
  • 原文地址:https://www.cnblogs.com/valorchang/p/11582565.html
Copyright © 2011-2022 走看看