zoukankan      html  css  js  c++  java
  • 一、使用 BeautifulSoup抓取网页信息信息

     

    一、解析网页信息

    from bs4 import  BeautifulSoup
    
    with open('C:/Users/michael/Desktop/Plan-for-combating-master/week1/1_2/1_2code_of_video/web/new_index.html','r') as web_data:
        Soup = BeautifulSoup(web_data,'lxml')
        print(Soup)

    二、获取要爬取元素的位置

    浏览器右键-》审查元素-》copy-》seletor

      """
        body > div.main-content > ul > li:nth-child(1) > div.article-info > h3 > a
        body > div.main-content > ul > li:nth-child(1) > div.article-info > p.meta-info > span:nth-child(2)
        body > div.main-content > ul > li:nth-child(1) > div.article-info > p.description
        body > div.main-content > ul > li:nth-child(1) > div.rate > span
        body > div.main-content > ul > li:nth-child(1) > img
        """
      images = Soup.select('body > div.main-content > ul > li:nth-child(1) > img')
        print(images)

    修改成:

      images = Soup.select('body > div.main-content > ul > li:nth-of-type(1) > img')
        print(images)

    这时候能获取到一个

        images = Soup.select('body > div.main-content > ul > li > img')
        print(images)

     获取到了所有图片

        titles = Soup.select('body > div.main-content > ul > li > div.article-info > h3 > a')
        descs = Soup.select('body > div.main-content > ul > li > div.article-info > p.description')
        rates = Soup.select(' body > div.main-content > ul > li > div.rate > span')
        cates = Soup.select(' body > div.main-content > ul > li > div.article-info > p.meta-info > span')
        print(images,titles,descs,rates,cates,sep='
    -----------
    ')

    获取到了其他信息

    三、获取标签中的文本信息(get_text())及属性(get())

    for title in titles:
            print(title.get_text())

    封装成字典:

    for title,image,desc,rate,cate in zip(titles,images,descs,rates,cates):
            data = {
                'title':title.get_text(),
                'rate':rate.get_text(),
                'desc':desc.get_text(),
                'cate':cate.get_text(),
                'image':image.get('src')
            }
            print(data)

    因为cates有多个属性,需要上升到父节点

    cates = Soup.select(' body > div.main-content > ul > li > div.article-info > p.meta-info')
    for title,image,desc,rate,cate in zip(titles,images,descs,rates,cates):
            data = {
                'title':title.get_text(),
                'rate':rate.get_text(),
                'desc':desc.get_text(),
                'cate':list(cate.stripped_strings),
                'image':image.get('src')
            }
            print(data)

    #找到评分大于3的文章
    for i in info:
        if float(i['rate'])>3:
            print(i['title'],i['cate'])

    四、完整代码

    from bs4 import  BeautifulSoup
    info =[]
    with open('C:/Users/michael/Desktop/Plan-for-combating-master/week1/1_2/1_2code_of_video/web/new_index.html','r') as web_data:
        Soup = BeautifulSoup(web_data,'lxml')
        # print(Soup)
        """
        body > div.main-content > ul > li:nth-child(1) > div.article-info > h3 > a
        body > div.main-content > ul > li:nth-child(1) > div.article-info > p.meta-info > span:nth-child(2)
        body > div.main-content > ul > li:nth-child(1) > div.article-info > p.description
        body > div.main-content > ul > li:nth-child(1) > div.rate > span
        body > div.main-content > ul > li:nth-child(1) > img
        """
        images = Soup.select('body > div.main-content > ul > li > img')
    
        titles = Soup.select('body > div.main-content > ul > li > div.article-info > h3 > a')
        descs = Soup.select('body > div.main-content > ul > li > div.article-info > p.description')
        rates = Soup.select(' body > div.main-content > ul > li > div.rate > span')
        cates = Soup.select(' body > div.main-content > ul > li > div.article-info > p.meta-info')
        # print(images,titles,descs,rates,cates,sep='
    -----------
    ')
    
    for title,image,desc,rate,cate in zip(titles,images,descs,rates,cates):
            data = {
                'title':title.get_text(),
                'rate':rate.get_text(),
                'desc':desc.get_text(),
                'cate':list(cate.stripped_strings),
                'image':image.get('src')
            }
            #添加到列表中
            info.append(data)
    #找到评分大于3的文章
    for i in info:
        if float(i['rate'])>3:
            print(i['title'],i['cate'])
  • 相关阅读:
    Css3 常见鼠标滑过效果集合
    HTML5 Media事件
    HTML 5 Audio/Video DOM buffered 属性
    Cocos2d-x 3.X 事件分发机制
    在 WPF 程序中使用 MVVM 模式
    Windows Phone 版 Cocos2d-x 程序的结构
    转载:Cocos2D-x 游戏接入 Windows 设备所需做的六件事
    使用 Cocos2d-x 3.1.1 创建 Windows Phone 8 游戏开发环境
    转载:Windows Phone 8.1 投影我的屏幕使用教程
    NHibernate 中使用 nvarchar(max) 类型
  • 原文地址:https://www.cnblogs.com/Michael2397/p/7747231.html
Copyright © 2011-2022 走看看