zoukankan      html  css  js  c++  java
  • 使用Python自带的库和正则表达式爬取熊猫直播主播观看人气

    • 主要是体现代码的规范性
      from urllib import request
      import re
      
      
      class Spider():
          url = 'https://www.panda.tv/cate/lol'
          root_pattern = '<div class="video-info">([sS]*?)</div>'
          name_pattern = '</i>([sS]*?)</span>'
          number_pattern = '<span class="video-number">([sS]*?)</span>'
      
          def __fetch_content(self):
              r = request.urlopen(Spider.url)
              htmls = r.read()
              htmls = str(htmls, encoding='utf-8')
              return htmls
      
          def __analysis(self, htmls):
              root_html = re.findall(Spider.root_pattern, htmls)
              anchors = []
              for html in root_html:
                  name = re.findall(Spider.name_pattern, html)
                  number = re.findall(Spider.number_pattern, html)
                  anchor = {"name": name, "number": number}
                  anchors.append(anchor)
              # print(root_html[1])
              # print(anchors[1])
              return anchors
      
          def __refine(self, anchors):
              L = lambda anchor: {"name": anchor['name'][0].strip(), 'number': anchor['name'][1]}
              return map(L, anchors)
      
          def __sort(self, anchors):
              anchors = sorted(anchors, key=self.__sort_seed, reverse=True)
              return anchors
      
          def __sort_seed(self, anchor):
              r = re.findall("d*", anchor["number"])
              number = float(r[0])
              if '万' in anchor['number']:
                  number = number * 10000
              return number
      
          def __show(self, anchors):
              for rank in range(0, len(anchors)):
                  print("排名:"+str(rank+1)+"  主播:" + anchors[rank]['name'] +
                        "--------" + "观看人数:" +
                        anchors[rank]['number'])
      
      
      
      
          def go(self):
              htmls = self.__fetch_content()
              anchors = self.__analysis(htmls)
              anchors = list(self.__refine(anchors))
              anchors = self.__sort(anchors)
              self.__show(anchors)
              print(len(anchors))
              # print(anchors)
      
      
      spider = Spider()
      spider.go()
      

        

  • 相关阅读:
    Siteserver平台搭建
    Android快速入门
    Android Studio/IntelliJ IDEA使用手记
    Nook 2 Root
    Spring 小记
    DeepinXP Lite 6.2 精简版220M 安装IIS
    Rom Modified [Galaxy 3 Tested]
    Windows Thin PC体验 & 语言包更改(win 7 included)
    重装系统后恢复wubi安装的Ubuntu(未实测)
    20180822-Java接口
  • 原文地址:https://www.cnblogs.com/longbigbeard/p/10473411.html
Copyright © 2011-2022 走看看