zoukankan      html  css  js  c++  java
  • 使用Python自带的库和正则表达式爬取熊猫直播主播观看人气

    • 主要是体现代码的规范性
      from urllib import request
      import re
      
      
      class Spider():
          url = 'https://www.panda.tv/cate/lol'
          root_pattern = '<div class="video-info">([sS]*?)</div>'
          name_pattern = '</i>([sS]*?)</span>'
          number_pattern = '<span class="video-number">([sS]*?)</span>'
      
          def __fetch_content(self):
              r = request.urlopen(Spider.url)
              htmls = r.read()
              htmls = str(htmls, encoding='utf-8')
              return htmls
      
          def __analysis(self, htmls):
              root_html = re.findall(Spider.root_pattern, htmls)
              anchors = []
              for html in root_html:
                  name = re.findall(Spider.name_pattern, html)
                  number = re.findall(Spider.number_pattern, html)
                  anchor = {"name": name, "number": number}
                  anchors.append(anchor)
              # print(root_html[1])
              # print(anchors[1])
              return anchors
      
          def __refine(self, anchors):
              L = lambda anchor: {"name": anchor['name'][0].strip(), 'number': anchor['name'][1]}
              return map(L, anchors)
      
          def __sort(self, anchors):
              anchors = sorted(anchors, key=self.__sort_seed, reverse=True)
              return anchors
      
          def __sort_seed(self, anchor):
              r = re.findall("d*", anchor["number"])
              number = float(r[0])
              if '万' in anchor['number']:
                  number = number * 10000
              return number
      
          def __show(self, anchors):
              for rank in range(0, len(anchors)):
                  print("排名:"+str(rank+1)+"  主播:" + anchors[rank]['name'] +
                        "--------" + "观看人数:" +
                        anchors[rank]['number'])
      
      
      
      
          def go(self):
              htmls = self.__fetch_content()
              anchors = self.__analysis(htmls)
              anchors = list(self.__refine(anchors))
              anchors = self.__sort(anchors)
              self.__show(anchors)
              print(len(anchors))
              # print(anchors)
      
      
      spider = Spider()
      spider.go()
      

        

  • 相关阅读:
    es学习-java操作 2.4.0版本
    es学习-基础增删改查
    mongodb 查询条件
    mongodb-查询
    mysql 性能优化
    mysql 存储过程学习(总)
    MySQL 存储过程 -流程控制的使用
    MySQL 存储过程 -光标的使用
    maven的聚合和继承
    mavean的依赖传递和排除依赖
  • 原文地址:https://www.cnblogs.com/longbigbeard/p/10473411.html
Copyright © 2011-2022 走看看