zoukankan      html  css  js  c++  java
  • Python入门学习笔记11:原生爬虫

     1 """
     2 原生爬虫
     3 
     4  爬虫前奏:
     5  明确目的
     6  找到数据对应的网页
     7  分析网页的结构找到数据所在的标签位置
     8 
     9  模拟HTTP请求,向服务器发送这个请求,获取到服务器返回给我们的HTML
    10  用正则表达式提取我们要的数据(名字、人气<热度>)
    11 
    12  参考文档:
    13 https://blog.csdn.net/qq_38151401/article/details/93018656
    14 
    15 思路:
    16 
    17 (1)获取网页内容
    18 
    19 (2)分析所要获取的数据格式
    20 
    21 (3)获取相应的数据
    22 
    23 (4)将数据转化为所需要的格式
    24 
    25 (5)数据展现
    26 """
      1 #样例:原生爬虫爬取虎牙的王者荣耀板块,进行主播人气排序
      2 #拓展爬虫框架:BeautifulSoup,Scrapy
      3 # 爬虫、反爬虫、反反爬虫  ip容易被封,代理IP库
      4 import re
      5 from urllib import request
      6 import ssl
      7 #断点调试
      8 class Spider():
      9     #定义链接、截取字段
     10     url = 'https://www.huya.com/g/wzry'                 #爬虫获取的网站
     11     root_pattern = '<span class="txt">([sS]*?)</li>'  #爬虫获取的节点
     12     #root_pattern2 = '<li class="game-live-item"[sS]*?</li>'
     13     name_pattern = '<i class="nick" title="([sS]*?)">'#爬虫获取的名字(正则)
     14     number_pattern = '<i class="js-num">([sS]*?)</i>' #爬虫获取的人气值(正则)
     15 
     16     #获取网站的代码
     17     def __fetch_content(self):
     18         ssl._create_default_https_context = ssl._create_unverified_context#创建免验证的ssl
     19         r = request.urlopen(Spider.url)#获取地址
     20         htmls = r.read()               #读取代码
     21         htmls = str(htmls, encoding='utf-8')#变为可阅读的字符串格式
     22         return htmls
     23 #
     24     #分析代码(使用正则获取字段) 将取得的字段放入列表(数组)中
     25     def __analysis(self, htmls):
     26         root_html = re.findall(Spider.root_pattern,htmls)
     27         print(root_html[0])
     28         #root_html2 = re.findall(Spider.root_pattern2,htmls)
     29         anchors = []
     30         for html in root_html:
     31             name = re.findall(Spider.name_pattern,html)
     32             number = re.findall(Spider.number_pattern,html)
     33             anchor = {'name':name,'number':number} #   {'name': ['Dae-心态'], 'number': ['<i class="js-num">473.4万</i>']}
     34             anchors.append(anchor)
     35         #print(anchors[0])
     36         a = 1
     37         return anchors
     38 
     39     #处理所获取数组中多余的符号等
     40     def __refine(self,anchors):
     41         l = lambda anchor:{
     42             'name':anchor['name'][0].strip(),
     43             'number': anchor['number'][0]
     44             }
     45         return map(l,anchors)
     46 
     47     #排序
     48     def __sort(self,anchors):
     49         #filter
     50         anchors = sorted(anchors,key=self.__sort_seed,reverse=True)
     51         return anchors
     52     #排序的条件
     53     def __sort_seed(self,anchor):
     54 
     55         #r = re.findall('[1-9]d[^,]*.d*|0.d*[1-9]d*|[^,]',anchor['number'])
     56         #r = re.findall('[1-9][^,]d*.d*|0.d*[1-9][^,]d*', '1,816.1万')
     57         # print(anchor['number'],list(r),r[0])
     58         number = float(str(anchor['number']).replace('', ''))
     59 
     60         if ',' in anchor['number']:
     61             number = float(str('1,816.1万').replace(',','').replace('',''))
     62         elif '' in anchor['number']:
     63             number *= 10000
     64         return number
     65 
     66     #展示
     67     def __show(self,anchors):
     68         for rank in range(0,len(anchors)):
     69             #print(anchor['name']+'-----'+anchor['number'])
     70             print('rank  ' + str(rank + 1)
     71                   + ':' + anchors[rank]['name']
     72                   + '    ' + anchors[rank]['number'])
     73     #公共方法区调用私有方法
     74     def go(self):
     75         htmls = self.__fetch_content()          #获取网站的代码
     76         anchors = self.__analysis(htmls)        #分析代码(使用正则获取字段) 将取得的字段放入列表(数组)中
     77         anchors = list(self.__refine(anchors))  #处理所获取数组中多余的符号等
     78         anchors = self.__sort(anchors)          #排序
     79         self.__show(anchors)                    #展示
     80         #print(list(anchors))
     81 
     82 spider = Spider()
     83 spider.go()
     84 
     85 """
     86       <li class="game-live-item" gid="2336" data-lp="1259515661837">
     87         <a href="https://www.huya.com/688" class="video-info " target="_blank">
     88         <img class="pic" data-original="//live-cover.msstatic.com/huyalive/1259515661837-1259515661837-4682562792811659264-2519031447130-10057-A-0-1/20200723205252.jpg?x-oss-process=image/resize,limit_0,m_fill,w_338,h_190/sharpen,80/format,jpg/interlace,1/quality,q_90" src="//live-cover.msstatic.com/huyalive/1259515661837-1259515661837-4682562792811659264-2519031447130-10057-A-0-1/20200723205252.jpg?x-oss-process=image/resize,limit_0,m_fill,w_338,h_190/sharpen,80/format,jpg/interlace,1/quality,q_90" data-default-img="338x190" alt="张大仙的直播" title="张大仙的直播">
     89 
     90                                 <em class="tag tag-recommend">超级明星</em>
     91                             
     92         <div class="item-mask"></div>
     93         <i class="btn-link__hover_i"></i>
     94         <p class="tag-right">
     95 
     96             <!-- 手机开播 -->
     97                         
     98             <!-- VR直播 -->
     99             
    100                 <!-- 无损音质 || 蓝光 -->
    101                                     <em class="tag-blue">蓝光8M</em>
    102                                 
    103                         
    104 
    105                     </p>
    106     </a>
    107     <a href="https://www.huya.com/688" class="title" title="大仙来啦" target="_blank">大仙来啦</a>
    108     <span class="txt">=============================================================================
    109         <span class="avatar fl">
    110             <img data-original="https://huyaimg.msstatic.com/avatar/1016/b9/b6824c9d5593f03f5b5c4f71189023_180_135.jpg" src="https://huyaimg.msstatic.com/avatar/1016/b9/b6824c9d5593f03f5b5c4f71189023_180_135.jpg" data-default-img="84x84" alt="张大仙" title="张大仙">
    111             <i class="nick" title="张大仙">张大仙</i>
    112         </span>
    113                 <span class="num">
    114                 <i class="num-icon"></i>
    115                 <i class="js-num">1,404.5万</i></span>
    116     </span>
    117 </li>
    118 
    119 """
  • 相关阅读:
    Node入门
    Java try-catch、throw和throws的几点想法
    SpringMVC的四种HandlerMapping
    枚举
    MyBatis表和实体关联
    MyBatis xml和dao层接口组合使用
    Spring Task 定时器
    (转)Spring定时任务的几种实现
    Struts2+Spring发送邮件
    Struts2文件上传
  • 原文地址:https://www.cnblogs.com/liuxiaoming123/p/13375309.html
Copyright © 2011-2022 走看看