zoukankan      html  css  js  c++  java
  • 原生爬虫实例

     1 # coding=utf-8
     2 from  urllib import request
     3 import requests
     4 import re
     5 # 断点调试
     6 # class Spider():
     7 #     url='https://www.panda.tv/cate/lol'
     8 #     root_pattern='<div class="video-info">[sS]*?</div>'#?是贪婪,非贪婪,现在是非贪婪
     9 #     def __fetch_countent(self): #打开要解析的网页
    10 #         r=request.urlopen(Spider.url) #这里Spider.url是一个实例的意思
    11 #         htmls= r.read()
    12 #         htmls=str(htmls,encoding='utf-8')
    13 #         print(htmls)
    14 #         return htmls
    15 #         a=1
    16 #
    17 #     def __analysis(self,htmls): #具体分析
    18 #         root_html=re.findall (Spider.root_pattern,htmls)
    19 #         print(root_html)
    20 #         a=1
    21 #     def go(self):
    22 #         htmls=self.__fetch_countent()
    23 #         self.__analysis(htmls)
    24 #
    25 #
    26 # youtube=Spider()
    27 # youtube.go()
    28 
    29 class Spider():
    30     url='https://www.panda.tv/cate/lol'
    31     root_pattern='<div class="video-info">([sS]*?)</div>' #这里选取非贪婪模式
    32     name_pattern='</i>([sS]*?)</span>'
    33     number_patter='<span class="video-number">([sS]*?)</span>'
    34 
    35     def __fetch_content(self):  #取得_内容
    36         print('1111')
    37         r=requests.get(Spider.url)
    38         r.enconding = "utf-8"
    39         htmls=r.content.decode("utf-8")
    40 
    41         return htmls
    42     def __analysis(self,htmls): #分析内容
    43         root_html=re.findall(Spider.root_pattern,htmls) #findall 需要2个参数,一个是正则内容,一个是正则对象
    44         list_renqi=[]
    45         for html in root_html:
    46             name=re.findall(Spider.name_pattern,html)
    47             number=re.findall(Spider.number_patter,html)
    48             dic_renqi={'name':name,'number':number}
    49             list_renqi.append(dic_renqi)
    50         a=1
    51         print('111')
    52 
    53         return  list_renqi
    54     def __refine(self,list_renqi):#精炼列表
    55         l=lambda dic_renqi:{'name':dic_renqi['name'][0].strip(),
    56                             'number':dic_renqi['number'][0].strip()
    57                             }
    58         return map(l,list_renqi)
    59 
    60     def __sort(self,list_renqi): #排序
    61         list_renqi=sorted(list_renqi,key=self.__sort_seed,reverse=True) #key指定需要比较大小的元素 #reverse是排列顺序,是正序还是倒叙
    62         print(list_renqi)
    63         return list_renqi
    64 
    65     def __sort_seed(self,dic_renqi): #这是给上边排序函数用的,目的是找出key的方法,用来排序
    66         r=re.findall('d*',dic_renqi['number'])# 这里是把‘万’子变成10000,用来排序
    67         number=float(r[0])
    68         if '' in dic_renqi['number']:
    69             number *=10000
    70         return  number
    71 
    72     def __show(self,list_renqi):
    73         for rank in range(0,len(list_renqi)):
    74             print('rank '+str(rank+1)
    75                   +':'+list_renqi[rank]['name']
    76                   +'     '+list_renqi[rank]['number'])
    77         # for renqi in list_renqi:
    78         #     print(renqi['name']+'-------'+renqi['number'])
    79 
    80     def go(self):  #总控
    81         htmls=self.__fetch_content()  #获得内容
    82         list_renqi=self.__analysis(htmls) #分析内容
    83         list_renqi=list(self.__refine(list_renqi)) #精炼内容
    84         print(type(list_renqi))
    85         list_renqi=self.__sort(list_renqi)  #排序
    86         list_renqi=self.__show(list_renqi)  #展示
    87         print(list_renqi)
    88 
    89 spider=Spider()
    90 spider.go()

      

  • 相关阅读:
    Android的数据存储
    Servlet第一天
    JavaScript高级程序设计读书笔记(3)
    Interesting Papers on Face Recognition
    Researchers Study Ear Biometrics
    IIS 发生意外错误 0x8ffe2740
    Father of fractal geometry, Benoit Mandelbrot has passed away
    Computer vision scientist David Mumford wins National Medal of Science
    Pattern Recognition Review Papers
    盒模型bug的解决方法
  • 原文地址:https://www.cnblogs.com/PYlog/p/9048199.html
Copyright © 2011-2022 走看看