1 """ 2 原生爬虫 3 4 爬虫前奏: 5 明确目的 6 找到数据对应的网页 7 分析网页的结构找到数据所在的标签位置 8 9 模拟HTTP请求,向服务器发送这个请求,获取到服务器返回给我们的HTML 10 用正则表达式提取我们要的数据(名字、人气<热度>) 11 12 参考文档: 13 https://blog.csdn.net/qq_38151401/article/details/93018656 14 15 思路: 16 17 (1)获取网页内容 18 19 (2)分析所要获取的数据格式 20 21 (3)获取相应的数据 22 23 (4)将数据转化为所需要的格式 24 25 (5)数据展现 26 """
1 #样例:原生爬虫爬取虎牙的王者荣耀板块,进行主播人气排序 2 #拓展爬虫框架:BeautifulSoup,Scrapy 3 # 爬虫、反爬虫、反反爬虫 ip容易被封,代理IP库 4 import re 5 from urllib import request 6 import ssl 7 #断点调试 8 class Spider(): 9 #定义链接、截取字段 10 url = 'https://www.huya.com/g/wzry' #爬虫获取的网站 11 root_pattern = '<span class="txt">([sS]*?)</li>' #爬虫获取的节点 12 #root_pattern2 = '<li class="game-live-item"[sS]*?</li>' 13 name_pattern = '<i class="nick" title="([sS]*?)">'#爬虫获取的名字(正则) 14 number_pattern = '<i class="js-num">([sS]*?)</i>' #爬虫获取的人气值(正则) 15 16 #获取网站的代码 17 def __fetch_content(self): 18 ssl._create_default_https_context = ssl._create_unverified_context#创建免验证的ssl 19 r = request.urlopen(Spider.url)#获取地址 20 htmls = r.read() #读取代码 21 htmls = str(htmls, encoding='utf-8')#变为可阅读的字符串格式 22 return htmls 23 # 24 #分析代码(使用正则获取字段) 将取得的字段放入列表(数组)中 25 def __analysis(self, htmls): 26 root_html = re.findall(Spider.root_pattern,htmls) 27 print(root_html[0]) 28 #root_html2 = re.findall(Spider.root_pattern2,htmls) 29 anchors = [] 30 for html in root_html: 31 name = re.findall(Spider.name_pattern,html) 32 number = re.findall(Spider.number_pattern,html) 33 anchor = {'name':name,'number':number} # {'name': ['Dae-心态'], 'number': ['<i class="js-num">473.4万</i>']} 34 anchors.append(anchor) 35 #print(anchors[0]) 36 a = 1 37 return anchors 38 39 #处理所获取数组中多余的符号等 40 def __refine(self,anchors): 41 l = lambda anchor:{ 42 'name':anchor['name'][0].strip(), 43 'number': anchor['number'][0] 44 } 45 return map(l,anchors) 46 47 #排序 48 def __sort(self,anchors): 49 #filter 50 anchors = sorted(anchors,key=self.__sort_seed,reverse=True) 51 return anchors 52 #排序的条件 53 def __sort_seed(self,anchor): 54 55 #r = re.findall('[1-9]d[^,]*.d*|0.d*[1-9]d*|[^,]',anchor['number']) 56 #r = re.findall('[1-9][^,]d*.d*|0.d*[1-9][^,]d*', '1,816.1万') 57 # print(anchor['number'],list(r),r[0]) 58 number = float(str(anchor['number']).replace('万', '')) 59 60 if ',' in anchor['number']: 61 number = float(str('1,816.1万').replace(',','').replace('万','')) 62 elif '万' in anchor['number']: 63 number *= 10000 64 return number 65 66 #展示 67 def __show(self,anchors): 68 for rank in range(0,len(anchors)): 69 #print(anchor['name']+'-----'+anchor['number']) 70 print('rank ' + str(rank + 1) 71 + ':' + anchors[rank]['name'] 72 + ' ' + anchors[rank]['number']) 73 #公共方法区调用私有方法 74 def go(self): 75 htmls = self.__fetch_content() #获取网站的代码 76 anchors = self.__analysis(htmls) #分析代码(使用正则获取字段) 将取得的字段放入列表(数组)中 77 anchors = list(self.__refine(anchors)) #处理所获取数组中多余的符号等 78 anchors = self.__sort(anchors) #排序 79 self.__show(anchors) #展示 80 #print(list(anchors)) 81 82 spider = Spider() 83 spider.go() 84 85 """ 86 <li class="game-live-item" gid="2336" data-lp="1259515661837"> 87 <a href="https://www.huya.com/688" class="video-info " target="_blank"> 88 <img class="pic" data-original="//live-cover.msstatic.com/huyalive/1259515661837-1259515661837-4682562792811659264-2519031447130-10057-A-0-1/20200723205252.jpg?x-oss-process=image/resize,limit_0,m_fill,w_338,h_190/sharpen,80/format,jpg/interlace,1/quality,q_90" src="//live-cover.msstatic.com/huyalive/1259515661837-1259515661837-4682562792811659264-2519031447130-10057-A-0-1/20200723205252.jpg?x-oss-process=image/resize,limit_0,m_fill,w_338,h_190/sharpen,80/format,jpg/interlace,1/quality,q_90" data-default-img="338x190" alt="张大仙的直播" title="张大仙的直播"> 89 90 <em class="tag tag-recommend">超级明星</em> 91 92 <div class="item-mask"></div> 93 <i class="btn-link__hover_i"></i> 94 <p class="tag-right"> 95 96 <!-- 手机开播 --> 97 98 <!-- VR直播 --> 99 100 <!-- 无损音质 || 蓝光 --> 101 <em class="tag-blue">蓝光8M</em> 102 103 104 105 </p> 106 </a> 107 <a href="https://www.huya.com/688" class="title" title="大仙来啦" target="_blank">大仙来啦</a> 108 <span class="txt">============================================================================= 109 <span class="avatar fl"> 110 <img data-original="https://huyaimg.msstatic.com/avatar/1016/b9/b6824c9d5593f03f5b5c4f71189023_180_135.jpg" src="https://huyaimg.msstatic.com/avatar/1016/b9/b6824c9d5593f03f5b5c4f71189023_180_135.jpg" data-default-img="84x84" alt="张大仙" title="张大仙"> 111 <i class="nick" title="张大仙">张大仙</i> 112 </span> 113 <span class="num"> 114 <i class="num-icon"></i> 115 <i class="js-num">1,404.5万</i></span> 116 </span> 117 </li> 118 119 """