爬取虎牙直播分类页面的主播的头像,名字,人气
今天学习了python3爬虫,上课闲着无聊,自己写了一个爬虫
就顺着老师思路
爬了虎牙直播分类页面的主播,头像,名字,和人气
HuYaCateScrapy.py
1 #!/usr/bin/python 2 # -*- coding: utf-8 -*- 3 4 """ 5 @ author: happy_code 6 @ contact: happy_code@foxmail.com 7 @ software: 爬虫 8 @ desc: 获取虎牙直播类页面主播名和人气,下载头像,并以名字和人气命名 9 """ 10 11 import urllib 12 import requests 13 import re 14 import os 15 16 17 class Spider: 18 19 # 目标url 20 url = "" 21 22 # 保存地址 23 myrobot = "D:/scinfo/" 24 25 # 获取的主播名和人气,头像 26 part = '<span class="txt">s*' 27 's*<span class="avatar fl">s*' 28 's*<img.*data-original="(.*?)".*>s*' 29 's*<i.*>(.*?)</i>s*' 30 's*</span>s*' 31 's*<span.*><i.*></i><i class="js-num">(.*?)</i></span>s*' 32 's*</span>' 33 34 35 def __init__(self, url): 36 self.url = url 37 38 39 # 获取网站源代码 40 def gethtml(self): 41 res = requests.get(self.url) 42 res.encoding = "UTF-8" 43 return res.text 44 45 46 # 获得信息 47 def gethtmlinfo(self): 48 html = self.gethtml() 49 all = re.findall(self.part, html) 50 return all 51 52 53 # 下载图片,保存到myrobot下, 可以自定义文件名,哈哈重复下载保证成功(最多3次) 54 def downloadimg(self, url, name=None): 55 ok = 0 56 for i in range(3): 57 try: 58 if name != None: 59 path = self.myrobot + name + "." +url.split('.')[-1] 60 else: 61 path = self.myrobot + url.split('/')[-1] 62 url = url.replace('\', '') 63 r = requests.get(url, timeout=30) 64 r.raise_for_status() 65 r.encoding = r.apparent_encoding 66 if not os.path.exists(self.myrobot): 67 os.makedirs(self.myrobot) 68 if not os.path.exists(path): 69 with open(path, 'wb') as f: 70 f.write(r.content) 71 f.close() 72 print(path + ' 文件保存成功') 73 ok = 1 74 else: 75 print('文件已经存在') 76 except: 77 print("异常") 78 continue 79 80 if ok == 1: 81 break 82 83 # 保存信息 84 def saveinfo(self, data): 85 for i in data: 86 s.downloadimg(i[0], i[1]+"-"+str(i[2])); 87 88 89 if __name__ == "__main__": 90 # lol分类的url 91 s = Spider("https://www.huya.com/g/lol") 92 data = s.gethtmlinfo() 93 s.saveinfo(data)
只需要在main中给出分类页面的url即可
然后呢,修改一下保存路径就好了
结果如下: