斗鱼已将网页的源代码修改,若想爬取图片请转至
https://www.cnblogs.com/summer1019/p/10388348.html
1 import requests
2 from bs4 import BeautifulSoup
3 from urllib import request
4 # import threading
5 import gevent
6 from gevent import monkey
7
8 monkey.patch_all()
9
10 def get_html_text(url):
11 try:
12 r = requests.get(url, timeout=10)
13 r.raise_for_status()
14 r.encoding = r.apparent_encoding
15 return r.text
16 except Exception as result:
17 print('错误类型:', result)
18
19
20 def html_text_parser(img_list, html):
21 soup = BeautifulSoup(html, 'html.parser')
22 for link in soup.find_all('img'):
23 Link = link.get('data-original') #link.get('key')--->>获取属性值
24 if Link:
25 img_list.append(Link)
26 return img_list
27
28
29 def get_douyu_img(Img_list):
30 for i,j in enumerate(Img_list):
31 # name = j.split('.')[-1]
32 r = request.urlopen(j)
33 ima_content = r.read()
34 path = str(i)
35 with open(path, 'wb') as f:
36 f.write(ima_content)
37
38 def main():
39 url = 'https://www.douyu.com/g_yz'
40 html = get_html_text(url)
41 img_list = list()
42 Img_list = html_text_parser(img_list, html)
43 # print(Img_list)
44 #t1 = threading.Thread(target=get_html_text, args=(url,))
45 #t2 = threading.Thread(target=html_text_parser, args=(img_list,html))
46 #t3 = threading.Thread(target=get_douyu_img, args=(Img_list,))
47 #t1.start()
48 #t2.start()
49 #t3.start()
50 gevent.joinall([
51 gevent.spawn(get_html_text, url),
52 gevent.spawn(html_text_parser, img_list, html),
53 gevent.spawn(get_douyu_img, Img_list)
54 ])
55
56
57 if __name__ == '__main__':
58 main()
备注:
一次性可以爬取239张高颜值美女图片,不同时段爬取的主播不同~
权当娱乐,分享诸位,
完全自学自练,欢迎指正。