1.线程通信方式--共享变量
缺点,共享变量需要加锁,来达到我们想要的效果
1 #!/user/bin/env python 2 # -*- coding:utf-8 -*- 3 4 # 对于io操作来说,多线程和多进程性能差别不大 5 # 1.通过Thread类实例化 6 import time 7 import threading 8 detail_url_list = [] 9 10 11 def get_detail_html(detail_url_list): 12 # 使用共享变量 13 # global detail_url_list 14 while True: 15 if len(detail_url_list): 16 url = detail_url_list.pop() 17 # 爬取文章详情页 18 print('get detail html started') 19 time.sleep(2) 20 print('get detail html end') 21 22 23 def get_detail_url(detail_url_list): 24 while True: 25 # 使用共享变量 26 # global detail_url_list 27 # 爬取文章列表页 28 print('get detail url started') 29 time.sleep(2) 30 for i in range(20): 31 detail_url_list.append('http://projectsedu.com/{id}'.format(id=i)) 32 print('get detail url end') 33 34 35 # 1.线程通信方式-共享变量 36 if __name__ == '__main__': 37 start_time = time.time() 38 thread_detail_url = threading.Thread(target=get_detail_url, args=(detail_url_list,)) 39 thread_detail_url.start() 40 for i in range(10): 41 thread_detail_html = threading.Thread(target=get_detail_html, args=(detail_url_list,)) 42 thread_detail_html.start() 43 44 print('last time: {}'.format(time.time() - start_time))
ps:不要去尝试去运行,我设置了while循环,而且没有设置守护线程。。。
2.通过queue的方式进行线程间的通信
1 #!/user/bin/env python 2 # -*- coding:utf-8 -*- 3 4 # 2.通过queue的方式进行线程间的通信 5 from queue import Queue 6 import time 7 import threading 8 detail_url_list = [] 9 10 11 def get_detail_html(queue): 12 while True: 13 # 队列的get方法是一个阻塞的函数,即如果队列为空,就阻塞 14 url = queue.get() 15 # 爬取文章详情页 16 print('get detail html started') 17 time.sleep(2) 18 print('get detail html end') 19 20 21 def get_detail_url(queue): 22 while True: 23 # 爬取文章列表页 24 print('get detail url started') 25 time.sleep(2) 26 for i in range(20): 27 # 向队列里面插入数据 28 # put也是一个阻塞函数,当队列已满的时候,会阻塞 29 queue.put('http://projectsedu.com/{id}'.format(id=i)) 30 print('get detail url end') 31 32 33 # 1.线程通信方式-共享变量 34 if __name__ == '__main__': 35 # 最好设置一个最大值,不然太大了,回对内存有影响 36 detail_url_queue = Queue(maxsize=1000) 37 start_time = time.time() 38 thread_detail_url = threading.Thread(target=get_detail_url, args=(detail_url_queue,)) 39 thread_detail_url.start() 40 for i in range(10): 41 thread_detail_html = threading.Thread(target=get_detail_html, args=(detail_url_queue,)) 42 thread_detail_html.start() 43 44 print('last time: {}'.format(time.time() - start_time))