在某些应用场景下,想要提高python的并发能力,可以使用多线程,或者协程。比如网络爬虫,数据库操作等一些IO密集型的操作。下面对比python单线程,多线程和协程在网络爬虫场景下的速度。
一,单线程。
单线程代
1 #!/usr/bin/env
2 # coding:utf8
3 # Author: hz_oracle 4 5 import MySQLdb 6 import gevent 7 import requests 8 import time 9 10 11 class DbHandler(object): 12 def __init__(self, host, port, user, pwd, dbname): 13 self.host = host 14 self.port = port 15 self.user = user 16 self.pwd = pwd 17 self.db = dbname 18 19 def db_conn(self): 20 try: 21 self.conn = MySQLdb.connect(host=self.host, port=self.port, user=self.user, passwd=self.pwd, db=self.db, charset="utf8") 22 self.cursor = self.conn.cursor() 23 return 1 24 except Exception as e: 25 return 0 26 27 def get_urls(self, limitation): 28 sql = """select pic from picurltable limit %s""" % limitation 29 urls_list = list() 30 try: 31 self.cursor.execute(sql) 32 fetchresult = self.cursor.fetchall() 33 for line in fetchresult: 34 urls_list.append(line[0]) 35 print len(urls_list) 36 except Exception as e: 37 print u"数据库查询失败:%s" % e 38 return [] 39 return urls_list 40 41 def db_close(self): 42 self.conn.close() 43 44 45 def get_pic(url): 46 try: 47 pic_obj = requests.get(url).content 48 except Exception as e: 49 print u"图片出错" 50 return "" 51 filename = url.split('/')[-2] 52 file_path = "./picture/" + filename + '.jpg' 53 fp = file(file_path, 'wb') 54 fp.write(pic_obj) 55 fp.close() 56 return "ok" 57 58 59 def main(): 60 start_time = time.time() 61 db_obj = DbHandler(host='127.0.0.1', port=3306, user='root', pwd='123456', dbname='pic') 62 db_obj.db_conn() 63 url_list = db_obj.get_urls(100)
64 map(get_pic, url_list) 65 #for url in url_list: 66 # get_pic(url) 67 end_time = time.time() 68 costtime = float(end_time) - float(start_time) 69 print costtime 70 print "download END" 71 72 if __name__ == "__main__": 73 main()
运行结果
100
45.1282339096
download END
单线程情况下,下载100张图片花了45秒。
再来看多线程的情况下。
#!/usr/bin/env python # coding:utf8 # Author: hz_oracle import MySQLdb import gevent import requests import time import threading import Queue lock1 = threading.RLock() url_queue = Queue.Queue() urls_list = list() class DbHandler(object): def __init__(self, host, port, user, pwd, dbname): self.host = host self.port = port self.user = user self.pwd = pwd self.db = dbname def db_conn(self): try: self.conn = MySQLdb.connect(host=self.host, port=self.port, user=self.user, passwd=self.pwd, db=self.db, charset="utf8") self.cursor = self.conn.cursor() return 1 except Exception as e: return 0 def get_urls(self, limitation): sql = """select pic from picurltable limit %s""" % limitation try: self.cursor.execute(sql) fetchresult = self.cursor.fetchall() for line in fetchresult: url_queue.put(line[0]) except Exception as e: print u"数据库查询失败:%s" % e return 0 return 1 def db_close(self): self.conn.close() class MyThread(threading.Thread): def __init__(self): super(MyThread, self).__init__() def run(self): url = url_queue.get() try: pic_obj = requests.get(url).content except Exception as e: print u"图片出错" return "" filename = url.split('/')[-2] file_path = "./picture/" + filename + '.jpg' fp = file(file_path, 'wb') fp.write(pic_obj) fp.close() def main(): start_time = time.time() db_obj = DbHandler(host='127.0.0.1', port=3306, user='root', pwd='123456', dbname='pic') db_obj.db_conn() db_obj.get_urls(100) for i in range(100): i = MyThread() i.start() while True: if threading.active_count()<=1: break end_time = time.time() costtime = float(end_time) - float(start_time) print costtime print "download END" if __name__ == "__main__": main()
运行结果
15.408192873
download END
启用100个线程发现只要花15秒即可完成任务,100个线程可能不是最优的方案,但较单线程有很明显的提升。接着再来看协程。
协程代码
#!/usr/bin/env python # coding:utf8 # Author: hz_oracle import MySQLdb import requests import time import threading import Queue from gevent import monkey; monkey.patch_all() import gevent class DbHandler(object): def __init__(self, host, port, user, pwd, dbname): self.host = host self.port = port self.user = user self.pwd = pwd self.db = dbname def db_conn(self): try: self.conn = MySQLdb.connect(host=self.host, port=self.port, user=self.user, passwd=self.pwd, db=self.db, charset="utf8") self.cursor = self.conn.cursor() return 1 except Exception as e: return 0 def get_urls(self, limitation): urls_list = list() sql = """select pic from picurltable limit %s""" % limitation try: self.cursor.execute(sql) fetchresult = self.cursor.fetchall() for line in fetchresult: urls_list.append(line[0]) except Exception as e: print u"数据库查询失败:%s" % e return [] return urls_list def db_close(self): self.conn.close() def get_pic(url): try: pic_obj = requests.get(url).content except Exception as e: print u"图片出错" return "" filename = url.split('/')[-2] file_path = "./picture/" + filename + '.jpg' fp = file(file_path, 'wb') fp.write(pic_obj) fp.close() return "ok" def main(): start_time = time.time() db_obj = DbHandler(host='127.0.0.1', port=3306, user='root', pwd='123456', dbname='pic') db_obj.db_conn() url_list = db_obj.get_urls(100) gevent.joinall([gevent.spawn(get_pic,url) for url in url_list]) end_time = time.time() costtime = float(end_time) - float(start_time) print costtime print "download END" if __name__ == "__main__": main()
运行结果
10.6234440804
download END
使用协程发现只花了10秒多,也就是三种方法中最快的。
总结:
三种方法中,单线程最慢,多线程次之,而协程最快。 不过如果对多线程进行优化,也可能变快,这里不讨论。