单线程爬虫:
import re import requests import time url_EB = 'http://www.amazon.com/gp/search/other/ref=sr_sa_p_4?me=A22XNR713HGDVG&rh=n%3A9063592011%2Ck%3Aprojector&bbn=9063592011&keywords=projector&pickerToList=brandtextbin&ie=UTF8&qid=1461902521' headers_EB = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.86 Safari/537.36'} url_AML = '''https://www.amazon.com/gp/search/other/ref=sr_sa_p_4?me=A3UJI9WWE6PRP5&rh=i%3Amerchant-items &pickerToList=brandtextbin&ie=UTF8&qid=1461899728''' headers_AML ={'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.86 Safari/537.36'} url_DL= 'https://www.amazon.com/gp/search/other/ref=sr_sa_p_4?me=AS7ZU4MN0FPOY&rh=i%3Amerchant-items&pickerToList=brandtextbin&ie=UTF8&qid=1461901862' headers_DL = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.86 Safari/537.36'} name = {'a':'ExclusiveBulbs', 'b':'Amazing Lamps', 'c':'Dynamic Lamps'} # listing_count = re.findall('<span class="narrowValue">(.*?)</span',data.text) # f = dict(map(lambda x,y:[x,y],store_name,listing_count)) # # for k,v in f.items(): # print(k,v) def foo_one(url,headers,name): print('--------------------------开始爬去{0}at{1}---------------------------'.format(name,time.ctime())) response = requests.get(url,headers=headers) store_name = re.findall('<span class="refinementLink">(.*?)</span><span class="narrowValue">(.*?)</span',response.text) for i in store_name: print(i) print('--------------------------爬去完毕at{}----------------------------'.format(time.ctime())) time.sleep(1) if __name__ == '__main__': foo_one(url_EB,headers_EB,name['a']) foo_one(url_AML,headers_AML,name['b']) foo_one(url_DL,headers_DL,name['c'])
输出:00:25:33开始,00:26:02结束 耗时29秒
--------------------------开始爬去ExclusiveBulbsatSat Apr 30 00:25:33 2016--------------------------- ('A.Shine', ' (97)') ('AmpacElectronics', ' (1,644)') ('AuraBeam', ' (33,084)') ('AWO', ' (1,206)') ('Battery1inc', ' (694)') ('Comoze Lamps', ' (6,172)') ('Compatible Lamp', ' (317)') ('Corgi Lamps', ' (2,124)') ('CTLAMP', ' (3,499)') ('Dell', ' (191)') ('Diamond Lamps', ' (966)') ('Dynamic', ' (4)') ('Eiki', ' (460)') ('ePharos', ' (2,592)') ('Epson', ' (1,456)') ('EREPLACEMENT', ' (115)') ('eReplacements', ' (814)') ('eWo's', ' (120)') ('eWorldlamp', ' (354)') ('FI Lamps', ' (5,707)') ('FL Projector Lamp For Mitsubishi', ' (1)') ('For Epson', ' (3)') ('Generic', ' (9,769)') ('Good Lamp', ' (819)') ('HCDZ', ' (2,746)') ('Hitachi', ' (935)') ('IET Lamps', ' (2,144)') ('InFocus', ' (44)') ('JVC', ' (326)') ('KCL', ' (3,781)') ('Lampedia', ' (618)') ('Lutema', ' (1,956)') ('Mitsubishi', ' (1,006)') ('Mogobe', ' (1,335)') ('MyProjectorLamps', ' (473)') ('NEC', ' (446)') ('Nec Computers', ' (13)') ('Optoma', ' (956)') ('Osram Sylvania', ' (78)') ('Panasonic', ' (820)') ('Philips', ' (7,502)') ('Powerwarehouse', ' (9,971)') ('Projector Lamps World', ' (112)') ('Pureglare', ' (369)') ('Samsung', ' (1,078)') ('Sharp', ' (426)') ('Shopforbattery', ' (2,510)') ('SMART BOARD', ' (66)') ('Sony', ' (990)') ('TVLampsforless', ' (14)') ('Unknown', ' (722)') --------------------------爬去完毕atSat Apr 30 00:25:57 2016---------------------------- --------------------------开始爬去Amazing LampsatSat Apr 30 00:25:58 2016--------------------------- ('AWO', ' (1)') ('Comoze Lamps', ' (2)') ('DNGO', ' (8)') ('Electrified', ' (9)') ('ELECTRIFIED', ' (10)') ('Electrified Discounters', ' (5)') ('ELECTRIFIED LAMPS', ' (1,177)') ('ELECTRIFIED PRINTHEAD', ' (24)') ('ELECTRIFIED PRINTHEADS', ' (2)') ('FI Lamps', ' (2)') ('Generic', ' (34)') ('GloWatt', ' (1)') ('KCL', ' (1)') ('OEM', ' (1)') ('Powerwarehouse', ' (7)') ('SKU', ' (5)') ('Top Lamp', ' (1)') ('Unknown', ' (1)') ('USOM', ' (3)') --------------------------爬去完毕atSat Apr 30 00:26:00 2016---------------------------- --------------------------开始爬去Dynamic LampsatSat Apr 30 00:26:01 2016--------------------------- ('Battery1inc', ' (85)') ('BenQ', ' (237)') ('Buslink', ' (31)') ('Calumet', ' (2)') ('Comoze Lamps', ' (405)') ('CTLAMP', ' (615)') ('Dell', ' (82)') ('Divine Lighting', ' (36)') ('DNGO', ' (63)') ('Dynamic', ' (4)') ('Eiko', ' (140)') ('Electrified', ' (2)') ('ELECTRIFIED LAMPS', ' (24)') ('Electronix Xpress', ' (418)') ('ePharos', ' (502)') ('Epson', ' (631)') ('eReplacements', ' (119)') ('FI Lamps', ' (505)') ('FL Projector Lamp For Mitsubishi', ' (1)') ('G-lamps', ' (43)') ('GE', ' (248)') ('GE Lighting', ' (152)') ('General Electric', ' (53)') ('Generic', ' (1,671)') ('Genie', ' (101)') ('GLAMPS', ' (2)') ('Impact', ' (7)') ('Industrial Lighting Solutions', ' (9)') ('KCL', ' (280)') ('Kodak', ' (1)') ('Lampedia', ' (63)') ('M-Wave', ' (830)') ('Mitsubishi', ' (406)') ('Mitsubishi DLP TV Bulbs', ' (29)') ('Mocpinc', ' (10)') ('MyProjectorLamps', ' (344)') ('Nec', ' (19)') ('Optoma', ' (161)') ('Osram', ' (1,295)') ('Panasonic', ' (245)') ('Philips', ' (988)') ('Powerwarehouse', ' (239)') ('Projector Lamps World', ' (45)') ('Pureglare', ' (107)') ('Samsung', ' (323)') ('ShopJimmy', ' (3)') ('Sony', ' (141)') ('Sylvania', ' (115)') ('Technical Precision', ' (10)') ('Unknown', ' (167)') ('Welch Allyn Compatible', ' (1)') --------------------------爬去完毕atSat Apr 30 00:26:02 2016----------------------------
多线程:00:32:37开始00:32:39结束 耗时2秒
import re import requests import threading import time from time import ctime,sleep url_EB = 'http://www.amazon.com/gp/search/other/ref=sr_sa_p_4?me=A22XNR713HGDVG&rh=n%3A9063592011%2Ck%3Aprojector&bbn=9063592011&keywords=projector&pickerToList=brandtextbin&ie=UTF8&qid=1461902521' headers_EB = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.86 Safari/537.36'} url_AML = '''https://www.amazon.com/gp/search/other/ref=sr_sa_p_4?me=A3UJI9WWE6PRP5&rh=i%3Amerchant-items &pickerToList=brandtextbin&ie=UTF8&qid=1461899728''' headers_AML ={'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.86 Safari/537.36'} url_DL= 'https://www.amazon.com/gp/search/other/ref=sr_sa_p_4?me=AS7ZU4MN0FPOY&rh=i%3Amerchant-items&pickerToList=brandtextbin&ie=UTF8&qid=1461901862' headers_DL = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.86 Safari/537.36'} name = {'a':'ExclusiveBulbs', 'b':'Amazing Lamps', 'c':'Dynamic Lamps'} # listing_count = re.findall('<span class="narrowValue">(.*?)</span',data.text) # f = dict(map(lambda x,y:[x,y],store_name,listing_count)) # # for k,v in f.items(): # print(k,v) def foo_one(url,headers,name): print('--------------------------开始爬去{0}at{1}---------------------------'.format(name,time.ctime())) response = requests.get(url,headers=headers) store_name = re.findall('<span class="refinementLink">(.*?)</span><span class="narrowValue">(.*?)</span',response.text) for i in store_name: print(i) print('--------------------------爬去完毕{0}at{1}----------------------------'.format(name,time.ctime())) threads = [] t1 = threading.Thread(target=foo_one,args=(url_EB,headers_EB,name['a'])) threads.append(t1) t2 = threading.Thread(target=foo_one,args=(url_AML,headers_AML,name['b'])) threads.append(t2) t3 = threading.Thread(target=foo_one,args=(url_DL,headers_DL,name['c'])) threads.append(t3) if __name__ == '__main__': for t in threads: t.setDaemon(True) t.start() t.join() print ("all over %s" %ctime())
输出:
--------------------------开始爬去ExclusiveBulbsatSat Apr 30 00:32:37 2016--------------------------- --------------------------开始爬去Amazing LampsatSat Apr 30 00:32:37 2016--------------------------- --------------------------开始爬去Dynamic LampsatSat Apr 30 00:32:37 2016--------------------------- ('A.Shine', ' (97)') ('AmpacElectronics', ' (1,645)') ('AuraBeam', ' (33,088)') ('AWO', ' (1,209)') ('Battery1inc', ' (694)') ('Comoze Lamps', ' (6,172)') ('Compatible Lamp', ' (317)') ('Corgi Lamps', ' (2,123)') ('CTLAMP', ' (3,501)') ('Dell', ' (191)') ('Diamond Lamps', ' (966)') ('Dynamic', ' (4)') ('Eiki', ' (457)') ('ePharos', ' (2,592)') ('Epson', ' (1,456)') ('EREPLACEMENT', ' (115)') ('eReplacements', ' (813)') ('eWo's', ' (120)') ('eWorldlamp', ' (354)') ('FI Lamps', ' (5,710)') ('FL Projector Lamp For Mitsubishi', ' (1)') ('For Epson', ' (3)') ('Generic', ' (9,771)') ('Good Lamp', ' (819)') ('HCDZ', ' (2,748)') ('Hitachi', ' (935)') ('IET Lamps', ' (2,137)') ('InFocus', ' (44)') ('JVC', ' (326)') ('KCL', ' (3,783)') ('Lampedia', ' (618)') ('Lutema', ' (1,955)') ('Mitsubishi', ' (1,006)') ('Mogobe', ' (1,336)') ('MyProjectorLamps', ' (473)') ('NEC', ' (450)') ('Nec Computers', ' (13)') ('Optoma', ' (956)') ('Osram Sylvania', ' (78)') ('Panasonic', ' (820)') ('Philips', ' (7,502)') ('Powerwarehouse', ' (9,972)') ('Projector Lamps World', ' (112)') ('Pureglare', ' (369)') ('Samsung', ' (1,078)') ('Sharp', ' (426)') ('Shopforbattery', ' (2,511)') ('SMART BOARD', ' (66)') ('Sony', ' (990)') ('TVLampsforless', ' (14)') ('Unknown', ' (722)') --------------------------爬去完毕ExclusiveBulbsatSat Apr 30 00:32:38 2016---------------------------- ('Battery1inc', ' (85)') ('BenQ', ' (237)') ('Buslink', ' (31)') ('Calumet', ' (2)') ('Comoze Lamps', ' (405)') ('CTLAMP', ' (615)') ('Dell', ' (82)') ('Divine Lighting', ' (36)') ('DNGO', ' (63)') ('Dynamic', ' (4)') ('Eiko', ' (140)') ('Electrified', ' (2)') ('ELECTRIFIED LAMPS', ' (24)') ('Electronix Xpress', ' (418)') ('ePharos', ' (502)') ('Epson', ' (631)') ('eReplacements', ' (119)') ('FI Lamps', ' (505)') ('FL Projector Lamp For Mitsubishi', ' (1)') ('G-lamps', ' (43)') ('GE', ' (248)') ('GE Lighting', ' (152)') ('General Electric', ' (53)') ('Generic', ' (1,671)') ('Genie', ' (101)') ('GLAMPS', ' (2)') ('Impact', ' (7)') ('Industrial Lighting Solutions', ' (9)') ('KCL', ' (280)') ('Kodak', ' (1)') ('Lampedia', ' (63)') ('M-Wave', ' (830)') ('Mitsubishi', ' (406)') ('Mitsubishi DLP TV Bulbs', ' (29)') ('Mocpinc', ' (10)') ('MyProjectorLamps', ' (344)') ('Nec', ' (19)') ('Optoma', ' (161)') ('Osram', ' (1,295)') ('Panasonic', ' (245)') ('Philips', ' (988)') ('Powerwarehouse', ' (239)') ('Projector Lamps World', ' (45)') ('Pureglare', ' (107)') ('Samsung', ' (323)') ('ShopJimmy', ' (3)') ('Sony', ' (141)') ('Sylvania', ' (115)') ('Technical Precision', ' (10)') ('Unknown', ' (167)') ('Welch Allyn Compatible', ' (1)') --------------------------爬去完毕Dynamic LampsatSat Apr 30 00:32:39 2016---------------------------- all over Sat Apr 30 00:32:39 2016