目前网上有很多关于scrapy的文章,这里我主要介绍一下我在开发中遇到问题及一些技巧:
1,以登录状态去爬取(带cookie)
-安装内容:
brew install phantomjs (MAC上)
pip install selenium
-代码:
1 from selenium import webdriver 2 from selenium.webdriver.common.desired_capabilities import DesiredCapabilities 3 4 dcap = dict(DesiredCapabilities.PHANTOMJS) # PhantomJS也可以对header进行修改 5 dcap["phantomjs.page.settings.userAgent"] = ( 6 "Mozilla/5.0 (Linux; U; Android 2.3.6; en-us; Nexus S Build/GRK39F) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1" 7 ) 8 #通过账号密码获得cookie的函数 9 def get_cookie_from_aicoin_login(account, password): 10 browser = webdriver.PhantomJS(executable_path='/usr/local/bin/phantomjs',desired_capabilities=dcap) 11 browser.get("https://www.aicoin.net.cn/sign_in") 12 while 'Sign in to AIcoin' in browser.title: 13 username = browser.find_element_by_name("user_account")#获得用户名标签 14 username.clear() 15 username.send_keys(account)#输入用户名 16 17 psd = browser.find_element_by_name("user_password")#获得密码标签 18 psd.clear() 19 psd.send_keys(password)#输入密码 20 21 code = browser.find_element_by_name("user_verify")#获得验证码标签 22 code.clear() 23 code_verify = browser.find_element_by_xpath("//button[@class='verify_code']")#部分页面存在验证码错误,需要再次点击刷新获得新的验证码 24 code_verify.click() 25 time.sleep(1) 26 browser.save_screenshot("aa.png") # 对登录页截屏并保存在本地 27 code_txt = input("请查看路径下新生成的aa.png,然后输入验证码:") # 查看图片后手动输入验证码 28 code.send_keys(code_txt)#输入验证码 29 commit = browser.find_element_by_xpath("//div[@class='sure_btn']/button[@type='submit']") # 获得登录按钮 30 commit.click()#点击提交按钮 31 time.sleep(3) 32 cookie = {} 33 for elem in browser.get_cookies(): 34 cookie[elem["name"]] = elem["value"] 35 #返回cookie 36 if 'AICoin - Leader Of Global Cryptocurrency Tickers Application' in browser.title:#验证是否登录成功,成功后会跳转到首页 37 return json.dumps(cookie) 38 else: 39 return {}
※特别提示:当需要爬取动态内容(js加载的内容)时,也会用到PHANTOMJS
※运行爬虫(scrapy crawl yourspider)需要到cd到该爬虫主目录下即包含scrapy.cfg的目录; 另外调试的时候可以直接使用scrapy shell yoururl 进行代码测试;
2,递归爬取内容
-在scrapy中对应的spider文件中添加如下代码(下面是代码是爬取股吧的帖子和评论)
1 from scrapy.http import Request 2 from gubaspider.items import PostItem,CommentItem 3 4 class GubaSpider(scrapy.spiders.Spider): 5 name = "guba" 6 allowed_domains = ["eastmoney.com"] 7 8 start_urls = [ 9 "http://guba.eastmoney.com/default_551215.html" 10 ] 11 12 def parse(self, response): 13 tmp_list = [] 14 15 for i in response.xpath('//ul[@class="newlist"]/li'): 16 17 title = i.xpath('span/a[2]/text()').extract()[0] 18 ar_url = i.xpath('span/a[2]/@href').extract()[0] 19 group = i.xpath('span/a[1]/text()').extract()[0] 20 comment_sum = i.xpath('cite[2]/text()').extract()[0] 21 read_sum = i.xpath('cite[1]/text()').extract()[0] 22 author = i.xpath('cite[3]/a/text()').extract()[0] 23 tmp_list.append({'title':title,'ar_url':ar_url,'group':group,'comment_sum':comment_sum,'read_sum':read_sum, 24 'author':author}) 25 26 for z in tmp_list: 27 yield Request('http://guba.eastmoney.com' + z.pop('ar_url'), callback=self.parse_article,meta=z,cookies=get_cookie_from_aicoin_login(user,pwd))#通过第一个页面里爬取到url再爬取并可以携带参数和cookie;callback就是爬取新url的方法 28 29 def parse_article(self,response): 30 title = response.meta['title'] 31 group = response.meta['group'] 32 comment_sum = response.meta['comment_sum'] 33 read_sum = response.meta['read_sum'] 34 author = response.meta['author'] 35 content = response.xpath('//div[@id="zwcontent"]/div[@class="zwcontentmain"]/div[@id="zwconbody"]/div[@class="stockcodec"]').extract() 36 post_time = self.get_node_value(response.xpath('//div[@id="zwcontent"]/div[@id="zwcontt"]/div[@id="zwconttb"]/div[@class="zwfbtime"]/text()').extract()) 37 if post_time != 0: 38 post_type = post_time.split(' ')[-1] 39 post_time = post_time[4:24] 40 good_sum = self.get_node_value(response.xpath('//div[@id="zwcontent"]/div[@class="zwconbtns clearfix"]/div[@id="zwconbtnsi_z"]/span[@id="zwpraise"]/a/span/text()').extract()) 41 transmit_sum = self.get_node_value(response.xpath( 42 '//div[@id="zwcontent"]/div[@class="zwconbtns clearfix"]/div[@id="zwconbtnsi_zf"]/a/span/text()').extract()) 43 comments = response.xpath('//div[@id="zwlist"]/div[@class="zwli clearfix"]/div[@class="zwlitx"]/div[@class="zwlitxt"]/div[@class="zwlitext stockcodec"]/text()').extract() 44 45 cm_name = response.xpath('//div[@id="zwlist"]/div[@class="zwli clearfix"]/div[@class="zwlitx"]/div[@class="zwlitxt"]/div[@class="zwlianame"]/span[@class="zwnick"]/a/text()').extract() 46 time = response.xpath('//div[@id="zwlist"]/div[@class="zwli clearfix"]/div[@class="zwlitx"]/div[@class="zwlitxt"]/div[@class="zwlitime"]/text()').extract() 47 page_info = response.xpath( 48 '//div[@id="zwlist"]/div[@class="pager talc zwpager"]/span[@id="newspage"]/@data-page').extract() 49 50 item = PostItem() 51 item['Author'] = author # 帖子作者称 52 item['Title'] = title # 帖子标题 53 item['Content'] = content # 帖子内容 54 item['PubTime'] = post_time # 发表时间 55 item['PostWay'] = post_time if post_time==0 else post_type # 发表方式 网页等 56 item['Url'] = response.url # 帖子地址 57 item['Group'] = group # 所属贴吧 58 item['Like'] = good_sum # 点赞数 59 item['Transmit'] = transmit_sum # 转发数 60 item['Comment_Num'] = comment_sum # 评论数 61 item['Tour'] = read_sum # 浏览数 62 63 64 for x in range(len(cm_name)): 65 if comments[x]==' ': 66 if comments[x] == ' ': 67 s = '//div[@id="zwlist"]/div[' + str( 68 x + 1) + ']/div[@class="zwlitx"]/div[@class="zwlitxt"]/div[@class="zwlitext stockcodec"]/img/@title' 69 s = response.xpath(s).extract() 70 comment = reduce(lambda x, y: x + '|'+y, s) if len(s) > 0 else '' 71 else: 72 comment = comments[x] 73 else: 74 comment = comments[x] 75 cm_list.append({'name':cm_name[x],'time':time[x][4:],'comment':comment}) 76 item['Comments'] = cm_list # 回复内容 77 yield item#存入DB 78 if len(page_info)>0: 79 page_info = page_info[0].split('|') 80 sumpage = int(int(page_info[1])/int(page_info[2]))+1 81 for p in range(1,sumpage): 82 cm_url = 'http://guba.eastmoney.com/'+page_info[0]+str(p+1)+'.html' 83 yield Request(cm_url,callback=self.parse_comment)#再爬取下一个页面
3,将数据存入mongodb
-pipelines文件中添加自定义的pipeline类:
1 import pymongo 2 3 class MongoPipeline(object): 4 5 def __init__(self, mongo_uri, mongo_db): 6 self.mongo_uri = mongo_uri 7 self.mongo_db = mongo_db 8 9 @classmethod 10 def from_crawler(cls, crawler): 11 return cls( 12 mongo_uri=crawler.settings.get('MONGO_URI'), 13 mongo_db=crawler.settings.get('MONGO_DATABASE') 14 ) 15 16 def open_spider(self, spider): 17 self.client = pymongo.MongoClient(self.mongo_uri) 18 self.db = self.client[self.mongo_db] 19 20 def close_spider(self, spider): 21 self.client.close() 22 23 def process_item(self, item, spider): 24 collection_name = item.__class__.__name__ 25 self.db[collection_name].insert(dict(item)) 26 return item
-items中定义自己item:
1 from scrapy import Item,Field 2 3 4 class PostItem(Item): 5 Author = Field() # 帖子作者称 6 Title = Field() # 帖子标题 7 Content = Field() # 帖子内容 8 PubTime = Field() # 发表时间 9 # Top = Field() # 是否顶 10 PostWay = Field() # 发表方式 网页等 11 Url = Field() # 帖子地址 12 Group = Field() # 所属贴吧 13 Like = Field() # 点赞数 14 Transmit = Field() # 转发数 15 Comment_Num = Field() # 评论数 16 Tour = Field() # 浏览数 17 Comments = Field() # 回复内容 18 19 class CommentItem(Item): 20 Url = Field() # url 21 Comments = Field() # 评论
-settings中添加ITEM_PIPELINES
1 ITEM_PIPELINES = { 2 'gubaspider.pipelines.MongoPipeline': 300, 3 }
4,添加代理和Agent
-在middlewares中添加你定义的中间件类:
1 from user_agents import agents#从一个文件导入全部agent 2 import random 3 4 class UserAgentMiddleware(object): 5 6 def process_request(self, request, spider): 7 agent = random.choice(agents) 8 request.headers["User-Agent"] = agent#随机agent 9 request.meta['proxy'] = "http://proxy.yourproxy:8001"#添加代理地址
-在settings中进行中间配置
1 DOWNLOADER_MIDDLEWARES = { 2 'gubaspider.middlewares.UserAgentMiddleware' : 543 3 }
-user_agents文件包含一个agent列表:
1 2 3 """ User-Agents """ 4 agents = [ 5 "Mozilla/5.0 (Linux; U; Android 2.3.6; en-us; Nexus S Build/GRK39F) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 6 "Avant Browser/1.2.789rel1 (http://www.avantbrowser.com)", 7 "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5", 8 "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.9 (KHTML, like Gecko) Chrome/5.0.310.0 Safari/532.9", 9 "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.514.0 Safari/534.7", 10 "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/9.0.601.0 Safari/534.14", 11 "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/10.0.601.0 Safari/534.14", 12 "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.20 (KHTML, like Gecko) Chrome/11.0.672.2 Safari/534.20", 13 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.27 (KHTML, like Gecko) Chrome/12.0.712.0 Safari/534.27", 14 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.24 Safari/535.1", 15 "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2", 16 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7", 17 "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre", 18 "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.10) Gecko/2009042316 Firefox/3.0.10", 19 "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)", 20 "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 GTB5", 21 "Mozilla/5.0 (Windows; U; Windows NT 5.1; tr; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8 ( .NET CLR 3.5.30729; .NET4.0E)", 22 "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", 23 "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", 24 "Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0", 25 "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0a2) Gecko/20110622 Firefox/6.0a2", 26 "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1", 27 "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0b4pre) Gecko/20100815 Minefield/4.0b4pre", 28 "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 5.0 )", 29 "Mozilla/4.0 (compatible; MSIE 5.5; Windows 98; Win 9x 4.90)", 30 "Mozilla/5.0 (Windows; U; Windows XP) Gecko MultiZilla/1.6.1.0a", 31 "Mozilla/2.02E (Win95; U)", 32 "Mozilla/3.01Gold (Win95; I)", 33 "Mozilla/4.8 [en] (Windows NT 5.1; U)", 34 "Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.4) Gecko Netscape/7.1 (ax)", 35 "HTC_Dream Mozilla/5.0 (Linux; U; Android 1.5; en-ca; Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 36 "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.2; U; de-DE) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/234.40.1 Safari/534.6 TouchPad/1.0", 37 "Mozilla/5.0 (Linux; U; Android 1.5; en-us; sdk Build/CUPCAKE) AppleWebkit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 38 "Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", 39 "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 40 "Mozilla/5.0 (Linux; U; Android 1.5; en-us; htc_bahamas Build/CRB17) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 41 "Mozilla/5.0 (Linux; U; Android 2.1-update1; de-de; HTC Desire 1.19.161.5 Build/ERE27) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", 42 "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Sprint APA9292KT Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 43 "Mozilla/5.0 (Linux; U; Android 1.5; de-ch; HTC Hero Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 44 "Mozilla/5.0 (Linux; U; Android 2.2; en-us; ADR6300 Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 45 "Mozilla/5.0 (Linux; U; Android 2.1; en-us; HTC Legend Build/cupcake) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", 46 "Mozilla/5.0 (Linux; U; Android 1.5; de-de; HTC Magic Build/PLAT-RC33) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1 FirePHP/0.3", 47 "Mozilla/5.0 (Linux; U; Android 1.6; en-us; HTC_TATTOO_A3288 Build/DRC79) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 48 "Mozilla/5.0 (Linux; U; Android 1.0; en-us; dream) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2", 49 "Mozilla/5.0 (Linux; U; Android 1.5; en-us; T-Mobile G1 Build/CRB43) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari 525.20.1", 50 "Mozilla/5.0 (Linux; U; Android 1.5; en-gb; T-Mobile_G2_Touch Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 51 "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Droid Build/ESD20) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", 52 "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Droid Build/FRG22D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 53 "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Milestone Build/ SHOLS_U2_01.03.1) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", 54 "Mozilla/5.0 (Linux; U; Android 2.0.1; de-de; Milestone Build/SHOLS_U2_01.14.0) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", 55 "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2", 56 "Mozilla/5.0 (Linux; U; Android 0.5; en-us) AppleWebKit/522 (KHTML, like Gecko) Safari/419.3", 57 "Mozilla/5.0 (Linux; U; Android 1.1; en-gb; dream) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2", 58 "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Droid Build/ESD20) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", 59 "Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", 60 "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Sprint APA9292KT Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 61 "Mozilla/5.0 (Linux; U; Android 2.2; en-us; ADR6300 Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 62 "Mozilla/5.0 (Linux; U; Android 2.2; en-ca; GT-P1000M Build/FROYO) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 63 "Mozilla/5.0 (Linux; U; Android 3.0.1; fr-fr; A500 Build/HRI66) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13", 64 "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2", 65 "Mozilla/5.0 (Linux; U; Android 1.6; es-es; SonyEricssonX10i Build/R1FA016) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 66 "Mozilla/5.0 (Linux; U; Android 1.6; en-us; SonyEricssonX10i Build/R1AA056) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 67 ]
※ 以上部分代码参考https://github.com/LiuXingMing/SinaSpider
ITEM_PIPELINES