使用selenium模块爬取验证码页面,selenium模块需要另外安装这里不讲环境的配置,我有一篇博客有专门讲ubuntn下安装和配置模拟浏览器的开发
spider的代码
1 # -*- coding: utf-8 -*- 2 from selenium import webdriver 3 import scrapy 4 from scrapy.selector import Selector 5 from time import sleep 6 7 8 class MydoubanSpider(scrapy.Spider): 9 name = "mydouban_moni" 10 11 def __init__(self, ): 12 super(MydoubanSpider, self).__init__() 13 self.start_urls = ['https://www.douban.com/'] 14 self.driver = webdriver.Chrome() 15 self.driver.get("https://accounts.douban.com/login") 16 sleep(1) 17 18 def parse(self, response): 19 yanzhengma = raw_input('请输入验证码:') 20 name = self.driver.find_element_by_xpath('//*[@id="email"]') 21 name.send_keys('username用户名') 22 password = self.driver.find_element_by_xpath('//*[@id="password"]') 23 password.send_keys('password密码') 24 key = self.driver.find_element_by_xpath('//*[@id="captcha_field"]') 25 key.send_keys(yanzhengma) 26 summit = self.driver.find_element_by_xpath('//*[@id="lzform"]/div[7]/input') 27 summit.click() 28 sleep(1) 29 sel = Selector(text=self.driver.page_source) 30 myname = sel.response.xpath('//*[@id="db-global-nav"]/div/div[1]/ul/li[2]/a/span[1]//text()').extract() 31 print ''.join(myname) 32 print '=====================' 33 pass
如果出现这个HTTP status code is not handled or not allowed 错误 说明还有spider的头需要配置这个可以用伪装代理解决在setting.py中设置
github完整代码地址:https://github.com/sea1234/myyangzhengma