1.配置信息
# 设置mongo参数 MONGO_URI = 'localhost' MONGO_DB = 'taobao' # 设置搜索关键字 KEYWORDS=['小米手机','华为手机'] # 最大爬取页数 MAX_PAGE = 2 # 相应超时设置 SELENIUM_TIMEOUT = 20 ROBOTSTXT_OBEY = False #忽略 # 中间件 DOWNLOADER_MIDDLEWARES = { 'taobaoSpider.middlewares.SeleniumMiddleware': 300, } #项目管道 ITEM_PIPELINES = { # 'taobaoSpider.pipelines.TaobaospiderPipeline': 300, 'taobaoSpider.pipelines.MongoPipeline': 400, }
2.item
import scrapy class TaobaospiderItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() title = scrapy.Field() price = scrapy.Field() sales = scrapy.Field() shop = scrapy.Field() location = scrapy.Field() image = scrapy.Field()
3.spider
# -*- coding: utf-8 -*- import scrapy class TaobaoSpider(scrapy.Spider): name = 'taobao' allowed_domains = ['taobao.com'] base_url = 'https://s.taobao.com/search?q=' def start_url(self): # 通过self.settings.get()的方式获取setting里面的参数 for keyword in self.gettings.get('KEYWORDS'): for page in range(1,self.gettings.get('MAX_PAGE')+1): url = self.base_url.format(self.url) yield scrapy.Request(url=self.url, callback=self.parse, meta={'page':page}, # 传递页码 dont_filter=True) # 不去重 def parse(self, response): products = response.xpath('//*[@id="mainsrp-itemlist"]/div[@class="m-itemlist"]/div[@class="grid g-claerfix"]/div[1]') # products = response.xpath('//div[contains(@class,"item J_MouserOnverReq"/') for product in products: from taobaoSpider.taobaoSpider.items import TaobaospiderItem item = TaobaospiderItem() item['title'] = ''.join(product.xpath('//div[contains(@class,"title")]/text()').extract()).strip() item['location'] = ''.join(product.xpath('//div[contains(@class,"location")]/text()').extract()).strip() item['shop'] = ''.join(product.xpath('//div[contains(@class,"shop")]/text()').extract()).strip() item['price'] = ''.join(product.xpath('//div[contains(@class,"price")]/text()').extract()).strip() item['deal'] = ''.join(product.xpath('//div[contains(@class,"deal-cnt")]/text()').extract()).strip() item['iamge'] = ''.join(product.xpath('//div[@class="pic"]/img[contains(@class,"img")/@data-src').extract()).strip() yield item
4.中间件
from selenium import webdriver from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from scrapy.http import HtmlResponse from logging import getLogger class SeleniumMiddleware(object): # def __init__(self, timeout=None, service_args=[]): def __init__(self, timeout=None): self.logger = getLogger(__name__) self.timeout = timeout # self.browser = webdriver.PhantomJS(service_args=service_args) # 无界面模式 # self.options = webdriver.ChromeOptions() # self.options.add_argument('--headless') # self.browser = webdriver.Chrome(chrome_options=self.options) self.browser = webdriver.Chrome() # self.browser.set_window_size(1400, 700) self.browser.set_page_load_timeout(self.timeout) self.wait = WebDriverWait(self.browser, self.timeout) print('timeout:', self.timeout) def __del__(self): self.browser.close() def process_request(self, request, spider): ''' :param request: :param spider: :return: ''' self.logger.debug('Selenium is Runing') # 得到的是个int型的整数 page = request.meta.get('page', 1) try: self.browser.get(request.url) print(10*'-', request.url,10*'-') if page > 1: # 从第二页开始,等待页面加载完成 # Presence_of_all_elements_located 判断一组元素是否存在 input = self.wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, '##mainsrp-pager > div > div > div > div.form > input'))) # Element_to_be_clickable 判断元素是否可点击 submit = self.wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit'))) input.clear() # 输入第几页 input.send_keys(page) submit.click() # Text_to_be_present_in_element 判断元素是否有xx文本信息 self.wait.until(EC.text_to_be_present_in_element(( By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul > li.item.active > span'), str(page))) # Presence_of_all_elements_located 判断一组元素是否存在 # 检测每一个item是否加载出来了 self.wait.until(EC.presence_of_element_located(( By.CSS_SELECTOR, '#mainsrp-itemlist .m-itemlist .grid.g-clearfix .item'))) return HtmlResponse(url=request.url, body=self.browser.page_source, request=request, encoding='utf-8', status=200) except TimeoutException: return HtmlResponse(url=request.url, status=500, request=request) # 类方法 感觉就是获取setting里面的数据 在这里调用 # 得到数据之后变成类变量 @classmethod def from_crawler(cls, crawler): return cls(timeout=crawler.settings.get('SELENIUM_TIMEOUT'),) # service_args=crawler.settings.get('PHANTOMJS_SERVICE_ARGS'))
5.管道(存储到mongo中)
class Pipeline(object): def process_item(self, item, spider): return item import pymongo # 存储到mongo中 class MongoPipeline(object): # 配置mongo数据库 def __init__(self,mongo_url,mongo_db): self.mongo_url = mongo_url self.mongo_db = mongo_db # 从setting中获取参数 @classmethod def from_crawler(cls,crawler): mongo_url = crawler.settings.get('MONGO_URL') mongo_db = crawler.settings.get('MONGO_DB') # 连接数据库 def open_spider(self,spider): self.client = pymongo.MongoClient(self.mongo_url) self.db = self.client[self.mongo_db] # 关闭数据库连接 def close_spider(self,spider): self.client.close() # 设置存储格式 def process_item(self,item,spider): # item.__class__.__name__ 输出的item的类名 name = item.__class__.__name__ print('---------------name', name, '-------------------') self.db[name].insert(dict(item)) return item