zoukankan      html  css  js  c++  java
  • scrapy+selenium 爬取淘宝商城商品数据存入到mongo中

    1.配置信息

    # 设置mongo参数
    MONGO_URI = 'localhost'
    MONGO_DB = 'taobao'
    
    # 设置搜索关键字
    KEYWORDS=['小米手机','华为手机']
    # 最大爬取页数
    MAX_PAGE = 2
    # 相应超时设置
    SELENIUM_TIMEOUT = 20
    
    ROBOTSTXT_OBEY = False #忽略
    
    # 中间件
    DOWNLOADER_MIDDLEWARES = {
       'taobaoSpider.middlewares.SeleniumMiddleware': 300,
    }
    
    
    #项目管道
    ITEM_PIPELINES = {
       # 'taobaoSpider.pipelines.TaobaospiderPipeline': 300,
        'taobaoSpider.pipelines.MongoPipeline': 400,
    }
    2.item
    import scrapy
    
    
    class TaobaospiderItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        title = scrapy.Field()
        price = scrapy.Field()
        sales = scrapy.Field()
        shop = scrapy.Field()
        location = scrapy.Field()
        image = scrapy.Field()

     3.spider

    # -*- coding: utf-8 -*-
    import scrapy
    
    class TaobaoSpider(scrapy.Spider):
        name = 'taobao'
        allowed_domains = ['taobao.com']
        base_url = 'https://s.taobao.com/search?q='
    
        def start_url(self):
            # 通过self.settings.get()的方式获取setting里面的参数
            for keyword in self.gettings.get('KEYWORDS'):
                for page in range(1,self.gettings.get('MAX_PAGE')+1):
                    url = self.base_url.format(self.url)
                    yield scrapy.Request(url=self.url,
                                         callback=self.parse,
                                        meta={'page':page}, # 传递页码
                                        dont_filter=True) # 不去重
    
    
        def parse(self, response):
    
            products = response.xpath('//*[@id="mainsrp-itemlist"]/div[@class="m-itemlist"]/div[@class="grid g-claerfix"]/div[1]')
    
            # products = response.xpath('//div[contains(@class,"item J_MouserOnverReq"/')
            for product in products:
                from taobaoSpider.taobaoSpider.items import TaobaospiderItem
                item = TaobaospiderItem()
                item['title'] = ''.join(product.xpath('//div[contains(@class,"title")]/text()').extract()).strip()
                item['location'] = ''.join(product.xpath('//div[contains(@class,"location")]/text()').extract()).strip()
                item['shop'] = ''.join(product.xpath('//div[contains(@class,"shop")]/text()').extract()).strip()
                item['price'] = ''.join(product.xpath('//div[contains(@class,"price")]/text()').extract()).strip()
                item['deal'] = ''.join(product.xpath('//div[contains(@class,"deal-cnt")]/text()').extract()).strip()
                item['iamge'] = ''.join(product.xpath('//div[@class="pic"]/img[contains(@class,"img")/@data-src').extract()).strip()
                yield item

    4.中间件

    from selenium import webdriver
    from selenium.common.exceptions import TimeoutException
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from scrapy.http import HtmlResponse
    from logging import getLogger
    
    
    class SeleniumMiddleware(object):
        # def __init__(self, timeout=None, service_args=[]):
        def __init__(self, timeout=None):
            self.logger = getLogger(__name__)
            self.timeout = timeout
            # self.browser = webdriver.PhantomJS(service_args=service_args)
            # 无界面模式
            # self.options = webdriver.ChromeOptions()
            # self.options.add_argument('--headless')
            # self.browser = webdriver.Chrome(chrome_options=self.options)
            self.browser = webdriver.Chrome()
            # self.browser.set_window_size(1400, 700)
            self.browser.set_page_load_timeout(self.timeout)
            self.wait = WebDriverWait(self.browser, self.timeout)
            print('timeout:', self.timeout)
    
        def __del__(self):
            self.browser.close()
    
        def process_request(self, request, spider):
            '''
            :param request:
            :param spider:
            :return:
            '''
            self.logger.debug('Selenium is Runing')
            # 得到的是个int型的整数
            page = request.meta.get('page', 1)
            try:
                self.browser.get(request.url)
                print(10*'-', request.url,10*'-')
                if page > 1:
                    # 从第二页开始,等待页面加载完成
                    # Presence_of_all_elements_located 判断一组元素是否存在
                    input = self.wait.until(
                        EC.presence_of_element_located((By.CSS_SELECTOR, '##mainsrp-pager > div > div > div > div.form > input')))
                    # Element_to_be_clickable 判断元素是否可点击
                    submit = self.wait.until(
                        EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit')))
                    input.clear()
                    # 输入第几页
                    input.send_keys(page)
                    submit.click()
                # Text_to_be_present_in_element 判断元素是否有xx文本信息
                self.wait.until(EC.text_to_be_present_in_element((
                    By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul > li.item.active > span'), str(page)))
                # Presence_of_all_elements_located 判断一组元素是否存在
                # 检测每一个item是否加载出来了
                self.wait.until(EC.presence_of_element_located((
                    By.CSS_SELECTOR, '#mainsrp-itemlist .m-itemlist .grid.g-clearfix .item')))
                return HtmlResponse(url=request.url, body=self.browser.page_source, request=request, encoding='utf-8', status=200)
            except TimeoutException:
                return HtmlResponse(url=request.url, status=500, request=request)
    
        # 类方法 感觉就是获取setting里面的数据 在这里调用
        # 得到数据之后变成类变量
        @classmethod
        def from_crawler(cls, crawler):
            return cls(timeout=crawler.settings.get('SELENIUM_TIMEOUT'),)
                       # service_args=crawler.settings.get('PHANTOMJS_SERVICE_ARGS'))

    5.管道(存储到mongo中)

    class Pipeline(object):
        def process_item(self, item, spider):
            return item
    
    import pymongo
    
    # 存储到mongo中
    class MongoPipeline(object):
    
        # 配置mongo数据库
        def __init__(self,mongo_url,mongo_db):
            self.mongo_url = mongo_url
            self.mongo_db = mongo_db
    
        # 从setting中获取参数
        @classmethod
        def from_crawler(cls,crawler):
            mongo_url = crawler.settings.get('MONGO_URL')
            mongo_db = crawler.settings.get('MONGO_DB')
    
        # 连接数据库
        def open_spider(self,spider):
            self.client = pymongo.MongoClient(self.mongo_url)
            self.db = self.client[self.mongo_db]
    
        # 关闭数据库连接
        def close_spider(self,spider):
            self.client.close()
    
        # 设置存储格式
        def process_item(self,item,spider):
            # item.__class__.__name__  输出的item的类名
            name = item.__class__.__name__
            print('---------------name', name, '-------------------')
            self.db[name].insert(dict(item))
            return item
  • 相关阅读:
    NanoProfiler
    NanoProfiler
    Open Source Cassandra Gitbook for Developer
    Android Fragment使用(四) Toolbar使用及Fragment中的Toolbar处理
    Android Fragment使用(三) Activity, Fragment, WebView的状态保存和恢复
    Android Fragment使用(二) 嵌套Fragments (Nested Fragments) 的使用及常见错误
    Android Fragment使用(一) 基础篇 温故知新
    Set up Github Pages with Hexo, migrating from Jekyll
    EventBus源码解析 源码阅读记录
    Android M Permission 运行时权限 学习笔记
  • 原文地址:https://www.cnblogs.com/ray-mmss/p/9388390.html
Copyright © 2011-2022 走看看