zoukankan      html  css  js  c++  java
  • scrapy+selenium 爬取淘宝商城商品数据存入到mongo中

    1.配置信息

    # 设置mongo参数
    MONGO_URI = 'localhost'
    MONGO_DB = 'taobao'
    
    # 设置搜索关键字
    KEYWORDS=['小米手机','华为手机']
    # 最大爬取页数
    MAX_PAGE = 2
    # 相应超时设置
    SELENIUM_TIMEOUT = 20
    
    ROBOTSTXT_OBEY = False #忽略
    
    # 中间件
    DOWNLOADER_MIDDLEWARES = {
       'taobaoSpider.middlewares.SeleniumMiddleware': 300,
    }
    
    
    #项目管道
    ITEM_PIPELINES = {
       # 'taobaoSpider.pipelines.TaobaospiderPipeline': 300,
        'taobaoSpider.pipelines.MongoPipeline': 400,
    }
    2.item
    import scrapy
    
    
    class TaobaospiderItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        title = scrapy.Field()
        price = scrapy.Field()
        sales = scrapy.Field()
        shop = scrapy.Field()
        location = scrapy.Field()
        image = scrapy.Field()

     3.spider

    # -*- coding: utf-8 -*-
    import scrapy
    
    class TaobaoSpider(scrapy.Spider):
        name = 'taobao'
        allowed_domains = ['taobao.com']
        base_url = 'https://s.taobao.com/search?q='
    
        def start_url(self):
            # 通过self.settings.get()的方式获取setting里面的参数
            for keyword in self.gettings.get('KEYWORDS'):
                for page in range(1,self.gettings.get('MAX_PAGE')+1):
                    url = self.base_url.format(self.url)
                    yield scrapy.Request(url=self.url,
                                         callback=self.parse,
                                        meta={'page':page}, # 传递页码
                                        dont_filter=True) # 不去重
    
    
        def parse(self, response):
    
            products = response.xpath('//*[@id="mainsrp-itemlist"]/div[@class="m-itemlist"]/div[@class="grid g-claerfix"]/div[1]')
    
            # products = response.xpath('//div[contains(@class,"item J_MouserOnverReq"/')
            for product in products:
                from taobaoSpider.taobaoSpider.items import TaobaospiderItem
                item = TaobaospiderItem()
                item['title'] = ''.join(product.xpath('//div[contains(@class,"title")]/text()').extract()).strip()
                item['location'] = ''.join(product.xpath('//div[contains(@class,"location")]/text()').extract()).strip()
                item['shop'] = ''.join(product.xpath('//div[contains(@class,"shop")]/text()').extract()).strip()
                item['price'] = ''.join(product.xpath('//div[contains(@class,"price")]/text()').extract()).strip()
                item['deal'] = ''.join(product.xpath('//div[contains(@class,"deal-cnt")]/text()').extract()).strip()
                item['iamge'] = ''.join(product.xpath('//div[@class="pic"]/img[contains(@class,"img")/@data-src').extract()).strip()
                yield item

    4.中间件

    from selenium import webdriver
    from selenium.common.exceptions import TimeoutException
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from scrapy.http import HtmlResponse
    from logging import getLogger
    
    
    class SeleniumMiddleware(object):
        # def __init__(self, timeout=None, service_args=[]):
        def __init__(self, timeout=None):
            self.logger = getLogger(__name__)
            self.timeout = timeout
            # self.browser = webdriver.PhantomJS(service_args=service_args)
            # 无界面模式
            # self.options = webdriver.ChromeOptions()
            # self.options.add_argument('--headless')
            # self.browser = webdriver.Chrome(chrome_options=self.options)
            self.browser = webdriver.Chrome()
            # self.browser.set_window_size(1400, 700)
            self.browser.set_page_load_timeout(self.timeout)
            self.wait = WebDriverWait(self.browser, self.timeout)
            print('timeout:', self.timeout)
    
        def __del__(self):
            self.browser.close()
    
        def process_request(self, request, spider):
            '''
            :param request:
            :param spider:
            :return:
            '''
            self.logger.debug('Selenium is Runing')
            # 得到的是个int型的整数
            page = request.meta.get('page', 1)
            try:
                self.browser.get(request.url)
                print(10*'-', request.url,10*'-')
                if page > 1:
                    # 从第二页开始,等待页面加载完成
                    # Presence_of_all_elements_located 判断一组元素是否存在
                    input = self.wait.until(
                        EC.presence_of_element_located((By.CSS_SELECTOR, '##mainsrp-pager > div > div > div > div.form > input')))
                    # Element_to_be_clickable 判断元素是否可点击
                    submit = self.wait.until(
                        EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit')))
                    input.clear()
                    # 输入第几页
                    input.send_keys(page)
                    submit.click()
                # Text_to_be_present_in_element 判断元素是否有xx文本信息
                self.wait.until(EC.text_to_be_present_in_element((
                    By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul > li.item.active > span'), str(page)))
                # Presence_of_all_elements_located 判断一组元素是否存在
                # 检测每一个item是否加载出来了
                self.wait.until(EC.presence_of_element_located((
                    By.CSS_SELECTOR, '#mainsrp-itemlist .m-itemlist .grid.g-clearfix .item')))
                return HtmlResponse(url=request.url, body=self.browser.page_source, request=request, encoding='utf-8', status=200)
            except TimeoutException:
                return HtmlResponse(url=request.url, status=500, request=request)
    
        # 类方法 感觉就是获取setting里面的数据 在这里调用
        # 得到数据之后变成类变量
        @classmethod
        def from_crawler(cls, crawler):
            return cls(timeout=crawler.settings.get('SELENIUM_TIMEOUT'),)
                       # service_args=crawler.settings.get('PHANTOMJS_SERVICE_ARGS'))

    5.管道(存储到mongo中)

    class Pipeline(object):
        def process_item(self, item, spider):
            return item
    
    import pymongo
    
    # 存储到mongo中
    class MongoPipeline(object):
    
        # 配置mongo数据库
        def __init__(self,mongo_url,mongo_db):
            self.mongo_url = mongo_url
            self.mongo_db = mongo_db
    
        # 从setting中获取参数
        @classmethod
        def from_crawler(cls,crawler):
            mongo_url = crawler.settings.get('MONGO_URL')
            mongo_db = crawler.settings.get('MONGO_DB')
    
        # 连接数据库
        def open_spider(self,spider):
            self.client = pymongo.MongoClient(self.mongo_url)
            self.db = self.client[self.mongo_db]
    
        # 关闭数据库连接
        def close_spider(self,spider):
            self.client.close()
    
        # 设置存储格式
        def process_item(self,item,spider):
            # item.__class__.__name__  输出的item的类名
            name = item.__class__.__name__
            print('---------------name', name, '-------------------')
            self.db[name].insert(dict(item))
            return item
  • 相关阅读:
    Penetration Test
    SpringBoot启动时初始化数据库及spring.jpa.generate-dll与spring.jpa.hibernate.ddl-auto之间的困惑
    maven deploy到ftp服务器
    spring-boot:repackage生成的MANIFEST.MF中的Main-Class和Start-Class
    @GeneratedValue的strategy
    mysql方言设置
    jpa.generate-ddl和jpa.hibernate.ddl-auto
    在家学习VS在咖啡馆学习
    StatusLogger No Log4j 2 configuration file found
    tomcat应用部署顺序
  • 原文地址:https://www.cnblogs.com/ray-mmss/p/9388390.html
Copyright © 2011-2022 走看看