zoukankan      html  css  js  c++  java
  • scrapy 抓取动态网页

    -- coding: utf-8 --

    '''
    gouwu.sogou.com Spider, Created on Dec, 2014

    version: 1.0

    author: chenqx @http://chenqx.github.com

    See more: http://doc.scrapy.org/en/latest/index.html
    '''
    import time
    from scrapy.selector import Selector
    from scrapy.http import Request
    from scrapy.contrib.spiders import CrawlSpider
    from scrapy.contrib.loader import ItemLoader
    from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
    from etao.items import EtaoItem
    from etao.lstData import lstData
    from selenium import webdriver
    class etaoSpider(CrawlSpider):
    # name of spiders
    name = 'Spider'
    allow_domain = ['gouwu.sogou.com']
    start_urls = [ ('http://gouwu.sogou.com/shop?query=' + searchWord ) for searchWord in lstData().lst]
    link_extractor = {
    'page': SgmlLinkExtractor(allow = '/detail/d+.html.+'),
    'page_down': SgmlLinkExtractor(allow = '/shop?query=.+',),#restrict_xpaths = '//a[@class = "pagination-next"]'
    }
    _x_query = {
    'title': '//p[@class="title"]/a/@title',
    'name': '//span[@class="floatR hui61 mt1"]/text()',#//li[2]/a/div[@class="ruyitao-market-name ruyitao-market-name-hightlight"]/text()
    'price' : '//span[@class="shopprice font17"]/text()', # 'price' : '//span[@class = "price"]/text()',
    }
    def init(self):
    CrawlSpider.init(self)
    # use any browser you wish
    self.browser = webdriver.Firefox()
    def del(self):
    self.browser.close()
    def parse(self, response):
    #crawl all display page
    for link in self.link_extractor['page_down'].extract_links(response):
    yield Request(url = link.url, callback=self.parse)
    #start browser
    self.browser.get(response.url)
    #loading time interval
    time.sleep(5)
    # get the data and write it to scrapy items
    etaoItem_loader = ItemLoader(item=EtaoItem(), response = response)
    url = str(response.url)
    etaoItem_loader.add_value('url', url)
    etaoItem_loader.add_xpath('title', self._x_query['title'])
    etaoItem_loader.add_xpath('name', self._x_query['name'])
    etaoItem_loader.add_xpath('price', self._x_query['price'])
    yield etaoItem_loader.load_item()

  • 相关阅读:
    (转) [组合数学] 第一类,第二类Stirling数,Bell数
    The Heaviest Non-decreasing Subsequence Problem
    高斯枚举自由元小板子 (待测试)
    gperftools::TCMalloc
    tinyxml2
    pugixml
    std::weak_ptr
    enable_shared_from_this
    mfc字符转码
    std::multiset
  • 原文地址:https://www.cnblogs.com/yunpiao111/p/6044392.html
Copyright © 2011-2022 走看看