zoukankan      html  css  js  c++  java
  • scrapy 抓取动态网页

    -- coding: utf-8 --

    '''
    gouwu.sogou.com Spider, Created on Dec, 2014

    version: 1.0

    author: chenqx @http://chenqx.github.com

    See more: http://doc.scrapy.org/en/latest/index.html
    '''
    import time
    from scrapy.selector import Selector
    from scrapy.http import Request
    from scrapy.contrib.spiders import CrawlSpider
    from scrapy.contrib.loader import ItemLoader
    from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
    from etao.items import EtaoItem
    from etao.lstData import lstData
    from selenium import webdriver
    class etaoSpider(CrawlSpider):
    # name of spiders
    name = 'Spider'
    allow_domain = ['gouwu.sogou.com']
    start_urls = [ ('http://gouwu.sogou.com/shop?query=' + searchWord ) for searchWord in lstData().lst]
    link_extractor = {
    'page': SgmlLinkExtractor(allow = '/detail/d+.html.+'),
    'page_down': SgmlLinkExtractor(allow = '/shop?query=.+',),#restrict_xpaths = '//a[@class = "pagination-next"]'
    }
    _x_query = {
    'title': '//p[@class="title"]/a/@title',
    'name': '//span[@class="floatR hui61 mt1"]/text()',#//li[2]/a/div[@class="ruyitao-market-name ruyitao-market-name-hightlight"]/text()
    'price' : '//span[@class="shopprice font17"]/text()', # 'price' : '//span[@class = "price"]/text()',
    }
    def init(self):
    CrawlSpider.init(self)
    # use any browser you wish
    self.browser = webdriver.Firefox()
    def del(self):
    self.browser.close()
    def parse(self, response):
    #crawl all display page
    for link in self.link_extractor['page_down'].extract_links(response):
    yield Request(url = link.url, callback=self.parse)
    #start browser
    self.browser.get(response.url)
    #loading time interval
    time.sleep(5)
    # get the data and write it to scrapy items
    etaoItem_loader = ItemLoader(item=EtaoItem(), response = response)
    url = str(response.url)
    etaoItem_loader.add_value('url', url)
    etaoItem_loader.add_xpath('title', self._x_query['title'])
    etaoItem_loader.add_xpath('name', self._x_query['name'])
    etaoItem_loader.add_xpath('price', self._x_query['price'])
    yield etaoItem_loader.load_item()

  • 相关阅读:
    第四章 JavaScript面向对象
    第二章 JavaScript操作DOM2
    第三章 JavaScript操作DOM
    第二章 JavaScript操作BOM2
    第二章 JavaScript操作BOM
    第一章 JavaScript基础
    java面向对象总结1
    java面向对象总结
    第七章 用表组织数据
    2020.10.22
  • 原文地址:https://www.cnblogs.com/yunpiao111/p/6044392.html
Copyright © 2011-2022 走看看