zoukankan      html  css  js  c++  java
  • scrapy 抓取动态网页

    -- coding: utf-8 --

    '''
    gouwu.sogou.com Spider, Created on Dec, 2014

    version: 1.0

    author: chenqx @http://chenqx.github.com

    See more: http://doc.scrapy.org/en/latest/index.html
    '''
    import time
    from scrapy.selector import Selector
    from scrapy.http import Request
    from scrapy.contrib.spiders import CrawlSpider
    from scrapy.contrib.loader import ItemLoader
    from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
    from etao.items import EtaoItem
    from etao.lstData import lstData
    from selenium import webdriver
    class etaoSpider(CrawlSpider):
    # name of spiders
    name = 'Spider'
    allow_domain = ['gouwu.sogou.com']
    start_urls = [ ('http://gouwu.sogou.com/shop?query=' + searchWord ) for searchWord in lstData().lst]
    link_extractor = {
    'page': SgmlLinkExtractor(allow = '/detail/d+.html.+'),
    'page_down': SgmlLinkExtractor(allow = '/shop?query=.+',),#restrict_xpaths = '//a[@class = "pagination-next"]'
    }
    _x_query = {
    'title': '//p[@class="title"]/a/@title',
    'name': '//span[@class="floatR hui61 mt1"]/text()',#//li[2]/a/div[@class="ruyitao-market-name ruyitao-market-name-hightlight"]/text()
    'price' : '//span[@class="shopprice font17"]/text()', # 'price' : '//span[@class = "price"]/text()',
    }
    def init(self):
    CrawlSpider.init(self)
    # use any browser you wish
    self.browser = webdriver.Firefox()
    def del(self):
    self.browser.close()
    def parse(self, response):
    #crawl all display page
    for link in self.link_extractor['page_down'].extract_links(response):
    yield Request(url = link.url, callback=self.parse)
    #start browser
    self.browser.get(response.url)
    #loading time interval
    time.sleep(5)
    # get the data and write it to scrapy items
    etaoItem_loader = ItemLoader(item=EtaoItem(), response = response)
    url = str(response.url)
    etaoItem_loader.add_value('url', url)
    etaoItem_loader.add_xpath('title', self._x_query['title'])
    etaoItem_loader.add_xpath('name', self._x_query['name'])
    etaoItem_loader.add_xpath('price', self._x_query['price'])
    yield etaoItem_loader.load_item()

  • 相关阅读:
    数据解压
    子区域数据合并
    数据压缩复制
    将Win10变回Win7/WinXP界面
    通过GP加载卫星云图-雷达图-降雨预报图
    Maven版本与JDK版本
    由输入法随想
    selinux开关
    android studio 配置
    NodeJS 笔记
  • 原文地址:https://www.cnblogs.com/yunpiao111/p/6044392.html
Copyright © 2011-2022 走看看