zoukankan      html  css  js  c++  java
  • Crawley框架

    crawley startproject crawley_spider

    cd crawley_spider

    # Models:等于scrapy的item
    from crawley.persistance import Entity, UrlEntity, Field, Unicode
    
    class Package(Entity):
    
        #add your table fields here
        updated = Field(Unicode(255))
        package = Field(Unicode(255))
        description = Field(Unicode(255))
    # Crawlers:
    from crawley.crawlers import BaseCrawler
    from crawley.scrapers import BaseScraper
    from crawley.extractors import XPathExtractor
    from models import *
    
    class pypiScraper(BaseScraper):
    
        #specify the urls that can be scraped by this class
        matching_urls = ["%"]
    
        def scrape(self, response):
    
            #getting the html table
            table = response.html.xpath("/html/body/div[5]/div/div/div[3]/table")[0]
    
            #for rows 1 to n-1
            for tr in table[1:-1]:
    
                #obtaining the searched html inside the rows
                td_updated = tr[0]
                td_package = tr[1]
                package_link = td_package[0]
                td_description = tr[2]
    
                #storing data in Packages table
                Package(updated=td_updated.text, package=package_link.text, description=td_description.text)
    
    
    class pypiCrawler(BaseCrawler):
    
        #add your starting urls here
        start_urls = ["http://pypi.python.org/pypi"]
    
        #add your scraper classes here
        scrapers = [pypiScraper]
    
        #specify you maximum crawling depth level
        max_depth = 0
    
        #select your favourite HTML parsing tool
        extractor = XPathExtractor

    pypiScraper类内部定义的scrape方法。它使用Xpath来获取解析的html,然后将提取的数据存储在Packages表中。

    # settings.py
    import os
    PATH = os.path.dirname(os.path.abspath(__file__))
    
    #Don't change this if you don't have renamed the project
    PROJECT_NAME = "pypi"
    PROJECT_ROOT = os.path.join(PATH, PROJECT_NAME)
    
    DATABASE_ENGINE = 'sqlite'
    DATABASE_NAME = 'pypi'
    DATABASE_USER = ''
    DATABASE_PASSWORD = ''
    DATABASE_HOST = ''
    DATABASE_PORT = ''
    
    SHOW_DEBUG_INFO = True

    运行爬虫:crawley  run

  • 相关阅读:
    sqlserver2008导出表结构和数据
    使用adb命令对手机进行截屏保存到电脑
    android中控制多点同时触发时间
    使用Androi自带模拟器7.0版本无法安装apk解决
    Android library使用butterknife配置
    使用RadioGroup和fragment搭建项目框架填坑
    【转】BaseAdapter&DataSetObserver通知机制
    【转】读BaseAdapter的一点感悟
    使用Rxjava和Retrofit报错--01
    使用LeakCanary检测内存泄漏
  • 原文地址:https://www.cnblogs.com/xuezhihao/p/11671986.html
Copyright © 2011-2022 走看看