zoukankan      html  css  js  c++  java
  • Crawley框架

    crawley startproject crawley_spider

    cd crawley_spider

    # Models:等于scrapy的item
    from crawley.persistance import Entity, UrlEntity, Field, Unicode
    
    class Package(Entity):
    
        #add your table fields here
        updated = Field(Unicode(255))
        package = Field(Unicode(255))
        description = Field(Unicode(255))
    # Crawlers:
    from crawley.crawlers import BaseCrawler
    from crawley.scrapers import BaseScraper
    from crawley.extractors import XPathExtractor
    from models import *
    
    class pypiScraper(BaseScraper):
    
        #specify the urls that can be scraped by this class
        matching_urls = ["%"]
    
        def scrape(self, response):
    
            #getting the html table
            table = response.html.xpath("/html/body/div[5]/div/div/div[3]/table")[0]
    
            #for rows 1 to n-1
            for tr in table[1:-1]:
    
                #obtaining the searched html inside the rows
                td_updated = tr[0]
                td_package = tr[1]
                package_link = td_package[0]
                td_description = tr[2]
    
                #storing data in Packages table
                Package(updated=td_updated.text, package=package_link.text, description=td_description.text)
    
    
    class pypiCrawler(BaseCrawler):
    
        #add your starting urls here
        start_urls = ["http://pypi.python.org/pypi"]
    
        #add your scraper classes here
        scrapers = [pypiScraper]
    
        #specify you maximum crawling depth level
        max_depth = 0
    
        #select your favourite HTML parsing tool
        extractor = XPathExtractor

    pypiScraper类内部定义的scrape方法。它使用Xpath来获取解析的html,然后将提取的数据存储在Packages表中。

    # settings.py
    import os
    PATH = os.path.dirname(os.path.abspath(__file__))
    
    #Don't change this if you don't have renamed the project
    PROJECT_NAME = "pypi"
    PROJECT_ROOT = os.path.join(PATH, PROJECT_NAME)
    
    DATABASE_ENGINE = 'sqlite'
    DATABASE_NAME = 'pypi'
    DATABASE_USER = ''
    DATABASE_PASSWORD = ''
    DATABASE_HOST = ''
    DATABASE_PORT = ''
    
    SHOW_DEBUG_INFO = True

    运行爬虫:crawley  run

  • 相关阅读:
    九九乘法表及双色球
    错误 “SCRIPT7002: XMLHttpRequest: 网络错误 0x2ef3, ie浏览器兼容问题
    隐藏ie input的X和眼睛图标
    vue-cli解决兼容ie的es6+api问题
    git 本地tag和远程tag对应不上 vscode里pull不下代码
    git 计算commit
    git 查看对比的方法log diff
    git 版本回退方法
    git rebase的使用
    git 常规操作
  • 原文地址:https://www.cnblogs.com/xuezhihao/p/11671986.html
Copyright © 2011-2022 走看看