zoukankan      html  css  js  c++  java
  • python之scrapy篇(三)

    一、创建工程(cmd)

    scrapy startproject xxxx

    二、编写item文件

    # -*- coding: utf-8 -*-
    
    # Define here the models for your scraped items
    #
    # See documentation in:
    # https://docs.scrapy.org/en/latest/topics/items.html
    
    from scrapy import Field, Item
    
    
    class YouyuanItem(Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        # 用户名
        username = Field()
        # 年龄
        age = Field()
        # 头像图片链接
        herder_url = Field()
        # 相册图片
        image_url = Field()
        # 独白
        content = Field()
        # 籍贯
        place_from = Field()
        # 学历
        education = Field()
        # 兴趣爱好
        hobby = Field()
        # 个人主页
        source_url = Field()
        # 数据来源
        sourcec = Field()

    三、编写settings文件

    # -*- coding: utf-8 -*-
    
    # Scrapy settings for youyuan project
    #
    # For simplicity, this file contains only settings considered important or
    # commonly used. You can find more settings consulting the documentation:
    #
    #     https://docs.scrapy.org/en/latest/topics/settings.html
    #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
    #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
    
    BOT_NAME = 'youyuan'
    
    SPIDER_MODULES = ['youyuan.spiders']
    NEWSPIDER_MODULE = 'youyuan.spiders'
    
    
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    #USER_AGENT = 'youyuan (+http://www.yourdomain.com)'
    
    # Obey robots.txt rules
    #ROBOTSTXT_OBEY = True
    
    # Configure maximum concurrent requests performed by Scrapy (default: 16)
    #CONCURRENT_REQUESTS = 32
    
    # Configure a delay for requests for the same website (default: 0)
    # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
    # See also autothrottle settings and docs
    DOWNLOAD_DELAY = 3
    # The download delay setting will honor only one of:
    #CONCURRENT_REQUESTS_PER_DOMAIN = 16
    #CONCURRENT_REQUESTS_PER_IP = 16
    
    # Disable cookies (enabled by default)
    #COOKIES_ENABLED = False
    
    # Disable Telnet Console (enabled by default)
    #TELNETCONSOLE_ENABLED = False
    
    # Override the default request headers:
    #DEFAULT_REQUEST_HEADERS = {
    #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    #   'Accept-Language': 'en',
    #}
    
    # Enable or disable spider middlewares
    # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
    #SPIDER_MIDDLEWARES = {
    #    'youyuan.middlewares.YouyuanSpiderMiddleware': 543,
    #}
    
    # Enable or disable downloader middlewares
    # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
    #DOWNLOADER_MIDDLEWARES = {
    #    'youyuan.middlewares.YouyuanDownloaderMiddleware': 543,
    #}
    
    # Enable or disable extensions
    # See https://docs.scrapy.org/en/latest/topics/extensions.html
    #EXTENSIONS = {
    #    'scrapy.extensions.telnet.TelnetConsole': None,
    #}
    
    # Configure item pipelines
    # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
    ITEM_PIPELINES = {
       'youyuan.pipelines.YouyuanPipeline': 300,
    }
    
    # Enable and configure the AutoThrottle extension (disabled by default)
    # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
    #AUTOTHROTTLE_ENABLED = True
    # The initial download delay
    #AUTOTHROTTLE_START_DELAY = 5
    # The maximum download delay to be set in case of high latencies
    #AUTOTHROTTLE_MAX_DELAY = 60
    # The average number of requests Scrapy should be sending in parallel to
    # each remote server
    #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
    # Enable showing throttling stats for every response received:
    #AUTOTHROTTLE_DEBUG = False
    
    # Enable and configure HTTP caching (disabled by default)
    # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
    #HTTPCACHE_ENABLED = True
    #HTTPCACHE_EXPIRATION_SECS = 0
    #HTTPCACHE_DIR = 'httpcache'
    #HTTPCACHE_IGNORE_HTTP_CODES = []
    #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

    四、进入spider文件创建自定义爬虫文件

    scrapy genspider demo 'www.xxxx.com'

    编写文件

    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    from youyuan.items import YouyuanItem
    import re
    
    
    class YySpider(CrawlSpider):
        name = 'yy'
        allowed_domains = ['xxxx.com']
        start_urls = ['http://www.xxxx.com/find/beijing/mm18-25/advance-0-0-0-0-0-0-0/p1/']
    
        # 匹配每一页链接匹配规则
        page_links = LinkExtractor(allow=(r"xxxx.com/find/beijing/mm18-25/advance-0-0-0-0-0-0-0/pd+/"))
        # 每个主页的匹配规则
        profile_links = LinkExtractor(allow=(r"xxxxx.com/d+-profile/"))
    
        rules = (
            Rule(page_links),
            Rule(profile_links, callback='parse_item'),
        )
    
        def parse_item(self, response):
            item = YouyuanItem()
            # 姓名
            item['username'] = self.get_username(response)
            # 年龄
            item['age'] = self.get_age(response)
            # 头像图片链接
            item['herder_url'] = self.get_herder_url(response)
            # 相册图片
            item['image_url'] = self.get_image_url(response)
            # 独白
            item['content'] = self.get_content(response)
            # 籍贯
            item['place_from'] = self.get_place_from(response)
            # 学历
            item['education'] = self.get_education(response)
            # 兴趣爱好
            item['hobby'] = self.get_hobby(response)
            # 个人主页
            item['source_url'] = response.url
            # 数据来源
            item['sourcec'] = "youyuan"
    
            yield item
    
        def get_username(self, response):
            username = response.xpath("//dl[@class='personal_cen']//div[@class='main']/strong/text()").extract()
            if len(username):
                username = username[0]
            else:
                username = "NULL"
            return username.split()
    
        def get_age(self, response):
            age = response.xpath("//dl[@class='personal_cen']//dd/p/text()").extract()
            if len(age):
                age = re.findall(u"d+岁", age[0])[0]
            else:
                age = "NULL"
            return age.strip()
    
        def get_herder_url(self, response):
            herder_url = response.xpath("//dl[@class='personal_cen']//dt/img/@src").extract()
            if len(herder_url):
                herder_url = herder_url[0]
            else:
                herder_url = "NULL"
            return herder_url.strip()
    
        def get_image_url(self, response):
            image_url = response.xpath("//div[@class='ph_show']/ul/li/a/img/@src").extract()
            if len(image_url):
                image_url = image_url
            else:
                image_url = "NULL"
            return image_url
    
        def get_content(self, response):
            content = response.xpath("//div[@class='pre_data']/ul/li/p/text()").extract()
            if len(content):
                content = content[0]
            else:
                content = "NULL"
            return content.strip()
    
        def get_place_from(self, response):
            place_from = response.xpath("//div[@class='pre_data']/ul/li[2]//ol[1]/li[1]/span/text()").extract()
            if len(place_from):
                place_from = place_from[0]
            else:
                place_from = "NULL"
            return place_from.strip()
    
        def get_education(self, response):
            education = response.xpath("//div[@class='pre_data']/ul/li[3]//ol[2]/li[2]/span/text()").extract()
            if len(education):
                education = education[0]
            else:
                education = "NULL"
            return education.strip()
    
        def get_hobby(self, response):
            hobby = response.xpath("//dl[@class='personal_cen']//ol/li/text()").extract()
            if len(hobby):
                hobby = ",".join(hobby).replace(" ", "")
            else:
                hobby = "NULL"
            return hobby.strip()

    五、运行

    scrapy crawl xxxx

    OVER!

  • 相关阅读:
    HDU 2899 Strange fuction
    HDU 2899 Strange fuction
    HDU 2199 Can you solve this equation?
    HDU 2199 Can you solve this equation?
    Java实现 LeetCode 700 二叉搜索树中的搜索(遍历树)
    Java实现 LeetCode 700 二叉搜索树中的搜索(遍历树)
    Java实现 LeetCode 700 二叉搜索树中的搜索(遍历树)
    Java实现 LeetCode 699 掉落的方块(线段树?)
    Java实现 LeetCode 699 掉落的方块(线段树?)
    Java实现 LeetCode 699 掉落的方块(线段树?)
  • 原文地址:https://www.cnblogs.com/jake-jin/p/11359886.html
Copyright © 2011-2022 走看看