zoukankan      html  css  js  c++  java
  • python学习之-用scrapy框架来创建爬虫(spider)

    scrapy简单说明

    scrapy  为一个框架  
            框架和第三方库的区别:
            库可以直接拿来就用,
            框架是用来运行,自动帮助开发人员做很多的事,我们只需要填写逻辑就好
    命令:
    
    创建一个 项目  :
    
    cd 到需要创建工程的目录中,
    
    scrapy startproject stock_spider
    
    其中 stock_spider 为一个项目名称
    
    
    创建一个爬虫
    
    cd  ./stock_spider/spiders
    
    scrapy genspider tonghuashun "http://basic.10jqka.com.cn/600004/company.html"
    
    其中 tonghuashun 为一个爬虫名称 
    
    "http://basic.10jqka.com.cn/600004/company.html"  为爬虫的地址

    执行命令

    1,创建一个工程:

    cd 到需要创建工程的目录
    
    scrapy startproject my_spide

    2,创建一个简单的爬虫

    cd  ./stock_spider/spiders
    
    scrapy genspider tonghuashun "http://basic.10jqka.com.cn/600004/company.html"
    
    其中 tonghuashun 为一个爬虫名称 
    
    "http://basic.10jqka.com.cn/600004/company.html"  为爬虫的地址

    tonghuashun.py代码

    import scrapy
    
    
    class TonghuashunSpider(scrapy.Spider):
        name = 'tonghuashun'
        allowed_domains = ['http://basic.10jqka.com.cn/600004/company.html']
        start_urls = ['http://basic.10jqka.com.cn/600004/company.html']
    
        def parse(self, response):
    
            # //*[@id="maintable"]/tbody/tr[1]/td[2]/a
            # res_selector = response.xpath("//*[@id="maintable"]/tbody/tr[1]/td[2]/a")
            # print(res_selector)
    
            # /Users/eddy/PycharmProjects/helloWord/stock_spider/stock_spider/spiders
    
            res_selector = response.xpath("//*[@id="ml_001"]/table/tbody/tr[1]/td[1]/a/text()")
    
            name = res_selector.extract()
    
            print(name)
    
            tc_names = response.xpath("//*[@class="tc name"]/a/text()").extract()
    
            for tc_name in tc_names:
                print(tc_name)
    
            positions = response.xpath("//*[@class="tl"]/text()").extract()
    
            for position in positions:
                print(position)
    
            pass

    xpath :

    '''
    xpath
    /   从根节点来进行选择元素
    //  从匹配选择的当前节点来对文档中的节点进行选择
    .   选择当前的节点
    ..  选择当前节点的父节点
    @   选择属性
    
    body/div    选取属于body的子元素中的所有div元素
    //div       选取所有div标签的子元素,不管它们在html中的位置
    
    @lang  选取名称为lang的所有属性
    
    通配符
    
    * 匹配任意元素节点
    @* 匹配任何属性节点
    
    //* 选取文档中的所有元素
    
    //title[@*]  选取所有带有属性的title元素
    
    |
    在xpath中 | 是代表和的意思
    
    //body/div | //body/li  选取body元素中的所有div元素和li元素
    
    
    '''
    scrapy shell 的使用过程:
    '''
    scrapy shell 的使用过程
    
    可以很直观的看到自己选择元素的打印
    
    命令:
    scrapy shell http://basic.10jqka.com.cn/600004/company.html
    
    
    查看指定元素命令:
    response.xpath("//*[@id="ml_001"]/table/tbody/tr[1]/td[1]/a/text()").extract()
    
    
    查看 class="tc name" 的所有元素
    response.xpath("//*[@class="tc name"]").extract()
    
    查看 class="tc name" 的所有元素 下a标签的text
    response.xpath("//*[@class="tc name"]/a/text()").extract()
    
    ['邱嘉臣', '刘建强', '马心航', '张克俭', '关易波', '许汉忠', '毕井双', '饶品贵', '谢泽煌', '梁慧', '袁海文', '邱嘉臣', '戚耀明', '武宇', '黄浩', '王晓勇', '于洪才', '莫名贞', '谢冰心']
    
    
    '''

    scrapy框架在爬虫中的应用

    在上个工程项目中cd 到 spidders 目录中,此处为存放爬虫类的包

    栗子2:
    cd  ./stock_spider/spiders
    
    scrapy genspider stock "pycs.greedyai.com"
    stock.py
    # -*- coding: utf-8 -*-
    import scrapy
    import re
    
    from urllib import parse
    from ..items import MySpiderItem2
    
    class StockSpider(scrapy.Spider):
        name = 'stock'
        allowed_domains = ['pycs.greedyai.com']
        start_urls = ['http://pycs.greedyai.com']
    
        def parse(self, response):
            hrefs = response.xpath("//a/@href").extract()
    
            for href in hrefs:
                yield scrapy.Request(url= parse.urljoin(response.url, href), callback=self.parse_detail, dont_filter=True)
    
    
        def parse_detail(self,response):
    
            stock_item = MySpiderItem2()
    
            # 董事会成员信息
            stock_item["names"] = self.get_tc(response)
    
            # 抓取性别信息
            stock_item["sexes"] = self.get_sex(response)
    
            # 抓取年龄信息
            stock_item["ages"] = self.get_age(response)
    
            # 股票代码
            stock_item["codes"] = self.get_cod(response)
    
            # 职位信息
            stock_item["leaders"] = self.get_leader(response,len(stock_item["names"]))
    
            yield stock_item
            # 处理信息
    
    
        def get_tc(self, response):
            names = response.xpath("//*[@class="tc name"]/a/text()").extract()
            return names
    
        def get_sex(self, response):
            # //*[@id="ml_001"]/table/tbody/tr[1]/td[1]/div/table/thead/tr[2]/td[1]
            infos = response.xpath("//*[@class="intro"]/text()").extract()
            sex_list = []
            for info in infos:
                try:
                    sex = re.findall("[男|女]", info)[0]
                    sex_list.append(sex)
                except(IndexError):
                    continue
    
            return sex_list
    
        def get_age(self, response):
            infos = response.xpath("//*[@class="intro"]/text()").extract()
            age_list = []
            for info in infos:
                try:
                    age = re.findall("d+", info)[0]
                    age_list.append(age)
                except(IndexError):
                    continue
    
            return age_list
    
        def get_cod(self, response):
            codes = response.xpath("/html/body/div[3]/div[1]/div[2]/div[1]/h1/a/@title").extract()
            code_list = []
            for info in codes:
                code = re.findall("d+", info)[0]
                code_list.append(code)
    
            return code_list
    
        def get_leader(self, response, length):
            tc_leaders = response.xpath("//*[@class="tl"]/text()").extract()
            tc_leaders = tc_leaders[0 : length]
            return tc_leaders
    items.py:
    import scrapy
    
    
    class MySpiderItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        pass
    
    class MySpiderItem2(scrapy.Item):
        names = scrapy.Field()
        sexes = scrapy.Field()
        ages = scrapy.Field()
        codes = scrapy.Field()
        leaders = scrapy.Field()

    说明:

    items.py中的MySpiderItem2 类中的字段用于存储在stock.py的StockSpider类中爬到的字段,交给pipelines.py中的MySpiderPipeline2处理,
    需要到settings.py中设置
    # -*- coding: utf-8 -*-
    
    # Scrapy settings for my_spider project
    #
    # For simplicity, this file contains only settings considered important or
    # commonly used. You can find more settings consulting the documentation:
    #
    #     https://doc.scrapy.org/en/latest/topics/settings.html
    #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
    #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
    
    BOT_NAME = 'my_spider'
    
    SPIDER_MODULES = ['my_spider.spiders']
    NEWSPIDER_MODULE = 'my_spider.spiders'
    
    
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    #USER_AGENT = 'my_spider (+http://www.yourdomain.com)'
    
    # Obey robots.txt rules
    ROBOTSTXT_OBEY = True
    
    # Configure maximum concurrent requests performed by Scrapy (default: 16)
    #CONCURRENT_REQUESTS = 32
    
    # Configure a delay for requests for the same website (default: 0)
    # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
    # See also autothrottle settings and docs
    #DOWNLOAD_DELAY = 3
    # The download delay setting will honor only one of:
    #CONCURRENT_REQUESTS_PER_DOMAIN = 16
    #CONCURRENT_REQUESTS_PER_IP = 16
    
    # Disable cookies (enabled by default)
    #COOKIES_ENABLED = False
    
    # Disable Telnet Console (enabled by default)
    #TELNETCONSOLE_ENABLED = False
    
    # Override the default request headers:
    #DEFAULT_REQUEST_HEADERS = {
    #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    #   'Accept-Language': 'en',
    #}
    
    # Enable or disable spider middlewares
    # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
    #SPIDER_MIDDLEWARES = {
    #    'my_spider.middlewares.MySpiderSpiderMiddleware': 543,
    #}
    
    # Enable or disable downloader middlewares
    # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
    #DOWNLOADER_MIDDLEWARES = {
    #    'my_spider.middlewares.MySpiderDownloaderMiddleware': 543,
    #}
    
    # Enable or disable extensions
    # See https://doc.scrapy.org/en/latest/topics/extensions.html
    #EXTENSIONS = {
    #    'scrapy.extensions.telnet.TelnetConsole': None,
    #}
    
    # Configure item pipelines
    # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    ITEM_PIPELINES = {
       'my_spider.pipelines.MySpiderPipeline': 300,
       'my_spider.pipelines.MySpiderPipeline2': 1,
    }
    
    # Enable and configure the AutoThrottle extension (disabled by default)
    # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
    #AUTOTHROTTLE_ENABLED = True
    # The initial download delay
    #AUTOTHROTTLE_START_DELAY = 5
    # The maximum download delay to be set in case of high latencies
    #AUTOTHROTTLE_MAX_DELAY = 60
    # The average number of requests Scrapy should be sending in parallel to
    # each remote server
    #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
    # Enable showing throttling stats for every response received:
    #AUTOTHROTTLE_DEBUG = False
    
    # Enable and configure HTTP caching (disabled by default)
    # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
    #HTTPCACHE_ENABLED = True
    #HTTPCACHE_EXPIRATION_SECS = 0
    #HTTPCACHE_DIR = 'httpcache'
    #HTTPCACHE_IGNORE_HTTP_CODES = []
    #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
    pipelines.py
    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    
    import os
    
    class MySpiderPipeline(object):
        def process_item(self, item, spider):
            return item
    
    
    
    class MySpiderPipeline2(object):
    
        '''
        # 类被加载时需要创建一个文件
    
        # 判断文件是否为空
        为空写:高管姓名,性别,年龄,股票代码,职位
        不为空:追加文件写数据
    
        '''
    
        def __init__(self):
    
            self.file = open("executive_prep.csv","a+")
    
    
        def process_item(self, item, spider):
    
            if os.path.getsize("executive_prep.csv"):
                # 写数据
                self.write_content(item)
            else:
                self.file.write("高管姓名,性别,年龄,股票代码,职位
    ")
    
            self.file.flush()
            return item
    
    
        def write_content(self,item):
    
            names = item["names"]
            sexes = item["sexes"]
            ages = item["ages"]
            codes = item["codes"]
            leaders = item["leaders"]
    
            # print(names + sexes + ages + codes + leaders)
    
            line = ""
            for i in range(len(names)):
                line = names[i] + "," + sexes[i] + "," + ages[i] + "," + codes[0] + "," + leaders[i] + "
    "
                self.file.write(line)

    文件可以在同级目录中查看

  • 相关阅读:
    Python(八)进程、线程、协程篇
    Python(七)Socket编程、IO多路复用、SocketServer
    Python(六)面向对象、异常处理、反射、单例模式
    Python并发编程之多进程
    ORM框架SQLAlchemy
    Python并发编程之IO模型
    Python并发编程之同步异步and阻塞非阻塞
    二分查找
    插入排序
    单向链表
  • 原文地址:https://www.cnblogs.com/Eddyer/p/9802263.html
Copyright © 2011-2022 走看看