zoukankan      html  css  js  c++  java
  • scrapy 爬取股票

    scrapy 爬取股票

    stock.py

    # -*- coding: utf-8 -*-
    import scrapy
    from items import StockstarItem, StockstarItemLoader
    
    
    class StockSpider(scrapy.Spider):
        name = 'stock'
        allowed_domains = ['quote.stockstar.com']
        start_urls = ['http://quote.stockstar.com/stock/ranklist_a_3_1_1.html']
    
        def parse(self, response):
            page = int(response.url.split("_")[-1].split(".")[0])
            item_nodes = response.css('#datalist tr')
    
            for item_node in item_nodes:
                item_loader = StockstarItemLoader(item=StockstarItem(), selector=item_node)
                item_loader.add_css("code", "td:nth-child(1) a::text")
                item_loader.add_css("abbr", "td:nth-child(2) a::text")
                item_loader.add_css("last_trade", "td:nth-child(3) span::text")
                item_loader.add_css("chg_ratio", "td:nth-child(4) span::text")
                item_loader.add_css("chg_amt", "td:nth-child(5) span::text")
                item_loader.add_css("chg_ratio_5min", "td:nth-child(6) span::text")
                item_loader.add_css("volumn", "td:nth-child(7)::text")
                item_loader.add_css("turn_over", "td:nth-child(8)::text")
                stock_item = item_loader.load_item()
                yield stock_item
    
            if item_nodes:
                next_page = page + 1
                next_url = response.url.replace("{0}.html".format(page), "{0}.html".format(next_page))
                yield scrapy.Request(url=next_url, callback=self.parse)
    

    item.py:

    import scrapy
    from scrapy.loader import ItemLoader
    from scrapy.loader.processors import TakeFirst
    
    
    class StockstarItemLoader(ItemLoader):
        #自定义itemloader
        default_output_processor = TakeFirst()
    
    class StockstarItem(scrapy.Item):
        code = scrapy.Field()  # 股票代码
        abbr = scrapy.Field()  # 股票简称
        last_trade = scrapy.Field()  # 最新价
        chg_ratio = scrapy.Field()   # 涨跌幅
        chg_amt = scrapy.Field()     # 涨跌额
        chg_ratio_5min = scrapy.Field() # 5分钟涨幅
        volumn = scrapy.Field()  # 成交量
        turn_over = scrapy.Field()  # 成交额
    

    middlewares.py:

    from scrapy import signals
    
    
    class StockstarSpiderMiddleware(object):
        @classmethod
        def from_crawler(cls, crawler):
            s = cls()
            crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
            return s
    
        def process_spider_input(self, response, spider):
            return None
    
        def process_spider_output(self, response, result, spider):
            for i in result:
                yield i
    
        def process_spider_exception(self, response, exception, spider):
            pass
    
        def process_start_requests(self, start_requests, spider):
            for r in start_requests:
                yield r
    
        def spider_opened(self, spider):
            spider.logger.info('Spider opened: %s' % spider.name)
    
    

    pip:

    class StockstarPipeline(object):
        def process_item(self, item, spider):
            return item
    

    settings.py:

    from scrapy.exporters import JsonLinesItemExporter
    
    # 默认显示的中文是阅读性较差的Unicode字符
    # 需要定义子类显示出原来的字符集(将父类的ensure_ascii属性设置为False即可)
    class CustomJsonLinesItemExporter(JsonLinesItemExporter):
        def __init__(self, file, **kwargs):
            super(CustomJsonLinesItemExporter, self).__init__(file, ensure_ascii=False, **kwargs)
    
    # 启用新定义的Exporter类
    FEED_EXPORTERS = {
        'json': 'stockstar.settings.CustomJsonLinesItemExporter',
    }
    
    BOT_NAME = 'stockstar'
    
    SPIDER_MODULES = ['stockstar.spiders']
    NEWSPIDER_MODULE = 'stockstar.spiders'
    
    ROBOTSTXT_OBEY = True
    

    main.py

    from scrapy.cmdline import execute
    
    execute(["scrapy", "crawl", "stock", "-o", "items.json"])
    

    scrapy.cfg

    [settings]
    default = stockstar.settings
    
    [deploy]
    #url = http://localhost:6800/
    project = stockstar
    

    问题:

    编码问题:
    
        import requests
        url = 'https://cdn.heweather.com/china-city-list.txt'
        response = requests.get(url)
        response.encoding='utf8'
        data = response.text
        data_1 = data.split('
    ')
        print(data_1)
        # 去除前3行
        for i in range(3):
            data_1.remove(data_1[0])
         # 提取每行的字符串索引
        for item in data_1:
            print(item[0:11])
    
  • 相关阅读:
    SQL注入的一般步骤及防范方法
    防止SQL注入的五种方法
    document.getElementById("orderform").submit() 提交给了谁?
    页面调试-F12
    rs.last()续
    rs.last()
    14课后习题
    HashMap
    链表
    习题
  • 原文地址:https://www.cnblogs.com/star-py-blog/p/13740300.html
Copyright © 2011-2022 走看看