zoukankan      html  css  js  c++  java
  • 使用scrapy爬取百度股票

    1.spiders文件夹下的爬虫文件

    # -*- coding: utf-8 -*-
    import scrapy
    import re
    import random
    from scrapy.http.response.html import HtmlResponse
    class StocksSpider(scrapy.Spider):
        def __init__(self):
            self.user_agent_list = [ 
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" 
                "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", 
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", 
                "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", 
                "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", 
                "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", 
                "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", 
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 
                "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 
                "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 
                "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 
                "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 
                "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", 
                "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 
                "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
            ]
            self.ua = random.choice(self.user_agent_list)
            self.header = {
                'Accept-Encoding': 'gzip, deflate, sdch, br',
                'Accept-Language': 'zh-CN,zh;q=0.8',
                'Connection': 'keep-alive',
                'Referer': 'https://gupiao.baidu.com/',
                'User-Agent': self.ua
            }
        name = 'stocks'
        start_urls = ['http://quote.eastmoney.com/stocklist.html']
    
        def parse(self, response):
            for href in response.css('a::attr(href)').extract():
    
                try:
                    stock = re.findall(r'[s][hz]d{6}',href)[0]
                    url = 'http://gupiao.baidu.com/stock/'+stock+'.html'
                    yield scrapy.Request(url,callback=self.parse_stock,headers=self.header)
                except:
                    continue
        def parse_stock(self,response):
            infoDict = {}
            stockInfo = response.css('.stock-bets').extract()[0]
            name = response.css('.stock-bets .bets-name').extract()[0]
            KeyList = response.css('.stock-bets dt').extract()
            ValueList = response.css('.stock-bets dd').extract()
            for i in range(len(KeyList)):
                key = re.findall(r'>.*</dt>',KeyList[i])[0][1:-5]
                try:
                    value = re.findall(r'd+.?.*</dd>', ValueList[i])[0][0:-5]
                except:
                    value = '--'
                infoDict[key] = value
            infoDict.update(
                {'股票名称':re.findall('s.*(',name)[0].split()[0] +
                        re.findall('>.*<',name)[0][1:-1]}
            )
            yield infoDict

    2.pipelines.py

    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
    
    
    class BaidustocksPipeline(object):
        def process_item(self, item, spider):
            return item
    
    
    class BaiduStocksInfoPipeline(object):
        def open_spider(self,spider):
            self.f = open('BaiduStockInfo1.txt','w',encoding='utf-8')
    
        def colse_spider(self,spider):
            self.f.close()
    
        def process_item(self,item,spider):
            try:
                line = str(dict(item)) +'
    '
                self.f.write(line)
            except:
                pass
            return item
  • 相关阅读:
    js 中基本数据类型和引用数据类型 ,,,, js中对象和函数的关系
    something
    js 的constructor属性
    js 的prototype 属性和用法,外加__proto__
    js 获取一下url里面的一些内容
    js 终于明白变量提升的概念了
    动态设置小程序的 标题
    Vue axios调用第三方接口跨域解决
    css translate的一些问题 这其实可以用在,不知道div宽高的情况,从而让其上下,左右都居中。
    用css3写一个可以无限旋转的div或者图片
  • 原文地址:https://www.cnblogs.com/ldq1996/p/8306060.html
Copyright © 2011-2022 走看看