zoukankan      html  css  js  c++  java
  • 贪心学院 scrapy爬虫

    生成爬虫

    scrapy genspider 爬虫名 网址
    

    打开调试用shell

    scrapy shell 网址
    

    主体 stock.py

    # -*- coding: utf-8 -*-
    import re
    from urllib import parse
    import scrapy
    from stock_spider.items import StockItem
    
    class StockSpider(scrapy.Spider):
        name = 'stock'
        allowed_domains = ['pycs.greedyai.com/']  #域名
        start_urls = ['http://pycs.greedyai.com/']    #地址
    
        def parse(self, response):
            post_urls= response.xpath("//a/@href").extract()  #获取子网址
            for post_url in post_urls:
                yield scrapy.Request(url=parse.urljoin(response.url,post_url),callback=self.parse_detail,dont_filter=True)  #整合成可访问的网址
    
        def parse_detail(self,response):
            stock_item= StockItem()
            #董事会成员
            stock_item['names']=self.get_name(response)
    
            #性别
            # stock_item['sexs']=self.get_sex(response)  #部分人员无性别资料导致后来的list越界
    
            #股票代码
            stock_item['codes']=self.get_code(response)
    
            #成员职位
            stock_item['positions']=self.get_position(response)
            yield stock_item
    
        def get_name(self,response):
            name=response.xpath("//td[@class="tc name"]/a/text()").extract()
            return name
    
    
        def get_sex(self,response):
            sex_temp = response.xpath("//td[@class="intro"]/text()").extract()
            sex_list=[]
            for sex_info in sex_temp:
                try:
                    sex=re.findall("男|女",sex_info)[0]
                    sex_list.append(sex)
                except(IndexError):  #捕获到该异常,则继续往下读取,因为视频上显示在有用数据前后有一些无效的转义字符
                    continue
            return sex_list
    
    
        def get_code(self,response):
            code_temp=response.xpath("/html/body/div[3]/div[1]/div[2]/div[1]/h1/a/@title").extract()
            for code_info in code_temp:
                code=re.findall("d+",code_info)
            return code
    
        def get_position(self,response):
            position = response.xpath("//td[@class="tl"]/text()").extract()
            return position
    

    main.py

    from scrapy.cmdline import execute  #调试用
    
    import sys
    import os
    
    sys.path.append(os.path.dirname(os.path.abspath(__file__)))
    execute(["scrapy","crawl","stock"])
    
    

    items.py

    # -*- coding: utf-8 -*-
    
    import scrapy
    
    class StockSpiderItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        pass
    
    class StockItem(scrapy.Item):  #新添加
        names=scrapy.Field()
        # sexs=scrapy.Field()
        codes=scrapy.Field()
        positions=scrapy.Field()
    
    

    pipelines.py

    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
    
    import os
    
    class StockSpiderPipeline(object):
        def process_item(self, item, spider):
            return item
    
    class StockPipeline(object): #新添加
    
        # 类被加载时创建一个文件
        def __init__(self):
            self.file=open("executive_prep.csv","a+") # a+有则追加,无则创建
    
        def process_item(self, item, spider):
    
            #判断文件是否为空,为空则写入标头:姓名,性别,股票代码,职位
            #为空则追加写文件
            if os.path.getsize("executive_prep.csv"): #获取文件大小
                #开始写文件
                self.write_content(item)
            else:
                self.file.write("姓名,性别,股票代码,职位
    ")
            self.file.flush()
    
    
        def write_content(self,item):
    
            names = item['names']
            # sexs = item['sexs']
            codes = item['codes']
            positions = item['positions']
            for i in range(len(names)):
                result=names[i]+","+codes[0]+","+positions[i]+"
    "
                self.file.write(result)
    
    

    settings.py

    # -*- coding: utf-8 -*-
    
    BOT_NAME = 'stock_spider'
    
    SPIDER_MODULES = ['stock_spider.spiders']
    NEWSPIDER_MODULE = 'stock_spider.spiders' #新添加
    
    # Configure item pipelines
    # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
    ITEM_PIPELINES = {
       'stock_spider.pipelines.StockSpiderPipeline': 300,
        'stock_spider.pipelines.StockPipeline': 300, #新添加
    }
    
    
  • 相关阅读:
    区块链在零售业和银行业的广泛应用
    云存储平台产品浅析
    LINUX操作系统知识:进程与线程详解
    hibernate实现分页
    Hibernate 映射文件的配置 核心文件的配置 一对一 一对多 多对多 hibernate检索策略 Hibernate中session的关闭问题总结
    留言系统项目总结
    jquery 进行dom操作
    数据库 的outfile 备份与还原 视图 事物 触发器 mysql函数和自定义函数
    数据库的子查询、连接查询
    三 级城市,部门,员工,列表联动的问题解决,获取列表的被选中option对象问题
  • 原文地址:https://www.cnblogs.com/j-c-y/p/11461677.html
Copyright © 2011-2022 走看看