zoukankan      html  css  js  c++  java
  • python 分层抽样

    import numpy as np
    import pandas as pd
    
    PATH_DES = '/Users/linxianli/Desktop/'
    df = pd.read_excel(PATH_DES + '工作簿1.xlsx')
    
    df.head()
    
    # 使用 sklearn 进行分层抽样
    from sklearn.model_selection import train_test_split
    
    # data['TYPE']是在data中的某一个属性列
    X_train, X_test, y_train, y_test = train_test_split(df,df['TYPE'], test_size=0.2, stratify=df['TYPE']) # test_size 测试集占比
    
    print(X_train.shape)
    print(X_test.shape)
    '''
    (885, 4)
    (222, 4)
    '''
    
    
    # 普通方法进行分层抽样
    test = pd.DataFrame()              # 划分出的test集合
    train = pd.DataFrame()             # 剩余的train集合
    tags = df['TYPE'].unique().tolist() # 按照该标签进行等比例抽取
    
    for tag in tags:
        # 随机选取0.2的数据
        data = df[(df['TYPE'] == tag)]
        sample = data.sample(int(0.2*len(data)))
        sample_index = sample.index
        
        # 剩余数据
        all_index = data.index
        residue_index = all_index.difference(sample_index) # 去除sample之后剩余的数据
        residue = data.loc[residue_index]  # 这里要使用.loc而非.iloc
        
        # 保存
        test = pd.concat([test, sample], ignore_index=True)
        train = pd.concat([train, residue], ignore_index=True)
    
    print(test.shape)
    print(train.shape)
    '''
    (221, 4)
    (886, 4)
    '''
  • 相关阅读:
    01 Windows编程——Hello World
    图像处理基础知识
    集成IDE anaconda
    Python中的正则表达式
    Introduction of Machine Learning
    Linux命令——diff、patch
    sed & awk 概述
    Linux行编辑器——ed
    Linux命令——w、who、whoami、lastlog、last
    【问题】统计系统上有多少个用户
  • 原文地址:https://www.cnblogs.com/LXL616/p/14121674.html
Copyright © 2011-2022 走看看