zoukankan      html  css  js  c++  java
  • 训练集、测试集(train_test_split)


    训练集 & 测试集

    如果拿所有原始数据来训练,存在的问题:

    1. 模型很差无法调整;
    2. 真实环境难以拿到真实 label;

    所以将数据区分为 训练数据 和 测试数据(train test split);
    将训练数据来训练模型;然后用测试数据测试模型;

    使用这种方式也存在问题;


    python 原生分离 iris 数据集

    import numpy as np
    from sklearn import datasets
    import matplotlib.pyplot as plt
     
    iris = datasets.load_iris()
     
    X = iris.data 
    y = iris.target
     
    X
    ''' 
        (array([[5.1, 3.5, 1.4, 0.2],
                [4.9, 3. , 1.4, 0.2],  ...
                [6.2, 3.4, 5.4, 2.3],
                [5.9, 3. , 5.1, 1.8]])
    '''
    
    y 
    '''
        array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
               0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
               1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
               1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
               2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
               2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
    
    '''
    
    X.shape, y.shape #((150, 4), (150,))
     
    # shuffle 
    shuffle_indexes = np.random.permutation(len(X))  # 0--len(X) 的随机排列
    shuffle_indexes
    # array([ 22,   4, 142,  24,   7, 146,  ... 9,  95, 130,  29, 124])
     
    test_ratio = 0.2
    test_size = int(len(X) * test_ratio)
    test_size # 30
     
    test_indexes = shuffle_indexes[:test_size]
    train_indexes = shuffle_indexes[test_size:]
     
    test_indexes  
    ''' 
        array([ 22,   4, 142,  24,   7, 146,  70,  77, 144,  14,  40, 119,  46, 85,  74,  87,  86,  60,  91, 120,  78,  45,  65, 105, 113,  39, 83,  80, 134,  16])
    '''
    
    X_train = X[train_indexes]
    y_train = y[train_indexes]
    
    X_test = X[test_indexes]
    y_test = y[test_indexes]
      
    X_test.shape, X_train.shape # ((30, 4), (120, 4)) 
    

    封装 train_test_split 函数

    
    def train_test_split(X, y, test_ratio=0.2, seed=None):
        
        assert x.shape[0] == y.shape[0], "the size of X must be equal to the size of y"
        assert 0.0 <= test_ratio <= 1.0, "test_ ration must be valid"
        if seed:
            np.random.seed(seed)
            
        shuffle_indexes = np.random.permutation(len(X))  
        test_size = int(len(X) * test_ratio)
        
        test_indexes = shuffle_indexes[:test_size]
        train_indexes = shuffle_indexes[test_size:]
    
        X_train = X[train_indexes]
        y_train = y[train_indexes]
    
        X_test = X[test_indexes]
        y_test = y[test_indexes]
        
        return X_train, y_train, X_test, y_test
    

    sklearn 中的 train_test_split

    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    

    train_test_split(*arrays, **options)

    X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42)


  • 相关阅读:
    Maven(二)Maven项目的创建(命令、myeclipse)及生命周期
    Maven(一)初识Maven
    MySQL(十一)之触发器
    MySQL(十)之视图
    MySQL(九)之数据表的查询详解(SELECT语法)二
    MySQL(九)之数据表的查询详解(SELECT语法)一
    关于oracle的锁表解决session marked for kill
    shell脚本清空redis库缓存
    Java 数组拷贝方法 System.arraycopy
    oracle 替换字符 replace
  • 原文地址:https://www.cnblogs.com/fldev/p/14360135.html
Copyright © 2011-2022 走看看