zoukankan      html  css  js  c++  java
  • 『Sklearn』数据划分方法

    原理介绍

    K折交叉验证:

    KFold,GroupKFold,StratifiedKFold,

    留一法:

    LeaveOneGroupOut,LeavePGroupsOut,LeaveOneOut,LeavePOut,

    随机划分法:

    ShuffleSplit,GroupShuffleSplit,StratifiedShuffleSplit,

     

     

    代码实现

    流程:

    实例化分类器 -> 迭代器迭代组[.split()]

    KFold(n_splits=2)

    #KFold
    import numpy as np
    from sklearn.model_selection import KFold
    X=np.array([[1,2],[3,4],[5,6],[7,8],[9,10],[11,12]])
    y=np.array([1,2,3,4,5,6])
    kf=KFold(n_splits=2)    # 定义分成几个组
    # kf.get_n_splits(X)    # 查询分成几个组
    print(kf)
    for train_index,test_index in kf.split(X):
        print("Train Index:",train_index,",Test Index:",test_index)
        X_train,X_test=X[train_index],X[test_index]
        y_train,y_test=y[train_index],y[test_index]
        #print(X_train,X_test,y_train,y_test)
    

     GroupKFold(n_splits=2)

    # GroupKFold,不是很懂这个划分方法
    import numpy as np
    from sklearn.model_selection import GroupKFold
    X=np.array([[1,2],[3,4],[5,6],[7,8],[9,10],[11,12]])
    y=np.array([1,2,3,4,5,6])
    groups=np.array([1,2,3,4,5,6])
    group_kfold=GroupKFold(n_splits=2)
    group_kfold.get_n_splits(X,y,groups)
    print(group_kfold)
    for train_index,test_index in group_kfold.split(X,y,groups):
        print("Train Index:",train_index,",Test Index:",test_index)
        X_train,X_test=X[train_index],X[test_index]
        y_train,y_test=y[train_index],y[test_index]
        #print(X_train,X_test,y_train,y_test)
    
    #GroupKFold(n_splits=2)
    #Train Index: [0 2 4] ,Test Index: [1 3 5]
    #Train Index: [1 3 5] ,Test Index: [0 2 4]
    

     

    StratifiedKFold(n_splits=3)

    # stratifiedKFold:保证训练集中每一类的比例是相同的(尽量)
    import numpy as np
    from sklearn.model_selection import StratifiedKFold
    X=np.array([[1,2],[3,4],[5,6],[7,8],[9,10],[11,12]])
    y=np.array([1,1,1,2,2,2])
    skf=StratifiedKFold(n_splits=3)
    skf.get_n_splits(X,y)
    print(skf)
    for train_index,test_index in skf.split(X,y):
        print("Train Index:",train_index,",Test Index:",test_index)
        X_train,X_test=X[train_index],X[test_index]
        y_train,y_test=y[train_index],y[test_index]
        #print(X_train,X_test,y_train,y_test)
    
    #StratifiedKFold(n_splits=3, random_state=None, shuffle=False)
    #Train Index: [1 2 4 5] ,Test Index: [0 3]
    #Train Index: [0 2 3 5] ,Test Index: [1 4]

     

    LeaveOneOut()

    # leaveOneOut:测试集就留下一个
    import numpy as np
    from sklearn.model_selection import LeaveOneOut
    X=np.array([[1,2],[3,4],[5,6],[7,8],[9,10],[11,12]])
    y=np.array([1,2,3,4,5,6])
    loo=LeaveOneOut()
    loo.get_n_splits(X)
    print(loo)
    for train_index,test_index in loo.split(X,y):
        print("Train Index:",train_index,",Test Index:",test_index)
        X_train,X_test=X[train_index],X[test_index]
        y_train,y_test=y[train_index],y[test_index]
        #print(X_train,X_test,y_train,y_test)
    #LeaveOneOut()
    #Train Index: [1 2 3 4 5] ,Test Index: [0]
    #Train Index: [0 2 3 4 5] ,Test Index: [1]
    #Train Index: [0 1 3 4 5] ,Test Index: [2]
    #Train Index: [0 1 2 4 5] ,Test Index: [3]
    #Train Index: [0 1 2 3 5] ,Test Index: [4]
    #Train Index: [0 1 2 3 4] ,Test Index: [5]
    

     

    LeavePOut(p=3)

    LeavePOut:测试集留下P个
    import numpy as np
    from sklearn.model_selection import LeavePOut
    X=np.array([[1,2],[3,4],[5,6],[7,8],[9,10],[11,12]])
    y=np.array([1,2,3,4,5,6])
    lpo=LeavePOut(p=3)
    lpo.get_n_splits(X)
    print(lpo)
    for train_index,test_index in lpo.split(X,y):
        print("Train Index:",train_index,",Test Index:",test_index)
        X_train,X_test=X[train_index],X[test_index]
        y_train,y_test=y[train_index],y[test_index]
        #print(X_train,X_test,y_train,y_test)
    
    #LeavePOut(p=3)
    #Train Index: [3 4 5] ,Test Index: [0 1 2]
    #Train Index: [2 4 5] ,Test Index: [0 1 3]
    #Train Index: [2 3 5] ,Test Index: [0 1 4]
    #Train Index: [2 3 4] ,Test Index: [0 1 5]
    #Train Index: [1 4 5] ,Test Index: [0 2 3]
    #Train Index: [1 3 5] ,Test Index: [0 2 4]
    #Train Index: [1 3 4] ,Test Index: [0 2 5]
    #Train Index: [1 2 5] ,Test Index: [0 3 4]
    #Train Index: [1 2 4] ,Test Index: [0 3 5]
    #Train Index: [1 2 3] ,Test Index: [0 4 5]
    #Train Index: [0 4 5] ,Test Index: [1 2 3]
    #Train Index: [0 3 5] ,Test Index: [1 2 4]
    #Train Index: [0 3 4] ,Test Index: [1 2 5]
    #Train Index: [0 2 5] ,Test Index: [1 3 4]
    #Train Index: [0 2 4] ,Test Index: [1 3 5]
    #Train Index: [0 2 3] ,Test Index: [1 4 5]
    #Train Index: [0 1 5] ,Test Index: [2 3 4]
    #Train Index: [0 1 4] ,Test Index: [2 3 5]
    #Train Index: [0 1 3] ,Test Index: [2 4 5]
    #Train Index: [0 1 2] ,Test Index: [3 4 5]
    

     

    ShuffleSplit(n_splits=3,test_size=.25,random_state=0)

    # ShuffleSplit 把数据集打乱顺序,然后划分测试集和训练集,训练集额和测试集的比例随机选定,
    # 训练集和测试集的比例的和可以小于1 
    import numpy as np
    from sklearn.model_selection import ShuffleSplit
    X=np.array([[1,2],[3,4],[5,6],[7,8],[9,10],[11,12]])
    y=np.array([1,2,3,4,5,6])
    rs=ShuffleSplit(n_splits=3,test_size=.25,random_state=0)
    rs.get_n_splits(X)
    print(rs)
    for train_index,test_index in rs.split(X,y):
        print("Train Index:",train_index,",Test Index:",test_index)
        X_train,X_test=X[train_index],X[test_index]
        y_train,y_test=y[train_index],y[test_index]
        #print(X_train,X_test,y_train,y_test)
    print("==============================")
    rs=ShuffleSplit(n_splits=3,train_size=.5,test_size=.25,random_state=0)
    rs.get_n_splits(X)
    print(rs)
    for train_index,test_index in rs.split(X,y):
        print("Train Index:",train_index,",Test Index:",test_index)
    
    #ShuffleSplit(n_splits=3, random_state=0, test_size=0.25, train_size=None)
    #Train Index: [1 3 0 4] ,Test Index: [5 2]
    #Train Index: [4 0 2 5] ,Test Index: [1 3]
    #Train Index: [1 2 4 0] ,Test Index: [3 5]
    #==============================
    #ShuffleSplit(n_splits=3, random_state=0, test_size=0.25, train_size=0.5)
    #Train Index: [1 3 0] ,Test Index: [5 2]
    #Train Index: [4 0 2] ,Test Index: [1 3]
    #Train Index: [1 2 4] ,Test Index: [3 5]
    

     StratifiedShuffleSplit(n_splits=3,test_size=.5,random_state=0)

    # StratifiedShuffleSplitShuffleSplit 把数据集打乱顺序,然后划分测试集和训练集,
    # 训练集额和测试集的比例随机选定,训练集和测试集的比例的和可以小于1,但是还要保证训练集中各类所占的比例是一样的
    
    import numpy as np
    from sklearn.model_selection import StratifiedShuffleSplit
    X=np.array([[1,2],[3,4],[5,6],[7,8],[9,10],[11,12]])
    y=np.array([1,2,1,2,1,2])
    sss=StratifiedShuffleSplit(n_splits=3,test_size=.5,random_state=0)
    sss.get_n_splits(X,y)
    print(sss)
    for train_index,test_index in sss.split(X,y):
        print("Train Index:",train_index,",Test Index:",test_index)
        X_train,X_test=X[train_index],X[test_index]
        y_train,y_test=y[train_index],y[test_index]
        #print(X_train,X_test,y_train,y_test)
    
    #StratifiedShuffleSplit(n_splits=3, random_state=0, test_size=0.5,train_size=None)
    #Train Index: [5 4 1] ,Test Index: [3 2 0]
    #Train Index: [5 2 3] ,Test Index: [0 4 1]
    #Train Index: [5 0 4] ,Test Index: [3 1 2]
    
  • 相关阅读:
    docker进入容器命令
    docker复制文件到容器内以及从容器内复制文件到宿主机
    在idea中创建maven父子工程,子工程无法导入父工程依赖的问题
    maven merge 其他分支比如master的方法
    Maven 右边的maven 项目为空 pom文件
    Spark Streaming集成Kafka调优
    spark sql/hive小文件问题
    CompletableFuture详解
    样式绑定styleBinding
    jsonArray图片数组实例
  • 原文地址:https://www.cnblogs.com/hellcat/p/7045585.html
Copyright © 2011-2022 走看看