zoukankan      html  css  js  c++  java
  • 模型评估---交叉验证

    1.原始交叉验证

    # Import the linear regression class
    from sklearn.linear_model import LinearRegression
    # Sklearn also has a helper that makes it easy to do cross validation
    from sklearn.cross_validation import KFold
    import numpy as np
    
    # The columns we'll use to predict the target
    predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
    
    # Initialize our algorithm class
    alg = LinearRegression()
    # Generate cross validation folds for the titanic dataset.  It return the row indices corresponding to train and test.
    # We set random_state to ensure we get the same splits every time we run this.
    kf = KFold(titanic.shape[0], n_folds=3, random_state=1)
    
    predictions = []
    for train, test in kf:
        # The predictors we're using the train the algorithm.  Note how we only take the rows in the train folds.
        train_predictors = (titanic[predictors].iloc[train,:])
        # The target we're using to train the algorithm.
        train_target = titanic["Survived"].iloc[train]
        # Training the algorithm using the predictors and target.
        alg.fit(train_predictors, train_target)
        # We can now make predictions on the test fold
        test_predictions = alg.predict(titanic[predictors].iloc[test,:])
        predictions.append(test_predictions)
    
    
    # The predictions are in three separate numpy arrays.  Concatenate them into one.  
    # We concatenate them on axis 0, as they only have one axis.
    predictions = np.concatenate(predictions, axis=0)
    
    # Map predictions to outcomes (only possible outcomes are 1 and 0)
    predictions[predictions > .5] = 1
    predictions[predictions <=.5] = 0
    accuracy = sum(predictions[predictions == titanic["Survived"]]) / len(predictions)
    print accuracy
    0.783389450056

    2.cross_validation交叉验证

    from sklearn import cross_validation
    from sklearn.linear_model import LogisticRegression
    # Initialize our algorithm
    alg = LogisticRegression(random_state=1)
    # Compute the accuracy score for all the cross validation folds.  (much simpler than what we did before!)
    scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=3)
    # Take the mean of the scores (because we have one for each fold)
    print(scores.mean())
    0.787878787879

    from sklearn import cross_validation
    from sklearn.ensemble import RandomForestClassifier
    
    predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
    
    # Initialize our algorithm with the default paramters
    # n_estimators is the number of trees we want to make
    # min_samples_split is the minimum number of rows we need to make a split
    # min_samples_leaf is the minimum number of samples we can have at the place where a tree branch ends (the bottom points of the tree)
    alg = RandomForestClassifier(random_state=1, n_estimators=10, min_samples_split=2, min_samples_leaf=1)
    # Compute the accuracy score for all the cross validation folds.  (much simpler than what we did before!)
    kf = cross_validation.KFold(titanic.shape[0], n_folds=3, random_state=1)
    scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=kf)
    
    # Take the mean of the scores (because we have one for each fold)
    print(scores.mean())
    0.785634118967


    3. 多模型投票分类
    predictors = ["Pclass", "Sex", "Age", "Fare", "Embarked", "FamilySize", "Title"]
    
    algorithms = [
        [GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3), predictors],
        [LogisticRegression(random_state=1), ["Pclass", "Sex", "Fare", "FamilySize", "Title", "Age", "Embarked"]]
    ]
    
    full_predictions = []
    for alg, predictors in algorithms:
        # Fit the algorithm using the full training data.
        alg.fit(titanic[predictors], titanic["Survived"])
        # Predict using the test dataset.  We have to convert all the columns to floats to avoid an error.
        predictions = alg.predict_proba(titanic_test[predictors].astype(float))[:,1]
        full_predictions.append(predictions)
    
    # The gradient boosting classifier generates better predictions, so we weight it higher.
    predictions = (full_predictions[0] * 3 + full_predictions[1]) / 4
    predictions

    交叉验证中混合模型分类

    from sklearn.ensemble import GradientBoostingClassifier
    import numpy as np
    
    # The algorithms we want to ensemble.
    # We're using the more linear predictors for the logistic regression, and everything with the gradient boosting classifier.
    algorithms = [
        [GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3), ["Pclass", "Sex", "Age", "Fare", "Embarked", "FamilySize", "Title",]],
        [LogisticRegression(random_state=1), ["Pclass", "Sex", "Fare", "FamilySize", "Title", "Age", "Embarked"]]
    ]
    
    # Initialize the cross validation folds
    kf = KFold(titanic.shape[0], n_folds=3, random_state=1)
    
    predictions = []
    for train, test in kf:
        train_target = titanic["Survived"].iloc[train]
        full_test_predictions = []
        # Make predictions for each algorithm on each fold
        for alg, predictors in algorithms:
            # Fit the algorithm on the training data.
            alg.fit(titanic[predictors].iloc[train,:], train_target)
            # Select and predict on the test fold.  
            # The .astype(float) is necessary to convert the dataframe to all floats and avoid an sklearn error.
            test_predictions = alg.predict_proba(titanic[predictors].iloc[test,:].astype(float))[:,1]
            full_test_predictions.append(test_predictions)
        # Use a simple ensembling scheme -- just average the predictions to get the final classification.
        test_predictions = (full_test_predictions[0] + full_test_predictions[1]) / 2
        # Any value over .5 is assumed to be a 1 prediction, and below .5 is a 0 prediction.
        test_predictions[test_predictions <= .5] = 0
        test_predictions[test_predictions > .5] = 1
        predictions.append(test_predictions)
    
    # Put all the predictions together into one array.
    predictions = np.concatenate(predictions, axis=0)
    
    # Compute accuracy by comparing to the training data.
    accuracy = sum(predictions[predictions == titanic["Survived"]]) / len(predictions)
    print(accuracy)
  • 相关阅读:
    CSS
    HTML
    MySQL:PyMySQL&ORM
    MySQL:SQL进阶
    03-body标签中相关标签
    02-body标签中相关标签
    01-html介绍和head标签
    并发编程
    异常处理、网络编程
    面向对象进阶和模块
  • 原文地址:https://www.cnblogs.com/itbuyixiaogong/p/9848761.html
Copyright © 2011-2022 走看看