zoukankan      html  css  js  c++  java
  • 数据分析关键代码汇总

    # 头部引入
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    %matplotlib inline
    import seaborn as sns
    import warnings
    warnings.filterwarnings("ignore")
    import missingno as mso
    import pandas_profiling
    # 前期设置
    pd.set_option("display.max_columns",100)
    # 数据读取
    data_init=pd.read_csv("训练数据.csv",encoding="gbk")
    # 预处理
    # 1.查看单个值得数量
    data_filer["BufferCounter"].unique()
    # 2.查看所有列
    list(data_filer.columns)
    #数据类型转换
    data_filer[[ 'RamUsage', 'CpuUsage', 'VideoTotleTraffic']]=data_filer[['RamUsage','VideoTotleTraffic']].apply(pd.to_numeric,errors="ignore")
    # 删除列
    data_filer=data_filer.dropna(subset=["Latitude"])
    # 缺失值处理
    ms.matrix(data_filer)
    RX=np.mean(data_filer["RX"])
    data_filer["RX"].fillna(RX,inplace=True)
    # 各个属性之间的关系
    data_filer.corr()
    # one-hot编码
    X_City=pd.get_dummies(X["City"])
    X=pd.concat([X,X_City],axis=1)
    X=X.drop(["City"],axis=1)
    # 时间处理
    data_filer["VideoTestTime"]=data_filer["VideoTestTime"].astype(np.datetime64)
    X["year"]=X["VideoTestTime"].apply(lambda x:x.year)
    X["month"]=X["VideoTestTime"].apply(lambda x:x.month)
    X["Day"]=X["VideoTestTime"].apply(lambda x:x.day)
    X["hour"]=X["VideoTestTime"].apply(lambda x:x.hour)
    X["minute"]=X["VideoTestTime"].apply(lambda x:x.minute)
    X_data=X.drop(["VideoTestTime"],axis=1)
    # 单独列的处理
    # 排序
    data2=data_base[data_base["p_date"]==data_base["dateBefore"]].sort_values(by="enodebid")
    import re
    def f(x):
        try:
            str=re.search("[a-zA-Z]+s",x)
            if str:
                return str.group()
            else:
                str2=re.search("[a-zA-Z]+",x)
                if str2:
                    return  str2.group()
                else:
                    return "other2"
        except:
            return "other"
    data_pho["PhoneTypenew"]=data_pho["PhoneType"].apply(f)
    # Age列处理
    
    #处理Age列以及Series的逐个访问
    Age_Pre=data[["Age","NameTitle"]].groupby("NameTitle")["Age"].mean()
    type(Age_Pre)
    for index,value in Age_Pre.items():
        data.loc[(data.Age.isnull())&(data.NameTitle==index),"Age"]=Age_Pre[index]
    # Mapping Age
    dataset.loc[ dataset['Age'] <= 16, 'Age']= 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age'] = 4 ;
    sex={"male":0,"female":1}
    dataset["Sex"]=dataset["Sex"].map(sex)

    # 特征工程
    players['birth_date'] = pd.to_datetime(players.birthday, format='%d.%m.%Y')
    players['age_years'] = ((pd.to_datetime("2013-01-01") - players['birth_date']).dt.days)/365.25
    players['age_years']

    #对离散category数值进行处理Create higher level categories
    position_types = players.position.unique()
    position_types
    “”“
    array(['Center Back', 'Attacking Midfielder', 'Right Midfielder',
           'Center Midfielder', 'Goalkeeper', 'Defensive Midfielder',
           'Left Fullback', nan, 'Left Midfielder', 'Right Fullback',
           'Center Forward', 'Left Winger', 'Right Winger'], dtype=object)
    ”“”
    
    defense = ['Center Back','Defensive Midfielder', 'Left Fullback', 'Right Fullback', ]
    midfield = ['Right Midfielder', 'Center Midfielder', 'Left Midfielder',]
    forward = ['Attacking Midfielder', 'Left Winger', 'Right Winger', 'Center Forward']
    keeper = 'Goalkeeper'
    
    # modifying dataframe -- adding the aggregated position categorical position_agg
    players.loc[players['position'].isin(defense), 'position_agg'] = "Defense"
    players.loc[players['position'].isin(midfield), 'position_agg'] = "Midfield"
    players.loc[players['position'].isin(forward), 'position_agg'] = "Forward"
    players.loc[players['position'].eq(keeper), 'position_agg'] = "Keeper"
    
    
    
    X=data_filer[['RamUsage', 'CpuUsage', 'Longitude', 'Latitude', 'City', 'Source',
           'NetType', 'APN/SSID', 'RX', 'L_SINR', 'LteRsrq', 'CI', 'VideoAvgSpeed',
           'VideoPeakSpeed', 'VideoTestTime',
           'VideoTotleTraffic']]
    y=data_filer["BufferCounter"]
    # 可视化
    ata_pho["PhoneType"].value_counts()[0:20].plot(kind="bar")
    # # 下面针对多个模型进行集成操作
    from sklearn.svm import SVC, LinearSVC
    from sklearn.naive_bayes import GaussianNB
    from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.neural_network import MLPClassifier
    from sklearn.kernel_approximation import Nystroem
    from sklearn.kernel_approximation import RBFSampler
    from sklearn.pipeline import make_pipeline

    集成算法

    SEED=666
    def get_models():
        """Generate a library of base learners."""
        nb = GaussianNB()
        svc = SVC(C=100, probability=True)
        knn = KNeighborsClassifier(n_neighbors=3)
        lr = LogisticRegression(C=100, random_state=SEED)
        nn = MLPClassifier((80, 10), early_stopping=False, random_state=SEED)
        gb = GradientBoostingClassifier(n_estimators=100, random_state=SEED)
        rf = RandomForestClassifier(n_estimators=10, max_features=3, random_state=SEED)
    
        models = {'svm': svc,
                  'knn': knn,
                  'naive bayes': nb,
                  'mlp-nn': nn,
                  'random forest': rf,
                  'gbm': gb,
                  'logistic': lr,
                  }
    
        return models
    def train_predict(model_list):
        """Fit models in list on training set and return preds"""
        P = np.zeros((y_test.shape[0], len(model_list)))
        P = pd.DataFrame(P)
    
        print("Fitting models.")
        cols = list()
        for i, (name, m) in enumerate(models.items()):
            print("%s..." % name, end=" ", flush=False)
            m.fit(X_train, y_train)
            P.iloc[:, i] = m.predict_proba(X_test)[:, 1]
            cols.append(name)
            print("done")
    
        P.columns = cols
        print("Done.
    ")
        return P
    def score_models(P, y):
        """Score model in prediction DF"""
        print("Scoring models.")
        for m in P.columns:
            score = roc_auc_score(y, P.loc[:, m])
            print("%-26s: %.3f" % (m, score))
        print("Done.
    ")
    # 使用前期各个分类器
    models = get_models()
    P = train_predict(models)
    score_models(P, y_test)
    # 绘制ROC曲线
    from sklearn.metrics import roc_curve
    
    def plot_roc_curve(ytest, P_base_learners, P_ensemble, labels, ens_label):
        """Plot the roc curve for base learners and ensemble."""
        plt.figure(figsize=(10, 8))
        plt.plot([0, 1], [0, 1], 'k--')
        
        cm = [plt.cm.rainbow(i)
          for i in np.linspace(0, 1.0, P_base_learners.shape[1] + 1)]
        
        for i in range(P_base_learners.shape[1]):
            p = P_base_learners[:, i]
            fpr, tpr, _ = roc_curve(ytest, p)
            plt.plot(fpr, tpr, label=labels[i], c=cm[i + 1])
    
        fpr, tpr, _ = roc_curve(ytest, P_ensemble)
        plt.plot(fpr, tpr, label=ens_label, c=cm[0])
            
        plt.xlabel('False positive rate')
        plt.ylabel('True positive rate')
        plt.title('ROC curve')
        plt.legend(frameon=False)
        plt.show()
    
    plot_roc_curve(y_test, P.values, P.mean(axis
    =1), list(P.columns), "ensemble")
    #去掉最差的一个
    include = [c for c in P.columns if c not in ["mlp-nn"]] print("Truncated ensemble ROC-AUC score: %.3f" % roc_auc_score(ytest, P.loc[:, include].mean(axis=1)))
    #可视化各个模型预测结果
    p = P.apply(lambda x: 1*(x >= 0.5).value_counts(normalize=True))
    p.index = ["DEM", "REP"]
    p.loc["REP", :].sort_values().plot(kind="bar")
    plt.axhline(0.25, color="k", linewidth=0.5)
    plt.text(0., 0.23, "True share republicans")
    plt.show()

    Stacking模型

    1.定义基础模型

    base_learners = get_models()

    2.定义我们的权重分配模型(第二层架构)

    meta_learner = GradientBoostingClassifier(
        n_estimators=1000,
        loss="exponential",
        max_features=4,
        max_depth=3,
        subsample=0.5,
        learning_rate=0.005, 
        random_state=SEED
    )

    3.将基础模型数据分成两部分,主要供第二层来使用 

    xtrain_base, xpred_base, ytrain_base, ypred_base = train_test_split(
        xtrain, ytrain, test_size=0.5, random_state=SEED)

    4.训练我们的基础模型

    def train_base_learners(base_learners, inp, out, verbose=True):
        """Train all base learners in the library."""
        if verbose: print("Fitting models.")
        for i, (name, m) in enumerate(base_learners.items()):
            if verbose: print("%s..." % name, end=" ", flush=False)
            m.fit(inp, out)
            if verbose: print("done")
    train_base_learners(base_learners, xtrain_base, ytrain_base)

    5.准备二阶段权重分配分类器的训练数据

    def predict_base_learners(pred_base_learners, inp, verbose=True):
        """Generate a prediction matrix."""
        P = np.zeros((inp.shape[0], len(pred_base_learners)))
    
        if verbose: print("Generating base learner predictions.")
        for i, (name, m) in enumerate(pred_base_learners.items()):
            if verbose: print("%s..." % name, end=" ", flush=False)
            p = m.predict_proba(inp)
            # With two classes, need only predictions for one class
            P[:, i] = p[:, 1]
            if verbose: print("done")
    
        return P
    P_base = predict_base_learners(base_learners, xpred_base)

    6.训练二阶段得出分类结果!

    meta_learner.fit(P_base, ypred_base)
    def ensemble_predict(base_learners, meta_learner, inp, verbose=True):
        """Generate predictions from the ensemble."""
        P_pred = predict_base_learners(base_learners, inp, verbose=verbose)
        return P_pred, meta_learner.predict_proba(P_pred)[:, 1]
    P_pred, p = ensemble_predict(base_learners, meta_learner, xtest)
    print("
    Ensemble ROC-AUC score: %.3f" % roc_auc_score(ytest, p))

    经过上面的操作,损失了一部分数据集,下面采用交叉验证。

    from sklearn.base import clone
    
    def stacking(base_learners, meta_learner, X, y, generator):
        """Simple training routine for stacking."""
    
        # Train final base learners for test time
        print("Fitting final base learners...", end="")
        train_base_learners(base_learners, X, y, verbose=False)
        print("done")
    
        # Generate predictions for training meta learners
        # Outer loop:
        print("Generating cross-validated predictions...")
        cv_preds, cv_y = [], []
        for i, (train_idx, test_idx) in enumerate(generator.split(X)):
    
            fold_xtrain, fold_ytrain = X[train_idx, :], y[train_idx]
            fold_xtest, fold_ytest = X[test_idx, :], y[test_idx]
    
            # Inner loop: step 4 and 5
            fold_base_learners = {name: clone(model)
                                  for name, model in base_learners.items()}
            train_base_learners(
                fold_base_learners, fold_xtrain, fold_ytrain, verbose=False)
    
            fold_P_base = predict_base_learners(
                fold_base_learners, fold_xtest, verbose=False)
    
            cv_preds.append(fold_P_base)
            cv_y.append(fold_ytest)
            print("Fold %i done" % (i + 1))
    
        print("CV-predictions done")
        
        # Be careful to get rows in the right order
        cv_preds = np.vstack(cv_preds)
        cv_y = np.hstack(cv_y)
    
        # Train meta learner
        print("Fitting meta learner...", end="")
        meta_learner.fit(cv_preds, cv_y)
        print("done")
    
        return base_learners, meta_learner
    from sklearn.model_selection import KFold
    
    # Train with stacking
    cv_base_learners, cv_meta_learner = stacking(
        get_models(), clone(meta_learner), xtrain.values, ytrain.values, KFold(2))
    
    P_pred, p = ensemble_predict(cv_base_learners, cv_meta_learner, xtest, verbose=False)
    print("
    Ensemble ROC-AUC score: %.3f" % roc_auc_score(ytest, p))

    并行方法提供效率:

    from mlens.ensemble import SuperLearner
    
    # Instantiate the ensemble with 10 folds
    sl = SuperLearner(
        folds=10,
        random_state=SEED,
        verbose=2,
        backend="multiprocessing"
    )
    
    # Add the base learners and the meta learner
    sl.add(list(base_learners.values()), proba=True) 
    sl.add_meta(meta_learner, proba=True)
    
    # Train the ensemble
    sl.fit(xtrain, ytrain)
    
    # Predict the test set
    p_sl = sl.predict_proba(xtest)
    
    print("
    Super Learner ROC-AUC score: %.3f" % roc_auc_score(ytest, p_sl[:, 1]))
  • 相关阅读:
    http://blog.csdn.net/steveguoshao/article/details/38414145
    http://www.tuicool.com/articles/EjMJNz
    http://jingyan.baidu.com/article/7f41ecec1b7a2e593d095ce6.html
    Linux 查看当前时间和修改系统时间
    http://m.blog.csdn.net/article/details?id=49132747
    http://www.cnblogs.com/nick-huang/p/4848843.html
    javaScript事件(一)事件流
    jQuery选择器
    超链接a的target属性
    html基础总结版
  • 原文地址:https://www.cnblogs.com/wangzhenghua/p/11240359.html
Copyright © 2011-2022 走看看