zoukankan      html  css  js  c++  java
  • 数据挖掘竞赛常用代码段

    常用库

    import gc
    import os
    import csv
    import time
    import math
    import datetime
    import collections
    import pandas as pd
    import numpy as np
    from tqdm import tqdm, tqdm_notebook, trange
    from sklearn import preprocessing
    
    import lightgbm as lgb
    import xgboost as xgb
    import catboost as cb
    from sklearn.model_selection import train_test_split
    from sklearn.model_selection import StratifiedKFold
    from sklearn.metrics import roc_curve, auc, roc_auc_score
    import matplotlib.pyplot as plt
    import seaborn as sns
    

    降低内存

    def reduce_mem_usage(df):
        """ iterate through all the columns of a dataframe and modify the data type
            to reduce memory usage.        
        """
        start_mem = df.memory_usage().sum() / 1024**2
        print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
        
        for col in df.columns:
            col_type = df[col].dtype
            
            if col_type != object:
                c_min = df[col].min()
                c_max = df[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)  
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)
            else:
                df[col] = df[col].astype('category')
    
        end_mem = df.memory_usage().sum() / 1024**2
        print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
        print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
        
        return df
    

    常用统计特征

    一阶

     tmp_df = use_Mb_info_n.groupby("user_id").agg({
            "x1": ["sum","max","min","mean"],
            "x2": ["count","nunique"],
            "x3":["nunique"],
            "x4":["sum"]
        })
    tmp_df.columns = ['_'.join(str(xx) for xx in x) for x in tmp_df.columns.ravel()]
    tmp_df = tmp_df.reset_index()
    feature_df = pd.merge(feature_df, tmp_df, how='left', on='user_id')
    

    二阶

    tmp_df = train_data.groupby(["user_id", "date"]).agg({
            "x1": ["sum", "mean", "max", "skew", pd.DataFrame.kurt],
            "x2": ["nunique"],
            "x3": ["sum"],
            "x4": ["sum"]
        })
    tmp_df.columns = ["_".join((str(xx) for xx in x)) for x in tmp_df.columns.ravel()]
    tmp_df = tmp_df.unstack(level=-1)
    tmp_df.columns = ["_".join((str(xx) for xx in x)) for x in tmp_df.columns.ravel()]
    tmp_df = tmp_df.reset_index()
    feature_df = pd.merge(feature_df, tmp_df, how='left', on='uid')
    

    画图

    协方差

    def correlation_heatmap(df):
        _ , ax = plt.subplots(figsize =(50, 50))
        colormap = sns.diverging_palette(220, 10, as_cmap = True)
        
        _ = sns.heatmap(
            df.corr(), 
            cmap = colormap,
            square=True, 
            cbar_kws={'shrink':.9 }, 
            ax=ax,
            annot=True, 
            linewidths=0.1,vmax=1.0, linecolor='white',
            annot_kws={'fontsize':12 }
        )
        
        plt.title('Pearson Correlation of Features', y=1.05, size=15)
    
    correlation_heatmap(your_df)
    

    正常显示中文

    plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
    plt.rcParams['axes.unicode_minus']=False #用来正常显示负号
    

    大小

    plt.rcParams['figure.figsize'] = (10,5)
    plt.rcParams['figure.dpi'] = 200
    

    count条状图

    sns.countplot(y="店铺名称", data=df, color="c")
    

    训练

    lgb

    n_splits = 5    # 分为5折
    seed = 19950115     # 随机种子
    gbm=None
    # lgb 参数
    lgb_params = {
        "learning_rate": 0.005,
        "lambda_l2": 0.15,
        "max_depth": 9,
        "objective": "binary",
        "verbose": -1,
        # 'feature_fraction': 0.9,
        # "min_split_gain": 0.1,
        "boosting_type": "gbdt",
        "subsample": 0.75,
        "colsample_bytree": 0.75,
        # "colsample_bylevel": 0.9,
        "scale_pos_weight": 16,
        'metric': ['auc'],  # 评估函数
    }
    
    df_train_columns = [c for c in data.columns if c not in ["label", "uid", "user_id"]]
    label = data['label']
    
    predictions = 0
    feature_importance_df = pd.DataFrame()
    skf = StratifiedKFold(n_splits=n_splits, random_state=seed, shuffle=True)
    
    for fold_, (trn_idx, val_idx) in enumerate(skf.split(data, label.values)):
        print("fold {}".format(fold_))
        trn_data = lgb.Dataset(data.iloc[trn_idx][df_train_columns], label=label.iloc[trn_idx])
        val_data = lgb.Dataset(data.iloc[val_idx][df_train_columns], label=label.iloc[val_idx])
    
        gbm = lgb.train(lgb_params, 
                        trn_data, 
                        # init_model=gbm,  
                        num_boost_round=150000, 
                        valid_sets=[trn_data, val_data],
                        early_stopping_rounds=200, 
                        verbose_eval=200)     # 训练
        # clf = joblib.load("model/lgb_{}.m".format(index))     # 保存模型
        # joblib.dump(clf, "model/lgb_{}.m".format(index))      # 加载模型
        # gbm.save_model(MODEL_PATH+'/lgb_more_fea.model', num_iteration=gbm.best_iteration)
        y_pred = gbm.predict(data.iloc[val_idx][df_train_columns], num_iteration=gbm.best_iteration)
        # qauc_score = qauc(y_pred, data.iloc[val_idx][df_train_columns], label.iloc[val_idx])
        # print("qauc: ", qauc_score)
        # y_score.append(qauc_score)  # 计算auc值
        
        fold_importance_df = pd.DataFrame()
        fold_importance_df["Feature"] = df_train_columns
        fold_importance_df["importance"] = gbm.feature_importance()
        fold_importance_df["fold"] = fold_ + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        
        predictions += y_pred.T[0] / skf.n_splits
    

    画特征重要性

    cols = (feature_importance_df[["Feature", "importance"]]
            .groupby("Feature")
            .mean()
            .sort_values(by="importance", ascending=False)[:1000].index)
    
    best_features = feature_importance_df.loc[feature_importance_df.Feature.isin(cols)]
    
    plt.figure(figsize=(14,26))
    sns.barplot(x="importance",
                y="Feature",
                data=best_features.sort_values(by="importance",
                                               ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    

    画树

    fig, ax = plt.subplots(figsize=(100, 100))
    lgb.plot_tree(gbm, ax=ax)
    

    xgb

    xgb1 = xgb.XGBClassifier(max_depth=9,
                           learning_rate=0.005,
                           n_estimators=10000,
                           colsample_bytree=0.75,
                           sub_sample=0.75,
                           reg_lambda=0.15,
                           n_jobs=4,
                           random_state=3,
                           scale_pos_weight = 16)
    
    df_train_columns = [c for c in data.columns if c not in ["label", "uid", "user_id"]]
    label = data['label']
    
    n_splits = 5    # 分为5折
    seed = 19950115     # 随机种子
    skf = StratifiedKFold(n_splits=n_splits, random_state=seed, shuffle=True)
    
    for fold_, (trn_idx, val_idx) in enumerate(skf.split(data, label.values)):
        print("fold {}".format(fold_))
        X_train = data.iloc[trn_idx][df_train_columns]
        y_train = label.iloc[trn_idx]
        X_valid = data.iloc[val_idx][df_train_columns]
        y_valid = label.iloc[val_idx]
        
        watchlist = [(X_train,y_train),(X_valid,y_valid)]
    
        xbm = xgb1.fit(
                    X=X_train,
                   y=y_train,
                    eval_set  = watchlist,
                    early_stopping_rounds=200,
                    verbose =100,
                    eval_metric='auc',
                    )     # 训练
    

    画特征重要性

    fig, ax = plt.subplots(figsize=(14, 26))
    xgb.plot_importance(xbm, ax=ax, height=0.3)
    

    画树

    xgb.plot_tree(clf, num_trees=0, fmap='xgb.fmap')
    fig = plt.gcf()
    fig.set_size_inches(150, 100)
    plt.show()
    
  • 相关阅读:
    C#中的Singleton模式
    CodeLib
    Google Chats 居然和Gmail集成了...
    Windows中OSG环境搭建
    Socket中winsock.h和winsock2.h的不同
    高斯日记 蓝桥杯
    MATLAB矩阵处理
    马虎的算式 蓝桥杯
    MATLAB基础
    矩阵相乘的一维数组实现
  • 原文地址:https://www.cnblogs.com/harrylyx/p/14026844.html
Copyright © 2011-2022 走看看