zoukankan      html  css  js  c++  java
  • 数据挖掘竞赛常用代码段

    常用库

    import gc
    import os
    import csv
    import time
    import math
    import datetime
    import collections
    import pandas as pd
    import numpy as np
    from tqdm import tqdm, tqdm_notebook, trange
    from sklearn import preprocessing
    
    import lightgbm as lgb
    import xgboost as xgb
    import catboost as cb
    from sklearn.model_selection import train_test_split
    from sklearn.model_selection import StratifiedKFold
    from sklearn.metrics import roc_curve, auc, roc_auc_score
    import matplotlib.pyplot as plt
    import seaborn as sns
    

    降低内存

    def reduce_mem_usage(df):
        """ iterate through all the columns of a dataframe and modify the data type
            to reduce memory usage.        
        """
        start_mem = df.memory_usage().sum() / 1024**2
        print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
        
        for col in df.columns:
            col_type = df[col].dtype
            
            if col_type != object:
                c_min = df[col].min()
                c_max = df[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)  
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)
            else:
                df[col] = df[col].astype('category')
    
        end_mem = df.memory_usage().sum() / 1024**2
        print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
        print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
        
        return df
    

    常用统计特征

    一阶

     tmp_df = use_Mb_info_n.groupby("user_id").agg({
            "x1": ["sum","max","min","mean"],
            "x2": ["count","nunique"],
            "x3":["nunique"],
            "x4":["sum"]
        })
    tmp_df.columns = ['_'.join(str(xx) for xx in x) for x in tmp_df.columns.ravel()]
    tmp_df = tmp_df.reset_index()
    feature_df = pd.merge(feature_df, tmp_df, how='left', on='user_id')
    

    二阶

    tmp_df = train_data.groupby(["user_id", "date"]).agg({
            "x1": ["sum", "mean", "max", "skew", pd.DataFrame.kurt],
            "x2": ["nunique"],
            "x3": ["sum"],
            "x4": ["sum"]
        })
    tmp_df.columns = ["_".join((str(xx) for xx in x)) for x in tmp_df.columns.ravel()]
    tmp_df = tmp_df.unstack(level=-1)
    tmp_df.columns = ["_".join((str(xx) for xx in x)) for x in tmp_df.columns.ravel()]
    tmp_df = tmp_df.reset_index()
    feature_df = pd.merge(feature_df, tmp_df, how='left', on='uid')
    

    画图

    协方差

    def correlation_heatmap(df):
        _ , ax = plt.subplots(figsize =(50, 50))
        colormap = sns.diverging_palette(220, 10, as_cmap = True)
        
        _ = sns.heatmap(
            df.corr(), 
            cmap = colormap,
            square=True, 
            cbar_kws={'shrink':.9 }, 
            ax=ax,
            annot=True, 
            linewidths=0.1,vmax=1.0, linecolor='white',
            annot_kws={'fontsize':12 }
        )
        
        plt.title('Pearson Correlation of Features', y=1.05, size=15)
    
    correlation_heatmap(your_df)
    

    正常显示中文

    plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
    plt.rcParams['axes.unicode_minus']=False #用来正常显示负号
    

    大小

    plt.rcParams['figure.figsize'] = (10,5)
    plt.rcParams['figure.dpi'] = 200
    

    count条状图

    sns.countplot(y="店铺名称", data=df, color="c")
    

    训练

    lgb

    n_splits = 5    # 分为5折
    seed = 19950115     # 随机种子
    gbm=None
    # lgb 参数
    lgb_params = {
        "learning_rate": 0.005,
        "lambda_l2": 0.15,
        "max_depth": 9,
        "objective": "binary",
        "verbose": -1,
        # 'feature_fraction': 0.9,
        # "min_split_gain": 0.1,
        "boosting_type": "gbdt",
        "subsample": 0.75,
        "colsample_bytree": 0.75,
        # "colsample_bylevel": 0.9,
        "scale_pos_weight": 16,
        'metric': ['auc'],  # 评估函数
    }
    
    df_train_columns = [c for c in data.columns if c not in ["label", "uid", "user_id"]]
    label = data['label']
    
    predictions = 0
    feature_importance_df = pd.DataFrame()
    skf = StratifiedKFold(n_splits=n_splits, random_state=seed, shuffle=True)
    
    for fold_, (trn_idx, val_idx) in enumerate(skf.split(data, label.values)):
        print("fold {}".format(fold_))
        trn_data = lgb.Dataset(data.iloc[trn_idx][df_train_columns], label=label.iloc[trn_idx])
        val_data = lgb.Dataset(data.iloc[val_idx][df_train_columns], label=label.iloc[val_idx])
    
        gbm = lgb.train(lgb_params, 
                        trn_data, 
                        # init_model=gbm,  
                        num_boost_round=150000, 
                        valid_sets=[trn_data, val_data],
                        early_stopping_rounds=200, 
                        verbose_eval=200)     # 训练
        # clf = joblib.load("model/lgb_{}.m".format(index))     # 保存模型
        # joblib.dump(clf, "model/lgb_{}.m".format(index))      # 加载模型
        # gbm.save_model(MODEL_PATH+'/lgb_more_fea.model', num_iteration=gbm.best_iteration)
        y_pred = gbm.predict(data.iloc[val_idx][df_train_columns], num_iteration=gbm.best_iteration)
        # qauc_score = qauc(y_pred, data.iloc[val_idx][df_train_columns], label.iloc[val_idx])
        # print("qauc: ", qauc_score)
        # y_score.append(qauc_score)  # 计算auc值
        
        fold_importance_df = pd.DataFrame()
        fold_importance_df["Feature"] = df_train_columns
        fold_importance_df["importance"] = gbm.feature_importance()
        fold_importance_df["fold"] = fold_ + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        
        predictions += y_pred.T[0] / skf.n_splits
    

    画特征重要性

    cols = (feature_importance_df[["Feature", "importance"]]
            .groupby("Feature")
            .mean()
            .sort_values(by="importance", ascending=False)[:1000].index)
    
    best_features = feature_importance_df.loc[feature_importance_df.Feature.isin(cols)]
    
    plt.figure(figsize=(14,26))
    sns.barplot(x="importance",
                y="Feature",
                data=best_features.sort_values(by="importance",
                                               ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    

    画树

    fig, ax = plt.subplots(figsize=(100, 100))
    lgb.plot_tree(gbm, ax=ax)
    

    xgb

    xgb1 = xgb.XGBClassifier(max_depth=9,
                           learning_rate=0.005,
                           n_estimators=10000,
                           colsample_bytree=0.75,
                           sub_sample=0.75,
                           reg_lambda=0.15,
                           n_jobs=4,
                           random_state=3,
                           scale_pos_weight = 16)
    
    df_train_columns = [c for c in data.columns if c not in ["label", "uid", "user_id"]]
    label = data['label']
    
    n_splits = 5    # 分为5折
    seed = 19950115     # 随机种子
    skf = StratifiedKFold(n_splits=n_splits, random_state=seed, shuffle=True)
    
    for fold_, (trn_idx, val_idx) in enumerate(skf.split(data, label.values)):
        print("fold {}".format(fold_))
        X_train = data.iloc[trn_idx][df_train_columns]
        y_train = label.iloc[trn_idx]
        X_valid = data.iloc[val_idx][df_train_columns]
        y_valid = label.iloc[val_idx]
        
        watchlist = [(X_train,y_train),(X_valid,y_valid)]
    
        xbm = xgb1.fit(
                    X=X_train,
                   y=y_train,
                    eval_set  = watchlist,
                    early_stopping_rounds=200,
                    verbose =100,
                    eval_metric='auc',
                    )     # 训练
    

    画特征重要性

    fig, ax = plt.subplots(figsize=(14, 26))
    xgb.plot_importance(xbm, ax=ax, height=0.3)
    

    画树

    xgb.plot_tree(clf, num_trees=0, fmap='xgb.fmap')
    fig = plt.gcf()
    fig.set_size_inches(150, 100)
    plt.show()
    
  • 相关阅读:
    Codeforces 992C(数学)
    Codeforces 990C (思维)
    Codeforces 989C (构造)
    POJ 1511 Invitation Cards(链式前向星,dij,反向建边)
    Codeforces 1335E2 Three Blocks Palindrome (hard version)(暴力)
    POJ 3273 Monthly Expense(二分)
    POJ 2566 Bound Found(尺取前缀和)
    POJ 1321 棋盘问题(dfs)
    HDU 1506 Largest Rectangle in a Histogram(单调栈)
    POJ 2823 Sliding Window(单调队列)
  • 原文地址:https://www.cnblogs.com/harrylyx/p/14026844.html
Copyright © 2011-2022 走看看