zoukankan      html  css  js  c++  java
  • Kaggle——NFL Big Data Bowl

    neural networks + feature engineering for the win

    导入需要的库

    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    import matplotlib as mpl
    import seaborn as sns
    import datetime
    from kaggle.competitions import nflrush
    import tqdm
    import re
    from string import punctuation
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.preprocessing import StandardScaler
    import keras
    from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping
    from keras.utils import plot_model
    import keras.backend as K
    import tensorflow as tf
    
    sns.set_style('darkgrid')
    mpl.rcParams['figure.figsize'] = [15,10]
    
    env = nflrush.make_env()
    
    train = pd.read_csv('../input/nfl-big-data-bowl-2020/train.csv', dtype={'WindSpeed': 'object'})
    

    全面分析

    train.head()
    

    特征工程

    #from https://www.kaggle.com/prashantkikani/nfl-starter-lgb-feature-engg
    train['DefendersInTheBox_vs_Distance'] = train['DefendersInTheBox'] / train['Distance']
    

    分类特征

    cat_features = []
    for col in train.columns:
        if train[col].dtype =='object':
            cat_features.append((col, len(train[col].unique())))
    
    cat_features
    

    对其中一些特性进行预处理。

    球场类型

    train['StadiumType'].value_counts()
    

    已经看到一些拼写错误了,下面来改正它们。

    def clean_StadiumType(txt):
        if pd.isna(txt):
            return np.nan
        txt = txt.lower()
        txt = ''.join([c for c in txt if c not in punctuation])
        txt = re.sub(' +', ' ', txt)
        txt = txt.strip()
        txt = txt.replace('outside', 'outdoor')
        txt = txt.replace('outdor', 'outdoor')
        txt = txt.replace('outddors', 'outdoor')
        txt = txt.replace('outdoors', 'outdoor')
        txt = txt.replace('oudoor', 'outdoor')
        txt = txt.replace('indoors', 'indoor')
        txt = txt.replace('ourdoor', 'outdoor')
        txt = txt.replace('retractable', 'rtr.')
        return txt
    
    train['StadiumType'] = train['StadiumType'].apply(clean_StadiumType)
    

    根据pareto's原则,我们只关注这些词: outdoor, indoor, closed and open.。

    def transform_StadiumType(txt):
        if pd.isna(txt):
            return np.nan
        if 'outdoor' in txt or 'open' in txt:
            return 1
        if 'indoor' in txt or 'closed' in txt:
            return 0
        return np.nan
    
    train['StadiumType'] = train['StadiumType'].apply(transform_StadiumType)
    

    草坪

    #from https://www.kaggle.com/c/nfl-big-data-bowl-2020/discussion/112681#latest-649087
    Turf = {'Field Turf':'Artificial', 
            'A-Turf Titan':'Artificial', 
            'Grass':'Natural', 
            'UBU Sports Speed S5-M':'Artificial', 
            'Artificial':'Artificial', 
            'DD GrassMaster':'Artificial', 
            'Natural Grass':'Natural', 
            'UBU Speed Series-S5-M':'Artificial', 
            'FieldTurf':'Artificial', 
            'FieldTurf 360':'Artificial', 
            'Natural grass':'Natural', 
            'grass':'Natural', 
            'Natural':'Natural', 
            'Artifical':'Artificial', 
            'FieldTurf360':'Artificial', 
            'Naturall Grass':'Natural', 
            'Field turf':'Artificial', 
            'SISGrass':'Artificial', 
            'Twenty-Four/Seven Turf':'Artificial', 
            'natural grass':'Natural'} 
    
    train['Turf'] = train['Turf'].map(Turf)
    train['Turf'] = train['Turf'] == 'Natural'
    

    拥有的队伍

    train[(train['PossessionTeam']!=train['HomeTeamAbbr']) & (train['PossessionTeam']!=train['VisitorTeamAbbr'])][['PossessionTeam', 'HomeTeamAbbr', 'VisitorTeamAbbr']]
    

    在BLT和BAL,ARZ,ARI这样的球队上有一些问题。下面来改正一下。

    sorted(train['HomeTeamAbbr'].unique()) == sorted(train['VisitorTeamAbbr'].unique())
    
    diff_abbr = []
    for x,y  in zip(sorted(train['HomeTeamAbbr'].unique()), sorted(train['PossessionTeam'].unique())):
        if x!=y:
            print(x + " " + y)
    

    这里有三个问题,让我们来解决它。

    map_abbr = {'ARI': 'ARZ', 'BAL': 'BLT', 'CLE': 'CLV', 'HOU': 'HST'}
    for abb in train['PossessionTeam'].unique():
        map_abbr[abb] = abb
    
    train['PossessionTeam'] = train['PossessionTeam'].map(map_abbr)
    train['HomeTeamAbbr'] = train['HomeTeamAbbr'].map(map_abbr)
    train['VisitorTeamAbbr'] = train['VisitorTeamAbbr'].map(map_abbr)
    
    train['HomePossesion'] = train['PossessionTeam'] == train['HomeTeamAbbr']
    
    train['Field_eq_Possession'] = train['FieldPosition'] == train['PossessionTeam']
    train['HomeField'] = train['FieldPosition'] == train['HomeTeamAbbr']
    

    进攻方式

    off_form = train['OffenseFormation'].unique()
    train['OffenseFormation'].value_counts()
    

    由于我没有任何关于这方面的知识,我只对这个特性进行一次热编码

    train = pd.concat([train.drop(['OffenseFormation'], axis=1), pd.get_dummies(train['OffenseFormation'], prefix='Formation')], axis=1)
    dummy_col = train.columns
    

    比赛时间

    Game clock is supposed to be a numerical feature.

    train['GameClock'].value_counts()
    

    已经有了四分之一的功能,可以将游戏时间除以15分钟,这样就可以得到四分之一的正常时间。

    def strtoseconds(txt):
        txt = txt.split(':')
        ans = int(txt[0])*60 + int(txt[1]) + int(txt[2])/60
        return ans
    
    train['GameClock'] = train['GameClock'].apply(strtoseconds)
    
    sns.distplot(train['GameClock'])
    

    球员的身高

    train['PlayerHeight‘]
    

    1 ft =12 in, 因此:

    train['PlayerHeight'] = train['PlayerHeight'].apply(lambda x: 12*int(x.split('-')[0])+int(x.split('-')[1]))
    
    train['PlayerBMI'] = 703*(train['PlayerWeight']/(train['PlayerHeight'])**2)
    

    Time handoff 和 snap 和参赛者生日

    train['TimeHandoff']
    
    train['TimeHandoff'] = train['TimeHandoff'].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ"))
    train['TimeSnap'] = train['TimeSnap'].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ"))
    
    train['TimeDelta'] = train.apply(lambda row: (row['TimeHandoff'] - row['TimeSnap']).total_seconds(), axis=1)
    
    train['PlayerBirthDate'] = train['PlayerBirthDate'].apply(lambda x: datetime.datetime.strptime(x, "%m/%d/%Y"))
    

    让我们用time handoff来计算参赛者的年龄

    seconds_in_year = 60*60*24*365.25
    train['PlayerAge'] = train.apply(lambda row: (row['TimeHandoff']-row['PlayerBirthDate']).total_seconds()/seconds_in_year, axis=1)
    
    train = train.drop(['TimeHandoff', 'TimeSnap', 'PlayerBirthDate'], axis=1)
    

    风速及风向

    train['WindSpeed'].value_counts()
    

    可以看到有些值是不标准化的(例如,12mph),将从所有的值中删除mph。

    train['WindSpeed'] = train['WindSpeed'].apply(lambda x: x.lower().replace('mph', '').strip() if not pd.isna(x) else x)
    
    train['WindSpeed'].value_counts()
    
    #let's replace the ones that has x-y by (x+y)/2
    # and also the ones with x gusts up to y
    train['WindSpeed'] = train['WindSpeed'].apply(lambda x: (int(x.split('-')[0])+int(x.split('-')[1]))/2 if not pd.isna(x) and '-' in x else x)
    train['WindSpeed'] = train['WindSpeed'].apply(lambda x: (int(x.split()[0])+int(x.split()[-1]))/2 if not pd.isna(x) and type(x)!=float and 'gusts up to' in x else x)
    
    def str_to_float(txt):
        try:
            return float(txt)
        except:
            return -1
    
    train['WindSpeed'] = train['WindSpeed'].apply(str_to_float)
    
    train['WindDirection'].value_counts()
    
    def clean_WindDirection(txt):
        if pd.isna(txt):
            return np.nan
        txt = txt.lower()
        txt = ''.join([c for c in txt if c not in punctuation])
        txt = txt.replace('from', '')
        txt = txt.replace(' ', '')
        txt = txt.replace('north', 'n')
        txt = txt.replace('south', 's')
        txt = txt.replace('west', 'w')
        txt = txt.replace('east', 'e')
        return txt
    
    train['WindDirection'] = train['WindDirection'].apply(clean_WindDirection)
    
    train['WindDirection'].value_counts()
    
    def transform_WindDirection(txt):
        if pd.isna(txt):
            return np.nan
        if txt=='n':
            return 0
        if txt=='nne' or txt=='nen':
            return 1/8
        if txt=='ne':
            return 2/8
        if txt=='ene' or txt=='nee':
            return 3/8
        if txt=='e':
            return 4/8
        if txt=='ese' or txt=='see':
            return 5/8
        if txt=='se':
            return 6/8
        if txt=='ses' or txt=='sse':
            return 7/8
        if txt=='s':
            return 8/8
        if txt=='ssw' or txt=='sws':
            return 9/8
        if txt=='sw':
            return 10/8
        if txt=='sww' or txt=='wsw':
            return 11/8
        if txt=='w':
            return 12/8
        if txt=='wnw' or txt=='nww':
            return 13/8
        if txt=='nw':
            return 14/8
        if txt=='nwn' or txt=='nnw':
            return 15/8
        return np.nan
    
    train['WindDirection'] = train['WindDirection'].apply(transform_WindDirection)
    

    PlayDirection

    train['PlayDirection'].value_counts()
    
    train['PlayDirection'] = train['PlayDirection'].apply(lambda x: x.strip() == 'right')
    

    队伍

    train['Team'] = train['Team'].apply(lambda x: x.strip()=='home')
    

    比赛时天气

    train['GameWeather'].unique()
    

    将应用以下预处理:

    • 转成小写
    • N/A Indoor, N/A (Indoors) and Indoor => indoor 把它们聚在一起.·
    • coudy and clouidy => cloudy
    • party => partly
    • sunny and clear => clear and sunny
    • skies and mostly => ""
    train['GameWeather'] = train['GameWeather'].str.lower()
    indoor = "indoor"
    train['GameWeather'] = train['GameWeather'].apply(lambda x: indoor if not pd.isna(x) and indoor in x else x)
    train['GameWeather'] = train['GameWeather'].apply(lambda x: x.replace('coudy', 'cloudy').replace('clouidy', 'cloudy').replace('party', 'partly') if not pd.isna(x) else x)
    train['GameWeather'] = train['GameWeather'].apply(lambda x: x.replace('clear and sunny', 'sunny and clear') if not pd.isna(x) else x)
    train['GameWeather'] = train['GameWeather'].apply(lambda x: x.replace('skies', '').replace("mostly", "").strip() if not pd.isna(x) else x)
    
    train['GameWeather'].unique()
    

    查看天气描述中最常见的词汇

    from collections import Counter
    weather_count = Counter()
    for weather in train['GameWeather']:
        if pd.isna(weather):
            continue
        for word in weather.split():
            weather_count[word]+=1
            
    weather_count.most_common()[:15]
    

    要对我们的天气进行编码,我们要做下面的映射:

    • climate controlled or indoor => 3, sunny or sun => 2, clear => 1, cloudy => -1, rain => -2, snow => -3, others => 0
    • partly => multiply by 0.5
    def map_weather(txt):
        ans = 1
        if pd.isna(txt):
            return 0
        if 'partly' in txt:
            ans*=0.5
        if 'climate controlled' in txt or 'indoor' in txt:
            return ans*3
        if 'sunny' in txt or 'sun' in txt:
            return ans*2
        if 'clear' in txt:
            return ans
        if 'cloudy' in txt:
            return -ans
        if 'rain' in txt or 'rainy' in txt:
            return -2*ans
        if 'snow' in txt:
            return -3*ans
        return 0
    
    train['GameWeather'] = train['GameWeather'].apply(map_weather)
    

    NflId NflIdRusher

    train['IsRusher'] = train['NflId'] == train['NflIdRusher']
    
    train.drop(['NflId', 'NflIdRusher'], axis=1, inplace=True)
    

    PlayDirection problems

    有一个问题,有一些特征,如X和Y,因为比赛方向,下面来修复这些问题

    X, 定位和方向

    train['X'] = train.apply(lambda row: row['X'] if row['PlayDirection'] else 120-row['X'], axis=1)
    
    #from https://www.kaggle.com/scirpus/hybrid-gp-and-nn
    def new_orientation(angle, play_direction):
        if play_direction == 0:
            new_angle = 360.0 - angle
            if new_angle == 360.0:
                new_angle = 0.0
            return new_angle
        else:
            return angle
    train['Orientation'] = train.apply(lambda row: new_orientation(row['Orientation'], row['PlayDirection']), axis=1)
    train['Dir'] = train.apply(lambda row: new_orientation(row['Dir'], row['PlayDirection']), axis=1)
    

    YardsLeft

    我们来计算一下距离终点还有多少码。

    train['YardsLeft'] = train.apply(lambda row: 100-row['YardLine'] if row['HomeField'] else row['YardLine'], axis=1)
    train['YardsLeft'] = train.apply(lambda row: row['YardsLeft'] if row['PlayDirection'] else 100-row['YardsLeft'], axis=1)
    
    ((train['YardsLeft']<train['Yards'])|(train['YardsLeft']-100>train['Yards'])).mean()
    

    Clearly: Yards<=YardsLeft and YardsLeft-100<=Yards,因此,将删除这些错误的行.

    train.drop(train.index[(train['YardsLeft']<train['Yards']) | (train['YardsLeft']-100>train['Yards'])], inplace=True)
    

    Baseline model

    放弃分类特性,在模型中运行一个简单的随机森林

    train = train.sort_values(by=['PlayId', 'Team', 'IsRusher', 'JerseyNumber']).reset_index()
    
    train.drop(['GameId', 'PlayId', 'index', 'IsRusher', 'Team'], axis=1, inplace=True)
    
    cat_features = []
    for col in train.columns:
        if train[col].dtype =='object':
            cat_features.append(col)       
    train = train.drop(cat_features, axis=1)
    

    现在要为每一场比赛做一个大的排,其中最后一个是冲锋者

    train.fillna(-999, inplace=True)
    
    players_col = []
    for col in train.columns:
        if train[col][:22].std()!=0:
            players_col.append(col)
    
    X_train = np.array(train[players_col]).reshape(-1, len(players_col)*22)
    
    play_col = train.drop(players_col+['Yards'], axis=1).columns
    X_play_col = np.zeros(shape=(X_train.shape[0], len(play_col)))
    for i, col in enumerate(play_col):
        X_play_col[:, i] = train[col][::22]
    
    X_train = np.concatenate([X_train, X_play_col], axis=1)
    y_train = np.zeros(shape=(X_train.shape[0], 199))
    for i,yard in enumerate(train['Yards'][::22]):
        y_train[i, yard+99:] = np.ones(shape=(1, 100-yard))
    
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    
    batch_size=64
    
    class RAdam(keras.optimizers.Optimizer):
        """RAdam optimizer.
        # Arguments
            learning_rate: float >= 0. Learning rate.
            beta_1: float, 0 < beta < 1. Generally close to 1.
            beta_2: float, 0 < beta < 1. Generally close to 1.
            epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
            decay: float >= 0. Learning rate decay over each update.
            weight_decay: float >= 0. Weight decay for each param.
            amsgrad: boolean. Whether to apply the AMSGrad variant of this
                algorithm from the paper "On the Convergence of Adam and
                Beyond".
            total_steps: int >= 0. Total number of training steps. Enable warmup by setting a positive value.
            warmup_proportion: 0 < warmup_proportion < 1. The proportion of increasing steps.
            min_lr: float >= 0. Minimum learning rate after warmup.
        # References
            - [Adam - 一种随机优化方法](https://arxiv.org/abs/1412.6980v8)
            - [关于Adam和Beyond的收敛 ](https://openreview.net/forum?id=ryQu7f-RZ)
            - [研究了自适应学习速率的方差及其影响因素](https://arxiv.org/pdf/1908.03265v1.pdf)
        """
    
        def __init__(self, learning_rate=0.001, beta_1=0.9, beta_2=0.999,
                     epsilon=None, decay=0., weight_decay=0., amsgrad=False,
                     total_steps=0, warmup_proportion=0.1, min_lr=0., **kwargs):
            learning_rate = kwargs.pop('lr', learning_rate)
            super(RAdam, self).__init__(**kwargs)
            with K.name_scope(self.__class__.__name__):
                self.iterations = K.variable(0, dtype='int64', name='iterations')
                self.learning_rate = K.variable(learning_rate, name='learning_rate')
                self.beta_1 = K.variable(beta_1, name='beta_1')
                self.beta_2 = K.variable(beta_2, name='beta_2')
                self.decay = K.variable(decay, name='decay')
                self.weight_decay = K.variable(weight_decay, name='weight_decay')
                self.total_steps = K.variable(total_steps, name='total_steps')
                self.warmup_proportion = K.variable(warmup_proportion, name='warmup_proportion')
                self.min_lr = K.variable(min_lr, name='min_lr')
            if epsilon is None:
                epsilon = K.epsilon()
            self.epsilon = epsilon
            self.initial_decay = decay
            self.initial_weight_decay = weight_decay
            self.initial_total_steps = total_steps
            self.amsgrad = amsgrad
    
        def get_updates(self, loss, params):
            grads = self.get_gradients(loss, params)
            self.updates = [K.update_add(self.iterations, 1)]
            lr = self.lr
    
            if self.initial_decay > 0:
                lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay))))
            t = K.cast(self.iterations, K.floatx()) + 1
    
            if self.initial_total_steps > 0:
                warmup_steps = self.total_steps * self.warmup_proportion
                decay_steps = K.maximum(self.total_steps - warmup_steps, 1)
                decay_rate = (self.min_lr - lr) / decay_steps
                lr = K.switch(
                    t <= warmup_steps,
                    lr * (t / warmup_steps),
                    lr + decay_rate * K.minimum(t - warmup_steps, decay_steps),
                )
            ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p), name='m_' + str(i)) for (i, p) in enumerate(params)]
            vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p), name='v_' + str(i)) for (i, p) in enumerate(params)]
            if self.amsgrad:
                vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p), name='vhat_' + str(i)) for (i, p) in enumerate(params)]
            else:
                vhats = [K.zeros(1, name='vhat_' + str(i)) for i in range(len(params))]
    
            self.weights = [self.iterations] + ms + vs + vhats
    
            beta_1_t = K.pow(self.beta_1, t)
            beta_2_t = K.pow(self.beta_2, t)
            sma_inf = 2.0 / (1.0 - self.beta_2) - 1.0
            sma_t = sma_inf - 2.0 * t * beta_2_t / (1.0 - beta_2_t)
    
            for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
                m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
                v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
    
                m_corr_t = m_t / (1.0 - beta_1_t)
                if self.amsgrad:
                    vhat_t = K.maximum(vhat, v_t)
                    v_corr_t = K.sqrt(vhat_t / (1.0 - beta_2_t))
                    self.updates.append(K.update(vhat, vhat_t))
                else:
                    v_corr_t = K.sqrt(v_t / (1.0 - beta_2_t))
    
                r_t = K.sqrt((sma_t - 4.0) / (sma_inf - 4.0) *
                             (sma_t - 2.0) / (sma_inf - 2.0) *
                             sma_inf / sma_t)
                p_t = K.switch(sma_t >= 5, r_t * m_corr_t / (v_corr_t + self.epsilon), m_corr_t)
                if self.initial_weight_decay > 0:
                    p_t += self.weight_decay * p
                p_t = p - lr * p_t
                self.updates.append(K.update(m, m_t))
                self.updates.append(K.update(v, v_t))
                new_p = p_t
    
                # Apply constraints.
                if getattr(p, 'constraint', None) is not None:
                    new_p = p.constraint(new_p)
                self.updates.append(K.update(p, new_p))
            return self.updates
        @property
        def lr(self):
            return self.learning_rate
        @lr.setter
        def lr(self, learning_rate):
            self.learning_rate = learning_rate
        def get_config(self):
            config = {
                'learning_rate': float(K.get_value(self.learning_rate)),
                'beta_1': float(K.get_value(self.beta_1)),
                'beta_2': float(K.get_value(self.beta_2)),
                'decay': float(K.get_value(self.decay)),
                'weight_decay': float(K.get_value(self.weight_decay)),
                'epsilon': self.epsilon,
                'amsgrad': self.amsgrad,
                'total_steps': float(K.get_value(self.total_steps)),
                'warmup_proportion': float(K.get_value(self.warmup_proportion)),
                'min_lr': float(K.get_value(self.min_lr)),
            }
            base_config = super(RAdam, self).get_config()
            return dict(list(base_config.items()) + list(config.items()))
    
    #from https://www.kaggle.com/davidcairuz/nfl-neural-network-w-softmax
    def crps(y_true, y_pred):
        return K.mean(K.square(y_true - K.cumsum(y_pred, axis=1)), axis=1)
    
    def get_model():
        x = keras.layers.Input(shape=[X_train.shape[1]])
        fc1 = keras.layers.Dense(units=450, input_shape=[X_train.shape[1]])(x)
        act1 = keras.layers.PReLU()(fc1)
        bn1 = keras.layers.BatchNormalization()(act1)
        dp1 = keras.layers.Dropout(0.55)(bn1)
        gn1 = keras.layers.GaussianNoise(0.15)(dp1)
        concat1 = keras.layers.Concatenate()([x, gn1])
        fc2 = keras.layers.Dense(units=600)(concat1)
        act2 = keras.layers.PReLU()(fc2)
        bn2 = keras.layers.BatchNormalization()(act2)
        dp2 = keras.layers.Dropout(0.55)(bn2)
        gn2 = keras.layers.GaussianNoise(0.15)(dp2)
        concat2 = keras.layers.Concatenate()([concat1, gn2])
        fc3 = keras.layers.Dense(units=400)(concat2)
        act3 = keras.layers.PReLU()(fc3)
        bn3 = keras.layers.BatchNormalization()(act3)
        dp3 = keras.layers.Dropout(0.55)(bn3)
        gn3 = keras.layers.GaussianNoise(0.15)(dp3)
        concat3 = keras.layers.Concatenate([concat2, gn3])
        output = keras.layers.Dense(units=199, activation='softmax')(concat2)
        model = keras.models.Model(inputs=[x], outputs=[output])
        return model
    def train_model(X_train, y_train, X_val, y_val):
        model = get_model()
        model.compile(optimizer=RAdam(warmup_proportion=0.1, min_lr=1e-7), loss=crps)
        er = EarlyStopping(patience=20, min_delta=1e-4, restore_best_weights=True, monitor='val_loss')
        model.fit(X_train, y_train, epochs=200, callbacks=[er], validation_data=[X_val, y_val], batch_size=batch_size)
        return model
    
    from sklearn.model_selection import RepeatedKFold
    rkf = RepeatedKFold(n_splits=5, n_repeats=5)
    models = []
    for tr_idx, vl_idx in rkf.split(X_train, y_train):
        x_tr, y_tr = X_train[tr_idx], y_train[tr_idx]
        x_vl, y_vl = X_train[vl_idx], y_train[vl_idx]   
        model = train_model(x_tr, y_tr, x_vl, y_vl)
        models.append(model)
    
    def make_pred(df, sample, env, models):
        df['StadiumType'] = df['StadiumType'].apply(clean_StadiumType)
        df['StadiumType'] = df['StadiumType'].apply(transform_StadiumType)
        df['DefendersInTheBox_vs_Distance'] = df['DefendersInTheBox'] / df['Distance']
        df['OffenseFormation'] = df['OffenseFormation'].apply(lambda x: x if x in off_form else np.nan)
        df = pd.concat([df.drop(['OffenseFormation'], axis=1), pd.get_dummies(df['OffenseFormation'], prefix='Formation')], axis=1)
        missing_cols = set( dummy_col ) - set( df.columns )-set('Yards')
        for c in missing_cols:
            df[c] = 0
        df = df[dummy_col]
        df.drop(['Yards'], axis=1, inplace=True)
        df['Turf'] = df['Turf'].map(Turf)
        df['Turf'] = df['Turf'] == 'Natural'
        df['PossessionTeam'] = df['PossessionTeam'].map(map_abbr)
        df['HomeTeamAbbr'] = df['HomeTeamAbbr'].map(map_abbr)
        df['VisitorTeamAbbr'] = df['VisitorTeamAbbr'].map(map_abbr)
        df['HomePossesion'] = df['PossessionTeam'] == df['HomeTeamAbbr']
        df['Field_eq_Possession'] = df['FieldPosition'] == df['PossessionTeam']
        df['HomeField'] = df['FieldPosition'] == df['HomeTeamAbbr']
        df['GameClock'] = df['GameClock'].apply(strtoseconds)
        df['PlayerHeight'] = df['PlayerHeight'].apply(lambda x: 12*int(x.split('-')[0])+int(x.split('-')[1]))
        df['PlayerBMI'] = 703*(df['PlayerWeight']/(df['PlayerHeight'])**2)
        df['TimeHandoff'] = df['TimeHandoff'].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ"))
        df['TimeSnap'] = df['TimeSnap'].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ"))
        df['TimeDelta'] = df.apply(lambda row: (row['TimeHandoff'] - row['TimeSnap']).total_seconds(), axis=1)
        df['PlayerBirthDate'] = df['PlayerBirthDate'].apply(lambda x: datetime.datetime.strptime(x, "%m/%d/%Y"))
        seconds_in_year = 60*60*24*365.25
        df['PlayerAge'] = df.apply(lambda row: (row['TimeHandoff']-row['PlayerBirthDate']).total_seconds()/seconds_in_year, axis=1)
        df['WindSpeed'] = df['WindSpeed'].apply(lambda x: x.lower().replace('mph', '').strip() if not pd.isna(x) else x)
        df['WindSpeed'] = df['WindSpeed'].apply(lambda x: (int(x.split('-')[0])+int(x.split('-')[1]))/2 if not pd.isna(x) and '-' in x else x)
        df['WindSpeed'] = df['WindSpeed'].apply(lambda x: (int(x.split()[0])+int(x.split()[-1]))/2 if not pd.isna(x) and type(x)!=float and 'gusts up to' in x else x)
        df['WindSpeed'] = df['WindSpeed'].apply(str_to_float)
        df['WindDirection'] = df['WindDirection'].apply(clean_WindDirection)
        df['WindDirection'] = df['WindDirection'].apply(transform_WindDirection)
        df['PlayDirection'] = df['PlayDirection'].apply(lambda x: x.strip() == 'right')
        df['Team'] = df['Team'].apply(lambda x: x.strip()=='home')
        indoor = "indoor"
        df['GameWeather'] = df['GameWeather'].apply(lambda x: indoor if not pd.isna(x) and indoor in x else x)
        df['GameWeather'] = df['GameWeather'].apply(lambda x: x.lower().replace('coudy', 'cloudy').replace('clouidy', 'cloudy').replace('party', 'partly').replace('clear and sunny', 'sunny and clear').replace('skies', '').replace("mostly", "").strip() if not pd.isna(x) else x)
        df['GameWeather'] = df['GameWeather'].apply(map_weather)
        df['IsRusher'] = df['NflId'] == df['NflIdRusher']
        df['X'] = df.apply(lambda row: row['X'] if row['PlayDirection'] else 120-row['X'], axis=1)
        df['Orientation'] = df.apply(lambda row: new_orientation(row['Orientation'], row['PlayDirection']), axis=1)
        df['Dir'] = df.apply(lambda row: new_orientation(row['Dir'], row['PlayDirection']), axis=1)
        df['YardsLeft'] = df.apply(lambda row: 100-row['YardLine'] if row['HomeField'] else row['YardLine'], axis=1)
        df['YardsLeft'] = df.apply(lambda row: row['YardsLeft'] if row['PlayDirection'] else 100-row['YardsLeft'], axis=1)
        df = df.sort_values(by=['PlayId', 'Team', 'IsRusher', 'JerseyNumber']).reset_index()
        df = df.drop(['TimeHandoff', 'TimeSnap', 'PlayerBirthDate', 'NflId', 'NflIdRusher', 'GameId', 'PlayId', 'index', 'IsRusher', 'Team'], axis=1)
        cat_features = []
        for col in df.columns:
            if df[col].dtype =='object':
                cat_features.append(col)
    
        df = df.drop(cat_features, axis=1)
        df.fillna(-999, inplace=True)
        X = np.array(df[players_col]).reshape(-1, len(players_col)*22)
        play_col = df.drop(players_col, axis=1).columns
        X_play_col = np.zeros(shape=(X.shape[0], len(play_col)))
        for i, col in enumerate(play_col):
            X_play_col[:, i] = df[col][::22]
        X = np.concatenate([X, X_play_col], axis=1)
        X = scaler.transform(X)
        y_pred = np.mean([np.cumsum(model.predict(X), axis=1) for model in models], axis=0)
        yardsleft = np.array(df['YardsLeft'][::22])
        
        for i in range(len(yardsleft)):
            y_pred[i, :yardsleft[i]-1] = 0
            y_pred[i, yardsleft[i]+100:] = 1
        env.predict(pd.DataFrame(data=y_pred.clip(0,1),columns=sample.columns))
        return y_pred
    
    for test, sample in tqdm.tqdm(env.iter_test()):
         make_pred(test, sample, env, models)
    
    env.write_submission_file()
    
    env.write_submission_file()
    
  • 相关阅读:
    Server has no associated SOC machines or all SOC machines are at capacity
    没有启用集成windows身份验证
    Arcgis server常见问题
    把.NET程序部署到没有安装.NET Framwork的机器上
    将图片保存到数据库表中及从数据库表中读取图片并显示
    C#实现QQ靠边隐藏的功能
    手把手教你用C#打包应用程序(安装程序)【卸载模块已添加】
    DotNet下简单的程序部署
    部署.NET平台的程序实例浅析
    程序打包
  • 原文地址:https://www.cnblogs.com/wwj99/p/12210048.html
Copyright © 2011-2022 走看看