zoukankan      html  css  js  c++  java
  • 机器学习算法讲堂(一) 十分钟入门机器学习算法竞赛

    机器学习算法讲堂(一) 十分钟入门机器学习算法竞赛

    比赛地址:https://www.kaggle.com/c/new-york-city-taxi-fare-prediction

    import pandas as pd
    import numpy as np 
    # http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html
    
    file = pd.read_csv('./data/train.csv', nrows = 1000000)
    
    print(file.head())
    print(file.shape)
    
    file = file.dropna(how = 'any', axis = 'rows')
    #Clean dataset
    def clean_df(df):
        return df[(df.fare_amount > 0) & 
               # (df.pickup_longitude > -80) & (df.pickup_longitude < -70) &
               # (df.pickup_latitude > 35) & (df.pickup_latitude < 45) &
               # (df.dropoff_longitude > -80) & (df.dropoff_longitude < -70) &
               # (df.dropoff_latitude > 35) & (df.dropoff_latitude < 45) &
                (df.passenger_count > 0) & (df.passenger_count < 10)]
    file = clean_df(file)
    print(len(file))
    print(file.shape)
    
    def sphere_dist(pickup_lat, pickup_lon, dropoff_lat, dropoff_lon):
    """
    Return distance along great radius between pickup and dropoff coordinates.
    """
    #Define earth radius (km)
    R_earth = 6371
    #Convert degrees to radians
    pickup_lat, pickup_lon, dropoff_lat, dropoff_lon = map(np.radians,
                                                             [pickup_lat, pickup_lon, 
                                                              dropoff_lat, dropoff_lon])
    #Compute distances along lat, lon dimensions
    dlat = dropoff_lat - pickup_lat
    dlon = dropoff_lon - pickup_lon
    
    #Compute haversine distance
    a = np.sin(dlat/2.0)**2 + np.cos(pickup_lat) * np.cos(dropoff_lat) * np.sin(dlon/2.0)**2
    
    return 2 * R_earth * np.arcsin(np.sqrt(a))
    
    def add_airport_dist(dataset):
        """
        Return minumum distance from pickup or dropoff coordinates to each airport.
        JFK: John F. Kennedy International Airport
        EWR: Newark Liberty International Airport
        LGA: LaGuardia Airport
        """
        jfk_coord = (40.639722, -73.778889)
        ewr_coord = (40.6925, -74.168611)
        lga_coord = (40.77725, -73.872611)
        Washington_Square = (40.4351,-73.5951)
        
        pickup_lat = dataset['pickup_latitude']
        dropoff_lat = dataset['dropoff_latitude']
        pickup_lon = dataset['pickup_longitude']
        dropoff_lon = dataset['dropoff_longitude']
        
        pickup_jfk = sphere_dist(pickup_lat, pickup_lon, jfk_coord[0], jfk_coord[1]) 
        dropoff_jfk = sphere_dist(jfk_coord[0], jfk_coord[1], dropoff_lat, dropoff_lon) 
        pickup_ewr = sphere_dist(pickup_lat, pickup_lon, ewr_coord[0], ewr_coord[1])
        dropoff_ewr = sphere_dist(ewr_coord[0], ewr_coord[1], dropoff_lat, dropoff_lon) 
        pickup_lga = sphere_dist(pickup_lat, pickup_lon, lga_coord[0], lga_coord[1]) 
        dropoff_lga = sphere_dist(lga_coord[0], lga_coord[1], dropoff_lat, dropoff_lon) 
        pickup_square = sphere_dist(pickup_lat, pickup_lon, Washington_Square[0], Washington_Square[1]) 
        dropoff_square = sphere_dist(Washington_Square[0], Washington_Square[1], dropoff_lat, dropoff_lon) 
        
        dataset['jfk_dist'] = pd.concat([pickup_jfk, dropoff_jfk], axis=1).min(axis=1)
        dataset['ewr_dist'] = pd.concat([pickup_ewr, dropoff_ewr], axis=1).min(axis=1)
        dataset['lga_dist'] = pd.concat([pickup_lga, dropoff_lga], axis=1).min(axis=1)
        dataset['washington_dist'] = pd.concat([pickup_square, dropoff_square], axis=1).min(axis=1)
        
        dataset['longitude_distance'] = abs(dataset['pickup_longitude'] - dataset['dropoff_longitude'])
        dataset['latitude_distance'] = abs(dataset['pickup_latitude'] - dataset['dropoff_latitude'])
    
        # Straight distance
        dataset['distance_travelled'] = (dataset['longitude_distance'] ** 2 + dataset['latitude_distance'] ** 2) ** .5
        dataset['distance_travelled_sin'] = np.sin((dataset['longitude_distance'] ** 2 * dataset['latitude_distance'] ** 2) ** .5)
        dataset['distance_travelled_cos'] = np.cos((dataset['longitude_distance'] ** 2 * dataset['latitude_distance'] ** 2) ** .5)
        dataset['distance_travelled_sin_sqrd'] = np.sin((dataset['longitude_distance'] ** 2 * dataset['latitude_distance'] ** 2) ** .5) ** 2
        dataset['distance_travelled_cos_sqrd'] = np.cos((dataset['longitude_distance'] ** 2 * dataset['latitude_distance'] ** 2) ** .5) ** 2
        
        # dataset["fare_to_dist_ratio"] = dataset["fare_amount"] / ( dataset["distance_travelled"]+0.0001)
        # dataset["fare_npassenger_to_dist_ratio"] = (dataset["fare_amount"] / dataset["passenger_count"]) /( dataset["distance_travelled"]+0.0001)
    
        dataset['jfk'] = 0
        dataset.loc[(dataset['pickup_longitude'] >= -73.7841) & (dataset['pickup_longitude'] <= -73.7721) &
           (dataset['pickup_latitude'] <= 40.6613) & (dataset['pickup_latitude'] >= 40.6213),'jfk'] = 1
        dataset.loc[(dataset['dropoff_longitude'] >= -73.7841) & (dataset['dropoff_longitude'] <= -73.7721) &
           (dataset['dropoff_latitude'] <= 40.6613) & (dataset['dropoff_latitude'] >= 40.6213),'jfk'] = 1
    
        dataset['lga'] = 0
        dataset.loc[(dataset['pickup_longitude'] >= -73.8870) & (dataset['pickup_longitude'] <= -73.8580) &
           (dataset['pickup_latitude'] <= 40.7800) & (dataset['pickup_latitude'] >= 40.7680),'lga'] = 1
        dataset.loc[(dataset['dropoff_longitude'] >= -73.8870) & (dataset['dropoff_longitude'] <= -73.8580) &
           (dataset['dropoff_latitude'] <= 40.7800) & (dataset['dropoff_latitude'] >= 40.7680),'lga'] = 1
    
        dataset['ewr'] = 0
        dataset.loc[(dataset['pickup_longitude'] >= -74.192) & (dataset['pickup_longitude'] <= -74.172) &
           (dataset['pickup_latitude'] <= 40.708) & (dataset['pickup_latitude'] >= 40.676),'ewr'] = 1
        dataset.loc[(dataset['dropoff_longitude'] >= -74.192) & (dataset['dropoff_longitude'] <= -74.172) &
           (dataset['dropoff_latitude'] <= 40.708) & (dataset['dropoff_latitude'] >= 40.676),'ewr'] = 1
    
        return dataset
    
    def add_datetime_info(dataset):
        #Convert to datetime format
        dataset['pickup_datetime'] = pd.to_datetime(dataset['pickup_datetime'],format="%Y-%m-%d %H:%M:%S UTC")
        
        # dataset['second'] = dataset.pickup_datetime.dt.second
        dataset['hour'] = dataset.pickup_datetime.dt.hour
        dataset['day'] = dataset.pickup_datetime.dt.day
        dataset['month'] = dataset.pickup_datetime.dt.month
        dataset['weekday'] = dataset.pickup_datetime.dt.weekday
        dataset['year'] = dataset.pickup_datetime.dt.year
        # dataset['all_time'] = dataset['second'] + 60*dataset['hour'] + 24*60*dataset['day']+30*24*60*dataset['month']
        
        return dataset
    
    file = add_datetime_info(file)
    file = add_airport_dist(file)
    file = file.drop(columns=['pickup_datetime']) #'distance_travelled_sin_sqrd','passenger_count','distance_travelled_cos_sqrd'])
    file['distance'] = sphere_dist(file['pickup_latitude'], file['pickup_longitude'], 
                                       file['dropoff_latitude'] , file['dropoff_longitude'])
    
    file.head()
    
    test_file = pd.read_csv('./data/test.csv')
    test_file = add_datetime_info(test_file)
    test_file = add_airport_dist(test_file)
    test_file = test_file.drop(columns=['pickup_datetime']) #, 'distance_travelled_sin_sqrd','passenger_count','distance_travelled_cos_sqrd'])
    
    test_file['distance'] = sphere_dist(test_file['pickup_latitude'], test_file['pickup_longitude'], 
                                       test_file['dropoff_latitude'] , test_file['dropoff_longitude'])
    
    
    test_file.head()
    
    
    import datetime as dt
    from sklearn.model_selection import train_test_split
    import xgboost as xgb
    import os
    
    
    train_x = file.drop(columns=['fare_amount'])
    y = file['fare_amount']
    new_test = test_file
    
    from sklearn.preprocessing import LabelEncoder
    for c in train_x.columns:
        if train_x[c].dtype == 'datetime64[ns]' or train_x[c].dtype == 'object':
            lbl = LabelEncoder()
            lbl.fit(list(train_x[c].values) + list(test_file[c].values))
            train_x[c] = lbl.transform(list(train_x[c].values))
            test_file[c] = lbl.transform(list(test_file[c].values))
    print(test_file.head())
    x_train,x_test,y_train,y_test = train_test_split(train_x,y,random_state=0,test_size=0.01)
    
    '''
    
    for x in range(0,len(x_train['pickup_datetime'])):
        try:
            time = ''
            for time_ac in str(x_train['pickup_datetime'].loc[x]):
                if time_ac <= '9' and time_ac >= '0':
                    time = time + time_ac
            x_train['pickup_datetime'].loc[x] = time
        except:
            x_train['pickup_datetime'].loc[x] = 0
    x_train['pickup_datetime'].astype('int64')
    '''
    print(x_train.dtypes)
    print(x_train.head)
    
    '''
    
    dtrain = xgb.DMatrix(x_train, y_train)
    dtest = xgb.DMatrix(x_test, y_test)
    eta = 0.1
    max_depth = 8
    subsample = 0.8
    colsample_bytree = 0.8
    
    print('XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}'.format(eta, max_depth, subsample, colsample_bytree))
    params = {
        "objective": "reg:linear",
        "booster" : "gbtree",
        "eval_metric": "rmse",
        "eta": eta,
        "max_depth": max_depth,
        "subsample": subsample,
        "colsample_bytree": colsample_bytree,
        "silent": 1,
        "seed": 19960429
    }
    
    watchlist  = [(dtrain,'train'),(dtest,'val')]
    num_round = 3000
    early_stopping_rounds=50
    bst = xgb.train(params, dtrain, num_round, watchlist,early_stopping_rounds=early_stopping_rounds)
    '''
    import lightgbm as lgb
    from sklearn.model_selection import train_test_split
    from sklearn import metrics
    from sklearn.metrics import mean_squared_error
    from sklearn.model_selection import KFold
    
    import matplotlib.pylab as plt
    
    # Keep Relevant Variables..
    trainshape = train_x.shape
    testshape = test_file.shape
    
    # print("
    Train DF..")
    # train = reduce_mem_usage(train)
    # print("
    Test DF..")
    # test_df = reduce_mem_usage(test_df)
    
    # LGBM Dataset Formating
    dtrain = lgb.Dataset(train_x, label=y, free_raw_data=False)
    
    print("Light Gradient Boosting Regressor: ")
    lgbm_params =  {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': 'rmse',
        'max_depth':7,
        'learning_rate':.1,
        'subsample': 0.8,
        'colsample_bytree': 0.8
    }
    
    folds = KFold(n_splits=5, shuffle=True, random_state=1)
    fold_preds = np.zeros(testshape[0])
    oof_preds = np.zeros(trainshape[0])
    dtrain.construct()
    
    # Fit 5 Folds
    modelstart = time.time()
    for trn_idx, val_idx in folds.split(file):
        clf = lgb.train(
            params=lgbm_params,
            train_set=dtrain.subset(trn_idx),
            valid_sets=dtrain.subset(val_idx),
            num_boost_round=17000, 
            early_stopping_rounds=250,
            verbose_eval=500
        )
        oof_preds[val_idx] = clf.predict(dtrain.data.iloc[val_idx])
        fold_preds += clf.predict(test_file) / folds.n_splits
        print(mean_squared_error(y.iloc[val_idx], oof_preds[val_idx]) ** .5)
        # lgb.plot_importance(clf, max_num_features=30)
        
    print("Model Runtime: %0.2f Minutes"%((time.time() - modelstart)/60))
    
    
    import time
    Ttest = xgb.DMatrix(test_file)
    # ypred = bst.predict(Ttest)
    ypred = fold_preds
    new_test = pd.read_csv('./data/test.csv')
    output = pd.DataFrame({ 'key' : new_test['key'], 'fare_amount': ypred })
    print(output.head())
    dt = time.strftime('%Y%m%d%H%M%S',time.localtime())
    output.to_csv('.//data//ans'+str(dt)+'.csv', index = False)
  • 相关阅读:
    《剑指offer》-判断平衡二叉树
    《剑指offer》-前n项和不准用通解和各种判断
    《剑指offer》-统计整数二进制表示中1的个数
    《剑指offer》-双栈实现队列
    《剑指offer》-数组乘积,不使用除法
    《剑指offer》-青蛙跳台阶II
    gradle入门(1-8)gradle 的依赖查看、依赖排除和指定版本(需要验证!)
    groovy入门(2-1)Groovy的Maven插件安装:Plugin execution not covered by lifecycle configuration
    zuul入门(4)zuul的注解@EnableZuulServer和@EnableZuulProxy
    zuul入门(2)zuul的过滤器分类和加载
  • 原文地址:https://www.cnblogs.com/qscqesze/p/9570578.html
Copyright © 2011-2022 走看看