zoukankan      html  css  js  c++  java
  • Python数据挖掘入门与实践---用决策树预测获胜球队

    数据集来源:1.  2013-14 NBA Schedule and Results

                            2.2013年 NBA 赛季排名情况

    参考书籍:《Python数据挖掘入门与实践》

    1.加载数据集:

    使用pandas加载数据集,有1319行数据, 8个特征, 查看前5项数据集,并查找是否有重复数据

    #coding=gbk
    #使用决策树来预测获胜球队
    import time 
    start = time.clock()
    
    #加载数据集
    import pandas as pd
    file_name = r'D:datasetsNBA_2014_games.csv'
    data = pd.read_csv(file_name)
    print(data.head())  #读取前5项数据集
    #               Date Unnamed: 1       Visitor/Neutral  PTS         Home/Neutral  .....
    # 0  Tue Oct 29 2013  Box Score         Orlando Magic   87       Indiana Pacers   
    # 1  Tue Oct 29 2013  Box Score  Los Angeles Clippers  103   Los Angeles Lakers   
    # 2  Tue Oct 29 2013  Box Score         Chicago Bulls   95           Miami Heat   
    # 3  Wed Oct 30 2013  Box Score         Brooklyn Nets   94  Cleveland Cavaliers   
    # 4  Wed Oct 30 2013  Box Score         Atlanta Hawks  109     Dallas Mavericks  
    print(data.shape)   # (1319, 8)
    print(data[data.duplicated()])  # Empty DataFrame 没有重复元素

    数据集清洗:1.第一列数据日期是字符串格式,改为日期格式; 2.修改表头。

    #修复表头数据参数
    data = pd.read_csv(file_name, parse_dates= ['Date'])    #skiprows 忽略的行数
    data.columns = ['Date','Score Type', 'Visitor Team', 'VisitorPts', 'Home Team', 'HomePts', 'OT?', 'Notes']
    print(data.head())  #重命名表头
    #         Date Score Type          Visitor Team  VisitorPts  。。。。
    # 0 2013-10-29  Box Score         Orlando Magic          87   
    # 1 2013-10-29  Box Score  Los Angeles Clippers         103   
    # 2 2013-10-29  Box Score         Chicago Bulls          95   
    # 3 2013-10-30  Box Score         Brooklyn Nets          94   
    # 4 2013-10-30  Box Score         Atlanta Hawks         109 
    print('-----')
    # print(data.ix[1] )  #打印出第2行的数据

    提取新特征:通过现有的数据抽取特征, 首先确定类别,篮球只有胜负之分, 不像足球还有 平,局,  以1 代表球队取胜,0为失败。

    #提取新特征
    
    #找出获胜的球队
    data['HomeWin'] = data['VisitorPts'] < data['HomePts']
    y_true = data['HomeWin'].values
    print(y_true[:5])   #[ True  True  True  True  True] 是 numpy 数组
    # print(data.head())
    
    #创建2个新特征, 分别是这两只球队的上一场比赛的胜负情况
    #创建字典,存放上次比赛结果
    from collections import defaultdict
    won_last = defaultdict(int)
    
    data['HomeLastWin'] = None
    data['VisitorLastWin'] = None   #此两行代码原书上没有,应该增加这2列,否则下面的循环不能创建这2列
     
    for index, row in data.iterrows():
        home_team = row['Home Team']
        visitor_team = row['Visitor Team']  #循环获得球队名称
        row['HomeLastWin'] = won_last[home_team]
        row['VisitorLastWin'] = won_last[visitor_team]
        data.ix[index] = row    #更新行数
        won_last[home_team] = row['HomeWin']    #判断上一场是否获胜
        won_last[visitor_team] =not row['HomeWin']
     
    print('----')
    # print(data.ix[20:25])
    #              Home Team  HomePts  OT? Notes  HomeWin HomeLastWin VisitorLastWin  
    # 20      Boston Celtics       98  NaN   NaN    False       False          False  
    # 21       Brooklyn Nets      101  NaN   NaN     True       False          False  
    # 22   Charlotte Bobcats       90  NaN   NaN     True       False           True  
    # 23      Denver Nuggets       98  NaN   NaN    False       False          False  
    # 24     Houston Rockets      113  NaN   NaN     True        True           True  
    # 25  Los Angeles Lakers       85  NaN   NaN    False       False           True  
        

    一些练习测试代码:defaultdict 和 iterrows()的使用方法

    won_last['jj'] = 12
    dd = won_last['Indiana Pacers'] #defaultdict的作用是在于,当字典里的key不存在但被查找时,返回的不是keyError而是一个默认值
    print(dd)   # 0
    print(won_last) #  defaultdict(<class 'int'>, {'Indiana Pacers': 0, 'jj': 12}) 返回的是defaultdict类型
    
    
    dataset = pd.DataFrame([[1,2,3],[4,5,6],[7,8,9]])
    print(dataset)
    for index, row in dataset.iterrows():
        print(index)    # 0, 1, 2 打印出行号
        print(row)      #打印出第 1, 2, 3 行的全部元素
    
    

    2.使用决策树

    决策树原理参考:

    这里直接使用决策树, 没有刻意地去调参数,可能是作者为了对比不同特征的优劣吧。

    从数据集中构建有效的特征, (Feature Engineering 特征工程)是数据挖掘的难点所在, 好的特征直接关系到结果的正确率, -------甚至比选择合适的算法更重要。

    #使用决策树
    from sklearn.tree import DecisionTreeClassifier
    clf = DecisionTreeClassifier(random_state =14)  #设置随机种子,使结果复现,。。。 但是还是不同。
    X_previousWins = data[['HomeLastWin', 'VisitorLastWin']].values #使用新创建的2个特征作为输入
     
    from sklearn.model_selection import cross_val_score # 使 用交叉验证模型平均得分
    import numpy as np
    scores = cross_val_score(clf, X_previousWins, y_true, scoring='accuracy')
    mean_score = np.mean(scores) *100
    print('the accuracy is %0.2f'%mean_score+'%')   #准确率为    the accuracy is 57.47% 

    使用另一数据集:13年NBA 排名情况

    #读取2013年球队排名情况
    file_name2 = r'D:datasetsNBA_2013_stangdings.csv'
    standings = pd.read_csv(file_name2)
    # print(standings.head())
    #    Rk                   Team Overall  Home   Road      E      W     A     C  ....
    # 0   1             Miami Heat   66-16  37-4  29-12  41-11   25-5  14-4  12-6   
    # 1   2  Oklahoma City Thunder   60-22  34-7  26-15   21-9  39-13   7-3   8-2   
    # 2   3      San Antonio Spurs   58-24  35-6  23-18   25-5  33-19   8-2   9-1   
    # 3   4         Denver Nuggets   57-25  38-3  19-22  19-11  38-14   5-5  10-0   
    # 4   5   Los Angeles Clippers   56-26  32-9  24-17   21-9  35-17   7-3   8-2   
    # print(standings.shape)  # (30, 24) 有30只球队 

    创建一个新特征值, 主场球队是否比对手排名高。然后使用创建的3个特征去 fit 模型

    #创建一个新特征值, 主场球队是否比对手排名高
    data['HomeTeamRanksHigher'] = 0
    for index, row in data.iterrows():
        home_team = row['Home Team']
        visitor_team = row['Visitor Team']
        if home_team =='New Orleans Pelicans':  #更换了名字的球队
            home_team ='New Orleans Hornets'
        elif visitor_team == 'New Orleans Pelicans':
            visitor_team='New Orleans Hornets'
        
        #比较排名, 更新特征值
        home_rank = standings[standings['Team']== home_team]['Rk'].values[0]
        visitor_rank = standings[standings['Team']== visitor_team]['Rk'].values[0]
        row['HomeTeamRanksHigher'] = int(home_rank > visitor_rank) 
        data.ix[index] = row 
    
    X_homehigher = data[['HomeLastWin', 'VisitorLastWin', 'HomeTeamRanksHigher']].values
    # clf1 = DecisionTreeClassifier(random_state=14)
    # scores = cross_val_score(clf1, X_homehigher, y_true, scoring='accuracy')
    # mean_score1 = np.mean(scores) *100
    # print('the new accuracy is %.2f'%mean_score1 + '%') #the new accuracy is 59.67%

    再创建新特征, 对比比赛的2队上一场2队比赛的结果

    #再创建新特征, 对比比赛的2队上一场2队比赛的结果
    last_match_winner = defaultdict(int)
    data['HomeTeamWonLast'] = 0
    for index, row in data.iterrows():
        home_team = row['Home Team']
        visitor_team = row['Visitor Team']
        teams = tuple(sorted([home_team, visitor_team]))
        row['HomeTeamWonLast'] = 1 if last_match_winner[teams] == row['Home Team'] else 0
        data.ix[index] = row
        winner = row['Home Team'] if row['HomeWin'] else row['Visitor Team']
        last_match_winner[teams] = winner 
        
    X_lastwinner = data[['HomeTeamWonLast', 'HomeTeamRanksHigher']]
    # clf2 = DecisionTreeClassifier(random_state=14)
    # scores = cross_val_score(clf2, X_lastwinner, y_true, scoring='accuracy')
    # mean_score2 = np.mean(scores) *100
    # print('the accuracy is %.2f'%mean_score2 + '%') #  the accuracy is 57.85% 

    观察决策树在训练数据量很大的情况下, 能否得到有效的模型,使用球队,并对其编码

    编码可以参考

    #使用LabelEncoder 转换器把字符串类型的队名转换成整型
    from sklearn.preprocessing import LabelEncoder
    encoding = LabelEncoder()
    encoding.fit(data['Home Team'].values)  #将主队名称转换成整型
    home_teams = encoding.transform(data['Home Team'].values)
    visitor_teams = encoding.transform(data['Visitor Team'].values)
    
    X_teams = np.vstack([home_teams, visitor_teams]).T 
    from sklearn.preprocessing import OneHotEncoder
    onehot = OneHotEncoder()
    X_teams_expanded = onehot.fit_transform(X_teams).todense()
    clf3 = DecisionTreeClassifier(random_state=14)
    # scores = cross_val_score(clf3, X_teams_expanded, y_true, scoring='accuracy')
    # mean_score3 = np.mean(scores) *100
    # print('the accuracy is %.2f'%mean_score3+'%')   #  the accuracy is 59.52%

    3.使用随机森林

    随机森林是一种集成学习的算法

    print('----rf-----')
    #使用随机森林进行预测
    from sklearn.ensemble import RandomForestClassifier
    # rf = RandomForestClassifier(random_state = 14, n_jobs =-1)  #最好调下决策树的参数
    # rf_scores = cross_val_score(rf, X_teams, y_true, scoring='accuracy')
    # mean_rf_score = np.mean(rf_scores) *100
    # print('the randforestclassifier accuracy is %.2f'%mean_rf_score+'%')    #the randforestclassifier accuracy is 58.38%
    
    #多使用几个特征
    print('使用多个参数')
    X_all = np.hstack([X_homehigher, X_teams])
    # rf_clf2 = RandomForestClassifier(random_state = 14, n_jobs=-1)
    # rf_scores2 = cross_val_score(rf_clf2, X_all, y_true, scoring='accuracy')
    # mean_rf_score2 = np.mean(rf_scores2) *100
    # print('the accuracy is %.2f'%mean_rf_score2+'%')    # the accuracy is 57.62%

    使用网格搜索查找最佳的模型, 并查看使用的参数。

    #调参数, 使用网格搜索
    from sklearn.model_selection import GridSearchCV
    param_grid = {
        'max_features':[2,3,'auto'],
        'n_estimators': [100,110,120 ],
        'criterion': ['gini', 'entropy'],
        "min_samples_leaf": [2, 4, 6]
        }
    clf = RandomForestClassifier(random_state=14, n_jobs=-1)
    grid = GridSearchCV(clf, param_grid)
    grid.fit(X_all, y_true)
    score = grid.best_score_ *100
    print('the accuracy is %.2f'%score +'%')    #the accuracy is 62.02%
    something= str(grid.best_estimator_)
    print(something)     #输出网格搜索找到的最佳模型
    print(grid.best_params_)    #输出返回最好的参数
    # the accuracy is 62.02%
    # RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
    #             max_depth=None, max_features=3, max_leaf_nodes=None,
    #             min_impurity_decrease=0.0, min_impurity_split=None,
    #             min_samples_leaf=2, min_samples_split=2,
    #             min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
    #             oob_score=False, random_state=14, verbose=0, warm_start=False)
    # {'n_estimators': 100, 'criterion': 'entropy', 'max_features': 3, 'min_samples_leaf': 2}
    # 所花费的时间 : 117.93s
    
    
    end = time.clock()
    time = end - start
    print('所花费的时间 : %.2f'%time + 's')  

  • 相关阅读:
    web安全性测试用例
    国内可用的网络时间服务器
    selenium需要的浏览器驱动程序下载
    杂齐杂八
    检查是否网络端口占用问题
    python入到到实战--第十章----文件
    python入到到实战--第九章
    python入到到实战--第八章
    python入到到实战--第七章
    python入到到实战--第六章
  • 原文地址:https://www.cnblogs.com/junge-mike/p/12761198.html
Copyright © 2011-2022 走看看