zoukankan      html  css  js  c++  java
  • XGBoost实战

    • XGBoost自动读取数据,判断蘑菇是否有毒 二分类
      # /usr/bin/python 
      # -*- encoding:utf-8 -*-
      
      # 判断蘑菇是否有毒二分类
      
      import xgboost as xgb
       import numpy as np
      
      # 1、xgBoost的基本使用
      # 2、自定义损失函数的梯度和二阶导
      # 3、binary:logistic/logitraw
      
      
      # 定义f: theta * x 
      def log_reg(y_hat, y):
          p = 1.0 / (1.0 + np.exp(- y_hat))
          g = p - y.get_label()
          h = p * (1.0-p)
          return g, h
      
      #错误率
      def error_rate(y_hat, y):
          return 'error', float(sum(y.get_label() != (y_hat > 0.5))) / len(y_hat)
      
      
      if __name__ == "__main__":
          # 读取数据
          data_train = xgb.DMatrix('12.agaricus_train.txt')
          data_test = xgb.DMatrix('12.agaricus_test.txt')
      
          # 设置参数
          #'max_depth': 2   每一棵树的最大深度为2
          #'eta': 1   衰减因子
          # 'silent': 1   输出生成树的过程
          #'objective': 'binary:logitraw' 二分类
          param = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logitraw'} # logitraw
          #data_test:测试数据    data_train:训练数据
          watchlist = [(data_test, 'eval'), (data_train, 'train')]
          #迭代三轮 得到3棵树
          n_round = 3
          #训练
          bst = xgb.train(param, data_train, num_boost_round=n_round, evals=watchlist)
      
          #自定义损失函数
          # obj=log_reg   目标函数为log_reg
          # bst = xgb.train(param, data_train, num_boost_round=n_round, evals=watchlist, obj=log_reg, feval=error_rate)
      
          # 计算错误率
          y_hat = bst.predict(data_test)
          y = data_test.get_label()
          # print(y_hat)
          # print(y)
          error = sum(y != (y_hat > 0))
          error_rate = float(error) / len(y_hat)
          print('样本总数:	', len(y_hat))
          print('错误数目:	%4d' % error)
          print('错误率:	%.5f%%' % (100*error_rate))
    • 判断蘑菇是否有毒   手动读取数据
      # /usr/bin/python
      # -*- coding:utf-8 -*-
      
      import xgboost as xgb
      import numpy as np
      import scipy.sparse
      from sklearn.model_selection import train_test_split
      from sklearn.linear_model import LogisticRegression
      
      #手动读取数据
      def read_data(path):
          y = []   #标签值
          row = []    #存储相应的行
          col = []    #存储相应的列
          values = [] #存储相应的值,row,col,values的值一一对应
          r = 0       # 首行
          for d in open(path):
              # 以空格分开
              d = d.strip().split()
              #第0列给y
              y.append(int(d[0]))
              #第一列后面的数都给d
              d = d[1:]
              #遍历每一个d
              for c in d:
                  #以':'进行拆分,前面的是key,后面的是value
                  key, value = c.split(':')
                  #对应的第几行放入 row中
                  row.append(r)
                  #列中加入相应的key
                  col.append(int(key))
                  #添加相应的值
                  values.append(float(value))
              #一行处理完r加1
              r += 1
          #创建系数矩阵,(row,col)的位置赋值成相应的值
          x = scipy.sparse.csr_matrix((values, (row, col))).toarray()
          y = np.array(y)
          return x, y
      
      
      def show_accuracy(a, b, tip):
          acc = a.ravel() == b.ravel()
          print(acc)
          print(tip + '正确率:	', float(acc.sum()) / a.size)
      
      
      if __name__ == '__main__':
          #x的每一行为特征
          #y为标签值
          x, y = read_data('12.agaricus_train.txt')
          #划分训练数据和测试数据
          x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, train_size=0.6)
      
          # Logistic回归
          lr = LogisticRegression(penalty='l2')
          lr.fit(x_train, y_train.ravel())
          y_hat = lr.predict(x_test)
          show_accuracy(y_hat, y_test, 'Logistic回归 ')
      
          # XGBoost
          # 把标记为3的都设置为0,因为XGBoost分类是从0开始的
          y_train[y_train == 3] = 0
          y_test[y_test == 3] = 0
          # 对测试数据和训练数据进行包装
          data_train = xgb.DMatrix(x_train, label=y_train)
          data_test = xgb.DMatrix(x_test, label=y_test)
          # 指定训练数据和测试数据
          watch_list = [(data_test, 'eval'), (data_train, 'train')]
          # 给定参数
          param = {'max_depth': 3, 'eta': 1, 'silent': 0, 'objective': 'multi:softmax', 'num_class': 3}
          # 训练
          bst = xgb.train(param, data_train, num_boost_round=4, evals=watch_list)
          # 预测
          y_hat = bst.predict(data_test)
          # 输出正确率
          show_accuracy(y_hat, y_test, 'XGBoost ')
    • 鸢尾花数据判断 多分类

      # /usr/bin/python
      # -*- encoding:utf-8 -*-
      
      #鸢尾花数据判断  多分类
      
      import xgboost as xgb
      import numpy as np
      from sklearn.model_selection import train_test_split   # cross_validation
      
      
      def iris_type(s):
          it = {b'Iris-setosa': 0, b'Iris-versicolor': 1, b'Iris-virginica': 2}
          return it[s]
      
      
      if __name__ == "__main__":
          # 数据文件路径
          path = u'.\8.iris.data'
          #载入数据
          data = np.loadtxt(path, dtype=float, delimiter=',', converters={4: iris_type})
      
          #x为前4列,y为4列之后
          x, y = np.split(data, (4,), axis=1)
          #一部分当做训练,一部分当做测试
          #test_size=50   测试数据取了50个
          x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, test_size=50)
      
          #训练数据和标记值组装给DMatrix
          data_train = xgb.DMatrix(x_train, label=y_train)
          # 测试数据和标记值组装给DMatrix
          data_test = xgb.DMatrix(x_test, label=y_test)
          #明确测试数据和训练数据
          watch_list = [(data_test, 'eval'), (data_train, 'train')]
          #每一棵树最大深度为3
          # 'objective': 'multi:softmax'   多分类
          param = {'max_depth': 3, 'eta': 0.3, 'silent': 1, 'objective': 'multi:softmax', 'num_class': 3}
      
          #训练五轮
          bst = xgb.train(param, data_train, num_boost_round=6, evals=watch_list)
          y_hat = bst.predict(data_test)
          result = y_test.reshape(1, -1) == y_hat
          print('正确率:	', float(np.sum(result)) / len(y_hat))
          print('END.....
      ')
    • #葡萄酒的分类问题

      # /usr/bin/python
      # -*- encoding:utf-8 -*-
      
      #葡萄酒的分类问题
      
      import xgboost as xgb
      import numpy as np
      from sklearn.model_selection import train_test_split   # cross_validation
      from sklearn.linear_model import LogisticRegression
      from sklearn.preprocessing import StandardScaler
      
      
      def show_accuracy(a, b, tip):
          acc = a.ravel() == b.ravel()
          # print(acc)
          print(tip + '正确率:	', float(acc.sum()) / a.size)
      
      
      if __name__ == "__main__":
          #载入数据
          data = np.loadtxt('12.wine.data', dtype=float, delimiter=',')
          #第一列是标记数据,后面的是特征数据
          y, x = np.split(data, (1,), axis=1)
          #划分训练数据和测试数据
          x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, test_size=0.5)
      
          # Logistic回归
          lr = LogisticRegression(penalty='l2')
          lr.fit(x_train, y_train.ravel())
          y_hat = lr.predict(x_test)
          show_accuracy(y_hat, y_test, 'Logistic回归 ')
      
          # XGBoost
          #把标记为3的都设置为0,因为XGBoost分类是从0开始的
          y_train[y_train == 3] = 0
          y_test[y_test == 3] = 0
          #对测试数据和训练数据进行包装
          data_train = xgb.DMatrix(x_train, label=y_train)
          data_test = xgb.DMatrix(x_test, label=y_test)
          #指定训练数据和测试数据
          watch_list = [(data_test, 'eval'), (data_train, 'train')]
          #给定参数
          param = {'max_depth': 3, 'eta': 1, 'silent': 0, 'objective': 'multi:softmax', 'num_class': 3}
          #训练
          bst = xgb.train(param, data_train, num_boost_round=4, evals= watch_list)
           # 预测 
          y_hat = bst.predict(data_test)
           # 输出正确率 
          show_accuracy(y_hat, y_test, ' XGBoost ' )
      • 泰坦尼克号问题
        # /usr/bin/python 
        # -*- encoding:utf-8 -*-
        
        # 泰坦尼克号
        
        import xgboost as xgb
         import numpy as np
         from sklearn.linear_model import LogisticRegression
         from sklearn.model_selection import train_test_split
         from sklearn.ensemble import RandomForestRegressor
         from sklearn.ensemble import RandomForestClassifier
         import pandas as pd
         import csv
        
        
        def show_accuracy(a, b, tip):
            acc = a.ravel() == b.ravel()
            acc_rate = 100 * float(acc.sum()) / a.size
             # print '%s正确率:%.3f%%' % (tip, acc_rate) 
            return acc_rate
        
        
        def load_data(file_name, is_train):
             # 使用pandas来读取数据
            # csv文件是带文件头的 
            data = pd.read_csv(file_name)   # 数据文件路径
            # 输出统计的信息,包括均值,最大值,最小值等
            # print(data.describe())
        
            # 性别
            # pandas的一个好处是可以直接通过类别来索引到相应的列
            # 如果是female则变成0,male则变成1,做这样一个字典映射 
            data[ ' Sex ' ] = data[ ' Sex ' ].map({ ' female ' : 0, ' male ' : 1 }).astype(int)
        
        
            # 补齐船票价格缺失值
            # data.Fare直接得到Fare的那一列
            if len(data.Fare[data.Fare.isnull()]) > 0:
                fare = np.zeros(3 )
                 # 取出等级是f的所有行,取出'Fare'列, 
                # 把空白的给去掉,然后求剩下的中位数
                for f in range(0, 3 ):
                    fare[f] = data[data.Pclass == f + 1][ ' Fare ' ].dropna().median()
                 # 填充相应等级的人的船票
                for f in range(0, 3 ):
                    data.loc[(data.Fare.isnull()) & (data.Pclass == f + 1), ' Fare ' ] = fare[f]
        
            # 年龄:使用均值代替缺失值
            # .dropna()去掉为空的行
            # mean_age = data['Age'].dropna().mean() 
            # data.loc[(data.Age.isnull()), 'Age'] = mean_age
        
            # 随机森林对年龄进行预测
            if is_train:
                 # 年龄:使用随机森林预测年龄缺失值
                print ( ' 随机森林预测缺失年龄:--start-- ' )
                 # 取出相应特征的列 
                data_for_age = data[[ ' Age ' , ' Survived ' , ' Fare ' , ' Parch ' , ' SibSp ' , ' Pclass ' ]]
                 # 年龄不缺失的数据部分提取出来 
                age_exist = data_for_age.loc[(data.Age.notnull())]
                 print( " age_exist:
         " ,age_exist)
        
                # 年龄为空的数据部分提取出来,要估计的部分 
                age_null = data_for_age.loc[(data.Age.isnull())]
                 # x为所有行的第1列以后,包括第一列 
                x = age_exist.values [:, 1 :]
                 # y为第0列 
                y = age_exist.values[:, 0]
                 # 随机森林预测 
                rfr = RandomForestRegressor(n_estimators=1000 )
                 # 对模型进行训练
                rfr.fit(x, y)
                 # 对数据进行预测 
                age_hat = rfr.predict(age_null.values[:, 1 :])
                 # 把预测的数据填充到为空的那些行中 
                data.loc[(data.Age.isnull()), ' Age ' ] =age_hat
                 print ( ' 随机森林预测缺失年龄:--over-- ' )
             # 如果是测试数据,则没有Survived这一项, 
            # 所以前面加一个is_train用来判段是测试数据还是训练数据
            else :
                 print ( ' 随机森林预测缺失年龄2:--start-- ' )
                data_for_age = data[[ ' Age ' , ' Fare ' , ' Parch ' , ' SibSp ' , ' Pclass ' ]]
                age_exist = data_for_age.loc[(data.Age.notnull())]   # 年龄不缺失的数据 
                age_null = data_for_age.loc[(data.Age.isnull())]
                 # print age_exist 
                x = age_exist.values[:, 1 :]
                y = age_exist.values[:, 0]
                rfr = RandomForestRegressor(n_estimators=1000 )
                rfr.fit(x, y)
                age_hat = rfr.predict(age_null.values[:, 1 :])
                 # print age_hat 
                data.loc[(data.Age.isnull()), ' Age ' ] = age_hat
                 print ( ' 随机森林预测缺失年龄2:- -over-- ' )
        
            # 对起始城市进行计算
            # 把出发乘客最多的城市赋值给城市为空的 
            data.loc[(data.Embarked.isnull()), ' Embarked ' ] = ' S ' 
            # 取出Embarked这一列的数据 
            embarked_data = pd.get_dummies(data.Embarked)
        
            # 把所有出发城市拿出来,前面加上前缀,形成三个特征
            # 使用lambda表达式,所有可能的值取出,形成一行,以(0,1,0) 
            # 的形式表示 
            embarked_data = embarked_data.rename
                (columns = lambda x: ' Embarked_ ' + str(x))
             # 数据和这个新的特征组合在一起,形成新的数据 
            data = pd.concat([data, embarked_data], axis=1 )
             # print(data .describe()) 
            # data.to_csv('New_Data.csv')
        
            # 把清洗后的数据提取出来作为x 
            x = data[[ ' Pclass ' , ' Sex ' , ' Age ' , ' SibSp ' , ' Parch ' , ' Fare ' , ' Embarked_C ' , ' Embarked_Q ' , ' Embarked_S ' ]]
            y = None
             # 如果是训练集,提取y 
            if  ' Survived '  in data:
                y = data[ ' Survived ' ]
        
            # 转成对应的矩阵 
            x = np.array(x)
            y = np.array(y)
          
           y = y.reshape(-1,1)

        # 平铺五行,让测试数据变得更多 x = np.tile(x, (5, 1 ) ) y = np.tile(y, (5, 1 ) ) if is_train: return x, y return x, data[ ' PassengerId ' ] def write_result(c, c_type): file_name = ' 12.Titanic.test.csv ' x, passenger_id = load_data(file_name, False) if type == 3 : x = xgb.DMatrix(x) y = c.predict(x) y[y > 0.5] = 1 y[ ~(y > 0.5)] = 0 predictions_file = open( " Prediction_%d.csv " % c_type, " wb " ) open_file_object = csv.writer(predictions_file) open_file_object.writerow([ " PassengerId " , " Survived " ]) open_file_object.writerows(zip(passenger_id, y)) predictions_file.close() if __name__ == " __main__ " : # 载入数据 x, y = load_data( ' 12.Titanic.train.csv ' , True) # 分成训练数据和测试数据 x_train, x_test, y_train, y_test = train_test_split(x, y , test_size=0.5, random_state=1 ) # logistic回归 lr = LogisticRegression(penalty= ' l2 ' ) lr.fit(x_train, y_train) y_hat = lr.predict(x_test) lr_rate = show_accuracy(y_hat, y_test, ' Logistic回归' ) # 随机森林,100棵树 rfc = RandomForestClassifier(n_estimators=100 ) rfc.fit(x_train, y_train) y_hat = rfc.predict(x_test) rfc_rate = show_accuracy(y_hat, y_test, ' 随机森林' ) # XGBoost # 训练数据和测试数据 data_train = xgb.DMatrix(x_train, label= y_train) data_test = xgb.DMatrix(x_test, label= y_test) # 指明那个是训练数据,哪个是测试数据 watch_list = [(data_test, ' eval ' ), (data_train, ' train ' )] # 训练参数二分类 param = { ' max_depth ' : 3, ' eta ' : 0.1, ' silent ' : 1, ' objective ' : ' binary:logistic ' } # 进行训练 bst = xgb.train(param, data_train, num_boost_round=100, evals=watch_list) # 进行预测 y_hat = bst.predict(data_test) # 把大于0.5的设置成1,小于0.5的设置为0 y_hat[y_hat > 0.5] = 1 y_hat[ ~(y_hat > 0.5)] = 0 xgb_rate = show_accuracy(y_hat, y_test, ' XGBoost ' ) print ( ' Logistic回归:%.3f%% ' % lr_rate) print ( ' 随机森林:%.3f%% ' % rfc_rate) print ( ' XGBoost:%.3f%% ' % xgb_rate)
  • 相关阅读:
    期末总结
    虚拟存储器学习记录
    实验报告
    并发编程学习记录
    进程&信号&管道实践学习记录
    异常控制流学习记录
    系统级IO实践学习记录
    系统级I/O学习记录
    Arduino小车学习与研究
    期中总结
  • 原文地址:https://www.cnblogs.com/xiaochi/p/11354702.html
Copyright © 2011-2022 走看看