zoukankan      html  css  js  c++  java
  • 使用遗传算法进行调参【决策树】

    背景

    最近接到一个项目,使用遗传算法对决策树进行调参;以前都是使用网格搜索来调参,没想到也可以用ga来做这件事情,再加上以前也写过比较多的ga算法,也就接了下来,本来以为要花一点时间来搞,实际上熟悉的话2-3个小时就能搞定。

    算法

    做项目肯定是要用库的啦(不可能自己写的),选择使用sklearn的决策树,ga算法流程比较清晰,就自己手写了,下面关键介绍ga算法的几个步骤是如何做的。

    初始化

    选择决策树比较重要的三个参数"max_depth", "min_samples_split", "max_leaf_nodes",穷举这三个参数可能的值进行初始化

    1 def init():
    2     forest = []
    3     for max_depth in range(5, 31, 3):
    4         for min_samples_split in range(5, 25, 5):
    5             for max_leaf_nodes in range(5, 25, 5):
    6                 forest.append(make_tree([max_depth, min_samples_split, max_leaf_nodes]))
    7     return forest

    选择

    使用准确率作为评分依据得到累计概率

    1 def tree_score(X, Y, clf):
    2     kf = KFold(n_splits=5)
    3     score = []
    4     for train_index, valid_index in kf.split(X):
    5         clf.fit(X[train_index], Y[train_index])
    6         pred = clf.predict(X[valid_index])
    7         score.append(accuracy_score(y_true=Y[valid_index], y_pred=pred))
    8     return np.mean(score)
     1 def adaption(X, Y, forest):
     2     score = []
     3     for t in forest:
     4         score.append(tree_score(X, Y, t))
     5     best_pos = np.argmax(score)
     6     global BEST_TREE
     7     BEST_TREE = copy.deepcopy(forest[best_pos])
     8     sm = np.sum(score)
     9     ada = score / sm
    10     for i in range(1, len(ada)):
    11         ada[i] = ada[i] + ada[i - 1]
    12     return ada

    选择这里可以注意一下,可以使用精英策略,即:把当前这一轮最好的个体,直接送入下一代中。这个策略在提升算法的稳定性上又很大用处

    交叉

    交叉使用的是参数的交叉,比如clf1,和clf2 然后随机得到一个找到一个交换参数的位置p,进行交叉

     1 def _cross_2_tree(t1, t2):
     2     sz = len(param)
     3 
     4     t1_param_value = _dict_get_value_list(t1.__dict__, param)
     5     t2_param_value = _dict_get_value_list(t2.__dict__, param)
     6     pos = random.randint(0, sz - 1)
     7     t1_left = t1_param_value[0:pos + 1]
     8     t1_right = t1_param_value[pos + 1:]
     9 
    10     t2_left = t2_param_value[0:pos + 1]
    11     t2_right = t2_param_value[pos + 1:]
    12 
    13     t1_left.extend(t2_right)
    14     t2_left.extend(t1_right)
    15     return [make_tree(t1_left), make_tree(t2_left)]
    16 
    17 
    18 def cross(forest):
    19     result = []
    20     sz = len(forest)
    21     for i in range(1, sz, 2):
    22         result.extend(_cross_2_tree(forest[i - 1], forest[i]))
    23     return result

    变异

    这一步使用比较简单的策略,直接在参数上进行+1或者-1操作

     1 def variation(forest):
     2     result = []
     3     for t in forest:
     4         r = random.random()
     5         if r < VAR_P:
     6             result.append(t)
     7             continue
     8 
     9         # 变异
    10         sz = len(param)
    11         pos = random.randint(0, sz - 1)
    12         val = t.__dict__[param[pos]]
    13         up = random.random()
    14 
    15         if up > 0.5:
    16             val = val + 1
    17         else:
    18             val = val - 1
    19 
    20         if val < 2:
    21             val = 2
    22         t.__dict__[param[pos]] = val
    23         result.append(t)
    24     return result

    完整代码

      1 import pandas as pd
      2 import numpy as np
      3 from sklearn.tree import DecisionTreeClassifier
      4 from sklearn.model_selection import train_test_split
      5 from sklearn.model_selection import KFold
      6 from sklearn.metrics import accuracy_score
      7 import random
      8 import copy
      9 import matplotlib.pyplot as plt
     10 
     11 param = ["max_depth", "min_samples_split", "max_leaf_nodes"]
     12 epochs = 300
     13 VAR_P = 0.4
     14 BEST_TREE = None
     15 
     16 
     17 def make_tree(param_value):
     18     p = dict(zip(param, param_value))
     19     return DecisionTreeClassifier(**p)
     20 
     21 
     22 def init():
     23     forest = []
     24     for max_depth in range(5, 31, 3):
     25         for min_samples_split in range(5, 25, 5):
     26             for max_leaf_nodes in range(5, 25, 5):
     27                 forest.append(make_tree([max_depth, min_samples_split, max_leaf_nodes]))
     28     return forest
     29 
     30 def tree_score(X, Y, clf):
     31     kf = KFold(n_splits=5)
     32     score = []
     33     for train_index, valid_index in kf.split(X):
     34         clf.fit(X[train_index], Y[train_index])
     35         pred = clf.predict(X[valid_index])
     36         score.append(accuracy_score(y_true=Y[valid_index], y_pred=pred))
     37     return np.mean(score)
     38 
     39 
     40 def evulate_forest(X, Y, forest):
     41     score = []
     42     for t in forest:
     43         score.append(tree_score(X, Y, t))
     44     worse_pos = np.argmin(score)
     45     global BEST_TREE
     46     forest[worse_pos] = BEST_TREE
     47     score[worse_pos] = tree_score(X, Y, BEST_TREE)
     48 
     49     score.sort(reverse=True)
     50     return score, np.mean(score)
     51 
     52 
     53 def adaption(X, Y, forest):
     54     score = []
     55     for t in forest:
     56         score.append(tree_score(X, Y, t))
     57     best_pos = np.argmax(score)
     58     global BEST_TREE
     59     BEST_TREE = copy.deepcopy(forest[best_pos])
     60     sm = np.sum(score)
     61     ada = score / sm
     62     for i in range(1, len(ada)):
     63         ada[i] = ada[i] + ada[i - 1]
     64     return ada
     65 
     66 
     67 def choose_trees(forest, ada):
     68     sz = len(forest)
     69     result = []
     70     for i in range(sz):
     71         r = random.random()
     72         for j in range(len(ada)):
     73             if r <= ada[j]:
     74                 result.append(copy.deepcopy(forest[j]))
     75                 break
     76     return result
     77 
     78 
     79 def _dict_get_value_list(mp, key_list):
     80     value_list = []
     81     for key in key_list:
     82         value_list.append(mp.get(key))
     83     return value_list
     84 
     85 
     86 def _cross_2_tree(t1, t2):
     87     sz = len(param)
     88 
     89     t1_param_value = _dict_get_value_list(t1.__dict__, param)
     90     t2_param_value = _dict_get_value_list(t2.__dict__, param)
     91     pos = random.randint(0, sz - 1)
     92     t1_left = t1_param_value[0:pos + 1]
     93     t1_right = t1_param_value[pos + 1:]
     94 
     95     t2_left = t2_param_value[0:pos + 1]
     96     t2_right = t2_param_value[pos + 1:]
     97 
     98     t1_left.extend(t2_right)
     99     t2_left.extend(t1_right)
    100     return [make_tree(t1_left), make_tree(t2_left)]
    101 
    102 
    103 def cross(forest):
    104     result = []
    105     sz = len(forest)
    106     for i in range(1, sz, 2):
    107         result.extend(_cross_2_tree(forest[i - 1], forest[i]))
    108     return result
    109 
    110 
    111 def variation(forest):
    112     result = []
    113     for t in forest:
    114         r = random.random()
    115         if r < VAR_P:
    116             result.append(t)
    117             continue
    118 
    119         # 变异
    120         sz = len(param)
    121         pos = random.randint(0, sz - 1)
    122         val = t.__dict__[param[pos]]
    123         up = random.random()
    124 
    125         if up > 0.5:
    126             val = val + 1
    127         else:
    128             val = val - 1
    129 
    130         if val < 2:
    131             val = 2
    132         t.__dict__[param[pos]] = val
    133         result.append(t)
    134     return result
    135 
    136 
    137 df = pd.read_csv("../dataset/data.csv", index_col=0)
    138 X = df.iloc[:, 1:].values
    139 Y = df.iloc[:, 0].values
    140 forest = init()
    141 
    142 mean_score_arr = []
    143 
    144 for i in range(epochs):
    145     ada = adaption(X, Y, forest)
    146     forest = choose_trees(forest, ada)
    147     forest = cross(forest)
    148     forest = variation(forest)
    149     score, mean = evulate_forest(X, Y, forest)
    150     mean_score_arr.append(mean)
    151 
    152     print(i, "/", epochs, ":")
    153     print("mean:", mean)
    154 
    155 plt.plot(np.arange(len(mean_score_arr)), mean_score_arr)
    156 plt.show()

    总结

    感觉使用ga进行调参很鸡肋,还不如使用网格搜索来的快,但是作为一种思想可以学习一下的。

    最近搞了一个人工智能交流的群:831852635,有兴趣的可以加一下!

  • 相关阅读:
    20172327 2017-2018-2 《程序设计与数据结构》第十一周学习总结
    20172327 2017-2018-2 《程序设计与数据结构》实验3报告
    20172327 2017-2018-2 《程序设计与数据结构》第十周学习总结
    20172327 2017-2018-2 《程序设计与数据结构》第九周学习总结
    20172327 结对编程项目-四则运算 第二周 阶段总结
    20172327 2017-2018-2 《程序设计与数据结构》第八周学习总结
    20172327 结对编程项目-四则运算 第一周 阶段总结
    20172327 2017-2018-2 《程序设计与数据结构》实验2报告
    20172327 2017-2018-2 《程序设计与数据结构》第七周学习总结
    MySQL数据库(四)—— 记录相关操作之插入、更新、删除、查询(单表、多表)
  • 原文地址:https://www.cnblogs.com/oldBook/p/10656241.html
Copyright © 2011-2022 走看看