zoukankan      html  css  js  c++  java
  • 决策树转为hivesql语句部署

    首先生成一个决策树,然后画出决策树,再将决策树条件转化成hiveSQL语句进行部署,并非所有公司有部署决策树的环境,这时候可以使用SQL去部署

    首先创建一个dt_main.py,主要是决策树方面的,代码如下:

    #!/usr/bin/env python
    # coding: utf-8
    
    # In[ ]:
    
    #导入模块
    from sklearn import tree
    import pandas as pd
    import numpy as np
    import os
    import toad
    from pydotplus import graph_from_dot_data
    from sklearn.tree import export_graphviz
    from six import StringIO
    #卡方分箱决策树
    class auto_DecisionTreeRegressor(object):
    
        def __init__(self,datasets,ex_lis,dep='bad_ind',min_samples=0.05,min_samples_leaf=200,min_samples_split=20,max_depth=4,is_bin=True):
    
            '''
            datasets:数据集 dataframe格式
            ex_lis:不参与建模的特征,如id,时间切片等。 list格式
            min_samples:分箱时最小箱的样本占总比 numeric格式
            max_depth:决策树最大深度 numeric格式
            min_samples_leaf:决策树子节点最小样本个数 numeric格式
            min_samples_split:决策树划分前,父节点最小样本个数 numeric格式
            is_bin:是否进行卡方分箱 bool格式(True/False)
            '''
            self.datasets = datasets
            self.ex_lis = ex_lis
            self.dep = dep
            self.max_depth = max_depth
            self.min_samples = min_samples
            self.min_samples_leaf = min_samples_leaf
            self.min_samples_split = min_samples_split
            self.is_bin = is_bin
    
            self.bins = 0
            
        def fit_plot(self):
            os.environ["PATH"] += os.pathsep + 'D:/Program Files'
            dtree = tree.DecisionTreeRegressor(max_depth=self.max_depth,
                                               min_samples_leaf=self.min_samples_leaf, 
                                               min_samples_split=self.min_samples_split)
            
            x = self.datasets.drop(self.ex_lis,axis=1)
            y = self.datasets[self.dep]
            
            combiner = 0
            
            if self.is_bin:
                #分箱
                combiner = toad.transform.Combiner()
                combiner.fit(x,y,method='chi',min_samples = self.min_samples)
    
                x_bin= combiner.transform(x)
                self.bins = combiner.export()        
            else:
                x_bin = x.copy()
                
            dtree = dtree.fit(x_bin,y) 
            
            df_bin = x_bin.copy()
            
            df_bin[self.dep] = y
            
            dot_data = StringIO()
            tree.export_graphviz(dtree, out_file=dot_data,
                                     feature_names=x_bin.columns,
                                     class_names=[self.dep],
                                     filled=True, rounded=True,
                                     special_characters=True)
    
            graph = graph_from_dot_data(dot_data.getvalue()) 
            png = graph.create_png()
            
            #返回字典
            dct = {
                    'df_bin': df_bin,
                    'bins':self.bins,
                    'combiner':combiner,
                    'graph':png,
                    'dtree':dtree,
                    'feature_names':x_bin.columns
                  }
                
            return dct
    View Code

    再创建一个Decision_utils.py,找到决策树的叶子节点并抽取决策路径到SQL中。因为是一个重新写的demo,如果公司实际生产环境,还需要在脚本中使用cast(features as flaot) 之类的函数,就自己添加

    代码如下:

    #!/usr/bin/env python
    # coding: utf-8
    
    
    
    import toad
    import pandas as pd
    import numpy as np
    import pydotplus
    from IPython.display import Image
    from six import StringIO
    import os
    from sklearn import tree  
    
    def DTR_TO_SQL(tree, feature_names, when_else=-1):
        import numpy as np
        
        left = tree.tree_.children_left
        right = tree.tree_.children_right
        threshold = tree.tree_.threshold
        features = [feature_names[i] for i in tree.tree_.feature]   
    
        le='<='               
        g ='>'
        
        idx = np.argwhere(left == -1)[:,0]     
    
        def find_value(tree):
            value_lis=[]
            tree_ = tree.tree_
            def recurse(node, depth):
                from sklearn.tree import _tree
                if tree_.feature[node] != _tree.TREE_UNDEFINED:
                    recurse(tree_.children_left[node], depth + 1)
                    recurse(tree_.children_right[node], depth + 1)
                else:
                    value_lis.append(tree_.value[node][0][0])
            recurse(0, 1)
            return value_lis
    
        value_lis = find_value(tree)
        
        
        def recurse(left, right, child, lineage=None):          
            if lineage is None:
                lineage = [child]
            
            if child in left:
                parent = np.where(left == child)[0].item()
                split = 'l'
            else:
                parent = np.where(right == child)[0].item()
                split = 'r'
            
            lineage.append((parent, split, threshold[parent], features[parent]))
            
            if parent == 0:
                lineage.reverse()
                return lineage
            else:
                return recurse(left, right, parent, lineage)
    
        print('case ')
        for j,child in enumerate(idx):
            clause=' when '
            for node in recurse(left, right, child):
                if len(str(node))<3:
                    continue
                i=node
                if i[1]=='l':  
                    sign=le 
                else: 
                    sign=g
                clause=clause+i[3]+sign+str(i[2])+' and '
            clause=clause[:-4]+' then '+ str(value_lis[j])
            print(clause)
        print('else %s end as clusters'%(when_else))
    View Code

    将这二者放在同一文件夹里面,然后再在此文件夹打开一个notebook,运行下面代码

    import pandas as pd
    import sys
    sys.path.append('/函数/automl/')
    
    from sklearn.datasets import make_classification
    from dt_main import auto_DecisionTreeRegressor
    from Decision_utils import DTR_TO_SQL
    from IPython.display import Image
    
    X, y = make_classification(n_samples=1000,n_features=30,n_classes=2,random_state=328)
    data = pd.DataFrame(X)
    data['bad_ind'] = y
    data['imei'] = [i for i in range(len(data))]
    data.columns = ['f0_radius','f0_texture','f0_perimeter','f0_area','f0_smoothness',
                    'f0_compactness','f0_concavity','f0_concave_points','f0_symmetry',
                    'f0_fractal_dimension','f1_radius_error','f1_texture_error','f1_perimeter_error',
                    'f2_area_error','f2_smoothness_error','f2_compactness_error','f2_concavity_error',
                    'f2_concave_points_error','f2_symmetry_error','f2_fractal_dimension_error',
                    'f3_radius','f3_texture','f3_perimeter','f3_area','f3_smoothness',
                    'f3_compactness','f3_concavity','f3_concave_points','f3_symmetry',
                    'f3_fractal_dimension','bad_ind','imei']
    
    
    uid, dep = "imei", "bad_ind"
    var_names = list(data.columns)
    var_names.remove(dep)
    var_names.remove(uid)
    
    #指定不参与建模的变量,包含标签bad_ind。
    ex_lis = [uid, dep]
    
    #调用决策树函数
    dct = auto_DecisionTreeRegressor(datasets = data,ex_lis = ex_lis,is_bin=False,
                                     dep = dep,min_samples=0.01,max_depth=4,
                                     min_samples_leaf=50,min_samples_split=50).fit_plot()
    #展示图像
    Image(dct['graph'])

     如果需要查看SQL:

    #查看SQL代码
    DTR_TO_SQL(dct['dtree'],dct['feature_names'],when_else=0)
    case 
     when f0_symmetry<=-0.028780914843082428 and f0_symmetry<=-0.6725731492042542 and f2_smoothness_error<=0.06367828324437141  then 0.08433734939759036
     when f0_symmetry<=-0.028780914843082428 and f0_symmetry<=-0.6725731492042542 and f2_smoothness_error>0.06367828324437141 and f2_concave_points_error<=0.9470688104629517  then 0.0
     when f0_symmetry<=-0.028780914843082428 and f0_symmetry<=-0.6725731492042542 and f2_smoothness_error>0.06367828324437141 and f2_concave_points_error>0.9470688104629517  then 0.03773584905660377
     when f0_symmetry<=-0.028780914843082428 and f0_symmetry>-0.6725731492042542  then 0.2857142857142857
     when f0_symmetry>-0.028780914843082428 and f3_symmetry<=0.34790952503681183 and f2_symmetry_error<=-1.2111823558807373  then 0.96
     when f0_symmetry>-0.028780914843082428 and f3_symmetry<=0.34790952503681183 and f2_symmetry_error>-1.2111823558807373 and f2_smoothness_error<=0.37725594639778137  then 1.0
     when f0_symmetry>-0.028780914843082428 and f3_symmetry<=0.34790952503681183 and f2_symmetry_error>-1.2111823558807373 and f2_smoothness_error>0.37725594639778137  then 0.9770114942528736
     when f0_symmetry>-0.028780914843082428 and f3_symmetry>0.34790952503681183  then 0.26
    else 0 end as clusters

    文章参考:https://zhuanlan.zhihu.com/p/188759754

    修改了一些错误,使得整个程序得以顺利运行

  • 相关阅读:
    linux知识笔记4
    linux知识笔记3
    linux知识笔记2
    linux常用命令笔记1
    计算机网络
    软件测试理论5
    软件测试理论4
    软件测试理论3
    Yarn 常用命令
    mac shell终端编辑命令行快捷键
  • 原文地址:https://www.cnblogs.com/cgmcoding/p/14028655.html
Copyright © 2011-2022 走看看