我能从决策树中的训练树中提取基本的决策规则(或“决策路径”)作为文本列表吗?

喜欢的东西:

if A>0.4 then if B<0.2 then if C>0.8 then class='X'

当前回答

这是您需要的代码

我已经修改了顶部喜欢的代码缩进在一个jupyter笔记本python 3正确

import numpy as np
from sklearn.tree import _tree

def tree_to_code(tree, feature_names):
    tree_ = tree.tree_
    feature_name = [feature_names[i] 
                    if i != _tree.TREE_UNDEFINED else "undefined!" 
                    for i in tree_.feature]
    print("def tree({}):".format(", ".join(feature_names)))

    def recurse(node, depth):
        indent = "    " * depth
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            print("{}if {} <= {}:".format(indent, name, threshold))
            recurse(tree_.children_left[node], depth + 1)
            print("{}else:  # if {} > {}".format(indent, name, threshold))
            recurse(tree_.children_right[node], depth + 1)
        else:
            print("{}return {}".format(indent, np.argmax(tree_.value[node])))

    recurse(0, 1)

其他回答

下面是我以一种可以直接在sql中使用的形式提取决策规则的方法,这样数据就可以按节点分组。(根据之前海报的做法)

结果将是后续的CASE子句,可以复制到sql语句,例如。

SELECT COALESCE(*CASE WHEN <conditions> THEN > <NodeA>*, >* CASE WHEN <条件> THEN <NodeB>*, > ....)* > FROM <表或视图>


import numpy as np

import pickle
feature_names=.............
features  = [feature_names[i] for i in range(len(feature_names))]
clf= pickle.loads(trained_model)
impurity=clf.tree_.impurity
importances = clf.feature_importances_
SqlOut=""

#global Conts
global ContsNode
global Path
#Conts=[]#
ContsNode=[]
Path=[]
global Results
Results=[]

def print_decision_tree(tree, feature_names, offset_unit=''    ''):    
    left      = tree.tree_.children_left
    right     = tree.tree_.children_right
    threshold = tree.tree_.threshold
    value = tree.tree_.value

    if feature_names is None:
        features  = [''f%d''%i for i in tree.tree_.feature]
    else:
        features  = [feature_names[i] for i in tree.tree_.feature]        

    def recurse(left, right, threshold, features, node, depth=0,ParentNode=0,IsElse=0):
        global Conts
        global ContsNode
        global Path
        global Results
        global LeftParents
        LeftParents=[]
        global RightParents
        RightParents=[]
        for i in range(len(left)): # This is just to tell you how to create a list.
            LeftParents.append(-1)
            RightParents.append(-1)
            ContsNode.append("")
            Path.append("")


        for i in range(len(left)): # i is node
            if (left[i]==-1 and right[i]==-1):      
                if LeftParents[i]>=0:
                    if Path[LeftParents[i]]>" ":
                        Path[i]=Path[LeftParents[i]]+" AND " +ContsNode[LeftParents[i]]                                 
                    else:
                        Path[i]=ContsNode[LeftParents[i]]                                   
                if RightParents[i]>=0:
                    if Path[RightParents[i]]>" ":
                        Path[i]=Path[RightParents[i]]+" AND not " +ContsNode[RightParents[i]]                                   
                    else:
                        Path[i]=" not " +ContsNode[RightParents[i]]                     
                Results.append(" case when  " +Path[i]+"  then ''" +"{:4d}".format(i)+ " "+"{:2.2f}".format(impurity[i])+" "+Path[i][0:180]+"''")

            else:       
                if LeftParents[i]>=0:
                    if Path[LeftParents[i]]>" ":
                        Path[i]=Path[LeftParents[i]]+" AND " +ContsNode[LeftParents[i]]                                 
                    else:
                        Path[i]=ContsNode[LeftParents[i]]                                   
                if RightParents[i]>=0:
                    if Path[RightParents[i]]>" ":
                        Path[i]=Path[RightParents[i]]+" AND not " +ContsNode[RightParents[i]]                                   
                    else:
                        Path[i]=" not "+ContsNode[RightParents[i]]                      
                if (left[i]!=-1):
                    LeftParents[left[i]]=i
                if (right[i]!=-1):
                    RightParents[right[i]]=i
                ContsNode[i]=   "( "+ features[i] + " <= " + str(threshold[i])   + " ) "

    recurse(left, right, threshold, features, 0,0,0,0)
print_decision_tree(clf,features)
SqlOut=""
for i in range(len(Results)): 
    SqlOut=SqlOut+Results[i]+ " end,"+chr(13)+chr(10)

我相信这个答案比这里的其他答案更正确:

from sklearn.tree import _tree

def tree_to_code(tree, feature_names):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]
    print "def tree({}):".format(", ".join(feature_names))

    def recurse(node, depth):
        indent = "  " * depth
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            print "{}if {} <= {}:".format(indent, name, threshold)
            recurse(tree_.children_left[node], depth + 1)
            print "{}else:  # if {} > {}".format(indent, name, threshold)
            recurse(tree_.children_right[node], depth + 1)
        else:
            print "{}return {}".format(indent, tree_.value[node])

    recurse(0, 1)

这将打印出一个有效的Python函数。下面是一个树的输出示例,它试图返回它的输入,一个0到10之间的数字。

def tree(f0):
  if f0 <= 6.0:
    if f0 <= 1.5:
      return [[ 0.]]
    else:  # if f0 > 1.5
      if f0 <= 4.5:
        if f0 <= 3.5:
          return [[ 3.]]
        else:  # if f0 > 3.5
          return [[ 4.]]
      else:  # if f0 > 4.5
        return [[ 5.]]
  else:  # if f0 > 6.0
    if f0 <= 8.5:
      if f0 <= 7.5:
        return [[ 7.]]
      else:  # if f0 > 7.5
        return [[ 8.]]
    else:  # if f0 > 8.5
      return [[ 9.]]

以下是我在其他答案中看到的一些绊脚石:

使用tree_。用阈值== -2来判断节点是否是叶节点不是一个好主意。如果它是一个阈值为-2的真实决策节点呢?相反,你应该看看树。Feature or tree.children_*。 对于tree_中的i,行features = [feature_names[i]。我的sklearn版本崩溃了,因为树。树_。特征为-2(特别是叶节点)。 递归函数中不需要有多个if语句,一个就可以了。

从这个答案中,您可以得到一个可读且高效的表示:https://stackoverflow.com/a/65939892/3746632

输出如下所示。X为一维向量,表示单个实例的特征。

from numba import jit,njit
@njit
def predict(X):
    ret = 0
    if X[0] <= 0.5: # if w_pizza <= 0.5
        if X[1] <= 0.5: # if w_mexico <= 0.5
            if X[2] <= 0.5: # if w_reusable <= 0.5
                ret += 1
            else:  # if w_reusable > 0.5
                pass
        else:  # if w_mexico > 0.5
            ret += 1
    else:  # if w_pizza > 0.5
        pass
    if X[0] <= 0.5: # if w_pizza <= 0.5
        if X[1] <= 0.5: # if w_mexico <= 0.5
            if X[2] <= 0.5: # if w_reusable <= 0.5
                ret += 1
            else:  # if w_reusable > 0.5
                pass
        else:  # if w_mexico > 0.5
            pass
    else:  # if w_pizza > 0.5
        ret += 1
    if X[0] <= 0.5: # if w_pizza <= 0.5
        if X[1] <= 0.5: # if w_mexico <= 0.5
            if X[2] <= 0.5: # if w_reusable <= 0.5
                ret += 1
            else:  # if w_reusable > 0.5
                ret += 1
        else:  # if w_mexico > 0.5
            ret += 1
    else:  # if w_pizza > 0.5
        pass
    if X[0] <= 0.5: # if w_pizza <= 0.5
        if X[1] <= 0.5: # if w_mexico <= 0.5
            if X[2] <= 0.5: # if w_reusable <= 0.5
                ret += 1
            else:  # if w_reusable > 0.5
                ret += 1
        else:  # if w_mexico > 0.5
            pass
    else:  # if w_pizza > 0.5
        ret += 1
    if X[0] <= 0.5: # if w_pizza <= 0.5
        if X[1] <= 0.5: # if w_mexico <= 0.5
            if X[2] <= 0.5: # if w_reusable <= 0.5
                ret += 1
            else:  # if w_reusable > 0.5
                pass
        else:  # if w_mexico > 0.5
            pass
    else:  # if w_pizza > 0.5
        pass
    if X[0] <= 0.5: # if w_pizza <= 0.5
        if X[1] <= 0.5: # if w_mexico <= 0.5
            if X[2] <= 0.5: # if w_reusable <= 0.5
                ret += 1
            else:  # if w_reusable > 0.5
                pass
        else:  # if w_mexico > 0.5
            ret += 1
    else:  # if w_pizza > 0.5
        ret += 1
    if X[0] <= 0.5: # if w_pizza <= 0.5
        if X[1] <= 0.5: # if w_mexico <= 0.5
            if X[2] <= 0.5: # if w_reusable <= 0.5
                ret += 1
            else:  # if w_reusable > 0.5
                pass
        else:  # if w_mexico > 0.5
            pass
    else:  # if w_pizza > 0.5
        ret += 1
    if X[0] <= 0.5: # if w_pizza <= 0.5
        if X[1] <= 0.5: # if w_mexico <= 0.5
            if X[2] <= 0.5: # if w_reusable <= 0.5
                ret += 1
            else:  # if w_reusable > 0.5
                pass
        else:  # if w_mexico > 0.5
            pass
    else:  # if w_pizza > 0.5
        pass
    if X[0] <= 0.5: # if w_pizza <= 0.5
        if X[1] <= 0.5: # if w_mexico <= 0.5
            if X[2] <= 0.5: # if w_reusable <= 0.5
                ret += 1
            else:  # if w_reusable > 0.5
                pass
        else:  # if w_mexico > 0.5
            pass
    else:  # if w_pizza > 0.5
        pass
    if X[0] <= 0.5: # if w_pizza <= 0.5
        if X[1] <= 0.5: # if w_mexico <= 0.5
            if X[2] <= 0.5: # if w_reusable <= 0.5
                ret += 1
            else:  # if w_reusable > 0.5
                pass
        else:  # if w_mexico > 0.5
            pass
    else:  # if w_pizza > 0.5
        pass
    return ret/10

因为每个人都很乐于助人,所以我将对Zelazny7和Daniele的漂亮解决方案进行修改。这是针对python 2.7的,使用tab使其更具可读性:

def get_code(tree, feature_names, tabdepth=0):
    left      = tree.tree_.children_left
    right     = tree.tree_.children_right
    threshold = tree.tree_.threshold
    features  = [feature_names[i] for i in tree.tree_.feature]
    value = tree.tree_.value

    def recurse(left, right, threshold, features, node, tabdepth=0):
            if (threshold[node] != -2):
                    print '\t' * tabdepth,
                    print "if ( " + features[node] + " <= " + str(threshold[node]) + " ) {"
                    if left[node] != -1:
                            recurse (left, right, threshold, features,left[node], tabdepth+1)
                    print '\t' * tabdepth,
                    print "} else {"
                    if right[node] != -1:
                            recurse (left, right, threshold, features,right[node], tabdepth+1)
                    print '\t' * tabdepth,
                    print "}"
            else:
                    print '\t' * tabdepth,
                    print "return " + str(value[node])

    recurse(left, right, threshold, features, 0)

这是基于@paulkernfeld的回答。如果你有一个包含特征的数据框架X和一个包含共振的目标数据框架y,你想知道哪个y值结束于哪个节点(并相应地绘制它),你可以做以下工作:

    def tree_to_code(tree, feature_names):
        from sklearn.tree import _tree
        codelines = []
        codelines.append('def get_cat(X_tmp):\n')
        codelines.append('   catout = []\n')
        codelines.append('   for codelines in range(0,X_tmp.shape[0]):\n')
        codelines.append('      Xin = X_tmp.iloc[codelines]\n')
        tree_ = tree.tree_
        feature_name = [
            feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
            for i in tree_.feature
        ]
        #print "def tree({}):".format(", ".join(feature_names))

        def recurse(node, depth):
            indent = "      " * depth
            if tree_.feature[node] != _tree.TREE_UNDEFINED:
                name = feature_name[node]
                threshold = tree_.threshold[node]
                codelines.append ('{}if Xin["{}"] <= {}:\n'.format(indent, name, threshold))
                recurse(tree_.children_left[node], depth + 1)
                codelines.append( '{}else:  # if Xin["{}"] > {}\n'.format(indent, name, threshold))
                recurse(tree_.children_right[node], depth + 1)
            else:
                codelines.append( '{}mycat = {}\n'.format(indent, node))

        recurse(0, 1)
        codelines.append('      catout.append(mycat)\n')
        codelines.append('   return pd.DataFrame(catout,index=X_tmp.index,columns=["category"])\n')
        codelines.append('node_ids = get_cat(X)\n')
        return codelines
    mycode = tree_to_code(clf,X.columns.values)

    # now execute the function and obtain the dataframe with all nodes
    exec(''.join(mycode))
    node_ids = [int(x[0]) for x in node_ids.values]
    node_ids2 = pd.DataFrame(node_ids)

    print('make plot')
    import matplotlib.cm as cm
    colors = cm.rainbow(np.linspace(0, 1, 1+max( list(set(node_ids)))))
    #plt.figure(figsize=cm2inch(24, 21))
    for i in list(set(node_ids)):
        plt.plot(y[node_ids2.values==i],'o',color=colors[i], label=str(i))  
    mytitle = ['y colored by node']
    plt.title(mytitle ,fontsize=14)
    plt.xlabel('my xlabel')
    plt.ylabel(tagname)
    plt.xticks(rotation=70)       
    plt.legend(loc='upper center', bbox_to_anchor=(0.5, 1.00), shadow=True, ncol=9)
    plt.tight_layout()
    plt.show()
    plt.close 

不是最优雅的版本,但它做到了…