0

the decision tree rule right now that I am having

{
  "id": -1,
  "rule": "TOTAL_REVENUE <= 300",
  "left": {
        "id": "0",
        "rule": "TOTAL_DATA_DUR <= 39.5794",
        
        "left": {
          "id": "1",
          "rule": "TOTAL_DATA_DUR <= 0.7408",
         
          "left": null,
          "right": {
            "id": "3",
            "rule": "TOTAL_PACKAGE_REVENUE <= 15.1350",
           
            "left": {
              "id": "4",
              "rule": "TOTAL_PACKAGE_REVENUE_14DAYS <= 12.5000",
             
              "left": null,
              "right": {
                "id": "6",
              
                "value": 84.62
                
              }
            },
            "right": null
          }
        },
        "right": {
          "id": "8",
          "rule": "TOTAL_DATA_DUR <= 301.6211",
        
          "left": null,
          "right": {
            "id": "10",
            "rule": "TOTAL_DATA_DUR <= 6898.9146",
           
            "left": {
              "id": "11",
              "rule": "TOTAL_PACKAGE_REVENUE <= 14.5000",
             
              "left": null,
              "right": {
                "id": "13",
                "rule": "TOTAL_PACKAGE_REVENUE <= 16.0000",
             
                "left": {
                  "id": "14",
                 
                 
                  "value": 84.96
                  
                },
                "right": {
                  "id": "15",
                  "rule": "TOTAL_PACKAGE_REVENUE <= 19.5000",
                  "left": null,
                  "right": {
                    "id": "17",
                   
                   
                    "value": 70.8
                    
                  }
                }
              }
            },
            "right": null
          }
        }
      }
}

the output that I am trying to get is

path1 -> TOTAL_REVENUE <= 300 and TOTAL_DATA_DUR <= 39.5794 TOTAL_DATA_DUR > 0.7408 and TOTAL_PACKAGE_REVENUE <= 15.1350 and TOTAL_PACKAGE_REVENUE_14DAYS > 12.5000

here you can see TOTAL_DATA_DUR > 0.7408 and TOTAL_PACKAGE_REVENUE_14DAYS > 12.5000 is being reversed as it is traversing through the right the rest of the condition is <= because its going through left

path2 -> TOTAL_REVENUE <= 300 and TOTAL_DATA_DUR > 39.5794 and TOTAL_DATA_DUR >  301.6211 and TOTAL_DATA_DUR <= 6898.9146 and TOTAL_PACKAGE_REVENUE > 14.5000 and TOTAL_PACKAGE_REVENUE <= 16.0000

path3 -> TOTAL_REVENUE <= 300 and TOTAL_DATA_DUR > 39.5794 and TOTAL_DATA_DUR >  301.6211 and TOTAL_DATA_DUR <= 6898.9146 and TOTAL_PACKAGE_REVENUE > 14.5000 and TOTAL_PACKAGE_REVENUE > 16.0000 and TOTAL_PACKAGE_REVENUE > 19.5000

I am fairly new to coding how will I get the required output using recursion

the code that I am working on right now

from collections import deque
import json
def isLeaf1(node):
    return node.get('left') is None and node.get('right') is None



all_paths = []


def printRootToLeafPaths1(node, path, node_type=None):
    # base case
    if node is None:

        return

    # include the current node to the path
    if node.get('rule') is not None:
        path.append(node.get('rule'))

    # if a leaf node is found, print the path
    if isLeaf1(node):
        all_paths.append(list(path))


    # recur for the left and right subtree
    printRootToLeafPaths1(node.get('left'), path, 'left')
    printRootToLeafPaths1(node.get('right'), path, 'right')

    # backtrack: remove the current node after the left, and right subtree are done
    path.pop()



# The main function to print paths from the root node to every leaf node
def printRootToLeafPath1(root):
    # list to store root-to-leaf path
    path = deque()
    printRootToLeafPaths1(root, path)

json_rule ='{"id":-1,"rule":"TOTAL_REVENUE <= 300","left":{"id":"0","rule":"TOTAL_DATA_DUR <= 39.5794","left":{"id":"1","rule":"TOTAL_DATA_DUR <= 0.7408","left":null,"right":{"id":"3","rule":"TOTAL_PACKAGE_REVENUE <= 15.1350","left":{"id":"4","rule":"TOTAL_PACKAGE_REVENUE_14DAYS <= 12.5000","left":null,"right":{"id":"6","value":84.62}},"right":null}},"right":{"id":"8","rule":"TOTAL_DATA_DUR <= 301.6211","left":null,"right":{"id":"10","rule":"TOTAL_DATA_DUR <= 6898.9146","left":{"id":"11","rule":"TOTAL_PACKAGE_REVENUE <= 14.5000","left":null,"right":{"id":"13","rule":"TOTAL_PACKAGE_REVENUE <= 16.0000","left":{"id":"14","value":84.96},"right":{"id":"15","rule":"TOTAL_PACKAGE_REVENUE <= 19.5000","impurity":"0.47643265235457066","samples":"304","left":null,"right":{"id":"17","value":70.8}}}},"right":null}}}}'

printRootToLeafPath1(json.loads(json_rule))

can anyone tell me what changes should I make in this code in order to obtains the output paths ?

aziz shaw
  • 144
  • 1
  • 12
  • *"output that I am trying to get is..."*: it seems to be a single string, yet your code seems to work with a list `all_paths`. So are you sure you just want that single string as output? Why that one, and not another "path"? Currently your function never returns anything. What is the expected data type of the returned value? A list of strings? – trincot Feb 07 '22 at 08:23
  • @trincot all_paths contains list of list strings each list in all_paths will be path and the output can be made by joining each element in a list with "and" . please suggest it there is a better way to do it – aziz shaw Feb 07 '22 at 08:28
  • What should happen with the "value" in the leaf node? You don't use it? – trincot Feb 07 '22 at 09:20
  • @trincot thanks you . actually from sklearn decision tree I formed the rule for in the form of json for the train data . then I pruned the leaf nodes having probability <50 % that's children's of some nodes are null . . the value is actually the probability value – aziz shaw Feb 07 '22 at 10:28
  • OK, but is your path generating algorithm not going to use it? – trincot Feb 07 '22 at 10:29
  • @trincot yes so after I got the query conditions I am using the query to back test on previous months data . and check for the control responders . the project is for a recommendation system – aziz shaw Feb 07 '22 at 10:34
  • OK, but is your path generating algorithm not going to *collect* these values, and are you going to visit the tree *again* when you need those values? – trincot Feb 07 '22 at 10:36
  • @trincot yes in order to show average probability , or probability of paths . may need to collect these value – aziz shaw Feb 07 '22 at 10:38
  • OK, if you can make that work I have no further questions ;-). if you need this in my answer, then please edit the question to show how the value should be incorporated into the output and I'll update my answer. – trincot Feb 07 '22 at 10:43
  • Let us [continue this discussion in chat](https://chat.stackoverflow.com/rooms/241783/discussion-between-aziz-shaw-and-trincot). – aziz shaw Feb 07 '22 at 11:21

1 Answers1

1

Some remarks:

  • Instead of printing in the recursive function, yield. That way, the caller can decide what they do with the paths: print them, or something else.

  • It is bad practice to use a global variable and let the function mutate that variable all_paths. Instead aim to make the function "pure", so it does not need that and can produce the paths through yielding them (or returning them).

  • The pop() call is unconditional, while the append() call is conditional. This looks as it there is a possibility to pop something when it was not appended in the same function, so it will be error prone. There is a better way to do recursion -- next point:

  • Instead of passing a partial path to the recursive call, let the recursive call give the paths it has found "as if" it was the root, and then prefix the current node's rule to it. This is the better practice for recursion.

  • Apparently, when there is no "rule" key in the node, it is a leaf-node. In that case it is not needed to look for left/right anymore, as it is understood that a node has either a "rule" key and children, or has no "rule" key and no children. In the case of a leaf, just yield an empty path which the caller can extend.

  • There is no logic in your code that negates a rule. For that you can use list of pairs that maps each operator to its opposite. And if none of those match the rule, then just apply a default not (rule) format. This logic assumes that a rule uses just one operator.

  • Your algorithm makes no use of the "value" properties in the leaves of the tree. I think you will need to use that information at some point, but I will ignore it for now.

Here is a possible implementation, taking the above points into account:

def negate(rule):
    mapper = (("<=", ">"), (">=", "<"), (">", "<="), ("<", ">="))
    for operator, negated in mapper:
        if operator in rule:
            return rule.replace(operator, negated)
    return "not (" + rule + ")"  # Default when operator not recognised

def nodeToLeafPaths(node):
    if not node:  # base case: nothing in this direction
        return
    rule = node.get('rule')
    if rule is None:  # base case: a leaf with a value
        yield []  # empty path
        return

    negated_rule = negate(rule)
    for path in nodeToLeafPaths(node.get('left')):
        yield [rule, *path]  # Extend path with current rule
    for path in nodeToLeafPaths(node.get('right')):
        yield [negated_rule, *path]

# Transform paths (lists) to AND-rules (strings):
def rootToLeafConjugations(root):
    return [" AND ".join(path) for path in nodeToLeafPaths(root)]

The main driver code could look like this:

import json

json_rule ='{"id":-1,"rule":"TOTAL_REVENUE <= 300","left":{"id":"0","rule":"TOTAL_DATA_DUR <= 39.5794","left":{"id":"1","rule":"TOTAL_DATA_DUR <= 0.7408","left":null,"right":{"id":"3","rule":"TOTAL_PACKAGE_REVENUE <= 15.1350","left":{"id":"4","rule":"TOTAL_PACKAGE_REVENUE_14DAYS <= 12.5000","left":null,"right":{"id":"6","value":84.62}},"right":null}},"right":{"id":"8","rule":"TOTAL_DATA_DUR <= 301.6211","left":null,"right":{"id":"10","rule":"TOTAL_DATA_DUR <= 6898.9146","left":{"id":"11","rule":"TOTAL_PACKAGE_REVENUE <= 14.5000","left":null,"right":{"id":"13","rule":"TOTAL_PACKAGE_REVENUE <= 16.0000","left":{"id":"14","value":84.96},"right":{"id":"15","rule":"TOTAL_PACKAGE_REVENUE <= 19.5000","impurity":"0.47643265235457066","samples":"304","left":null,"right":{"id":"17","value":70.8}}}},"right":null}}}}'

for rule in rootToLeafConjugations(json.loads(json_rule)):
    print(rule)
trincot
  • 317,000
  • 35
  • 244
  • 286