Find data elements in a data frame that pass a rule for a node in a tree model?

So, I used the rpart package to create a tree model, and I found an interesting rule and wondered if there is an easy way to see what observations in this data frame pass this rule.

It seems very tedious to use path.rpart to find the path that it took from the tree and manually enter these filters in the data frame to search for them. Is there a way in which I can pass a tree and / or node and a data frame and return all the elements in this frame that ended with this node?

+4
source share
2 answers

path.rpart, , node, node. , , path.rpart.

subset.rpart <- function (tree, df, nodes) {
    if (!inherits(tree, "rpart")) 
        stop("Not a legitimate \"rpart\" object")
    stopifnot(nrow(df)==length(tree$where))
    frame <- tree$frame
    n <- row.names(frame)
    node <- as.numeric(n)

    if (missing(nodes)) {
        xy <- rpart:::rpartco(tree)
        i <- identify(xy, n = 1L, plot = FALSE)
        if(i> 0L) {
             return( df[tree$where==i, ] )
        } else {
            return(df[0,])
        }
    }
    else {
        if (length(nodes <- rpart:::node.match(nodes, node)) == 0L) 
            return(df[0,])
        return ( df[tree$where %in% as.numeric(nodes), ] )
    }
}

fit <- rpart(Kyphosis ~ Age + Number + Start, data = kyphosis)
plot(fit)
text(fit)

rpart tree plot

, node,

subset.rpart(fit, kyphosis)

node . node . data.frame, , . , , node, path.rpart

# path.rpart(fit)  
#  node number: 10  ---> looks interesting
#    root
#    Start>=8.5
#    Start< 14.5
#    Age< 55

subset.rpart(fit, kyphosis, 10)
#    Kyphosis Age Number Start
# 14   absent   1      4    12
# 20   absent  27      4     9
# 26   absent   9      5    13
# 37   absent   1      3     9
# 39   absent  20      6     9
# 42   absent  35      3    13
# 57   absent   2      3    13
# 59   absent  51      7     9
# 66   absent  17      4    10
# 69   absent  18      4    11
# 78   absent  26      7    13
# 81   absent  36      4    13
+9
#' subset of rpart node: return logical index
#' @param tree rpart model
#' @param node which node/leaf?
#' @export
subset_rpart <- function (tree, node) {
  nodes = as.numeric(rownames(tree$frame))
  nodes = log(nodes, 2)
  lower = log(node, 2)
  upper = log(node + 1, 2)
  a = floor(lower)
  lower_ = lower - a
  upper_  = upper - a
  nodes_ = nodes %% 1
  w = which(((nodes_ >= lower_ & nodes_ < upper_) | (nodes_ + 1 < upper_)) & nodes >= lower)
  tree$where %in% w
}



#' subset df by subset_rpart
#' @param tree rpart model
#' @param node node number
#' @param df df
#' @export
subset.rpart = function(tree, node, df){
  df[subset_rpart(tree, node), ]
}
0

Source: https://habr.com/ru/post/1542405/


All Articles