How to create a new column in a data framework based on permutations of other columns?

Suppose I have a dataframe that looks like this:

    var1   var2   var3   var4  
a   TRUE   FALSE  TRUE   FALSE
b   TRUE   TRUE   TRUE   FALSE
c   FALSE  TRUE   FALSE  TRUE
d   TRUE   FALSE  FALSE  FALSE
e   TRUE   FALSE  TRUE   FALSE
f   FALSE  TRUE   FALSE  TRUE

I want to create a new column that assigns categories ato fcategories based on what kind of permutation TRUEit FALSEhas for the variables on top.

In this simplified example, the result will look like this:

    var1   var2   var3   var4    category
a   TRUE   FALSE  TRUE   FALSE      A
b   TRUE   TRUE   TRUE   FALSE      B
c   FALSE  TRUE   FALSE  TRUE       C
d   TRUE   FALSE  FALSE  FALSE      D
e   TRUE   FALSE  TRUE   FALSE      A
f   FALSE  TRUE   FALSE  TRUE       C

Please note that each unique permutation TRUEand FALSEbecomes a different category, and since a, and ehave the same permutation, they fall into the same category ( a).

, , , , TRUE FALSE, , /

+4
3

-

## paste the rows together, creating a character vector
x <- do.call(paste, df)
## match it against itself and apply to 'LETTERS', and assign as new column
df$category <- LETTERS[match(x, x)]
df
#    var1  var2  var3  var4 category
# a  TRUE FALSE  TRUE FALSE        A
# b  TRUE  TRUE  TRUE FALSE        B
# c FALSE  TRUE FALSE  TRUE        C
# d  TRUE FALSE FALSE FALSE        D
# e  TRUE FALSE  TRUE FALSE        A
# f FALSE  TRUE FALSE  TRUE        C

, . .

df$category <- LETTERS[with(list(x = do.call(paste, df)), match(x, x))]

:

df <- structure(list(var1 = c(TRUE, TRUE, FALSE, TRUE, TRUE, FALSE), 
    var2 = c(FALSE, TRUE, TRUE, FALSE, FALSE, TRUE), var3 = c(TRUE, 
    TRUE, FALSE, FALSE, TRUE, FALSE), var4 = c(FALSE, FALSE, 
    TRUE, FALSE, FALSE, TRUE)), .Names = c("var1", "var2", "var3", 
"var4"), row.names = c("a", "b", "c", "d", "e", "f"), class = "data.frame")
+7
#Example DATA
mydata = structure(list(V1 = c(TRUE, TRUE, FALSE, TRUE, TRUE, FALSE), 
V2 = c(FALSE, TRUE, TRUE, FALSE, FALSE, TRUE), V3 = c(TRUE, 
TRUE, FALSE, FALSE, TRUE, FALSE), V4 = c(FALSE, FALSE, TRUE, 
FALSE, FALSE, TRUE)), .Names = c("V1", "V2", "V3", "V4"),
class = "data.frame", row.names = c(NA,-6L))

#RUN THE ONE LINER (Incorporating David Arenburg advice in comment)
mydata$category = toupper(letters[as.numeric(as.factor(do.call(paste, mydata)))])
+2

, , "". ( "Data.frame" df RichScriven.)

:

o = do.call(order, df)

, :

starts_new_elt = lapply(df, function(x) { 
                                 xo = x[o]
                                 c(TRUE, xo[-1] != xo[-length(x)]) 
                             })

TRUE/FALSE, , . , , "data.frame" :

starts_new_row = Reduce("|", starts_new_elt)

"data.frame" , , TRUE , a FALSE .

starts_new_row
#[1]  TRUE FALSE  TRUE  TRUE FALSE  TRUE

df[o, ]
#   var1  var2  var3  var4
#c FALSE  TRUE FALSE  TRUE
#f FALSE  TRUE FALSE  TRUE
#d  TRUE FALSE FALSE FALSE
#a  TRUE FALSE  TRUE FALSE
#e  TRUE FALSE  TRUE FALSE
#b  TRUE  TRUE  TRUE FALSE

, cumsum id "data.frame" , order(o):

gr = cumsum(starts_new_row)[order(o)]
gr
#[1] 3 4 1 2 3 1

:

LETTERS[match(gr, unique(gr))]
#[1] "A" "B" "C" "D" "A" "C"

A convenient alternative to the above is based on a function groupingthat has been added to newer versions of R from the data.table package. groupingreturns the order as before, but some convenient attributes:

o2 = do.call(grouping, df)
ends = attr(o2, "ends")
gr2 = rep(seq_along(ends), c(ends[1], diff(ends)))[order(o2)]

gr2
#[1] 3 4 1 2 3 1
LETTERS[match(gr2, unique(gr2))]
#[1] "A" "B" "C" "D" "A" "C"
+1
source

Source: https://habr.com/ru/post/1666659/


All Articles