R Create a new column that determines whether the row is the last record for a user of type

I am trying to create a new column, presumably using mutate, which will determine if a row matches several criteria. Basically, for each user I want to determine the final row (in time) for a particular DataCode. Only some DataCodes are applicable (1000 and 2000 in the example below), while others should return NA (3000 here). I'm trying to do this in my head, and all I can think of is a very long mutant element with a series of If statements. Is there a more elegant way?

The IsFinal column below shows what will happen to the product.

 User Time  DataCode Data     IsFinal
 101  10    1000     50       0
 101  20    2000     300      1
 101  30    3000     150      NA
 101  40    1000     250      1
 101  50    3000     300      NA
 102  10    2000     50       0
 102  20    1000     150      0
 102  30    1000     150      0
 102  40    2000     350      1
 102  50    3000     150      NA
 102  60    1000     50       1
+4
3

, , merge dplyr package:

library(dplyr)
new.tab <- query.tab %>%
            group_by(User, DataCode) %>%
            arrange(Time) %>%
            filter(DataCode != 3000) %>% 
            mutate(IsFinal = ifelse(row_number()==n(),1,0))

fin.tab  <- merge(new.tab, query.tab, all.x = FALSE, all.y = TRUE)

dplyr, :

fin.tab <-
 query.tab %>%
  group_by(User, DataCode) %>%
   arrange(User,Time) %>%
    mutate(IsFinal = ifelse(DataCode == 3000 , NA, 
                             ifelse(row_number()==n(),1,0)))

:

> fin.tab

#    User Time DataCode Data IsFinal 
# 1   101   10     1000   50       0 
# 2   101   20     2000  300       1 
# 3   101   30     3000  150      NA 
# 4   101   40     1000  250       1 
# 5   101   50     3000  300      NA 
# 6   102   10     2000   50       0 
# 7   102   20     1000  150       0 
# 8   102   30     1000  150       0 
# 9   102   40     2000  350       1 
# 10  102   50     3000  150      NA 
# 11  102   60     1000   50       1

:

query.tab <- structure(list(User = c(101L, 101L, 101L, 101L, 101L, 102L, 102L, 
102L, 102L, 102L, 102L), Time = c(10L, 20L, 30L, 40L, 50L, 10L, 
20L, 30L, 40L, 50L, 60L), DataCode = c(1000L, 2000L, 3000L, 1000L, 
3000L, 2000L, 1000L, 1000L, 2000L, 3000L, 1000L), Data = c(50L, 
300L, 150L, 250L, 300L, 50L, 150L, 150L, 350L, 150L, 50L)), .Names = c("User", 
"Time", "DataCode", "Data"), row.names = c(NA, -11L), class = "data.frame")

. . , .

+3

? if.

# Can you obtain list of viable codes?
codes <- c("2000", "1000")
# Can you put them in order?
goodcodes <- codes[order(codes)]
# last item in ordered goodcodes should be the end code
endcode <- goodcodes[length(goodcodes)]

testcodes <- c("0500", "1000", "2000", "3000")
n <- length(testcodes)
IsFinal <- rep(0, n)

for (i in 1:n) {
  if (testcodes[i] %in% goodcodes) {
    if (testcodes[i] == endcode) (IsFinal[i] = 1)
  } else (IsFinal[i] = NA)
}

> IsFinal
[1] NA  0  1 NA
> 
+2

R ave duplicated fromLast . NA. @masoud.

# get binary values for final DataCode by user
query.tab$IsFinal <- with(query.tab,
                         ave(DataCode, User, FUN=function(x) !duplicated(x, fromLast=TRUE)))
# Fill in NA values
is.na(query.tab$IsFinal)  <- query.tab$DataCode %in% c(3000)

query.tab
   User Time DataCode Data IsFinal
1   101   10     1000   50       0
2   101   20     2000  300       1
3   101   30     3000  150      NA
4   101   40     1000  250       1
5   101   50     3000  300      NA
6   102   10     2000   50       0
7   102   20     1000  150       0
8   102   30     1000  150       0
9   102   40     2000  350       1
10  102   50     3000  150      NA
11  102   60     1000   50       1

Note that this assumes the data is ordered by user time. This can be achieved by calling orderbefore using the above code.

query.tab <- query.tab[order(query.tab$User, query.tab$Time),]
+2
source

Source: https://habr.com/ru/post/1679992/


All Articles