RowMean if the string passes the test

I am working on a dataset where the source name is indicated by the abbreviation of the two letters before the variable. Thus, all variables from source AA begin with AA_var1, and source bb has bb_variable_name_2. There are actually many sources and many variable names, but I leave only 2 as a minimal example.

I want to create an average variable for any row where the number of sources, that is, the number of unique prefixes for which the data in this row is not NA, is greater than 1. If there is only one source, I want the general variable to be NA.

So, for example, my data looks like this:

> head(df)
  AA_var1 AA_var2   myid   bb_meow bb_A_v1
1      NA      NA 123456        10      12
2      NA      10 194200        12      NA
3      12      10 132200        NA      NA
4      12      NA 132201        NA      12
5      NA      NA 132202        NA      NA
6      12      13 132203        14      NA

And I want the following:

> head(df)
  AA_var1 AA_var2   myid   bb_meow bb_A_v1  rowMeanIfDiverseData
1      NA      NA 123456        10      12                    NA #has only bb
2      NA      10 194200        12      NA                    11 #has AA and bb
3      12      10 132200        NA      NA                    NA #has only AA
4      12      NA 132201        NA      12                    12 #has AA and bb
5      NA      NA 132202        NA      NA                    NA #has neither
6      12      13 132203        14      NA                    13 #has AA and bb

rowMeans() . , / /, , .

:

mynames <- names(df[!names(df) %in% c("myid")])
tmp <- str_extract(mynames, perl("[A-Za-z]{2}(?=_)"))
uniq <- unique(tmp[!is.na(tmp)])

,

> uniq
[1] "AA" "bb"

, , df :

multiSource <- function(x){
    nm = names(x[!names(x) %in% badnames])           # exclude c("myid")
    tmp <- str_extract(nm, perl("[A-Za-z]{2}(?=_)")) # get prefixes
    uniq <- unique(tmp[!is.na(tmp)])                 # ensure unique and not NA
    if (length(uniq) > 1){
        return(T)
    } else {
        return(F)
    }
 }

, ..

> lapply(df,multiSource)
$AA_var1
[1] FALSE

$AA_var2
[1] FALSE

$bb_meow
[1] FALSE

$bb_A_v1
[1] FALSE

...

> apply(df,MARGIN=1,FUN=multiSource)

TRUE .

...

df$rowMean <- rowMeans(df, na.rm=T)

# so, in this case
rowMeansIfTest <- function(X,test) {
   # is this row muliSource True?
   # if yes, return(rowMeans(X))
   # else return(NA)
}

df$rowMeanIfDiverseData <- rowMeansIfTest(df, test=multiSource)

, - .

+4
5

, , , , NA. rowsums, , , NA, , cbind.

, AA_varXXX, myid. , , , .

df.dat <- df[!names(df) == "myid"]
diverse.rows <- rowSums(
  sapply(
    split.default(df.dat, gsub("^([A-Z]{2})_var.*", "\\1", names(df.dat))), 
    function(x) apply(x, 1, function(y) any(!is.na(y)))
) ) > 1
cbind(df, div.mean=ifelse(diverse.rows, rowMeans(df.dat, na.rm=T), NA))

:

  AA_var1 AA_var2   myid BB_var3 BB_var4 div.mean
1      NA      NA 123456      10      12       NA
2      NA      10 194200      12      NA       11
3      12      10 132200      NA      NA       NA
4      12      NA 132201      NA      12       12
5      NA      NA 132202      NA      NA       NA
6      12      13 132203      14      NA       13
+3

, , , , .

# Here your data:
df <- data.frame(AA_var1 = c(NA,NA,12,12,NA,12),
                 AA_var2 = c(NA,10,10,NA,NA,13),
                 BB_var3 = c(10,12,NA,NA,NA,14),
                 BB_var4 = c(12,NA,NA,12,NA,NA))

# calculate rowMeans for each subset of variables
a <- rowMeans(df[,grepl('AA',names(df))], na.rm=TRUE)
b <- rowMeans(df[,grepl('BB',names(df))], na.rm=TRUE)

# count non-missing values for each subset of variables
a2 <- rowSums(!is.na(df[,grepl('AA',names(df))]), na.rm=TRUE)
b2 <- rowSums(!is.na(df[,grepl('BB',names(df))]), na.rm=TRUE)

# calculate means:
rowSums(cbind(a*a2,b*b2)) /
    rowSums(!is.na(df[,grepl('[AA]|[BB]',names(df))]), na.rm=TRUE)

:

> df$rowMeanIfDiverseData <- rowSums(cbind(a*a2,b*b2)) /
+         rowSums(!is.na(df[,grepl('[AA]|[BB]',names(df))]), na.rm=TRUE)
> df
  AA_var1 AA_var2 BB_var3 BB_var4 rowMeanIfDiverseData
1      NA      NA      10      12                  NaN
2      NA      10      12      NA                   11
3      12      10      NA      NA                  NaN
4      12      NA      NA      12                   12
5      NA      NA      NA      NA                  NaN
6      12      13      14      NA                   13

, :

> df$rowMeanIfDiverseData[is.nan(df$rowMeanIfDiverseData)] <- NA
> df
  AA_var1 AA_var2 BB_var3 BB_var4 rowMeanIfDiverseData
1      NA      NA      10      12                   NA
2      NA      10      12      NA                   11
3      12      10      NA      NA                   NA
4      12      NA      NA      12                   12
5      NA      NA      NA      NA                   NA
6      12      13      14      NA                   13
+1

, ...

dat<-data.frame(AA_var1=c(NA,NA,12,12,NA,12),
                    AA_var2=c(NA,10,10,NA,NA,13),
                    myid=1:6,
                    BB_var3=c(10,12,NA,NA,NA,14),
                    BB_var4=c(12,NA,NA,12,NA,NA))

#what columns are associated with variables used in our mean
varcols<-grep("*var[1-9]",names(dat),value=T)

#which rows have the requisite diversification of non-nulls
#i assume these columns will start with capitals and folloowed by underscore
meanrow<-apply(!is.na(dat[,varcols]),1,function(x){n<-varcols[x]
                              1<length(unique(regmatches(n,regexpr("[A-Z]+_",n))))
                                            })
#do the row mean for all 
dat$meanval<-rowMeans(dat[,varcols],na.rm=T)

#null out for those without diversification (i.e. !meanrow)
dat[!meanrow,"meanval"]<-NA
+1
source

I think some of the answers make this more complicated than that. This will do it:

df$means = ifelse(rowSums(!is.na(df[, grep('AA_var', names(df))])) &
                    rowSums(!is.na(df[, grep('BB_var', names(df))])),
                  rowMeans(df[, grep('_var', names(df))], na.rm = T), NA)
#  AA_var1 AA_var2   myid BB_var3 BB_var4 means
#1      NA      NA 123456      10      12    NA
#2      NA      10 194200      12      NA    11
#3      12      10 132200      NA      NA    NA
#4      12      NA 132201      NA      12    12
#5      NA      NA 132202      NA      NA    NA
#6      12      13 132203      14      NA    13

Here's a generalization of the above, given the comment, subject to a unique identifier (if not, create a unique index):

library(data.table)
library(reshape2)

dt = data.table(df)
setkey(dt, myid) # not strictly necessary, but makes life easier

# find the conditional
cond = melt(dt, id.var = 'myid')[,
         sum(!is.na(value)), by = list(myid, sub('_var.*', '', variable))][,
         all(V1 != 0), keyby = myid]$V1

# fill in the means (could also do a join, but will rely on ordering instead)
dt[cond, means := rowMeans(.SD, na.rm = T), .SDcols = grep('_var', names(dt))]

dt
#   AA_var1 AA_var2   myid BB_var3 BB_var4 means
#1:      NA      NA 123456      10      12    NA
#2:      12      10 132200      NA      NA    NA
#3:      12      NA 132201      NA      12    12
#4:      NA      NA 132202      NA      NA    NA
#5:      12      13 132203      14      NA    13
#6:      NA      10 194200      12      NA    11
+1
source
fun <- function(x) {
    MEAN <- mean(c(x[1], x[2], x[4], x[5]), na.rm=TRUE)
    CHECK <- sum(!is.na(c(x[1], x[2]))) > 0 & sum(!is.na(c(x[4], x[5])) > 0)
    MEAN * ifelse(CHECK, 1, NaN)
}
df$rowMeanIfDiverseData <- apply(df, 1, fun)
df

  AA_var1 AA_var2   myid BB_var3 BB_var4 rowMeanIfDiverseData
1      NA      NA 123456      10      12                  NaN
2      NA      10 194200      12      NA                   11
3      12      10 132200      NA      NA                  NaN
4      12      NA 132201      NA      12                   12
5      NA      NA 132202      NA      NA                  NaN
6      12      13 132203      14      NA                   13
0
source

Source: https://habr.com/ru/post/1530083/


All Articles