Smooth a nested list by averaging vectors

Say I have a nested list of vectors.

lst1 <- list(`A`=c(a=1,b=1), `B`=c(a=1), `C`=c(b=1), `D`=c(a=1,b=1,c=1)) lst2 <- list(`A`=c(b=1), `B`=c(a=1,b=1), `C`=c(a=1,c=1), `D`=c(a=1,c=1)) lstX <- list(lst1, lst2) 

As can be seen, each vector A,B,C,D occurs twice with a,b,c present at different frequencies.

What will be the most efficient way to smooth lists so that a,b,c summed or averaged over A,B,C,D over nested lists, as shown below. A real list contains several hundred thousand nested lists.

 #summed abc A 1 2 NA B 2 1 NA C 1 1 1 D 2 1 2 #averaged abc A 0.5 1 NA B 1 0.5 NA C 0.5 0.5 0.5 D 1 0.5 1 
+5
source share
3 answers

Here's a simple basic R solution (which will return 0 instead of NA (not sure if good enough)

 temp <- unlist(lstX) res <- data.frame(do.call(rbind, strsplit(names(temp), "\\.")), value = temp) 

Amount

 xtabs(value ~ X1 + X2, res) # X2 # X1 abc # A 1 2 0 # B 2 1 0 # C 1 1 1 # D 2 1 2 

Funds

 xtabs(value ~ X1 + X2, res) / length(lstX) # X2 # X1 abc # A 0.5 1.0 0.0 # B 1.0 0.5 0.0 # C 0.5 0.5 0.5 # D 1.0 0.5 1.0 

Alternatively, a more flexible data.table solution

 library(data.table) #V1.9.6+ temp <- unlist(lstX) res <- data.table(names(temp))[, tstrsplit(V1, "\\.")][, value := temp] 

Amount

 dcast(res, V1 ~ V2, sum, value.var = "value", fill = NA) # V1 abc # 1: A 1 2 NA # 2: B 2 1 NA # 3: C 1 1 1 # 4: D 2 1 2 

Funds

 dcast(res, V1 ~ V2, function(x) sum(x)/length(lstX), value.var = "value", fill = NA) # V1 abc # 1: A 0.5 1.0 NA # 2: B 1.0 0.5 NA # 3: C 0.5 0.5 0.5 # 4: D 1.0 0.5 1.0 

In general, you can use almost any function with dcast

+5
source

We could also try

 library(data.table) DT1 <- rbindlist(lapply(do.call('c', lstX), as.data.frame.list), fill=TRUE, idcol=TRUE) DT1[, lapply(.SD, sum, na.rm=TRUE), .id] # .id abc #1: A 1 2 0 #2: B 2 1 0 #3: C 1 1 1 #4: D 2 1 2 DT1[, lapply(.SD, function(x) sum(x, na.rm=TRUE)/.N), .id] # .id abc #1: A 0.5 1.0 0.0 #2: B 1.0 0.5 0.0 #3: C 0.5 0.5 0.5 #4: D 1.0 0.5 1.0 
+2
source

This is not the shortest answer or the fastest, but we can try something like this:

 ### Get all the vector names names <- lapply(lstX, function(l) lapply(l, names)) names <- unique(unlist(names)) names ## [1] "a" "b" "c" ## Check if a name is missing, for example setdiff(names, names(lstX[[1]][[1]])) ## [1] "c" ## Now we will check for every vectors within each list ## and fill the missing names with NA and order the results lstX <- lapply(lstX, function(l) { lapply(l, function(v) { v[setdiff(names, names(v))] <- NA v[order(names(v))] ## order by names to bind it without errors }) }) lstX ## [[1]] ## [[1]]$A ## abc ## 1 1 NA ## [[1]]$B ## abc ## 1 NA NA ## [[1]]$C ## abc ## NA 1 NA ## [[1]]$D ## abc ## 1 1 1 ## [[2]] ## [[2]]$A ## abc ## NA 1 NA ## [[2]]$B ## abc ## 1 1 NA ## [[2]]$C ## abc ## 1 NA 1 ## [[2]]$D ## abc ## 1 NA 1 ### Now we can bind it matlist <- lapply(lstX, function(l) do.call(rbind, l)) matlist ## [[1]] ## abc ## A 1 1 NA ## B 1 NA NA ## C NA 1 NA ## D 1 1 1 ## [[2]] ## abc ## A NA 1 NA ## B 1 1 NA ## C 1 NA 1 ## D 1 NA 1 mysum <- apply(simplify2array(matlist), c(1, 2), function(x) ifelse(all(is.na(x)), NA, sum(x, na.rm = TRUE))) mysum ## abc ## A 1 2 NA ## B 2 1 NA ## C 1 1 1 ## D 2 1 2 ### Average over list mysum / length(res) ## abc ## A 0.5 1.0 NA ## B 1.0 0.5 NA ## C 0.5 0.5 0.5 ## D 1.0 0.5 1.0 

EDIT

Thanks to @CathG you can quickly create a matlist like this

 matlist <- lapply(lstX, function(x) { t(sapply(x, function(y) { y <- y[names] names(y) <- names y })) }) 
+1
source

Source: https://habr.com/ru/post/1236701/


All Articles