Effective construction of a large (200 mm line) data frame

I am trying to build a large (~ 200 MM lines) data frame in R. Each record in the data frame will consist of approximately 10 digits (e.g. 1234.12345). The code is designed to move through the list, subtracts the element at the position [i] from each element after [i], but not the elements before [i] (if I put the output in the matrix, it would be a triangular matrix). The code is simple and works fine on small lists, but I wonder if there is a faster or more efficient way to do this? I assume that the first part of the answer will entail “not using a nested loop”, but I'm not sure what the alternatives are.

The idea is that it will be a "boundary list" for the schedule of analysis of a social network. As soon as I have an “outlist”, I will reduce the number of edges based on some criteria (<,>, ==,), so the final list (and schedule) will not be so heavy.

#Fake data of same approximate dimensions as real data
dlist<-sample(1:20,20, replace=FALSE) 
#purge the output list before running the loop
rm(outlist)
outlist<-data.frame()

for(i in 1:(length(dlist)-1)){
         for(j in (i+1):length(dlist)){

             outlist<-rbind(outlist, c(dlist[i],dlist[j], dlist[j]-dlist[i]))

     }
    }
+4
source share
4 answers

IIUC your final data set will be ~ 200 million rows in 3 columns, the whole type numericthat takes up the total space:

200e6 (rows) * 3 (cols) * 8 (bytes) / (1024 ^ 3)
# ~ 4.5GB

This is pretty big data where copying should be avoided, where possible.

, data.table () vecseq ( C + ) - :=, .

fn1 <- function(x) {
    require(data.table) ## 1.9.2
    lx = length(x)
    vx = as.integer(lx * (lx-1)/2)
    # R v3.1.0 doesn't copy on doing list(.) - so should be even more faster there
    ans = setDT(list(v1 = rep.int(head(x,-1L), (lx-1L):1L), 
                v2=x[data.table:::vecseq(2:lx, (lx-1L):1, vx)]))
    ans[, v3 := v2-v1]
}

:

. , R v3.0.2, fn1() ( ) R v3.1.0, list(.) .

fn2 <- function(x) {
    diffmat <- outer(x, x, "-")
    ss <- which(upper.tri(diffmat), arr.ind = TRUE)
    data.frame(v1 = x[ss[,1]], v2 = x[ss[,2]], v3 = diffmat[ss])
}

fn3 <- function(x) {
    idx <- combn(seq_along(x), 2)
    out2 <- data.frame(v1=x[idx[1, ]], v2=x[idx[2, ]])
    out2$v3 <- out2$v2-out2$v1
    out2
}

set.seed(45L)
x = runif(20e3L)
system.time(ans1 <- fn1(x))  ## 18  seconds + ~8GB  (peak) memory usage
system.time(ans2 <- fn2(x))  ## 158 seconds + ~19GB (peak) memory usage
system.time(ans3 <- fn3(x))  ## 809 seconds + ~12GB (peak) memory usage

, fn2() - outer ( >= 19 ) , fn1(). fn3() (- combn ).

+7

-

#Sample Data
N <- 20
set.seed(15) #for reproducibility
dlist <- sample(1:N,N, replace=FALSE) 

idx <- combn(1:N,2)
out2 <- data.frame(i=dlist[idx[1, ]], j=dlist[idx[2, ]])
out2$dist <- out2$j-out2$i

combn data.set, . data.frame , .

out1 <- data.frame()
for(i in 1:(length(dlist)-1)){
    for(j in (i+1):length(dlist)){
        out1<-rbind(out1, c(dlist[i],dlist[j], dlist[j]-dlist[i]))
    }
}

,

all(out1==out2)
# [1] TRUE

, microbenchmark, ,

microbenchmark(loops(), combdata())
# Unit: microseconds
#        expr       min        lq     median         uq       max neval
#     loops() 30888.403 32230.107 33764.7170 34821.2850 82891.166   100
#  combdata()   684.316   800.384   873.5015   940.9215  4285.627   100

, , .

+2

, :

vec <- 1:10

diffmat <- outer(vec,vec,"-")
ss <- which(upper.tri(diffmat),arr.ind = TRUE)

data.frame(one = vec[ss[,1]], 
           two = vec[ss[,2]], 
           diff = diffmat[ss])
+1

, . , , , , NA.

0
source

Source: https://habr.com/ru/post/1546423/


All Articles