Faster method than while loop to find the infection chain in R

I analyze large tables (300,000 - 500,000 rows) that store data obtained using the disease simulation model. In the model, animals on the landscape infect other animals. For example, in the example below, animal a1 infects each animal in the landscape, and the infection passes from animal to animal, separating from the "chains" of the infection.

In my example below, I want to take a table that stores information about each animal (in my example below, table = allanimals) and cut out only information about the animals the infection chain (I selected the chain in green), so I can calculate the average value of the environment for this chain of infection. d2 d2

Although my while loop works, it is slow, like molasses, when hundreds of thousands of rows are stored in a table and the chain has 40-100 members.

Any ideas on how to speed this up? We hope for a solution tidyverse. I know this "looks fast enough" with my sample data set, but with my data it is very slow ...

Scheme:

enter image description here

Desired conclusion from the example below:

   AnimalID InfectingAnimal habitat
1        d2              d1       1
2        d1              c3       1
3        c3              c2       3
4        c2              c1       2
5        c1              b3       3
6        b3              b2       6
7        b2              b1       5
8        b1              a2       4
9        a2              a1       2
10       a1               x       1

Code example:

library(tidyverse)

# make some data
allanimals <- structure(list(AnimalID = c("a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8",
"b1", "b2", "b3", "b4", "b5", "c1", "c2", "c3", "c4", "d1", "d2", "e1", "e2",
"e3", "e4", "e5", "e6", "f1", "f2", "f3", "f4", "f5", "f6", "f7"),
InfectingAnimal = c("x", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a2", "b1",
"b2", "b3", "b4", "b3", "c1", "c2", "c3", "c3", "d1", "b1", "e1", "e2", "e3",
"e4", "e5", "e1", "f1", "f2", "f3", "f4", "f5", "f6"), habitat = c(1L, 2L, 1L,
2L, 2L, 1L, 3L, 2L, 4L, 5L, 6L, 1L, 2L, 3L, 2L, 3L, 2L, 1L, 1L, 2L, 5L, 4L,
1L, 1L, 1L, 1L, 4L, 5L, 4L, 5L, 4L, 3L)), .Names = c("AnimalID",
"InfectingAnimal", "habitat"), class = "data.frame", row.names = c(NA, -32L))

# check it out
head(allanimals)

# Start with animal I'm interested in - say, d2
Focal.Animal <- "d2"

# Make a 1-row data.frame with d2 information
Focal.Animal <- allanimals %>% 
  filter(AnimalID == Focal.Animal)

# This is the animal we start with
Focal.Animal

# Make a new data.frame to store our results of the while loop in
Chain <- Focal.Animal

# make a condition to help while loop
InfectingAnimalInTable <- TRUE

# time it 
ptm <- proc.time()

# Run loop until you find an animal that isn't in the table, then stop
while(InfectingAnimalInTable == TRUE){
    # Who is the next infecting animal?
    NextAnimal <- Chain %>% 
      slice(n()) %>% 
      select(InfectingAnimal) %>% 
      unlist()

    NextRow <- allanimals %>% 
      filter(AnimalID == NextAnimal)


    # If there is an infecting animal in the table, 
    if (nrow(NextRow) > 0) {
      # Add this to the Chain table
      Chain[(nrow(Chain)+1),] <- NextRow
      #Otherwise, if there is no infecting animal in the  table, 
      # define the Infecting animal follows, this will stop the loop.
    } else {InfectingAnimalInTable <- FALSE}
  }

proc.time() - ptm

# did it work? Check out the Chain data.frame
Chain
+4
source share
2 answers

So, the problem here is in your data structure. You will need a vector that stores those who are infected by those who (saving as integers):

allanimals_ID <- unique(c(allanimals$AnimalID, allanimals$InfectingAnimal))

infected <- rep(NA_integer_, length(allanimals_ID))
infected[match(allanimals$AnimalID, allanimals_ID)] <-
  match(allanimals$InfectingAnimal, allanimals_ID)

path <- rep(NA_integer_, length(allanimals_ID))
curOne <- match("d2", allanimals_ID)
i <- 1
while (!is.na(nextOne <- infected[curOne])) {
  path[i] <- curOne
  i <- i + 1
  curOne <- nextOne
}

allanimals[path[seq_len(i - 1)], ]

Recode this loop with Rcpp to further increase performance: ')

+2
source
 library(data.tree)

  path = function(data,vec=data[,1]){
    S = data.tree::FromDataFrameNetwork(A,"d2")$
      Get(function(x) data[rev(which(data[,1]%in%
        unlist(strsplit(x$pathString,"/")))),],simplify = FALSE)
    S[names(S)%in%vec]
   }

Conclusion:

      path(allanimals,"d2") 
$d2
    AnimalID InfectingAnimal habitat
  19       d2              d1       1
  18       d1              c3       1
  16       c3              c2       3
  15       c2              c1       2
  14       c1              b3       3
  11       b3              b2       6
  10       b2              b1       5
  9        b1              a2       4
  2        a2              a1       2
  1        a1               x       1

  path(allanimals,c("d2","e2"))
 $d2
     AnimalID InfectingAnimal habitat
  19       d2              d1       1
  18       d1              c3       1
  16       c3              c2       3
  15       c2              c1       2
  14       c1              b3       3
  11       b3              b2       6
  10       b2              b1       5
  9        b1              a2       4
  2        a2              a1       2
  1        a1               x       1

  $e2
      AnimalID InfectingAnimal habitat
   21       e2              e1       5
   20       e1              b1       2
   9        b1              a2       4
   2        a2              a1       2
   1        a1               x       1     
+2

Source: https://habr.com/ru/post/1684394/


All Articles