Calculate the difference for the period in R

I am working with a dataset that looks like this:

team runs_scored       date
LAN           3        2014-03-22
ARI           1        2014-03-22
LAN           7        2014-03-23
ARI           5        2014-03-23
LAN           1        2014-03-30
SDN           3        2014-03-30

I am trying to test the predictive model on this set, and one of the input parameters is the variance runs_scoredat t-1. In other words, to predict the result variable for the 4th observation, I need the variance LANbased on previous observations in the data set.

I can calculate cumulative means and amounts, but I cannot figure out how to calculate cumulative variance in a dataset. I do most of my data manipulation in dplyr, but I do not mind using an alternative solution if it gets me what I need

+2
source share
3 answers

as, (sum(x^2)-length(x)*mean(x)^2)/(length(x)-1), , (cummean dplyr). ,

library(dplyr)
cum_var <- function(x){
    n <- 1:length(x)
    (cumsum(x^2)-n*cummean(x)^2)/(n-1)
}

@MrFlick cumvar .

x <- rnorm(1e6)
all.equal(cum_var(x), cumvar(x))
#[1] TRUE
system.time(cumvar(x))[3]
elapsed 
   5.52 
system.time(cum_var(x))[3]
elapsed 
   0.04 
+10

, - . , , , .

x<-c(3,1,7,5,1,3)

cumvar<-function(x) {
   tail(Reduce(local({mm<-0; nn<-0; function(a,b) 
        {nn<<-nn+1; d<-b-mm; mm<<-mm+d/nn; a+d*(b-mm)}}), 
        x, 0, accumulate=TRUE), -1)/(seq_along(x)-1)
}
cumvar(x)
# [1]       NaN 24.500000 14.333333 10.000000  7.700000  6.166667  5.333333   4.696429  4.111111  3.777778

,

cumvar2 <- function(x)  {
    sapply(seq_along(x), function(i) var(x[1:i]))
}
cumvar2(x)
# [1]        NA 24.500000 14.333333 10.000000  7.700000  6.166667  5.333333  4.696429  4.111111  3.777778

set.seed(15)
x<-rpois(100, 5)
microbenchmark:::microbenchmark(cumvar(x), cumvar2(x))

# Unit: microseconds
#        expr      min        lq      mean   median       uq      max neval cld
#   cumvar(x)  272.502  297.2425  335.2058  315.490  339.625  957.728   100  a 
#  cumvar2(x) 1672.323 1793.0960 2089.8104 1865.838 1956.208 6386.863   100   b

, wiki, , .

dplyr

dd<-read.table(text="team runs_scored       date
LAN           3        2014-03-22
ARI           1        2014-03-22
LAN           7        2014-03-23
ARI           5        2014-03-23
LAN           1        2014-03-30
SDN           3        2014-03-30", header=T)

dd %>% mutate(cvar=lag(cumvar(runs_scored)))

#   team runs_scored       date     cvar
# 1  LAN           3 2014-03-22       NA
# 2  ARI           1 2014-03-22      NaN
# 3  LAN           7 2014-03-23 2.000000
# 4  ARI           5 2014-03-23 9.333333
# 5  LAN           1 2014-03-30 6.666667
# 6  SDN           3 2014-03-30 6.800000
+5

? , :

data$vars <- NA
for(i in 2:nrow(data)){
  data$vars[i] <- var(data$runs_scored[1:(i - 1)])
}

  team runs_scored      date     vars
1  LAN           3 3/22/2014       NA
2  ARI           1 3/22/2014       NA
3  LAN           7 3/23/2014 2.000000
4  ARI           5 3/23/2014 9.333333
5  LAN           1 3/30/2014 6.666667
6  SDN           3 3/30/2014 6.800000

edit: , :

data$vars <- NA
cumVar <- function(position, df){
  return(var(data$runs_scored[1:(position - 1)]))
}

sapply, :

position <- 3:nrow(data)
results <- c(NA,NA, sapply(position, cumVar,data))
data$var <- results

, 30000 , for 10,5 7,5 .

+2

Source: https://habr.com/ru/post/1618914/


All Articles