Data Aggregation. Table with sum, length and grep

Lets make data.table:

dt <- data.table(x.1=1:8, x.2=1:8, x.3=2:9, vessel=rep(letters[1:2], each=4), Year=rep(2012:2015, 2))
dt
   x.1 x.2 x.3 vessel Year
1:   1   1   2      a 2012
2:   2   2   3      a 2013
3:   3   3   4      a 2014
4:   4   4   5      a 2015
5:   5   5   6      b 2012
6:   6   6   7      b 2013
7:   7   7   8      b 2014
8:   8   8   9      b 2015

I can combine it using the length and sum functions to get the sum of all x in each year and the sum of unique vessels every year as follows:

dt[, 
            list(
  x.1=sum(x.1),
  x.2=sum(x.2),
  x.3=sum(x.3),
  vessels=length(unique(vessel))),
    by=list(Year=Year)]

   Year x.1 x.2 x.3 vessels
1: 2012   6   6   8       2
2: 2013   8   8  10       2
3: 2014  10  10  12       2
4: 2015  12  12  14       2

This is what I want, but I have a lot of columns in my real data, so I would like to use grep or% like%, but I cannot get it to work. I thought something according to this:

dt[,grep("x",colnames(dt)),with = FALSE])

But how to combine this with the unit?

+4
source share
4 answers

You can use lapplyto apply the function to all ( .SD) or several columns (selected with .SDcols):

dt[, lapply(.SD, sum), by=Year, .SDcols=c("x.1","x.2")]

, "x" :

dt[, c(lapply(.SD, sum), vessel=uniqueN(vessel)),
    by=Year,
    .SDcols=grepl("^x", names(dt))
]
+5

, , melt() dcast():

molten <- melt(dt, id.vars = c("Year", "vessel"))

molten
#    Year vessel variable value
# 1: 2012      a      x.1     1
# 2: 2013      a      x.1     2
# 3: 2014      a      x.1     3
# 4: 2015      a      x.1     4
# 5: 2012      b      x.1     5
# ...
#19: 2014      a      x.3     4
#20: 2015      a      x.3     5
#21: 2012      b      x.3     6
#22: 2013      b      x.3     7
#23: 2014      b      x.3     8
#24: 2015      b      x.3     9
#    Year vessel variable value

dcast(molten, Year ~ variable, sum)
#   Year x.1 x.2 x.3
#1: 2012   6   6   8
#2: 2013   8   8  10
#3: 2014  10  10  12
#4: 2015  12  12  14 

dt[, .(vessels = uniqueN(vessel)), Year]
#   Year vessels
#1: 2012       2
#2: 2013       2
#3: 2014       2
#4: 2015       2

, :

dcast(molten, Year ~ variable, sum)[dt[, .(vessels = uniqueN(vessel)), Year], on = "Year"]
#   Year x.1 x.2 x.3 vessels
#1: 2012   6   6   8       2
#2: 2013   8   8  10       2
#3: 2014  10  10  12       2
#4: 2015  12  12  14       2

  • measure.vars melt() // .
  • subset dcast()
  • dcast()

, :

dcast(molten, Year ~ variable, list(mean, sum, max), subset = .(variable == "x.2")
      )[dt[, .(vessels = uniqueN(vessel)), Year], on = "Year"]
#   Year value_mean_x.2 value_sum_x.2 value_max_x.2 vessels
#1: 2012              3             6             5       2
#2: 2013              4             8             6       2
#3: 2014              5            10             7       2
#4: 2015              6            12             8       2
+1

, :

> dt[, .SD
     ][, .N, .(vessel, Year)
     ][, .N, .(Year)
     ][, copy(dt)[.SD, vessels := i.N, on='Year']
     ][, vessel := NULL
     ][, melt(.SD, id.vars=c('Year', 'vessels'))
     ][, .(value=sum(value)), .(Year, vessels, variable)
     ][, dcast(.SD, ... ~ variable, value.var='value')
     ][, setcolorder(.SD, c(setdiff(colnames(.SD), 'vessels'), 'vessels'))
     ][order(Year)
     ]

   Year x.1 x.2 x.3 vessels
1: 2012   6   6   8       2
2: 2013   8   8  10       2
3: 2014  10  10  12       2
4: 2015  12  12  14       2
> 
0

I do not understand your question, but what you want to do with grep can be solved with something like this.

dt <- data.frame(x.1=1:8, x.2=1:8, x.3=2:9, vessel=rep(letters[1:2], each=4), Year=rep(2012:2015, 2))
dt
dt[unlist(lapply(colnames(dt),function(v){grepl("x",v)}))]

then in your filtered database you can do what you want

-1
source

Source: https://habr.com/ru/post/1677158/


All Articles