Number of firms per year using dplyr or datatable

Say I have a data frame:

df <- data.frame(City = c("NY", "NY", "NY", "NY", "NY", "LA", "LA", "LA", "LA"),
                 YearFrom = c("2001", "2003", "2002", "2006", "2008", "2004", "2005", "2005", "2002"),
                 YearTo = c(NA, "2005", NA, NA, "2009", NA, "2008", NA, NA))

where YearFrom is the year when, for example, the company was created, and YearTo is the year when it was canceled. If YearTo is NA, then it still works.

I would like to calculate the number of firms for each year.

The table should look like this:

City    |"Year"   |"Count"
"NY"    |2001       1
"NY"    |2002       2
"NY"    |2003       3
"NY"    |2004       3
"NY"    |2005       2
"NY"    |2006       3
"NY"    |2007       3
"NY"    |2008       4
"NY"    |2009       3
"LA"    |2001       0
"LA"    |2002       1
"LA"    |2003       1
"LA"    |2004       2
"LA"    |2005       4
"LA"    |2006       4
"LA"    |2007       4
"LA"    |2008       2
"LA"    |2009       2

I would like to solve this with dplyr or datatable package, but I can not figure out how to do this?

+6
source share
4 answers

A shorter solution tidyverse.

# Firsts some data prep
df <- mutate(df,
    YearFrom = as.numeric(as.character(YearFrom)),                     #Fix year coding
    YearTo = as.numeric(as.character(YearTo)),
    YearTo = coalesce(YearTo, max(c(YearFrom, YearTo), na.rm = TRUE))) #Replace NA with max

df %>% 
  mutate(Years = map2(YearFrom, YearTo - 1, `:`)) %>%          #Find all years
  unnest() %>%                                                 #Spread over rows
  count(Years, City) %>%                                       #Count them
  complete(City, Years, fill = list(n = 0))                    #Add in zeros, if needed
+7
source

Clear data first ...

curr_year = as.integer(year(Sys.Date()))

library(data.table)
setDT(df)
df[, YearTo := as.integer(as.character(YearTo)) ]
df[, YearFrom := as.integer(as.character(YearFrom)) ]
df[, quasiYearTo := YearTo ]
df[is.na(YearTo), quasiYearTo := curr_year ]

Then the unequal union:

df[CJ(City = City, Year = min(YearFrom):max(YearTo, na.rm=TRUE), unique=TRUE), 
  on=.(City, YearFrom <= Year, quasiYearTo > Year), allow.cartesian = TRUE, 
  .N
, by=.EACHI][, .(City, Year = YearFrom, N)]

    City Year N
 1:   LA 2001 0
 2:   LA 2002 1
 3:   LA 2003 1
 4:   LA 2004 2
 5:   LA 2005 4
 6:   LA 2006 4
 7:   LA 2007 4
 8:   LA 2008 3
 9:   LA 2009 3
10:   NY 2001 1
11:   NY 2002 2
12:   NY 2003 3
13:   NY 2004 3
14:   NY 2005 2
15:   NY 2006 3
16:   NY 2007 3
17:   NY 2008 4
18:   NY 2009 3
+6
source

, data.table. .

# get list of businesses, one obs per year of operation
cityList <- lapply(seq_len(nrow(df)),
              function(i) df[i, .(City, "Year"=seq(YearFrom, YearTo - 1))])

# combine to a single data.table
dfNew <- rbindlist(cityList)

# get counts
dfNew <- dfNew[, .(Count=.N), by=.(City, Year)]

,

# get the counts
rbindlist(lapply(seq_len(nrow(df)),
          function(i) df[i, .(City, "Year"=seq(YearFrom, YearTo - 1))]))[, .(Count=.N),
  by=.(City, Year)]

lapply . YearTo , . , 2018 , .

lapply data.tables, rbindlist. -, .N.

    City Year Count
 1:   NY 2001     1
 2:   NY 2002     2
 3:   NY 2003     3
 4:   NY 2004     3
 5:   NY 2005     2
 6:   NY 2006     3
 7:   NY 2007     3
  ...
26:   LA 2012     3
27:   LA 2013     3
28:   LA 2014     3
29:   LA 2015     3
30:   LA 2016     3
31:   LA 2017     3
32:   LA 2002     1
33:   LA 2003     1

< >

setDT(df)
# convert string years to integers
df[, grep("Year", names(df), value=TRUE) := 
   lapply(.SD, function(x) as.integer(as.character(x))), .SDcols=grep("Year", names(df))]
# replace NA values with 2018 (to include 2017 in count)
df[is.na(YearTo), YearTo := 2018]
+4

dplyr tidyr.

library(dplyr)
library(tidyr)

df %>%
  # Change YearFrom and YearTo to numeric
  mutate(YearFrom = as.numeric(as.character(YearFrom)), 
         YearTo = as.numeric(as.character(YearTo))) %>%
  # Replace NA with 2017 in YearTo
  mutate(YearTo = ifelse(is.na(YearTo), 2017, YearTo)) %>%
  # All number in YearTo minus 1 to exclude the year of cancellation
  mutate(YearTo = YearTo - 1) %>%
  # Group by row
  rowwise() %>%
  # Create a tbl for each row, expand the Year column based on YearFrom and YearTo
  do(data_frame(City = .$City, Year = seq(.$YearFrom, .$YearTo, by = 1))) %>%
  ungroup() %>%
  # Count the number of each City and Year
  count(City, Year) %>%
  # Rename the column n to Count
  rename(Count = n) %>%
  # Spread the data frame to find the implicity missing value in LA, 2001
  spread(Year, Count) %>%
  # Gather the data frame to account for the missing value in LA, 2001
  gather(Year, Count, - City) %>%
  # Replace NA with 0 in Count
  mutate(Count = ifelse(is.na(Count), 0L, Count)) %>%
  # Arrange the data 
  arrange(desc(City), Year) %>%
  # Filter the data until 2009
  filter(Year <= 2009)
+2

Source: https://habr.com/ru/post/1016114/


All Articles