I played with competitor data from Kaggle West-Nile Virus as a means of practicing the installation of spatio-temporal GAM. The first few lines (several processed from the CSV source data) weather are given below (plus the first 20 lines a dput() ed output at the end of the question).
> head(weather) Station Date Tmax Tmin Tavg Depart DewPoint WetBulb Heat Cool Sunrise 1 1 2007-05-01 83 50 67 14 51 56 0 2 448 2 2 2007-05-01 84 52 68 NA 51 57 0 3 NA 3 1 2007-05-02 59 42 51 -3 42 47 14 0 447 4 2 2007-05-02 60 43 52 NA 42 47 13 0 NA 5 1 2007-05-03 66 46 56 2 40 48 9 0 446 6 2 2007-05-03 67 48 58 NA 40 50 7 0 NA Sunset CodeSum Depth Water1 SnowFall PrecipTotal StnPressure SeaLevel 1 1849 <NA> 0 NA 0 0 29.10 29.82 2 NA <NA> NA NA NA 0 29.18 29.82 3 1850 BR 0 NA 0 0 29.38 30.09 4 NA BR HZ NA NA NA 0 29.44 30.08 5 1851 <NA> 0 NA 0 0 29.39 30.12 6 NA HZ NA NA NA 0 29.46 30.12 ResultSpeed ResultDir AvgSpeed 1 1.7 27 9.2 2 2.7 25 9.6 3 13.0 4 13.4 4 13.3 2 13.4 5 11.7 7 11.9 6 12.9 6 13.2
Pay attention to the CodeSum variable. Each CodeSum element is an observation of significant weather events. Some observations are missing ( NA ), some of them have no data, but they are not enough, some of them have one type of significant weather conditions, and others have several important meteorological observations on the same day.
I want to create a new data frame with n new binary variables (n will be the number of unique values ββin CodeSum ) with NA if absent, and 1 is the observed weather indicator; a 0 if not observed.
At first I tried tidyr::separate() , but it either required that all indicators be present for all observations, or they would process them in order; the first indicator, regardless of what indicator it is, has always been tied to the first binary variable.
I have a solution:
expandLevs <- function(x, set) { m <- matrix(0, ncol = length(set), nrow = 1L) colnames(m) <- set nax <- is.na(x) m[, nax] <- NA if (!all(nax)) { idx <- x[!nax] m[, idx] <- 1 } m } cs <- with(weather, strsplit(as.character(CodeSum), " ")) levs <- with(weather, sort(unique(unlist(strsplit(levels(CodeSum), " "))))) cs <- lapply(cs, expandLevs, set = levs) cs <- do.call("rbind", cs) cs <- data.frame(cs, check.names = FALSE) cs <- lapply(cs, factor, levels = c(0,1)) cs <- data.frame(cs, check.names = FALSE)
What gives
> cs BR HZ RA 1 <NA> <NA> <NA> 2 <NA> <NA> <NA> 3 1 0 0 4 1 1 0 5 <NA> <NA> <NA> 6 0 1 0 7 0 0 1 8 <NA> <NA> <NA> 9 <NA> <NA> <NA> 10 <NA> <NA> <NA> 11 <NA> <NA> <NA> 12 <NA> <NA> <NA> 13 0 0 1 14 <NA> <NA> <NA> 15 1 0 0 16 0 1 0 17 1 1 0 18 1 1 0 19 1 0 0 20 1 1 0
for 20 lines of data in weather (below).
But at best, this seems awkward.
Am I missing an easier way to create binary variables?
The expected result is also included as dput()ed code at the end.
weather <- structure(list(Station = c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), Date = structure(c(13634, 13634, 13635, 13635, 13636, 13636, 13637, 13637, 13638, 13638, 13639, 13639, 13640, 13640, 13641, 13641, 13642, 13642, 13643, 13643), class = "Date"), Tmax = c(83L, 84L, 59L, 60L, 66L, 67L, 66L, 78L, 66L, 66L, 68L, 68L, 83L, 84L, 82L, 80L, 77L, 76L, 84L, 83L), Tmin = c(50L, 52L, 42L, 43L, 46L, 48L, 49L, 51L, 53L, 54L, 49L, 52L, 47L, 50L, 54L, 60L, 61L, 63L, 56L, 59L), Tavg = c(67, 68, 51, 52, 56, 58, 58, NA, 60, 60, 59, 60, 65, 67, 68, 70, 69, 70, 70, 71), Depart = c(14, NA, -3, NA, 2, NA, 4, NA, 5, NA, 4, NA, 10, NA, 12, NA, 13, NA, 14, NA), DewPoint = c(51L, 51L, 42L, 42L, 40L, 40L, 41L, 42L, 38L, 39L, 30L, 30L, 41L, 39L, 58L, 57L, 59L, 60L, 52L, 52L), WetBulb = c(56, 57, 47, 47, 48, 50, 50, 50, 49, 50, 46, 46, 54, 53, 62, 63, 63, 63, 60, 61), Heat = c(0, 0, 14, 13, 9, 7, 7, NA, 5, 5, 6, 5, 0, 0, 0, 0, 0, 0, 0, 0), Cool = c(2, 3, 0, 0, 0, 0, 0, NA, 0, 0, 0, 0, 0, 2, 3, 5, 4, 5, 5, 6), Sunrise = c(448, NA, 447, NA, 446, NA, 444, NA, 443, NA, 442, NA, 441, NA, 439, NA, 438, NA, 437, NA), Sunset = c(1849, NA, 1850, NA, 1851, NA, 1852, NA, 1853, NA, 1855, NA, 1856, NA, 1857, NA, 1858, NA, 1859, NA), CodeSum = structure(c(NA, NA, 2L, 3L, NA, 19L, 23L, NA, NA, NA, NA, NA, 23L, NA, 2L, 19L, 3L, 3L, 2L, 3L), .Label = c("BCFG BR", "BR", "BR HZ", "BR HZ FU", "BR HZ VCFG", "BR VCTS", "DZ", "DZ BR", "DZ BR HZ", "FG BR HZ", "FG+", "FG+ BCFG BR", "FG+ BR", "FG+ BR HZ", "FG+ FG BR", "FG+ FG BR HZ", "FG+ MIFG BR", "FU", "HZ", "HZ FU", "HZ VCTS", "MIFG BCFG BR", "RA", "RA BCFG BR", "RA BR", "RA BR FU", "RA BR HZ", "RA BR HZ FU", "RA BR HZ VCFG", "RA BR HZ VCTS", "RA BR SQ", "RA BR VCFG", "RA BR VCTS", "RA DZ", "RA DZ BR", "RA DZ BR HZ", "RA DZ FG+ BCFG BR", "RA DZ FG+ BR", "RA DZ FG+ BR HZ", "RA DZ FG+ FG BR", "RA DZ SN", "RA FG BR", "RA FG+ BR", "RA FG+ MIFG BR", "RA HZ", "RA SN", "RA SN BR", "RA VCTS", "TS", "TS BR", "TS BR HZ", "TS HZ", "TS RA", "TS RA BR", "TS RA BR HZ", "TS RA FG+ FG BR", "TS TSRA", "TS TSRA BR", "TS TSRA BR HZ", "TS TSRA GR RA BR", "TS TSRA HZ", "TS TSRA RA", "TS TSRA RA BR", "TS TSRA RA BR HZ", "TS TSRA RA BR HZ VCTS", "TS TSRA RA BR VCTS", "TS TSRA RA FG BR", "TS TSRA RA FG BR HZ", "TS TSRA RA HZ", "TS TSRA RA VCTS", "TS TSRA VCFG", "TSRA", "TSRA BR", "TSRA BR HZ", "TSRA BR HZ FU", "TSRA BR HZ VCTS", "TSRA BR SQ", "TSRA DZ BR HZ", "TSRA DZ FG+ FG BR HZ", "TSRA FG+ BR", "TSRA FG+ BR HZ", "TSRA HZ", "TSRA RA", "TSRA RA BR", "TSRA RA BR HZ", "TSRA RA BR HZ VCTS", "TSRA RA BR VCTS", "TSRA RA DZ BR", "TSRA RA DZ BR HZ", "TSRA RA FG BR", "TSRA RA FG+ BR", "TSRA RA FG+ FG BR", "TSRA RA FG+ FG BR HZ", "TSRA RA HZ", "TSRA RA HZ FU", "TSRA RA VCTS", "VCTS"), class = "factor"), Depth = c(0, NA, 0, NA, 0, NA, 0, NA, 0, NA, 0, NA, 0, NA, 0, NA, 0, NA, 0, NA), Water1 = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), SnowFall = c(0, NA, 0, NA, 0, NA, 0, NA, 0, NA, 0, NA, 0, NA, 0, NA, 0, NA, 0, NA), PrecipTotal = c(0, 0, 0, 0, 0, 0, 0.005, 0, 0.005, 0.005, 0, 0, 0.005, 0, 0, 0.005, 0.13, 0.02, 0, 0), StnPressure = c(29.1, 29.18, 29.38, 29.44, 29.39, 29.46, 29.31, 29.36, 29.4, 29.46, 29.57, 29.62, 29.38, 29.44, 29.29, 29.36, 29.21, 29.28, 29.2, 29.26), SeaLevel = c(29.82, 29.82, 30.09, 30.08, 30.12, 30.12, 30.05, 30.04, 30.1, 30.09, 30.29, 30.28, 30.12, 30.12, 30.03, 30.02, 29.94, 29.93, 29.92, 29.91), ResultSpeed = c(1.7, 2.7, 13, 13.3, 11.7, 12.9, 10.4, 10.1, 11.7, 11.2, 14.4, 13.8, 8.6, 8.5, 2.7, 2.5, 3.9, 3.9, 0.7, 2), ResultDir = c(27L, 25L, 4L, 2L, 7L, 6L, 8L, 7L, 7L, 7L, 11L, 10L, 18L, 17L, 11L, 8L, 9L, 7L, 17L, 9L), AvgSpeed = c(9.2, 9.6, 13.4, 13.4, 11.9, 13.2, 10.8, 10.4, 12, 11.5, 15, 14.5, 10.5, 9.9, 5.8, 5.4, 6.2, 5.9, 4.1, 3.9)), .Names = c("Station", "Date", "Tmax", "Tmin", "Tavg", "Depart", "DewPoint", "WetBulb", "Heat", "Cool", "Sunrise", "Sunset", "CodeSum", "Depth", "Water1", "SnowFall", "PrecipTotal", "StnPressure", "SeaLevel", "ResultSpeed", "ResultDir", "AvgSpeed"), row.names = c(NA, 20L), class = "data.frame") output <- structure(list(BR = structure(c(NA, NA, 2L, 2L, NA, 1L, 1L, NA, NA, NA, NA, NA, 1L, NA, 2L, 1L, 2L, 2L, 2L, 2L), .Label = c("0", "1"), class = "factor"), HZ = structure(c(NA, NA, 1L, 2L, NA, 2L, 1L, NA, NA, NA, NA, NA, 1L, NA, 1L, 2L, 2L, 2L, 1L, 2L), .Label = c("0", "1"), class = "factor"), RA = structure(c(NA, NA, 1L, 1L, NA, 1L, 2L, NA, NA, NA, NA, NA, 2L, NA, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("0", "1"), class = "factor")), .Names = c("BR", "HZ", "RA"), row.names = c(NA, -20L), class = "data.frame")