This should be single line:
library('haven') sas <- read_sas('nats2012.sas7bdat', 'formats.sas7bcat') with(sas, table(SMOKSTATUS_R, RACEETHNIC))
Use haven to read in the data, but it also gives you useful attributes , namely variable labels:
attributes(sas$SMOKSTATUS_R) # $label # [1] "SMOKER STATUS (4-level)" # # $class # [1] "labelled" # # $labels # Current everyday smoker Current some days smoker Former smoker # 1 2 3 # Never smoker Unknown # 4 5 # # $is_na # [1] FALSE FALSE FALSE FALSE FALSE
You can easily write this to a function used more widely:
do_fmt <- function(x, fmt) { lbl <- if (!missing(fmt)) unlist(unname(fmt)) else attr(x, 'labels') if (!is.null(lbl)) tryCatch(names(lbl[match(unlist(x), lbl)]), error = function(e) { message(sprintf('formatting failed for %s', attr(x, 'label')), domain = NA) x }) else x } table(do_fmt(sas[, 'SMOKSTATUS_R']), do_fmt(sas[, 'RACEETHNIC']))
And apply to the entire dataset
sas[] <- lapply(sas, do_fmt) sas$SMOKSTATUS_R[1:4] # [1] "Never smoker" "Former smoker" "Former smoker" "Never smoker"
Although sometimes this does not happen as shown below. It looks somehow wrong with haven package
attr(sas$SMOKTYPE, 'labels')
So, instead, you can parse the format.sas file with simple simple expressions
locf <- function(x) { x <- data.frame(x, stringsAsFactors = FALSE) x[x == ''] <- NA indx <- !is.na(x) x[] <- lapply(seq_along(x), function(ii) { idx <- cumsum(indx[, ii]) idx[idx == 0] <- NA x[, ii][indx[, ii]][idx] }) x[, 1] } fmt <- readLines('~/desktop/2012-2013-NATS-Format/2012-2013-NATS-Format.sas')
Thus, smoke type formats (one of them that have not been executed above), for example, are processed as follows:
sp['A5_'] # $A5_ # 'INAPPLICABLE' 'REFUSED' 'DK' # "-1" "-7" "-8" # 'NOT ASCERTAINED' 'PREMADE CIGARETTES' 'ROLL-YOUR-OWN' 'BOTH' # "-9" "1" "2" "3"
And then you can use this function again to apply to the data
table(do_fmt(sas['SMOKTYPE'], sp['A5_']))