Dplyr :: nested data filter

Question:

How to filter rows based on a nested frame using dplyr: filter


Problem: The following code provides an example dataset to include a working example.

Using the sample code, I can subset with which, but I am having trouble using dplyr due to nested data frames.

Now I appreciate that I could smooth out the dataframe using jsonlite, however I am interested to know if and how I can use dplyr without aligning the data frame.

All help is gratefully received and appreciated.

requiredPackages <- c("devtools","dplyr","tidyr","data.table","ggplot2","ggvis","RMySQL", "jsonlite", "psych", "plyr", "knitr")

ipak <- function(pkg)
{
  new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
  if (length(new.pkg))
    install.packages(new.pkg, dependencies = TRUE)
  sapply(pkg, require, character.only = TRUE)
}

ipak(requiredPackages)

dataDir         <- "./data"
fileUrl         <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/yelp_dataset_challenge_academic_dataset.zip"
filePath        <- file.path(dataDir)

# Does the directory Exist? If it does'nt create it
if (!file.exists(dataDir)) {
  dir.create(dataDir)
}

# Now we check if we have downloaded the data already into 
# "./data/yelp_dataset_challenge_academic_dataset". If not, then we download the
# zip file... and extract it under the data directory as 
# './data/yelp_dataset_challenge_academic_dataset'...

if (!file.exists( file.path(dataDir,"yelp_dataset_challenge_academic_dataset"))) {
  temp <- tempfile()
  download.file(fileUrl, temp, mode = "wb", method = "curl")
  unzip(temp, exdir = dataDir)
  unlink(temp)
}

if ( !exists("yelpBusinessData") )
{
  if (file.exists( file.path(dataDir,"yelpBusinessData.rds"))) {
    yelpBusinessData <- readRDS(file.path(dataDir,"yelpBusinessData.rds"))
  } else {
    yelpBusinessDataFilePath <- file.path(dataDir, 
                                          "yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_business.json")
    yelpBusinessData <- fromJSON(sprintf("[%s]",
                                         paste(readLines(yelpBusinessDataFilePath),
                                               collapse = ",")),
                                 flatten = FALSE)
    str(yelpBusinessData, max_level = 1)
    # Fix the column name duplication issue
    # If and when you flatten the data the you create two columns wiht the same column id
    # 
    # i.e. yelpBusinessData$attributes.Good.for.kids
    # 
    # This fixes the issue by renaming the first column...
    #       
    colnames(yelpBusinessData$attributes)[6] <- "Price_Range"
    colnames(yelpBusinessData$attributes)[7] <- "Good_For_Kids"
    saveRDS( yelpBusinessData, file.path(dataDir, "yelpBusinessData.rds"))
  }
}

The above code loads an example data frame.

, . which . , dplyr:: filter - ? , ?

# Extract the Phoenix subset using `which`
yelpBusinessData.PA <- yelpBusinessData[which(yelpBusinessData$city == "Phoenix"),]
yelpBusinessData.PA.rest <- yelpBusinessData.PA[which(grepl("Restaurants",
yelpBusinessData.PA$categories)),]
Exp <- yelpBusinessData.PA.rest[which(yelpBusinessData.PA.rest$attributes$Price_Range == 4),]
dim(Exp)

- : -)

> dim(Exp)
[1]  4 15

: dplyr?

yelpBusinessData.PA.rest <- yelpBusinessData %>% 
  filter(city == "Phoenix") %>%
  filter(grepl("Restaurants", categories)) %>%
  filter(attributes$Price_Range == 4)

... , , , ...

: "attributes $Price_Range" "attributes.Price_Range".

yelpBusinessData2 <- flatten(yelpBusinessData, recursive = TRUE)
dim(yelpBusinessData2)

Exp2 <- yelpBusinessData2 %>% 
  filter(city == "Phoenix") %>%
  filter(grepl("Restaurants", categories)) %>%
  filter(attributes.Price_Range == 4)
dim(Exp2)

- , , .

I.E → ** dplyr ? **

?: -)

, , [[]], , dplyr...

?

Exp2 <- yelpBusinessData %>% 
  filter(city == "Phoenix") %>%
  filter(grepl("Restaurants", categories)) %>%
  filter( attributes[[6]][] == 4)

" $Price_range" . i.e Price_Range - 6- dataframe...

> sessionInfo()
R version 3.2.2 (2015-08-14)
Platform: x86_64-apple-darwin13.4.0 (64-bit)
Running under: OS X 10.11.2 (El Capitan)

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] knitcitations_1.0.6 pander_0.5.2        plyr_1.8.3          jsonlite_0.9.16     ggvis_0.4.2.9000   
 [6] tidyr_0.2.0         devtools_1.8.0      qmap_1.0-3          fitdistrplus_1.0-4  knitr_1.11         
[11] dplyr_0.4.3.9000    data.table_1.9.4    psych_1.5.6         mapproj_1.2-4       maptools_0.8-36    
[16] rworldmap_1.3-1     sp_1.1-1            maps_2.3-11         ggmap_2.5.2         ggplot2_1.0.1      
[21] RMySQL_0.10.5       DBI_0.3.1           setwidth_1.0-4      colorout_1.1-1      vimcom_1.2-3       

loaded via a namespace (and not attached):
 [1] httr_1.0.0           splines_3.2.2        shiny_0.12.2         assertthat_0.1       highr_0.5           
 [6] yaml_2.1.13          lattice_0.20-33      chron_2.3-47         digest_0.6.8         RefManageR_0.8.63   
[11] colorspace_1.2-6     htmltools_0.2.6      httpuv_1.3.3         XML_3.98-1.3         bibtex_0.4.0        
[16] xtable_1.7-4         scales_0.3.0         jpeg_0.1-8           git2r_0.11.0         lazyeval_0.1.10.9000
[21] mnormt_1.5-3         proto_0.3-10         survival_2.38-3      RJSONIO_1.3-0        magrittr_1.5        
[26] mime_0.3             memoise_0.2.1        evaluate_0.7.2       MASS_7.3-43          xml2_0.1.1          
[31] foreign_0.8-66       ggthemes_2.2.1       rsconnect_0.4.1.4    tools_3.2.2          geosphere_1.4-3     
[36] RgoogleMaps_1.2.0.7  formatR_1.2          stringr_1.0.0        munsell_0.4.2        rversions_1.0.2     
[41] grid_3.2.2           RCurl_1.95-4.7       rstudioapi_0.3.1     rjson_0.2.15         spam_1.0-1          
[46] bitops_1.0-6         labeling_0.3         rmarkdown_0.7        gtable_0.1.2         curl_0.9.3          
[51] reshape2_1.4.1       R6_2.1.1             lubridate_1.3.3      stringi_0.5-5        parallel_3.2.2      
[56] Rcpp_0.12.0          fields_8.2-1         png_0.1-7  
+4
1

3 , , , ( ) SO.

:

  1. "" data.frame R/dplyr?

    , "" data.frame, , -, -.

  2. "" data.frame R/dplyr?
  3. , ?

"" R/dplyr?

, , . , , , , .

:

  1. , ,
  2. ,
  3. ,

: , . , . , , "" (№ 2 № 3 ). , , .

data.frames. ( , dplyr, , ).

, dplyr::select_if:

yelpBusinessData %>%
  dplyr::select_if(purrr::negate(is.data.frame)) %>% 
  dplyr::filter(city == 'Phoenix')

dplyr , , data.frames (, attribute s). ..

"" data.frame R/dplyr?

"" , flatten .

attributes, jsonlite::flatten - :

yelpBusinessData %>%
  dplyr::select_if(purrr::negate(is.data.frame)) %>% 
  dplyr::bind_cols(jsonlite::flatten(yelpBusinessData$attributes, recursive = T)) %>%
  dplyr::filter(city == 'Phoenix') %>%
  dplyr::filter(grepl("Restaurants", categories)) %>%
  dplyr::filter(Price_Range == 4)

hours, , -. hours data.frame data.frame ("" ""). purrr:map , data.frame .

hours <- 
  yelpBusinessData$hours %>% 
  purrr::map(. %>% 
               dplyr::transmute(hours = stringr::str_c(open, close, sep = ' - ')) %>% 
               unlist()) %>%
  tibble::as_tibble()

data.frame data.frame :

> str(hours)
Classes ‘tbl_df, ‘tbl and 'data.frame':   61184 obs. of  7 variables:
 $ Tuesday  : chr  "08:00 - 17:00" NA NA "10:00 - 21:00" ...
 $ Friday   : chr  "08:00 - 17:00" NA NA "10:00 - 21:00" ...
 $ Monday   : chr  "08:00 - 17:00" NA NA "10:00 - 21:00" ...
 $ Wednesday: chr  "08:00 - 17:00" NA NA "10:00 - 21:00" ...
 $ Thursday : chr  "08:00 - 17:00" NA NA "10:00 - 21:00" ...
 $ Sunday   : chr  NA NA NA "11:00 - 18:00" ...
 $ Saturday : chr  NA NA NA "10:00 - 21:00" ...

map2_dfc ( bind_cols bind_cols ), :

hours <- yelpBusinessData$hours %>% 
  purrr::map2_dfc(.x = .,
                  .y = names(.),
                  .f = ~ .x %>% 
                    dplyr::rename_all(funs(stringr::str_c(.y, ., sep = '_'))))

data.frame :

> str(hours)
'data.frame':   61184 obs. of  14 variables:
 $ Tuesday_close  : chr  "17:00" NA NA "21:00" ...
 $ Tuesday_open   : chr  "08:00" NA NA "10:00" ...
 $ Friday_close   : chr  "17:00" NA NA "21:00" ...
 $ Friday_open    : chr  "08:00" NA NA "10:00" ...
 $ Monday_close   : chr  "17:00" NA NA "21:00" ...
 $ Monday_open    : chr  "08:00" NA NA "10:00" ...
 $ Wednesday_close: chr  "17:00" NA NA "21:00" ...
 $ Wednesday_open : chr  "08:00" NA NA "10:00" ...
 $ Thursday_close : chr  "17:00" NA NA "21:00" ...
 $ Thursday_open  : chr  "08:00" NA NA "10:00" ...
 $ Sunday_close   : chr  NA NA NA "18:00" ...
 $ Sunday_open    : chr  NA NA NA "11:00" ...
 $ Saturday_close : chr  NA NA NA "21:00" ...
 $ Saturday_open  : chr  NA NA NA "10:00" ...

, , "" , :

> purrr::flatten_dfr(yelpBusinessData$hours, .id = 'day')
# A tibble: 61,184 x 3
   day   close open 
   <chr> <chr> <chr>
 1 1     NA    NA   
 2 1     NA    NA   
 3 1     NA    NA   
 4 1     21:00 10:00
 5 1     16:00 10:00
 6 1     NA    NA   
 7 1     NA    NA   
 8 1     NA    NA   
 9 1     NA    NA   
10 1     02:00 08:00
# ... with 61,174 more rows

, ?

, . data.frame R , data.frame of data.frames. .

, , data.frame . purrr, , / , data.frame .

, simplify ( ) data.frame, jsonlite. , . .

:

## read in yelpBusinessData without converting to data.frame
yelpBusinessData2 <- fromJSON(sprintf("[%s]",
                                         paste(readLines(yelpBusinessDataFilePath),
                                               collapse = ",")),
                                 flatten = FALSE,
                                 simplify = FALSE)

# filter to Phoenix cities _before_ converting to a data.frame
> yelpBusinessData2 %>% 
    purrr::keep(~ .$'city' == 'Phoenix'
                  && stringr::str_detect(.$categories, pattern = 'Restaurants')) %>% 
    jsonlite:::simplify(., flatten = T) %>% 
    dplyr::select(business_id, full_address, contains('kids')) %>% 
    str()
'data.frame':   8410 obs. of  5 variables:
 $ business_id                              : chr  "vcNAWiLM4dR7D2nwwJ7nCA" "x5Mv61CnZLohZWxfCVCPTQ" "2ZnCITVa0abGce4gZ6RhIw" "EmzaQR5hQlF0WIl24NxAZA" ...
 $ full_address                             : chr  "4840 E Indian School Rd\nSte 101\nPhoenix, AZ 85018" "2819 N Central Ave\nPhoenix, AZ 85004" "1850 N Central Ave\nPhoenix, AZ 85004" "132 E Washington St\nPhoenix, AZ 85004" ...
 $ attributes.Good for Kids                 : logi  NA FALSE TRUE FALSE NA NA ...
 $ attributes.Good For Kids                 : logi  NA NA NA NA NA NA ...
 $ attributes.Hair Types Specialized In.kids: logi  NA NA NA NA NA NA ...

, , - , R, clean_names() . , Excel.

0

Source: https://habr.com/ru/post/1615923/


All Articles