Perform multiple operations on multiple data.

I really need to learn this because I have been doing it terribly, terribly painfully for too long. I have a list of data that I want:

  • convert to data.table
  • perform three functions on
  • output to a new list for plyr join_all

What is the best way to do this automatically so that I don't write this function for every instance of the data frame. One tricky part for me is accessing names in the habitat_names vector when working in the data.tables list. Here is my existing code:

iucn_1_4 <- data.table(read.csv("~/Projects/ClimateChange/Random/IUCN/1.4 Temperate Forest.csv"))
iucn_1_6 <- data.table(read.csv("~/Projects/ClimateChange/Random/IUCN/1.6 Subtropical_Tropical Moist Lowland Forest.csv"))
iucn_1_7 <- data.table(read.csv("~/Projects/ClimateChange/Random/IUCN/1.7 Subtropical_Tropical Mangrove Forest Vegetation Above High Tide Level.csv"))
iucn_1_8 <- data.table(read.csv("~/Projects/ClimateChange/Random/IUCN/1.9 Subtropical_Tropical Moist Montane Forest.csv"))
iucn_4_4 <- data.table(read.csv("~/Projects/ClimateChange/Random/IUCN/4.4 Temperate Grassland.csv"))
iucn_4_5 <- data.table(read.csv("~/Projects/ClimateChange/Random/IUCN/4.5 Subtropical_Tropical Dry Lowland Grassland_Colombia_Venezuela and Bolivia.csv"))

habitat_names <- c("1.4 Temperate Forest","1.6 Subtropical/Tropical Moist Lowland Forest","1.7 Subtropical/Tropical Mangrove Forest Vegetation Above High Tide Level","1.8 Subtropical/Tropical Swamp Forest","4.4 Temperate Grassland","4.5 Subtropical/Tropical Dry Lowland Grassland")

iucn_1_4[,SCI_NAME:=paste(Genus, Species, " ")]
iucn_1_4[,habitat_names[1]:=TRUE]
iucn_1_4.out <- iucn_1_4[,c("SCI_NAME", habitat_names[1]),with=FALSE]

iucn_1_6[,SCI_NAME:=paste(Genus, Species, " ")]
iucn_1_6[,habitat_names[2]:=TRUE]
iucn_1_6.out <- iucn_1_6[,c("SCI_NAME", habitat_names[2]),with=FALSE]

iucn_1_7[,SCI_NAME:=paste(Genus, Species, " ")]
iucn_1_7[,habitat_names[3]:=TRUE]
iucn_1_7.out <- iucn_1_7[,c("SCI_NAME", habitat_names[3]),with=FALSE]

iucn_1_8[,SCI_NAME:=paste(Genus, Species, " ")]
iucn_1_8[,habitat_names[4]:=TRUE]
iucn_1_8.out <- iucn_1_8[,c("SCI_NAME", habitat_names[4]),with=FALSE]

iucn_4_4[,SCI_NAME:=paste(Genus, Species, " ")]
iucn_4_4[,habitat_names[5]:=TRUE]
iucn_4_4.out <- iucn_4_4[,c("SCI_NAME", habitat_names[5]),with=FALSE]

iucn_4_5[,SCI_NAME:=paste(Genus, Species, " ")]
iucn_4_5[,habitat_names[6]:=TRUE]
iucn_4_5.out <- iucn_4_5[,c("SCI_NAME", habitat_names[6]),with=FALSE]
+4
source share
4 answers

You can also use a map to avoid a for loop. Code example

library(data.table)

dt1 <- data.table(v1 = 1:10, v2 = c("a", "b")) # eg iucn_1_4
dt2 <- data.table(v1 = 1:10, v2 = c("a", "b", "c", "d", "e")) # eg iucn_1_6
names <- c("name1", "name2") # replace by habitat_names

f <- function(dt, name) {
    dt[, SCI_NAME := paste0(v1, v2)] # replace v1, v2 by Genus, Species
    dt[, eval(name) := TRUE]
    dt[, c("SCI_NAME", name), with = FALSE]
}

res <- Map(f, list(dt1, dt2), names)

join_all res ( , )

+4
files <- c( "file1.csv", "file2.csv", etc)


DT.list <- lapply(files, fread)
DT.out <- list()    

for (i in seq(DT.list)) {
   DT.list[[i]][, SCI_NAME:=paste(Genus, Species, " ")]
   DT.list[[i]][, c(habitat_names[i]) := TRUE]
   DT.out[[i]]  <- DT.list[[i]][,c("SCI_NAME", habitat_names[i]),with=FALSE]
}

rbindlist(DT.out)

or 
do.call(merge, DT.out)

. Data.table

+1

Unconfirmed code, but maybe something like this?

namestoread <- c(
   "1.4 Temperate Forest.csv"
   ,"1.6 Subtropical_Tropical Moist Lowland Forest.csv"
   ,"1.7 Subtropical_Tropical Mangrove Forest Vegetation Above High Tide Level.csv"
   ,"1.9 Subtropical_Tropical Moist Montane Forest.csv"
   ,"4.4 Temperate Grassland.csv"
   ,"4.5 Subtropical_Tropical Dry Lowland Grassland_Colombia_Venezuela and Bolivia.csv"
)

listofdatasets <- vector(mode = 'list')

for ( i in seq(length(namestoread)))
{

listofdatasets[[i]] <- read.csv(paste0("~/Projects/ClimateChange/Random/IUCN/",namestoread[i]))
listofdatasets[[i]][,SCI_NAME:=paste(Genus, Species, " ")]
listofdatasets[[i]][,namestoread[i]:=TRUE]

}

join_all(listofdatasets)
0
source

Good practice for R is that you do something more than once, and then write a function to handle repeating elements:

eg:

#get names
habitat_names <- c("1.4 Temperate Forest",
                   "1.6 Subtropical/Tropical Moist LowlandForest",
                   "1.7 Subtropical/Tropical Mangrove Forest Vegetation Above High", 
                    #etc.....
                   )
#a simple function
getSpecies <- function(path, index, Names){
            data <- data.table(read.csv(path)
            data[,SCI_NAME:=paste(Genus, Species, " ")]
            data[,Names[index]:=TRUE]
            out <- data[,c("SCI_NAME", Names[index]),with=FALSE]
            return(out)
 }

 #call function:
 iucn_1_4 <- getSpecies(path = "~/Projects/ClimateChange/Random/IUCN/1.4 TemperateForest.csv",
                        index = 1,
                        names = habitat_names)
0
source

Source: https://habr.com/ru/post/1542671/


All Articles