Extracting the last word from many columns of a data frame (R)

I have a data frame that contains 3 columns. The data is as follows:

V1                V2               V3
Auto = Chevy      Engine = V6      Trans = Auto
Auto = Chevy      Engine = V8      Trans = Manual
Auto = Chevy      Engine = V10     Trans = Manual

I want the dataframe to look like this:

Auto       Engine  Trans
Chevy      V6      Auto
Chevy      V8      Manual
Chevy      V10     Manual

In other words, get the last row after "=" and take the first value in the column and make it the column heading. Or a way to simply get the last word after "=" and replace it with a column without adding new columns.

Can this be done in R? Many thanks!

+6
source share
3 answers

Well, if you don't mind just using the old (pre-Hadley) R style, here is the solution:

> x <- as.data.frame(list(c('Auto = Chevy', 'Auto = Chevy', 'Auto = Chevy'),
+ c('Engine = V6', 'Engine = V8', 'Engine = V10'),
+ c('Trans = Auto', 'Trans = Manual', 'Trans = Manual')),
+ stringsAsFactors=FALSE)
> values <- lapply(x, gsub, pattern='.*= ', replacement='')
> new.names <- lapply(x, gsub, pattern=' =.*', replacement='')
> new.names <- lapply(new.names, unique)
> names(values) <- new.names
> new.frame <- as.data.frame(values, stringsAsFactors = FALSE)
> new.frame
   Auto Engine  Trans
1 Chevy     V6   Auto
2 Chevy     V8 Manual
3 Chevy    V10 Manual

It will not work for a multi-column data frame, but it will work for a narrow multi-row data frame.

+5

, stringr stringi ( stringr wrap stringi):

library(stringi)
library(dplyr)

read.table(text='V1,V2,V3
"Auto = Chevy","Engine = V6","Trans = Auto"
"Auto = Chevy","Engine = V8","Trans = Manual"
"Auto = Chevy","Engine = V10","Trans = Manual"',
sep=",", header=TRUE, stringsAsFactors=FALSE) -> df

mutate_all(df, funs(stri_extract_last_words))
##      V1  V2     V3
## 1 Chevy  V6   Auto
## 2 Chevy  V8 Manual
## 3 Chevy V10 Manual

tidyverse " ", R script, , :

library(stringi)
library(dplyr)
library(purrr)

read.table(text='V1,V2,V3
"Auto = Chevy","Engine = V6","Trans = Auto"
"Auto = Chevy","Engine = V8","Trans = Manual"
"Auto = Chevy","Engine = V10","Trans = Manual"',
sep=",", header=TRUE, stringsAsFactors=FALSE) -> df

mutate_all(df, funs(stri_extract_last_words)) %>%
  setNames(mutate_all(df, stri_extract_first_words) %>%
             distinct() %>%
             flatten_chr())

tidyverse stringi , R script, , :

library(stringi)
library(tidyverse)

read.table(text='V1,V2,V3
"Auto = Chevy","Engine = V6","Trans = Auto"
"Auto = Chevy","Engine = V8","Trans = Manual"
"Auto = Chevy","Engine = V10","Trans = Manual"',
sep=",", header=TRUE, stringsAsFactors=FALSE) -> df

by_row(df, function(x) {
  map(x, stri_match_all_regex, "(.*) = (.*)") %>%
    map(1) %>%
    map(~setNames(.[,3], .[,2])) %>%
    flatten_df()
}) %>%
  select(.out) %>%
  unnest()
## # A tibble: 3 × 3
##    Auto Engine  Trans
##   <chr>  <chr>  <chr>
## 1 Chevy     V6   Auto
## 2 Chevy     V8 Manual
## 3 Chevy    V10 Manual
+4

base R

1) scan sub - =, sub data.frame matrix, scan vector . (c(FALSE, TRUE)) "v1" "df2", unique , "v1", c(TRUE, FALSE) vector.

df2 <- df1
v1 <- scan(text=sub("=\\s+", "", as.matrix(df1)), what="", sep=" ", quiet=TRUE)
df2[] <- v1[c(FALSE, TRUE)]
colnames(df2) <- unique(v1[c(TRUE, FALSE)])
df2
#   Auto Engine  Trans
#1 Chevy     V6   Auto
#2 Chevy     V8 Manual
#3 Chevy    V10 Manual

2) sub - , (\\1) (lapply(df1, ..)

df2[] <- lapply(df1, function(x) sub(".*\\b(\\w+)$", "\\1", x))

3) strsplit - ("=\\s+) (tail, 1) , 2)

df2[] <- lapply(df1, function(x) sapply(strsplit(x, "=\\s+"), tail, 1))

2- 3- , sub unlist ted

colnames(df2) <- sub("\\s+=.*", "", unlist(df1[1,], use.names = FALSE))

1) str_extract - (\\w+) $ lapply list ('df2'). , sub unlist ed.

library(stringr)
df2[] <- lapply(df1, function(x) str_extract(x, "\\w+$"))
colnames(df2) <- word(unlist(df1[1,]), 1)
df2
#   Auto Engine  Trans
#1 Chevy     V6   Auto
#2 Chevy     V8 Manual
#3 Chevy    V10 Manual

2) tidyverse

library(dplyr)
library(tidyr)
gather(df1) %>% 
      separate(value, into = c("header", "value")) %>%
      group_by(key) %>%
      mutate(i1 = row_number()) %>% 
      ungroup() %>% 
      select(-key) %>% 
      spread(header, value) %>%
      select(-i1)
# A tibble: 3 × 3
#   Auto Engine  Trans
#* <chr>  <chr>  <chr>
#1 Chevy     V6   Auto
#2 Chevy     V8 Manual
#3 Chevy    V10 Manual

df1 <- structure(list(V1 = c("Auto = Chevy", "Auto = Chevy", "Auto = Chevy"
), V2 = c("Engine = V6", "Engine = V8", "Engine = V10"), V3 = c("Trans = Auto", 
"Trans = Manual", "Trans = Manual")), .Names = c("V1", "V2", 
"V3"), class = "data.frame", row.names = c(NA, -3L))
+2

Source: https://habr.com/ru/post/1014275/


All Articles