R is the difference between two sets in the data frame

I have 2 factor columns, I want to create a third column that tells me that the second has what the first does not. It is very similar to this post , but I have problems using dfusing the function setdiff().
For instance:

library(dplyr)
y1 <- c("a.b.","a.","b.c.d.")
y2 <- c("a.b.c.","a.b.","b.c.d.")
df <- data.frame(y1,y2)

The column y1has a.b., and the column y2has a.b.c.. I want a third party column to return c.or just c.

> df
      y1     y2  col3
1   a.b.  a.b.c.  c.
2     a.    a.b.  b.
3 b.c.d.  b.c.d.  

I think it should be a combination of strsplitand setdiff, but I can't get it to work.

factor character, strsplit() , . , , setdiff()

#convert factor to character
df <- df %>% mutate_if(is.factor, as.character)
lapply(df$y1,function(x)(strsplit(x,split = "[.]")))

> lapply(df$y1,function(x)(strsplit(x,split = "[.]")))
[[1]]
[[1]][[1]]
[1] "a" "b"


[[2]]
[[2]][[1]]
[1] "a"


[[3]]
[[3]][[1]]
[1] "b" "c" "d"
+4
3

, 1 , . , paste . unlist.

df$col3 <- mapply(function(x, y) paste0(setdiff(y, x), collapse = ""),
   strsplit(as.character(df$y1), "\\."), strsplit(as.character(df$y2), "\\."))

mapply "." strsplit, setdiff.

df$col3 <- mapply(function(x, y) setdiff(y, x),
       strsplit(as.character(df$y1), "\\."), strsplit(as.character(df$y2), "\\."))

df
#     y1     y2 col3
#1   a.b. a.b.c.    c
#2     a.   a.b.    b
#3 b.c.d. b.c.d.     

col3 , unlist, , , unlist character(0). , . .

unlist(lapply(df$col3,function(x) if(identical(x,character(0))) ' ' else x))

#[1] "c" "b" " "
+5

purrr:map2:

df %>%
    mutate_if(is.factor, as.character) %>%
    mutate(col3 = map2(strsplit(y2, "\\."), strsplit(y1, "\\."), setdiff))
#      y1     y2 col3
#1   a.b. a.b.c.    c
#2     a.   a.b.    b
#3 b.c.d. b.c.d.    

: factor character, setdiff "." -split y2 y1. , col3 list.


Update

, unnest character list. , col3 list character, :

df %>%
    mutate_if(is.factor, as.character) %>%
    mutate(col3 = map2(strsplit(y2, "\\."), strsplit(y1, "\\."), setdiff)) %>%
    rowwise() %>%
    mutate(col3 = paste(col3, collapse = "."))
## A tibble: 3 x 3
#  y1     y2     col3
#  <chr>  <chr>  <chr>
#1 a.b.   a.b.c. c
#2 a.     a.b.   b
#3 b.c.d. b.c.d. ""

- col3 ( ); rowwise() - paste.

:

y1 <- c("a.b.","a.","b.c.d.")
y2 <- c("a.b.c.e.","a.b.","b.c.d.")
df <- data.frame(y1,y2)
df %>%
    mutate_if(is.factor, as.character) %>%
    mutate(col3 = map2(strsplit(y2, "\\."), strsplit(y1, "\\."), setdiff)) %>%
    rowwise() %>%
    mutate(col3 = paste(col3, collapse = "."))
## A tibble: 3 x 3
#  y1     y2       col3
#  <chr>  <chr>    <chr>
#1 a.b.   a.b.c.e. c.e
#2 a.     a.b.     b
#3 b.c.d. b.c.d.   ""
+4

Very simple, but not strict, is to replace everything in y1 with "" from y2. This will not handle cases where the orders are different or y1 has something extra for y2, and not vice versa.

df %>% rowwise() %>% mutate(col3 = gsub(y1,"",y2))
+3
source

Source: https://habr.com/ru/post/1696260/


All Articles