Replace a partial string of characters in the data frame with conditions in r

I have a data frame like this:

df = read.table(text="REF   Alt S00001  S00002  S00003  S00004  S00005
 TAAGAAG    TAAG    TAAGAAG/TAAGAAG TAAGAAG/TAAG    TAAG/TAAG   TAAGAAG/TAAGAAG TAAGAAG/TAAGAAG
 T  TG  T/T -/- TG/TG   T/T T/T
 CAAAA  CAAA    CAAAA/CAAAA CAAAA/CAAA  CAAAA/CAAAA -/- CAAAA/CAAAA
 TTGT   TTGTGT  TTGT/TTGT   TTGT/TTGT   TTGT/TTGT   TTGTGT/TTGTGT   TTGT/TTGTGT
 GTTT   GTTTTT  GTTT/GTTTTT GTTT/GTTT   GTTT/GTTT   GTTT/GTTT   GTTTTT/GTTTTT", header=T, stringsAsFactors=F)

I would like to replace character elements separated by "/" with "D" or "I", depending on the length of the rows in the "REF" and "Alt" columns. If the elements correspond to the longest, they will be replaced by "I", otherwise instead of "D". But no change for the "-". Thus, the result is expected as:

REF Alt S00001  S00002  S00003  S00004  S00005
TAAGAAG TAAG    I/I I/D D/D I/I I/I
T   TG  D/D -/- I/I D/D D/D
CAAAA   CAAA    I/I I/D I/I -/- I/I
TTGT    TTGTGT  D/D D/D D/D I/I D/I
GTTT    GTTTTT  D/I D/D D/D D/D I/I
+4
source share
2 answers

Here is one approach. I used the package stringibecause it does a great job with pattern vectors and line vectors for searching.

, , :

short <- ifelse(nchar(df$Alt) > nchar(df$REF), df$REF, df$Alt)
long <- ifelse(nchar(df$REF) > nchar(df$Alt), df$REF, df$Alt)

, . , , , :

library(stringi)

df[,!(names(df) %in% c("REF", "Alt"))] <- # assign into original df
  lapply(1:(ncol(df) - 2), # - 2 because there are two columns we don't use
    function(ii) stri_replace_all_fixed(df[ ,ii + 2], long, "I")) # + 2 to skip first 2 columns

df[,!(names(df) %in% c("REF", "Alt"))] <- 
  lapply(1:(ncol(df) - 2),
    function(ii) stri_replace_all_fixed(df[ ,ii + 2], short, "D"))

#      REF    Alt S00001 S00002 S00003 S00004 S00005
#1 TAAGAAG   TAAG    I/I    I/D    D/D    I/I    I/I
#2       T     TG    D/D    -/-    I/I    D/D    D/D
#3   CAAAA   CAAA    I/I    I/D    I/I    -/-    I/I
#4    TTGT TTGTGT    D/D    D/D    D/D    I/I    D/I
#5    GTTT GTTTTT    D/I    D/D    D/D    D/D    I/I
+4

REF Alt I D:

refalt <- data.frame(
    from=c(df$REF, df$Alt),
    to=c(rep('I', length(df$REF)), rep('D', length(df$Alt))),
    stringsAsFactors=FALSE)
refalt <- rbind(refalt, c('-', '-'))
from <- expand.grid(refalt$from, refalt$from)
to <- expand.grid(refalt$to, refalt$to)
map <- paste(to[,1], to[,2], sep='/')
names(map) <- paste(from[,1], from[,2], sep='/')

:

for (name in paste0('S0000', seq(5))) {
    df[[name]] <- map[df[[name]]]
}
0

Source: https://habr.com/ru/post/1622977/


All Articles