Extract different words from character string in R

Question

Extract different words from character string in R

I saw several SO posts that seem to answer this question closely, but I can’t say if this is really done, please forgive me, this is a duplicate of the post. I have dozens of character strings (this is a column inside the data frame) that contain different numbers, usually written as words, but sometimes as integers. For instance:.

Three neonates with one adult

1 adult, ten neonates nearby

Two adults and six neonates

My ultimate goal is to be able to extract the number of newborns and adults from each row and get something like this:

data.frame(Adults=c(1,1,6), Neonates=c(3,10,6)

But the number and location of the number inside the line are different. All the examples that I saw with help gsub, strsplitetc., seem to work only when the pattern used for substitution, separation, highlighting, etc., is the same for the lines or remains in a constant position inside the line. Since I know what numbers should be c("one","two",...,"ten"), I could skip each character string, and then skip all possible numbers to see if it is present in the string, and then, if present, extract it and convert to a numeric one. But it seems very inefficient.

Any help would be greatly appreciated.

+4

string r character gsub strsplit

Jbauder Aug 16 '17 at 18:43

source share

7 answers

OdeToMyFiddle · Answer 1 · 2017-08-16T19:56:17+0000

str_split stringr -. , / .

library(stringr) #for str_split

customFun = function(
strObj="Three neonates with one adult",
rootOne = "adult",
rootTwo = "neonate"){

#split string
discreteStr = str_split(strObj,pattern = "\\s+",simplify = TRUE)



#find indices of root words
rootOneIndex = grep(rootOne,discreteStr)
rootTwoIndex = grep(rootTwo,discreteStr)

#mapping vectors
charVec = c("one","two","three","four","five","six","seven","eight","nine","ten")
numVec = as.character(1:10)
names(numVec) = charVec

#match index neighbourhood ,-1/+1  and select first match
rootOneMatches = tolower(discreteStr[c(rootOneIndex-1,rootOneIndex+1)])
rootOneMatches = rootOneMatches[!is.na(rootOneMatches)]
rootOneMatches = head(rootOneMatches,1)


rootTwoMatches = tolower(discreteStr[c(rootTwoIndex-1,rootTwoIndex+1)])
rootTwoMatches = rootTwoMatches[!is.na(rootTwoMatches)]
rootTwoMatches = head(rootTwoMatches,1)

#check presence in mapping vectors
rootOneNum = intersect(rootOneMatches,c(charVec,numVec))
rootTwoNum = intersect(rootTwoMatches,c(charVec,numVec))

#final matches and numeric conversion
rootOneFinal = ifelse(!is.na(as.numeric(rootOneNum)),as.numeric(rootOneNum),as.numeric(numVec[rootOneNum]))
rootTwoFinal = ifelse(!is.na(as.numeric(rootTwoNum)),as.numeric(rootTwoNum),as.numeric(numVec[rootTwoNum]))

outDF = data.frame(strObj = strObj,adults = rootOneFinal,neonates = rootTwoFinal,stringsAsFactors=FALSE) 
return(outDF)
}

:

inputVec = c("Three neonates with one adult","1 adult, ten neonates nearby","Two adults and six neonates")
outputAggDF = suppressWarnings(do.call(rbind,lapply(inputVec,customFun)))

outputAggDF
#                         strObj adults neonates
#1 Three neonates with one adult      1        3
#2  1 adult, ten neonates nearby      1       10
#3   Two adults and six neonates      2        6

user108363 · Answer 2 · 2017-08-16T20:00:43+0000

, , .

string1 <- c("Three neonates with one adult")
string2 <- c("1 adult, ten neonates nearby")
string3 <- c("Two adults and six neonates")
df <- rbind(string1, string2, string3)

#change all written words to numeric values
df <- tolower(df)
df <- ifelse(grepl("one", df), gsub("one", 1, df), df)
df <- ifelse(grepl("two", df), gsub("two", 2, df), df)
df <- ifelse(grepl("three", df), gsub("three", 3, df), df)
df <- ifelse(grepl("four", df), gsub("four", 4, df), df)
df <- ifelse(grepl("five", df), gsub("five", 5, df), df)
df <- ifelse(grepl("six", df), gsub("six", 6, df), df)
df <- ifelse(grepl("seven", df), gsub("seven", 7, df), df)
df <- ifelse(grepl("eight", df), gsub("eight", 8, df), df)
df <- ifelse(grepl("nine", df), gsub("nine", 9, df), df)
df <- ifelse(grepl("ten", df), gsub("ten", 10, df), df)


#extract number and the next two spaces (gets a or n for adult or neonates)
number_let <- gregexpr('[0-9]+..',df)
list_nl <- regmatches(df,number_let)

df <- as.data.frame(df)
new_df <- data.frame(matrix(unlist(list_nl), nrow=nrow(df), byrow=T))
> new_df
   X1   X2
1 3 n  1 a
2 1 a 10 n
3 2 a  6 n

new_df$X1 <- as.character(new_df$X1)
new_df$X2 <- as.character(new_df$X2)

#extract numeric values
FW <- data.frame(matrix(unlist(regmatches(new_df$X1,gregexpr('[0-9]+',new_df$X1))), nrow=nrow(df), byrow=T))
SW <- data.frame(matrix(unlist(regmatches(new_df$X2,gregexpr('[0-9]+',new_df$X2))), nrow=nrow(df), byrow=T))

new_df <- cbind(new_df, FW, SW)
colnames(new_df)[3:4] <- c("FW", "SW")

new_df$FW <- as.numeric(as.character(new_df$FW))
new_df$SW <- as.numeric(as.character(new_df$SW))

#get numeric value separated into neonates and adults
new_df$neonate_1 <- ifelse(grepl("n", new_df$X1), new_df$FW,0)
new_df$neonate_2 <- ifelse(grepl("n", new_df$X2), new_df$SW,0)
new_df$adult_1 <-ifelse(grepl("a", new_df$X1), new_df$FW,0)
new_df$adult_2 <- ifelse(grepl("a", new_df$X2), new_df$SW,0)

#total neonates and adults for each string
new_df$total_neo <- new_df$neonate_1 + new_df$neonate_2
new_df$total_adu <- new_df$adult_1 + new_df$adult_2

#extract the two final columns
Count <- new_df[,9:10]
colnames(Count) <- c("Neonates", "Adults")

> Count
  Neonates Adults
1        3      1
2       10      1
3        6      2

Eldioo · Answer 3 · 2017-08-16T20:02:32+0000

, , .

"one" "two" .. , .

strings <- c("Three neonates with one adult",
"1 adult, ten neonates nearby",
"Two adults and six neonates")

numbers <- c("one","two","three","four","five","six","seven","eight","nine","ten")

splitted <- unlist(strsplit(strings, split="[[:blank:] | [:punct:]]"))

ind_neon <- which((splitted == "neonates") | (splitted == "neonate"))
ind_adul <- which((splitted == "adults") | (splitted == "adult"))

neon <- tolower(splitted[ind_neon-1])
adul <- tolower(splitted[ind_adul-1])

neon2 <- as.numeric(neon)
neon2[is.na(neon2)] <- as.numeric(factor(neon[is.na(neon2)],
               levels=numbers,
               labels=(1:10)))

adul2 <- as.numeric(adul)
adul2[is.na(adul2)] <- as.numeric(factor(adul[is.na(adul2)],
                levels=numbers,
                labels=(1:10)))

adul2
# [1] 1 1 2
neon2
# [1]  3 10  6

Wyldsoul · Answer 4 · 2017-08-17T13:16:23+0000

, , , , .

library(stringr)
library(qdap)
library(tidyr)

 v <- tolower(c("Three neonates with one adult",
           "1 adult, ten neonates nearby",
           "Two adults and six neonates"))

words<- c("one","two","three","four","five","six","seven","eight","nine","ten")
nums <- seq(1, 10)
pattern <- c(words, nums)

w <- paste(unlist(str_extract_all( v, paste(pattern, collapse="|"))),
           unlist(str_extract_all( v, "neonate|adult")))

mutliple gsub qdap,

w <- mgsub(words, nums, w)
w <- do.call(rbind.data.frame, strsplit(w, " "))
names(w) <- c("numbers", "name")

rowid, .

w$row <- rep(1:(nrow(w)/2), each=2)
spread(w, name, numbers)[-c(1)]


#    adult neonate
#  1     1       3
#  2     1      10
#  3     2       6

Sagar · Answer 5 · 2017-08-17T13:46:17+0000

strapply gsubfn , . , , .

> library(gsubfn)
> df <- data.frame(Text = c("Three neonates with one adult","1 adult, ten neonates nearby","Two adults and six neonates"))
> df
                           Text
1 Three neonates with one adult
2  1 adult, ten neonates nearby
3   Two adults and six neonates

> for(i in 1:nrow(df)){
+     
+     df$Adults[i] <- strapply(as.character(df$Text[i]), "(\\w+) adult*")
+     df$Neonates[i] <- strapply(as.character(df$Text[i]), "(\\w+) neonate*")
+     
+ }

> df
                           Text Adults Neonates
1 Three neonates with one adult    one    Three
2  1 adult, ten neonates nearby      1      ten
3   Two adults and six neonates    Two      six

ztl · Answer 6 · 2017-08-17T14:09:29+0000

R - ; -)

1 10 /, X adult(s) Y neonate(s) (.. ), :

df = data.frame(strings = c("Three neonates with one adult",
                            "1 adult, ten neonates nearby",
                            "Two adults and six neonates"))

littnums = c('one', 'two', 'three', 'four', 'five', 
             'six', 'seven', 'eight', 'nine', 'ten')
nums = 1:10

getnums = function(mystring, mypattern) {
  # split your string at all spaces
  mysplitstring = unlist(strsplit(mystring, split=' '))
  # The number you are looking for is before the pattern
  numBeforePattern = mysplitstring[grep(mypattern, mysplitstring) - 1]
  # Then convert it to a integer or, if it fails, translate it 
  ifelse(is.na(suppressWarnings(as.integer(numBeforePattern))), 
         nums[grep(tolower(numBeforePattern), littnums)], 
         as.integer(numBeforePattern))
}

df$Neonates = sapply(as.vector(df$strings), FUN=getnums, 'neonate')
df$Adults = sapply(as.vector(df$strings), FUN=getnums, 'adult')
df
#                         strings Neonates Adults
# 1 Three neonates with one adult        3      1
# 2  1 adult, ten neonates nearby       10      1
# 3   Two adults and six neonates        6      2

CPak · Answer 7 · 2017-08-17T21:34:12+0000

S <- c("Three neonates with one adult", "1 adult, ten neonates nearby", "Two adults and six neonates")

dplyr stringr

library(stringr)
library(dplyr)

searchfor <- c("neonates", "adult")         
words <- str_extract_all(S, boundary("word"))   # keep only words

This next statement will capture the word before all words searchforand save asdata.frame

chrnum <- as.data.frame(Reduce(cbind, lapply(searchfor, function(y) lapply(words, function(x) x[which(x %in% y)-1]))))

This next statement will str_replace_alluse a named vector and convert to a numeric

replaced <- chrnum %>% 
              mutate_all(funs(as.numeric(str_replace_all(tolower(.), c("one" = "1", "two" = "2", "three" = "3", "four" = "4", "five" = "5", "six" = "6", "seven" = "7", "eight" = "8", "nine" = "9", "ten" = "10"))))) %>%
              setNames(searchfor)

NOTE. You will receive a NA enforcement warning.

Output

  neonates adult
1        3     1
2       10     1
3        6    NA

Extract different words from character string in R

dplyr stringr

Output

More articles: