This has curtailed, so there should be a better approach:
dat <- read.csv(text="sentence_id, text 1, abcde 2, abbe 3, bcd 4, ae", header=TRUE) library(qdapTools); library(tidyr) x <- t(mtabulate(with(dat, by(text, sentence_id, bag_o_words))) > 0) out <- x %*% t(x) out[upper.tri(out, diag=TRUE)] <- NA out2 <- matrix2df(out, "word1") %>% gather(word2, freq, -word1) %>% na.omit() rownames(out2) <- NULL out2 ## word1 word2 freq ## 1 ba 2 ## 2 ca 1 ## 3 da 1 ## 4 ea 3 ## 5 cb 2 ## 6 db 2 ## 7 eb 2 ## 8 dc 2 ## 9 ec 1 ## 10 ed 1
Basic solution only
out <- lapply(with(dat, split(text, sentence_id)), function(x) { strsplit(gsub("^\\s+|\\s+$", "", as.character(x)), "\\s+")[[1]] }) nms <- sort(unique(unlist(out))) out2 <- lapply(out, function(x) { as.data.frame(table(x), stringsAsFactors = FALSE) }) dat2 <- data.frame(x = nms) for(i in seq_along(out2)) { m <- merge(dat2, out2[[i]], all.x = TRUE) names(m)[i + 1] <- dat[["sentence_id"]][i] dat2 <- m } dat2[is.na(dat2)] <- 0 x <- as.matrix(dat2[, -1]) > 0 out3 <- x %*% t(x) out3[upper.tri(out3, diag=TRUE)] <- NA dimnames(out3) <- list(dat2[[1]], dat2[[1]]) out4 <- na.omit(data.frame( word1 = rep(rownames(out3), ncol(out3)), word2 = rep(colnames(out3), each = nrow(out3)), freq = c(unlist(out3)), stringsAsFactors = FALSE) ) row.names(out4) <- NULL out4
source share