How to aggregate data.frame for row and column names based on hierarchical structure of dictionary names?

Question

How to aggregate data.frame for row and column names based on hierarchical structure of dictionary names?

(apologies, I was not sure what the best title for this post would be, feel free to edit).

Suppose I have the following relational structure between words and their type (for example, a dictionary):

dictionary <- data.frame(level1=c(rep("Positive", 3), rep("Negative", 3)), level2 = c("happy", "fantastic", "great", "sad", "rubbish", "awful")) # level1 level2 # 1 Positive happy # 2 Positive fantastic # 3 Positive great # 4 Negative sad # 5 Negative rubbish # 6 Negative awful

and we calculated their occurrences in seven documents (i.e., the term-document matrix):

 set.seed(42) range = 0:3 df <- data.frame(row.names = c("happy", "fantastic", "great", "sad", "rubbish", "awful"), doc1 = sample(x=range, size=6, replace=TRUE), doc2 = sample(x=range, size=6, replace=TRUE), doc3 = sample(x=range, size=6, replace=TRUE), doc4 = sample(x=range, size=6, replace=TRUE), doc5 = sample(x=range, size=6, replace=TRUE), doc6 = sample(x=range, size=6, replace=TRUE), doc7 = sample(x=range, size=6, replace=TRUE)) # doc1 doc2 doc3 doc4 doc5 doc6 doc7 # happy 3 2 3 1 0 2 0 # fantastic 3 0 1 2 2 3 0 # great 1 2 1 3 1 1 3 # sad 3 2 3 0 3 2 2 # rubbish 2 1 3 3 1 0 1 # awful 2 2 0 3 3 3 1

Then I can easily calculate how often two words appear in the same document (i.e., a match or adjacency matrix):

 # binary to indicate a co-occurrence df[df > 0] <- 1 # sum co-occurrences m <- as.matrix(df) %*% t(as.matrix(df)) # happy fantastic great sad rubbish awful # happy 5 4 5 4 4 4 # fantastic 4 5 5 4 4 4 # great 5 5 7 6 6 6 # sad 4 4 6 6 5 5 # rubbish 4 4 6 5 6 5 # awful 4 4 6 5 5 6

Question: How can I restructure my match matrix so that I consider the type of word (level1) in the dictionary, and not just the words themselves (level2)?

i.e. I would like to:

 data.frame(row.names = c("Positive", "Negative"), Positive = c(5+4+5+4+5+5+5+5+7, 4+4+6+4+4+6+4+4+6), Negative = c(4+4+4+4+4+4+6+6+6, 6+5+5+5+6+5+5+5+6)) # Positive Negative # Positive 45 42 # Negative 42 48

What I have done so far: I used to hope that I could deduce the process from this question Combine columns based on data.frame by type of name

However, although I can reduce the lines:

 require(data.table) dt <- data.table(m) dt[, level1:=c(rep("Positive", 3), rep("Negative", 3))] dt[, lapply(.SD, sum), by = "level1"] # level1 happy fantastic great sad rubbish awful # 1: Positive 14 14 17 14 14 14 # 2: Negative 12 12 18 16 16 16

I cannot figure out how to reduce the required columns.

+6

r data.table

Tony breyal Oct 18 '13 at 15:20

source share

4 answers

Continuation with df[df > 0] <- 1

 library(reshape) library(reshape2) library(data.table) # incorporating @RicardoSaporta suggestion of using data.table(keep.rownames = TRUE) dt <- data.table(as.matrix(df) %*% t(as.matrix(df)), keep.rownames = TRUE) #reducing matrix format to plain data format, look at dt to see the change dt <- melt(dt, "rn") #getting positive/negative for word1 and word2 dt <- merge(dt,dictionary, all.x = TRUE, by.y = "level2", by.x = "rn") dt <- merge(dt,dictionary, all.x = TRUE, by.y = "level2", by.x = "variable", suffixes = c("_1","_2")) #getting counts for each positive/negative - positive/negative combination dt <- data.table(dt) dt[,list(value = sum(value)), by = c("level1_1","level1_2")] #structuring cast(dt,level1_1~level1_2, fun.aggregate=sum)

Output

 > cast(dt,level1_1~level1_2, fun.aggregate=sum) level1_1 Negative Positive 1 Negative 48 42 2 Positive 42 45

+6

TheComeOnMan Oct 18 '13 at 15:30

source share

You can return to step by performing aggregation on the adjacency matrix before creating the co-presence matrix:

 dict <- data.table(dictionary,key='level2') adj2 <- data.table(df,keep.rownames=TRUE) adj1 <- adj2[,lapply(.SD,sum),by=dict[rn]$level1] # one tedious step: adj1mat <- as.matrix(adj1[,-1,with=FALSE]) rownames(adj1mat) <- as.character(adj1$dict) m1 <- adj1mat %*% t(adj1mat) # Positive Negative # Positive 45 42 # Negative 42 48

Of course, I want your dictionary to be saved as a data table with a key.

+4

Frank Oct 18 '13 at 16:52

source share

We can aggregate use your matrix m with by twice. We just need to turn the words level2 into the words level1, using match first. I am sure you can do it in one call, but I can’t figure it out. Two calls are not so bad.

 # Match Positive and Negative to words colnames(m) <- dictionary$level1[ match( colnames( m ) , dictionary$level2 ) ] rownames(m) <- dictionary$level1[ match( rownames( m ) , dictionary$level2 ) ] # Aggregate down to desired result tmp <- do.call( cbind , by( m , INDICES = colnames(m) , FUN=colSums ) ) do.call(cbind , by( tmp , INDICES = rownames(m) , FUN=colSums ) ) # Negative Positive #Negative 48 42 #Positive 42 45

+3

Simon O'Hanlon Oct 18 '13 at 16:09

source share

eddi · Accepted Answer · 2013-10-18T16:18:10+0000

In principle, the same solution as the other two is still a little more compact and probably a little faster:

 library(reshape2) library(data.table) mdt = data.table(melt(m), key = 'Var1') dic = data.table(dictionary, key = 'level2') dcast(dic[setkey(dic[mdt], Var2)], level1 ~ level1.1, fun.aggregate = sum) # level1 Negative Positive #1 Negative 48 42 #2 Positive 42 45

How to aggregate data.frame for row and column names based on hierarchical structure of dictionary names?

More articles: