, . 1: " ", , / /, . Jaccard , . 2: , apple/apples vs. apple/orange, , .
library(reshape2)
library(proxy)
attributes <- c("apple-water-orange", "apple-water", "apple-orange", "coffee",
"coffee-croissant", "green-red-yellow", "green-red-blue",
"green-red","black-white","black-white-purple")
dat <- data.frame(attr=attributes, row.names = paste("id", seq_along(attributes), sep=""))
attributesList <- strsplit(attributes, "-")
df <- data.frame(id=paste("id", rep(seq_along(attributesList), sapply(attributesList, length)), sep=""),
word=unlist(attributesList))
df.wide <- dcast(data=df, word ~ id, length)
rownames(df.wide) <- df.wide[, 1]
df.wide <- as.matrix(df.wide[, -1])
df.dist <- dist(t(df.wide), method="jaccard")
plot(hclust(df.dist))
abline(h=c(0.6, 0.8))
heatmap.2(df.wide, trace="none", col=rev(heat.colors(15)))
res <- merge(dat, data.frame(cat1=cutree(hclust(df.dist), h=0.8)), by="row.names")
res <- merge(res, data.frame(cat2=cutree(hclust(df.dist), h=0.6)), by.y="row.names", by.x="Row.names")
res
, , , .


, () "Smith-Waterman"
Biostrings Bioconductor. SW ( ) (). cutree , .
library(Biostrings)
strList <- lapply(attributes, BString)
swDist <- matrix(apply(expand.grid(seq_along(strList), seq_along(strList)), 1, function(x) {
pairwiseAlignment(strList[[x[1]]], strList[[x[2]]], type="local")@score
}), nrow = 10)
heatmap.2(swDist, trace="none", col = rev(heat.colors(15)),
labRow = paste("id", 1:10, sep=""), labCol = paste("id", 1:10, sep=""))
