, TermDocumentMatrix Bigrams (2 , ) weka,
library("tm")
library("RWeka")
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
tdm <- TermDocumentMatrix(corpus, control = list(tokenize = BigramTokenizer))
tdm <- removeSparseTerms(tdm, 0.99)
print("----")
print("tdm properties")
str(tdm)
tdm_top_N_percent = tdm$nrow / 100 * topN_percentage_wanted
,
wmin=1
wmax = 5
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = wmin, max = wmax))
, , "" .