FUN-error after launching "tolower" when creating Twitter wordcloud

Trying to create wordcloud from twitter data, but get the following error:

Error in FUN(X[[72L]], ...) : 
  invalid input '            ❤             "@xxx:bla, bla, bla... http://t.co/56Fb78aTSC"' in 'utf8towcs' 

This error appears after running "mytwittersearch_corpus <- tm_map (mytwittersearch_corpus, tolower)" code

mytwittersearch_list <-sapply(mytwittersearch, function(x) x$getText())

mytwittersearch_corpus <-Corpus(VectorSource(mytwittersearch_corpus_list))
mytwittersearch_corpus<-tm_map(mytwittersearch_corpus, tolower)
mytwittersearch_corpus<-tm_map( mytwittersearch_corpus, removePunctuation)
mytwittersearch_corpus <-tm_map(mytwittersearch_corpus, function(x) removeWords(x, stopwords()))

I read on other pages, this may be due to the fact that R has difficulty handling characters, emoticons and letters in non-English languages, but this does not seem to be a problem with “tweet errors” that cause R. problems. I executed the codes :

mytwittersearch_corpus <- tm_map(mytwittersearch_corpus, function(x) iconv(enc2utf8(x), sub = "byte"))
mytwittersearch_corpus<- tm_map(mytwittersearch_corpus, content_transformer(function(x)    iconv(enc2utf8(x), sub = "bytes")))

This will not help. I also understand that it cannot find the function content_transformer, even if it is tm-packagedisabled and running.

I am running this on OS X 10.6.8 and using the latest version of RStudio.

+5
8

, :

tweets$text <- sapply(tweets$text,function(row) iconv(row, "latin1", "ASCII", sub=""))
+10

wordcloud Twitter . tolower TermDocumentMatrix, Twitter.

library(twitteR)
library(tm)
library(wordcloud)
library(RColorBrewer)
library(ggplot2)


#Collect tweets containing 'new year'
tweets = searchTwitter("new year", n=50, lang="en")

#Extract text content of all the tweets
tweetTxt = sapply(tweets, function(x) x$getText())

#In tm package, the documents are managed by a structure called Corpus
myCorpus = Corpus(VectorSource(tweetTxt))

#Create a term-document matrix from a corpus
tdm = TermDocumentMatrix(myCorpus,control = list(removePunctuation = TRUE,stopwords = c("new", "year", stopwords("english")), removeNumbers = TRUE, tolower = TRUE))

#Convert as matrix
m = as.matrix(tdm)

#Get word counts in decreasing order
word_freqs = sort(rowSums(m), decreasing=TRUE) 

#Create data frame with words and their frequencies
dm = data.frame(word=names(word_freqs), freq=word_freqs)

#Plot wordcloud
wordcloud(dm$word, dm$freq, random.order=FALSE, colors=brewer.pal(8, "Dark2"))

enter image description here

+2

tm stri_trans_tolower stringi?

library(twitteR)
library(tm)
library(stringi)
setup_twitter_oauth("CONSUMER_KEY", "CONSUMER_SECRET")
mytwittersearch <- showStatus(551365749550227456) 
mytwittersearch_list <- mytwittersearch$getText()
mytwittersearch_corpus <- Corpus(VectorSource(mytwittersearch_list))

mytwittersearch_corpus <- tm_map(mytwittersearch_corpus, content_transformer(tolower))
# Error in FUN(content(x), ...) : 
#   invalid input 'í ½í±…í ¼í¾¯â¤í ¼í¾§í ¼í½œ "@comScore: Nearly half of #Millennials do at least some of their video viewing from a smartphone or tablet: http://t.co/56Fb78aTSC"' in 'utf8towcs'

mytwittersearch_corpus <- tm_map(mytwittersearch_corpus, content_transformer(stri_trans_tolower))
inspect(mytwittersearch_corpus)
# <<VCorpus (documents: 1, metadata (corpus/indexed): 0/0)>>
#   
# [[1]]
# <<PlainTextDocument (metadata: 7)>>
# <ed><U+00A0><U+00BD><ed><U+00B1><U+0085><ed><U+00A0><U+00BC><ed><U+00BE><U+00AF><U+2764><ed><U+00A0><U+00BC><ed><U+00BE><U+00A7><ed><U+00A0><U+00BC><ed><U+00BD><U+009C> "@comscore: nearly half of #millennials do at least some of their video viewing from a smartphone or tablet: http://t.co/56fb78atsc"
+2

, , , wordcloud tm.

, , , , , .

, wordcloud

 Error in FUN(content(x), ...) : in 'utf8towcs'

- :

words.corpus <- tm_map(words.corpus, tolower)

words.corpus <- tm_map(words.corpus, content_transformer(tolower))

, , :

plot_wordcloud <- function(words, max_words = 70, remove_words ="",
                           n_colors = 5, palette = "Set1")
{
    require(dplyr)
    require(wordcloud)
    require(RColorBrewer) # for brewer.pal()
    require(tm) # for tm_map()

    # Solution: remove all non-printable characters in UTF-8 with this line
    words <- iconv(words, "ASCII", "UTF-8", sub="byte")

    wc <- wordcloud(words=words.corpus, max.words=max_words,
                    random.order=FALSE,
                    colors = brewer.pal(n_colors, palette),
                    random.color = FALSE,
                    scale=c(5.5,.5), rot.per=0.35) %>% recordPlot
    return(wc)
}

:

words.corpus <- Corpus(VectorSource(words))

:

UTF-8 :

words <- sapply(words, function(x) iconv(enc2utf8(x), sub = "byte"))

for (i in 1:length(words))
{
    Encoding(words[[i]])="UTF-8"
}

:

UTF-8 :

    words.corpus <- tm_map(words.corpus, removeWords, remove_words)

    words.corpus <- tm_map(words.corpus, content_transformer(stringi::stri_trans_tolower))

    words.corpus <- tm_map(words.corpus, function(x) iconv(x, to='UTF-8'))

    words.corpus <- tm_map(words.corpus, enc2utf8)

    words.corpus <- tm_map(words.corpus, tolower)

, , , . . , , , . , :

    words <- iconv(words, "ASCII", "UTF-8", sub="byte")

: : http://www.textasdata.com/2015/02/encoding-headaches-emoticons-and-rs-handling-of-utf-816/

+2

RStudio . , , tolower/content_transformer. - , tm- tm_map, , , . , !

0

corp <- tm_map(corp, content_transformer(tolower), mc.cores=1)

corp <- tm_map(corp, tolower, mc.cores=1)
0

, , , , amazon aws, apps.io, , "", santé , csv . , .txt utf-8 , , csv, txt. R 3.2.1, Rstudio - 0.99.465

0

, ( Twitter). iconv(x, "latin1", "UTF-8"), Encoding() , UTF-8.

0

Source: https://habr.com/ru/post/1569824/


All Articles