这次做的文本挖掘以tm包为基础,数据集内容是奥巴马的国会演讲。 链接:https://github.com/datameister66/data
library(tm)
name <- file.path("/Users/mac/rstudio-workplace/txtData")
dir(name) [1] "sou2010.txt" "sou2011.txt" "sou2012.txt" "sou2013.txt" "sou2014.txt" "sou2015.txt" [7] "sou2016.txt"
length(dir(name)) [1] 7
docs <- Corpus(DirSource(name))
inspect(docs[1])
docs <- tm_map(docs,tolower)
docs <- tm_map(docs,removeNumbers)
docs <- tm_map(docs,removePunctuation)
docs <- tm_map(docs,removeWords,stopwords("english"))
docs <- tm_map(docs,stripWhitespace)
docs <- tm_map(docs,removeWords,c("applause","can","cant","will","that","weve","dont","wont","youll","youre"))
dtm <- documentTermMatrix(docs)
dim(dtm) [1] 7 4715
inspect(dtm) <<documentTermMatrix (documents: 7, terms: 4715)>> Non-/sparse entries: 10899/22106 Sparsity : 67% Maximal term length: 17 Weighting : term frequency (tf) Sample : Terms Docs america american jobs make new now people thats work years sou2010.txt 18 18 23 14 20 30 32 26 21 20 sou2011.txt 18 19 25 23 36 25 31 24 20 25 sou2012.txt 30 34 34 15 27 26 21 24 16 18 sou2013.txt 24 19 32 20 24 35 18 18 20 22 sou2014.txt 28 21 23 22 29 11 24 19 27 21 sou2015.txt 35 19 18 23 41 15 22 30 20 25 sou2016.txt 21 16 8 17 16 15 21 29 20 17
inspect(dtm[1:3,1:3])
freq <- colSums(as.matrix(dtm)) head(freq) abide ability able abroad absolutely abuses 1 4 14 13 4 1
ord <- order(-freq) head(ord) [1] 913 60 1386 991 755 922
freq[head(ord)] new america thats people jobs now 193 174 170 169 163 157
freq[tail(ord)]
head(table(freq)) freq 1 2 3 4 5 6 2226 788 382 234 142 137 tail(table(freq)) freq 157 163 169 170 174 193 1 1 1 1 1 1
findFreqTerms(dtm,125) [1] "america" "american" "americans" "jobs" "make" "new" "now" [8] "people" "thats" "work" "year" "years"
findAssocs(dtm,"job",corlimit = 0.9) $job wrong pollution forces together achieve training 0.97 0.96 0.93 0.93 0.93 0.91
library(wordcloud) wordcloud(names(freq),freq,min.freq = 70,scale = c(3,.3),colors = brewer.pal(6,"Dark2"))