-
-
Notifications
You must be signed in to change notification settings - Fork 107
/
Analyzing Text Data with R (on Mac)
37 lines (30 loc) · 1.13 KB
/
Analyzing Text Data with R (on Mac)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# For MAC
library(tm)
options(header=FALSE, stringsAsFactors = FALSE,FileEncoding="latin1")
#Read data
text<-readLines(file.choose())
corpus <- iconv(text, to = "utf-8-mac")
corpus <- Corpus(VectorSource(corpus))
#Clean-up
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
cleanset <- tm_map(corpus, removeWords, stopwords("english"))
#removeURL <- function(x) gsub("http[[:alnum:]]*", "", x)
#cleanset <- tm_map(cleanset, content_transformer(removeURL))
#cleanset <- tm_map(cleanset, removeWords, c("two", "five"))
cleanset <- tm_map(cleanset, stripWhitespace)
#Term Document Matrix
tdm <- TermDocumentMatrix(cleanset, control=list(minWordLength=c(1,Inf)))
# inspect frequent words
findFreqTerms(tdm, lowfreq=20)
#Bar plot
termFrequency <- rowSums(as.matrix(tdm))
termFrequency <- subset(termFrequency, termFrequency>=20)
barplot(termFrequency,las=2,col = rainbow(20))
#WORD CLOUD
library(wordcloud)
m<- as.matrix(tdm)
wordFreq <- sort(rowSums(m), decreasing=TRUE)
set.seed(375) # to make it reproducible
wordcloud(words=names(wordFreq), freq=wordFreq, min.freq=10, random.order=F)