-
Notifications
You must be signed in to change notification settings - Fork 7
/
Bigram.R
75 lines (53 loc) · 2.15 KB
/
Bigram.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#create a vector of all the packages you want to load
packs <- c("tm","tau","qdap","RWeka","e1071","caret")
lapply(packs, require, character.only = TRUE)
#load the dataset as a matrix
inputText = read.csv(file = "pathToFile/AirlineTweets.csv", header = TRUE, sep = ",")
input=as.matrix(inputText)
# remove retweet entities
some_txt = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", inputText[,1])
# remove at people e.g. @ArabWic
some_txt = gsub("@\\w+", "", some_txt)
# remove punctuation
some_txt = gsub("[[:punct:]]", "", some_txt)
# remove numbers
some_txt = gsub("[[:digit:]]", "", some_txt)
# remove html links
some_txt = gsub("http\\w+", "", some_txt)
# remove unnecessary spaces
some_txt = gsub("[ \t]{2,}", "", some_txt)
some_txt = gsub("^\\s+|\\s+$", "", some_txt)
#convert some_txt into a corpus and remove stopwords and punctuations
corp=Corpus(VectorSource(some_txt))
as.VCorpus(corp)
corp=tm_map(corp, removeWords, stopwords('english'))
inspect(corp)
corp <- tm_map(corp, removePunctuation)
inspect(corp)
#create the bigrams and Term document matrix
BigramTokenizer <- function(y) NGramTokenizer(y, Weka_control(min = 2, max = 2))
tdm <- TermDocumentMatrix(corp, control = list(tokenize = BigramTokenizer))
m= inspect(tdm)
library(wordcloud)
set.seed(1234)
v = sort(rowSums(m), decreasing = TRUE)
#you can change the frequency of the terms to a number of your choice.
wordcloud(names(v), v, min.freq = 10)
#create a dataframe to hold our data
DF <- as.data.frame(m, stringsAsFactors = FALSE)
nrow(DF)
DF=as.matrix(DF)
tdf=as.data.frame(t(DF))
tdf=cbind(tdf,input[,2])
len=ncol(tdf)
header=c(1:(len-1))
header=c(header,"Class")
colnames(tdf)=header
#create svm model using the train and test sets
trainset <- tdf[1:1800,]
testset <- tdf[1801:2000,]
svm.model <- svm(Class ~ ., data = trainset, cost = 100, gamma = 1)
svm.pred <- predict(svm.model, testset[,-len])
pred <- factor(svm.pred)
xtab <- table(pred, testset[,len])
confusionMatrix(xtab)