-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.go
149 lines (137 loc) · 3.51 KB
/
main.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
package main
import (
"encoding/csv"
"fmt"
"math/rand"
"os"
"strconv"
"time"
)
const (
CUACA_HUJAN = 1.0
CUACA_CERAH = 2.0
CUACA_MENDUNG = 3.0
)
var (
testPercentage = 0.1 //presentasi data test
datafile = "data-cuaca.csv"
threshold = 1.1
//exampleif `threshold` is `1.5` this means the category with the highest probability
// needs to be 1.5 times higher than the second highest probability.
// If the top category fails the threshold we will classify it as `unknown`.
)
// datasets
type document struct {
time string
class string
}
type Condition map[string]float64
//dipisahkan untuk training dan test
var train []document
var test []document
var categories = []string{"Hujan", "Berawan", "Cerah"}
func main() {
nb := NewClassifier(categories, threshold)
nb.setupData(datafile)
fmt.Println("Data file used:", datafile)
fmt.Println("no of docs in TRAIN dataset:", len(train))
fmt.Println("no of docs in TEST dataset:", len(test))
// train on train dataset
for _, doc := range train {
nb.Train(doc.class, doc.time)
}
// a := NormalDist{nb.avg("dmin", CUACA_CERAH), nb.stdev("dmin", CUACA_CERAH)}
// fmt.Println("hasil training", nb.categoriesDocuments, a)
// validate on test dataset
count, accurates, unknowns := 0, 0, 0
for i, doc := range test {
count++
sentiment := nb.Classify(doc.time, nb.datatest[i]["dmin"], nb.datatest[i]["dmax"],
nb.datatest[i]["tmin"], nb.datatest[i]["tmax"])
if sentiment == doc.class {
accurates++
}
if sentiment == "unknown" {
unknowns++
}
}
fmt.Printf("Accuracy on TEST dataset is %2.1f%% with %2.1f%% unknowns",
float64(accurates)*100/float64(count), float64(unknowns)*100/float64(count))
// validate on the first 100 docs in the train dataset
count, accurates, unknowns = 0, 0, 0
for i, doc := range train[0:100] {
count++
sentiment := nb.Classify(doc.time, nb.datatrain[i]["dmin"], nb.datatrain[i]["dmax"],
nb.datatrain[i]["tmin"], nb.datatrain[i]["tmax"])
if sentiment == doc.class {
accurates++
}
if sentiment == "unknown" {
unknowns++
}
}
fmt.Printf("\nAccuracy on TRAIN dataset is %2.1f%% with %2.1f%% unknowns",
float64(accurates)*100/float64(count), float64(unknowns)*100/float64(count))
}
func (c *Classifier) setupData(file string) {
rand.Seed(time.Now().UTC().UnixNano())
f, err := os.Open(file)
if err != nil {
return
}
csvReader := csv.NewReader(f)
csvData, err := csvReader.ReadAll()
for i := 0; i < len(csvData); i++ {
dmin, err := strconv.ParseFloat(csvData[i][2], 64)
if err != nil {
continue
}
dmax, err := strconv.ParseFloat(csvData[i][3], 64)
if err != nil {
continue
}
tmin, err := strconv.ParseFloat(csvData[i][4], 64)
if err != nil {
continue
}
tmax, err := strconv.ParseFloat(csvData[i][5], 64)
if err != nil {
continue
}
cuaca := CUACA_CERAH
switch csvData[i][1] {
case "Cerah":
cuaca = CUACA_CERAH
break
case "Berawan":
cuaca = CUACA_MENDUNG
break
case "Hujan":
cuaca = CUACA_HUJAN
break
}
class := csvData[i][1]
waktu := csvData[i][0]
// fmt.Println(waktu, dmin, dmin, tmin, tmax, class)
//dibagi data train dan test
if rand.Float64() > testPercentage {
train = append(train, document{waktu, class})
c.addDataTrain(Condition{
"dmin": dmin,
"dmax": dmax,
"tmin": tmin,
"tmax": tmax,
"cuaca": cuaca,
})
} else {
test = append(test, document{waktu, class})
c.addDataTest(Condition{
"dmin": dmin,
"dmax": dmax,
"tmin": tmin,
"tmax": tmax,
"cuaca": cuaca,
})
}
}
}