-
Notifications
You must be signed in to change notification settings - Fork 0
/
classification.jl
75 lines (61 loc) · 3.19 KB
/
classification.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#=
classification:
- Julia version:
- Author: yusufrahadika
- Date: 2019-11-20
=#
using StringEncodings
include("preprocessing.jl")
include("weighting.jl")
mutable struct Classification
weighting_instance::Weighting
file_classes::Array{String, 1}
unique_file_classes::Array{String, 1}
naive_bayes_model::Matrix{Int64}
end
Classification() = Classification(Weighting(), [], [], zeros(Int64, (0,0)))
function train(classification::Classification, file_names::Array{String, 1}, file_classes::Array{String, 1})
setText(classification.weighting_instance, [
Preprocessing.preprocess(read(file_name, String, enc"ISO_8859-1"))
for file_name in file_names
])
classification.file_classes = file_classes
classification.unique_file_classes = unique(file_classes)
classification.naive_bayes_model = zeros(Int64, (length(classification.weighting_instance.features), length(classification.unique_file_classes)))
for (class, document) in zip(classification.file_classes, classification.weighting_instance.documents)
column_index = findfirst(isequal(class), classification.unique_file_classes)
for (row_index, feature) in enumerate(classification.weighting_instance.features)
classification.naive_bayes_model[row_index, column_index] = classification.naive_bayes_model[row_index, column_index] + count(word -> word == feature, document)
end
end
end
function test(classification::Classification, file_names)::Array{String, 1}
result::Array{String, 1} = []
initial_naive_bayes_probability = [
(count(class_name -> class_name == unique_class, classification.file_classes) / length(classification.file_classes))
for unique_class in classification.unique_file_classes
]
feature_count = length(classification.weighting_instance.features)
classes_words_count = [
sum(column_tf)
for column_tf in eachcol(classification.weighting_instance.tf)
]
for file_name in file_names
naive_bayes_probability = copy(initial_naive_bayes_probability)
for test_feature in unique(Preprocessing.preprocess(read(file_name, String, enc"ISO_8859-1")))
feature_index = findfirst(train_feature -> train_feature == test_feature, classification.weighting_instance.features)
if !isnothing(feature_index)
for (column_index, unique_class) in enumerate(classification.unique_file_classes)
naive_bayes_probability[column_index] = naive_bayes_probability[column_index] * ((classification.naive_bayes_model[feature_index, column_index] + 1) / (classes_words_count[column_index] + feature_count))
end
end
end
println(naive_bayes_probability)
maximum_probability = maximum(naive_bayes_probability)
push!(result, classification.unique_file_classes[findfirst(naive_prob -> naive_prob == maximum_probability, naive_bayes_probability)])
end
return result
end
function hitungAkurasi(actual_classes::Array{String, 1}, predicted_classes::Array{String, 1})::Float64
return count(class -> class[1] == class[2], zip(actual_classes, predicted_classes)) / min(length(actual_classes), length(predicted_classes))
end