-
Notifications
You must be signed in to change notification settings - Fork 92
/
Copy pathdentropytarget.go
134 lines (110 loc) · 3.39 KB
/
dentropytarget.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
package CloudForest
import (
"fmt"
"math"
)
/*
DEntropyTarget wraps a categorical feature for use in entropy driven classification
as in Ross Quinlan's ID3 (Iterative Dichotomizer 3) with a the entropy modified to use
"disutility entropy"
I = - k Sum ri * pi * log(pi)
*/
type DEntropyTarget struct {
CatFeature
Costs []float64
}
//NewDEntropyTarget creates a RefretTarget and initializes DEntropyTarget.Costs to the proper length.
func NewDEntropyTarget(f CatFeature) *DEntropyTarget {
return &DEntropyTarget{f, make([]float64, f.NCats())}
}
/*NewDEntropyTarget.SetCosts puts costs in a map[string]float64 by feature name into the proper
entries in NewDEntropyTarget.Costs.*/
func (target *DEntropyTarget) SetCosts(costmap map[string]float64) {
for i := 0; i < target.NCats(); i++ {
c := target.NumToCat(i)
target.Costs[i] = costmap[c]
}
}
/*
DEntropyTarget.SplitImpurity is a version of Split Impurity that calls DEntropyTarget.Impurity
*/
func (target *DEntropyTarget) SplitImpurity(l *[]int, r *[]int, m *[]int, allocs *BestSplitAllocs) (impurityDecrease float64) {
nl := float64(len(*l))
nr := float64(len(*r))
nm := 0.0
impurityDecrease = nl * target.Impurity(l, allocs.LCounter)
impurityDecrease += nr * target.Impurity(r, allocs.RCounter)
if m != nil && len(*m) > 0 {
nm = float64(len(*m))
impurityDecrease += nm * target.Impurity(m, allocs.Counter)
}
impurityDecrease /= nl + nr + nm
return
}
//UpdateSImpFromAllocs willl be called when splits are being built by moving cases from r to l as in learning from numerical variables.
//Here it just wraps SplitImpurity but it can be implemented to provide further optimization.
func (target *DEntropyTarget) UpdateSImpFromAllocs(l *[]int, r *[]int, m *[]int, allocs *BestSplitAllocs, movedRtoL *[]int) (impurityDecrease float64) {
target.MoveCountsRtoL(allocs, movedRtoL)
nl := float64(len(*l))
nr := float64(len(*r))
nm := 0.0
impurityDecrease = nl * target.ImpFromCounts(len(*l), allocs.LCounter)
impurityDecrease += nr * target.ImpFromCounts(len(*r), allocs.RCounter)
if m != nil && len(*m) > 0 {
nm = float64(len(*m))
impurityDecrease += nm * target.ImpFromCounts(len(*m), allocs.Counter)
}
impurityDecrease /= nl + nr + nm
return
}
func (target *DEntropyTarget) ImpFromCounts(total int, counts *[]int) (e float64) {
p := 0.0
for c, i := range *counts {
if i > 0 {
p = float64(i) / float64(total)
e -= target.Costs[c] * p * math.Log(p)
}
}
return
}
func (target *DEntropyTarget) FindPredicted(cases []int) (pred string) {
prob_true := 0.0
t := target.CatToNum("True")
weightedvoted := true
if weightedvoted {
count := 0.0
total := 0.0
for _, i := range cases {
ti := target.Geti(i)
cost := target.Costs[ti]
if ti == t {
count += cost
}
total += cost
}
prob_true = count / total
} else {
count := 0
for _, i := range cases {
if target.Geti(i) == t {
count++
}
}
prob_true = float64(count) / float64(len(cases))
}
return fmt.Sprintf("%v", prob_true)
}
//DEntropyTarget.Impurity implements categorical entropy as sum(pj*log2(pj)) where pj
//is the number of cases with the j'th category over the total number of cases.
func (target *DEntropyTarget) Impurity(cases *[]int, counts *[]int) (e float64) {
total := len(*cases)
target.CountPerCat(cases, counts)
p := 0.0
for _, i := range *counts {
if i > 0 {
p = float64(i) / float64(total)
e -= p * math.Log(p)
}
}
return
}