-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkmeans.go
149 lines (129 loc) · 3.21 KB
/
kmeans.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
// kmeans is a package to calculate k-means clustering
// This package implements sequential functions
// so you can calculate k-means from stream of data
package kmeans
import(
"math"
"math/rand"
"errors"
)
// Node represents a feature vector
type Node []float64
// Cluster is info about one cluster
type Cluster struct {
Center Node
Count int
}
type Kmeans struct {
C []Cluster
k int
d Distance
a float64
}
// Distance Function: To compute the distanfe between observations
type Distance func(Node, Node) float64
// Sum sums two vectors
func (n Node) Add(o Node) Node {
r := make([]float64, len(n))
for i, j := range o {
r[i] = n[i]+j
}
return r
}
// Sub subtracts two vectors
func (n Node) Sub(o Node) Node {
r := make([]float64, len(n))
for i, j := range o {
r[i] = n[i]-j
}
return r
}
// Mul multiplicates a vector with a scalar
func (n Node) Mul(s float64) Node {
r := make([]float64, len(n))
for i := range n {
r[i] = n[i]*s
}
return r
}
func New(k int, d Distance, a float64) (Kmeans) {
return Kmeans{make([]Cluster, k), k, d, a}
}
// Add adds a vector to the internal clusters on the frequently
// This function uses the count of elements in cluster as a weight
// See http://www.cs.princeton.edu/courses/archive/fall08/cos436/Duda/C/sk_means.htm
func (k Kmeans) Add(o Node) {
add(o, k.C, k.d)
}
// Addf adds a vector to the internal clusters on the frequently
// This function uses constant pass during creation as a weight
// See http://www.cs.princeton.edu/courses/archive/fall08/cos436/Duda/C/sk_means.htm
func (k Kmeans) Addf(o Node) {
addf(o, k.C, k.d, k.a)
}
// Sequential calculates clusters in sequential manner. It estimates clusters
func (k Kmeans) Sequential(o []Node) error {
err := k.Seed(o)
if err != nil {
return err
}
for _, n := range o {
k.Add(n)
}
return nil
}
func (k Kmeans) Seed(o []Node) error {
if len(o) < k.k {
return errors.New("There is not enough data")
}
seed(o, k.C, k.d, k.k)
return nil
}
// Near finds the closest observation and returns the index and distance
func near(o Node, cs []Cluster, df Distance) (int, float64) {
d := math.MaxFloat64
i := -1
for j, c := range cs {
dd := df(o, c.Center)
if dd < d {
d = dd
i = j
}
}
return i, d
}
// Seed initializes clusters Instead of initializing randomly the seeds, make a sound decision of initializing
// Seed does not add 1 to cluster because we then proceed with Add() for each node
func seed(n []Node, cs []Cluster, df Distance, k int) {
l := len(n)
cs[0].Center = n[rand.Intn(l)]
for i := 1; i < k; i++ {
d2 := make([]float64, l)
var sum float64
for j, o := range n {
_, d := near(o, cs[:i], df)
d2[j] = d
sum += d2[j]
}
target := rand.Float64() * sum
j := 0
for sum = d2[0]; sum < target; sum += d2[j] {
j++
}
cs[i].Center = n[j]
}
}
func add(o Node, cs []Cluster, df Distance) {
// Get closest cluster
i, _ := near(o, cs, df)
// Update it's properties
cs[i].Count += 1
cs[i].Center = cs[i].Center.Add(o.Sub(cs[i].Center).Mul(1/float64(cs[i].Count)))
}
func addf(o Node, cs []Cluster, df Distance, a float64) {
// Get closest cluster
i, _ := near(o, cs, df)
// Update it's properties
cs[i].Count += 1
cs[i].Center = cs[i].Center.Add(o.Sub(cs[i].Center).Mul(a))
}