-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanomalous_cluster.py
179 lines (170 loc) · 5.18 KB
/
anomalous_cluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
"""
Anomalous Cluster Algorithm
"""
import numpy as np
def anomalous(X, me, rang, D):
"""
anomalous, iterative extraction of anomalous clusters based on the algorithm referred to as Separate/Conquer (Mirkin, 1999, Machine Learning)
Input:
X - data matrix,
me - grand mean,
range - normalizing values
D - normalised data scatter
output:
ancl - an indexed dataset of anomalous clusters found:
ancl[ind, 1] list of indices in cluster,
ancl[ind, 2] standardised center,
ancl[ind, 3] contribution to the data scatter
"""
nn,mm = X.shape
remains = np.array(range(nn))
number = 0
anomalous_cluster = []
while len(remains) != 0:
distance = dist(X,remains,rang,me)
d, ind = max(distance), np.argmax(distance)
index = remains[ind]
centroid = X[index,:]
number += 1
# finding AP cluster
cluster, centroid = anpat(X, remains, rang, centroid, me)
ancl = [0,0,0]
# list of indices in the anomalous cluster
ancl[0] = cluster
# standardized center of the anomalous cluster
censtand = (centroid - me) / rang
# cluster contrib, per cent
dD = (np.dot(censtand, censtand.T) * len(cluster) *100) / D
#print('[DEBUG] dD: ',dD)
# standardised centroid
ancl[1] = censtand
# proportion of the data scatter
ancl[2] = dD
# update of the set of objects yet unclustered
remains = np.setdiff1d(remains, cluster)
anomalous_cluster.append(ancl)
return anomalous_cluster
def anpat(X, remains, rang, centroid, me):
"""
anpat, iterative soubroutine in anomalous
based on the algorithm 'Separate/Conquer' (Mirkin, 1999, Machine Learning)
Input:
X - full data matrix
remains - set of its row indices % %(objects) under consideration
range - normalizing values
centroid - initial center of the anomalous cluster
me - vector to shift to the 0 (origin)
Output:
cluster - set of row indices in the anomalous cluster
centrod - center of the cluster
"""
count = 0
key = 1
while key == 1:
# separc: separate cluster around "centroid" from that around "me"
cluster = separc(X,remains,rang,centroid,me)
if len(cluster) > 0:
# center: finding centre of cluster
newcenter = center(X,cluster)
else:
newcenter = centroid
if not np.array_equal(newcenter, centroid):
centroid = newcenter
else:
key = 0
return cluster, centroid
def center(X, cluster):
"Finding centroid to a cluster"
nn, mm = X.shape
ccc = np.zeros((mm))
for jj in range(mm):
zz = X[:,jj]
zc = zz[cluster]
ccc[jj] = zc.mean()
return ccc
def separc(X0, remains, rang, a, b):
"Separating a cluster around 'a' from that around 'b'"
dist_a = distm(X0,remains,rang,a)
dist_b = distm(X0,remains,rang,b)
clust = dist_a < dist_b
cluster = remains[clust]
return cluster
def distm(X,remains,rang,a):
"""
Finding normalized distances in 'remains' to point 'a'
Input:
X - the original data matrix
remains - set of X-row indices under consideration
range - normalizing vector
a - point the distances relate to
Output:
distan - column of distances from a to remains
"""
nn, mm = X.shape
rr = len(remains)
# Submatrix of the entities (remains)
z = X[remains,:]
# column vector with 'a' point
az = np.array([a]*rr)
rz = np.array([rang]*rr)
# the relevant data normalized
dz = (z - az) / rz
# the squares of the relevant entries
ddz = dz * dz
# di is a row of the squared Euclidean distances
if mm > 1:
di = sum(ddz.T)
else:
di = ddz.T
#rr times 1 column
distan = di.T
return distan
def dist(X, remains, stand, me):
"""
Finding normalized distances in 'remains' to point 'a'
Input:
X - data matrix,
remains - data still to cluster,
stand - values to standardize,
me - array point to calculate dist
Output:
distan - column of distances from a to remains.
"""
nn, mm = X.shape
rr = len(remains)
# distan is a matrix rr x 1
distan = np.zeros((rr))
for jj in range(mm):
# jjth feature
z = X[:,jj]
# the jjth feature of the data still in remains
zz = z[remains]
y = zz - me[jj]
y = y / stand[jj]
yy = y * y
distan = distan + yy
return distan
def post_process_ANCL(ancl, threshold=10):
"""
post-processing of anomalous clusters
removal of small anomalous clusters
Input:
ancl - anomalous clusters
threshold - the maximum cardinality of a small cluster
Output:
new_ancl - new anomalous clusters
"""
new_ancl = []
aK = len(ancl)
ll = np.zeros((aK))
for ik in range(aK):
ll[ik] = len(ancl[ik][0])
rl = np.argwhere(ll>threshold)[:,0]
# cent shape is (n_clusters x n_features)
cent = np.zeros((len(rl),len(ancl[0][1])))
if rl.size == 0:
print('Too great a threshold!!! ')
else:
for ik in range(len(rl)):
new_ancl.append(ancl[rl[ik]])
return new_ancl