-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathanalysis.py
189 lines (140 loc) · 7.08 KB
/
analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
import glob
import numpy as np
import pandas as pd
import pickle
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import constants as consts
from itertools import tee
from sklearn.metrics.pairwise import euclidean_distances
def pairwise(iterable):
a, b = tee(iterable)
next(b, None)
return zip(a, b)
# load dataframe (with cluster labels) from path
def __load_day(path):
df = pd.read_parquet("{0}/dataframe.parquet".format(path))
kmeans_model = pickle.load(open("{0}/kmeans_models/kmeans_{1}.pkl".format(path, consts.N_CLUSTER_STR), "rb"))
df["Cluster Number"] = kmeans_model.labels_
return df
# this method returns a matrix representation of the day passed in `path`. If there are 4 clusters and the session length is 10,
# it contains 4 vectors of length 10. Each vector corresponds to the average session type of that cluster.
def __load_session_types_of_day(path):
df = __load_day(path)
# group by the cluster number and return mean aggregation, sorted by cluster number
result_mean = df.groupby(["Cluster Number"], as_index=False).agg("mean").sort_values(by="Cluster Number", ascending=True)
# at this point, vectors in `result_mean` are ordered, meaning the 1st row corresponds to Cluster 0, 2nd to Cluster 1, etc.
# Drop the cluster number column
result_mean.drop("Cluster Number", axis=1, inplace=True)
# convert to numpy
result_mean = result_mean.to_numpy()
return result_mean
def __matchings_of_two_experiments(distance_matrix):
# number of rows, which is equivalent to number of clusters
n_rows = distance_matrix.shape[0]
# `i` here also refers to the cluster number of Group A. The aim is to find a match in Group B, and put mapping
# in `_dict` for later use
_dict = {}
for i in range(n_rows):
# get current row
row_arr = distance_matrix[i]
# get index of minimum value
h_index = np.where(row_arr == np.amin(row_arr))[0][0]
_dict[i] = h_index
return _dict
# this method loops through all experiments passed as input (`experiments`), and it returns a dictionary
# with all the matchings among clusters. For example, for the first tuple of experiments (A, B), it returns
# a mapping from clusters of A to clusters of B
def __create_similarity_matching(experiments):
# loop all experiments pairwise
_matching_dict = {}
for exp_a, exp_b in pairwise(experiments):
print("Matching: {0} - {1}".format(exp_a, exp_b))
# load matrices for the two experiments under analysis
df_a = __load_session_types_of_day(exp_a)
df_b = __load_session_types_of_day(exp_b)
# calculate euclidean distance between two matrices (each matrix has n vectors, where n is n_clusters)
dis = euclidean_distances(df_a, df_b)
# add matchings of ExpA and ExpB to overall dict
_matching_dict[(exp_a, exp_b)] = __matchings_of_two_experiments(dis)
return _matching_dict
def __generate_distributions_by_cluster(matching_dict):
# create dictionary with individual distributions for each experiment
_distr_dict = {}
for (exp_a, _), _ in matching_dict.items():
# we calculate the distribution in percentage only for exp_a. Eventually, all days will be looped through. This is why
# the last day is duplicated
df = __load_day(exp_a)
_local_dict = df["Cluster Number"].value_counts(normalize=True).to_dict()
_distr_dict[exp_a] = _local_dict
# array that will hold all distribution values. 1st row is Cluster 0, 2nd row is Cluster 1, etc.
distr_array = np.zeros(shape=(4, len(matching_dict)))
for i in range(consts.N_CLUSTERS_INT):
num_to_look_for = i
j = 0
for (exp_a, _), vals in matching_dict.items():
distr_array[i, j] = _distr_dict[exp_a][num_to_look_for]
# update num_to_look_for by following "translation"
num_to_look_for = vals[num_to_look_for]
j += 1
return distr_array
def __distribution_stacked_histogram(distribution_dict, labels, base_path):
filename = "{0}/stacked_distribution_histogram.png".format(base_path)
fig, ax = plt.subplots()
# use percentages and with 1 decimal points
distribution_dict = np.round(distribution_dict * 100, 1)
cmap = matplotlib.cm.get_cmap("tab20c")
# Create gaps (filled by gray lines) in between bars
y_pos = [0,1.25,2.25,3.5,4.5,5.5,6.5,7.75,8.75,9.75,10.75,11.75,12.75]
# plot horizontal stacked bars
ax.barh(y_pos, distribution_dict[0], color=cmap(0.5), label="listener")
left = distribution_dict[0]
ax.barh(y_pos, distribution_dict[1], left=left, color=cmap(0.15), label="listen-then-skip")
left += distribution_dict[1]
ax.barh(y_pos, distribution_dict[2], left=left, color=cmap(0.05),label="skip-then-listen")
left += distribution_dict[2]
ax.barh(y_pos, distribution_dict[3], left=left, color=cmap(0.35), label="skipper")
ax.axhline(0.625, color="tab:gray")
ax.axhline(2.875, color="tab:gray")
ax.axhline(7.125, color="tab:gray")
# add percentages in each bar
for c in ax.containers:
ax.bar_label(c, label_type="center")
# change frequency of x-axis for 25-50-75-100
plt.xticks(np.arange(0, 101, 25))
plt.grid(axis="x")
# set y-axis labels
plt.yticks(y_pos, labels)
# Long/Medium/Short label opposite of y-axis
plt.gcf().text(1, 0.6, "Long Sessions", rotation=90, ha="center", va="center")
# Add legend
ax.legend(loc="lower center", bbox_to_anchor=(0.5, -0.3), ncol=2)
ax.tick_params(axis="both", which="both")
ax.invert_yaxis()
# remove margin at the end of the x-axis (after 100)
plt.margins(x=0)
plt.margins(y=0.03)
fig.tight_layout()
fig.savefig(filename, dpi=fig.dpi, bbox_inches="tight")
plt.close()
def main():
# get list of experiments to agglomerate together
base_path = "results/"
experiments = sorted(glob.glob("{0}/*".format(base_path)))
# repeat last element in the list of experiments. It is needed for the current matching implementation, otherwise the last day will be ignored!
experiments.append(experiments[-1])
print("Creating matchings dictionary")
_matching_dict = __create_similarity_matching(experiments)
print("\nGenerating Distribution of Clusters")
distr_dict = __generate_distributions_by_cluster(_matching_dict)
# labels for histogram to better handling of names
labels = ["All", "Weekday", "Weekend", "Morning", "Afternoon", "Evening", "Night", "Editorial Playlist", "User Collection", "Catalog", "Radio", "Charts", "Personalized Playlist"]
# Note: if we are creating multiple stacked histograms for later merging into one (e.g. long/medium/short sessions),
# for each graph it is required to manually rearrange the `distr_dict` rows to the desired sequence of types.
# This is necessary if we want to report the sequence "listener, listen-then-skip, skip-then-listen, skipper".
# An example of this rearrange is:
#distr_dict = distr_dict[[2, 3, 0, 1],:]
__distribution_stacked_histogram(distr_dict, labels, base_path)
if __name__ == "__main__":
main()