-
Notifications
You must be signed in to change notification settings - Fork 8
/
discrete.py
241 lines (213 loc) · 8.62 KB
/
discrete.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
from numbers import Number
from typing import List, Tuple, Union
import numpy as np
from bayesian_testing.experiments.base import BaseDataTest
from bayesian_testing.metrics import eval_numerical_dirichlet_agg
from bayesian_testing.utilities import get_logger
logger = get_logger("bayesian_testing")
class DiscreteDataTest(BaseDataTest):
"""
Class for Bayesian A/B test for data with finite discrete states (i.e. categorical data
with numerical categories). As a real world examples we can think of dice rolls,
1-5 star ratings, 1-10 ratings, etc.
After class initialization, use add_variant methods to insert variant data.
Then to get results of the test, use for instance `evaluate` method.
"""
def __init__(self, states: List[Union[float, int]]) -> None:
"""
Initialize DiscreteDataTest class.
Parameters
----------
states : List of all possible states for a given discrete variable.
"""
super().__init__()
if not self.check_if_numerical(states):
raise ValueError("States in the test have to be numbers (int or float).")
self.states = states
@property
def concentrations(self):
return [self.data[k]["concentration"] for k in self.data]
@property
def prior_alphas(self):
return [self.data[k]["prior"] for k in self.data]
@staticmethod
def check_if_numerical(values):
res = True
for v in values:
if not isinstance(v, Number):
res = False
return res
def eval_simulation(
self,
sim_count: int = 20000,
seed: int = None,
min_is_best: bool = False,
interval_alpha: float = 0.95,
) -> Tuple[dict, dict, dict]:
"""
Calculate probabilities of being best, expected loss and credible intervals for a current
class state.
Parameters
----------
sim_count : Number of simulations to be used for probability estimation.
seed : Random seed.
min_is_best : Option to change "being best" to a minimum. Default is maximum.
interval_alpha : Credible interval probability (value between 0 and 1).
Returns
-------
res_pbbs : Dictionary with probabilities of being best for all variants in experiment.
res_loss : Dictionary with expected loss for all variants in experiment.
res_intervals : Dictionary with quantile-based credible intervals for all variants.
"""
pbbs, loss, intervals = eval_numerical_dirichlet_agg(
self.states,
self.concentrations,
self.prior_alphas,
sim_count,
seed,
min_is_best,
interval_alpha,
)
res_pbbs = dict(zip(self.variant_names, pbbs))
res_loss = dict(zip(self.variant_names, loss))
res_intervals = dict(zip(self.variant_names, intervals))
return res_pbbs, res_loss, res_intervals
def evaluate(
self,
sim_count: int = 20000,
seed: int = None,
min_is_best: bool = False,
interval_alpha: float = 0.95,
) -> List[dict]:
"""
Evaluation of experiment.
Parameters
----------
sim_count : Number of simulations to be used for probability estimation.
seed : Random seed.
min_is_best : Option to change "being best" to a minimum. Default is maximum.
interval_alpha : Credible interval probability (value between 0 and 1).
Returns
-------
res : List of dictionaries with results per variant.
"""
keys = [
"variant",
"concentration",
"average_value",
"posterior_mean",
"credible_interval",
"prob_being_best",
"expected_loss",
]
posterior_alphas = [
list(np.array(i[0]) + np.array(i[1]))
for i in zip(self.concentrations, self.prior_alphas)
]
posterior_mean = [
round(sum(np.multiply(np.array(self.states), np.array(i[0]) / sum(np.array(i[0])))), 5)
for i in zip(posterior_alphas)
]
eval_pbbs, eval_loss, eval_intervals = self.eval_simulation(
sim_count, seed, min_is_best, interval_alpha
)
pbbs = list(eval_pbbs.values())
loss = list(eval_loss.values())
intervals = list(eval_intervals.values())
average_values = [
np.sum(np.multiply(i, self.states)) / np.sum(i) for i in self.concentrations
]
data = [
self.variant_names,
[dict(zip(self.states, i)) for i in self.concentrations],
average_values,
posterior_mean,
intervals,
pbbs,
loss,
]
res = [dict(zip(keys, item)) for item in zip(*data)]
return res
def add_variant_data_agg(
self,
name: str,
concentration: List[int],
prior: List[Union[float, int]] = None,
replace: bool = True,
) -> None:
"""
Add variant data to test class using aggregated discrete data.
This can be convenient as aggregation can be done on database level.
Default prior setup is Dirichlet(1,...,1) which is low information prior
(we can interpret it as prior 1 observation of each state).
Parameters
----------
name : Variant name.
concentration : Total number of experiment observations for each state
(e.g. number of rolls for each side in a die roll data).
prior : Prior alpha parameters of a Dirichlet distribution (conjugate prior).
replace : Replace data if variant already exists.
If set to False, data of existing variant will be appended to existing data.
"""
if not isinstance(name, str):
raise ValueError("Variant name has to be a string.")
if not len(self.states) == len(concentration):
msg = (
f"Concentration list has to have same size as number of states in a test "
f"{len(concentration)} != {len(self.states)}."
)
raise ValueError(msg)
if not self.check_if_numerical(concentration):
raise ValueError("Concentration parameter has to be a list of integer values.")
if not prior:
prior = [1] * len(self.states)
if name not in self.variant_names:
self.data[name] = {"concentration": concentration, "prior": prior}
elif name in self.variant_names and replace:
msg = (
f"Variant {name} already exists - new data is replacing it. "
"If you wish to append instead, use replace=False."
)
logger.info(msg)
self.data[name] = {"concentration": concentration, "prior": prior}
elif name in self.variant_names and not replace:
msg = (
f"Variant {name} already exists - new data is appended to variant, "
"keeping its original prior setup. "
"If you wish to replace data instead, use replace=True."
)
logger.info(msg)
self.data[name]["concentration"] = [
sum(x) for x in zip(self.data[name]["concentration"], concentration)
]
def add_variant_data(
self,
name: str,
data: List[int],
prior: List[Union[float, int]] = None,
replace: bool = True,
) -> None:
"""
Add variant data to test class using raw discrete data.
Default prior setup is Dirichlet(1,...,1) which is low information prior
(we can interpret it as prior 1 observation of each state).
Parameters
----------
name : Variant name.
data : List of numerical data observations from possible states.
prior : Prior alpha parameters of a Dirichlet distribution (conjugate prior).
replace : Replace data if variant already exists.
If set to False, data of existing variant will be appended to existing data.
"""
if len(data) == 0:
raise ValueError("Data of added variant needs to have some observations.")
if not min([i in self.states for i in data]):
msg = (
"Input data needs to be a list of numbers from possible states: " f"{self.states}."
)
raise ValueError(msg)
counter_dict = dict(zip(self.states, np.zeros(len(self.states))))
for i in data:
counter_dict[i] += 1
concentration = [counter_dict[i] for i in self.states]
self.add_variant_data_agg(name, concentration, prior, replace)