-
Notifications
You must be signed in to change notification settings - Fork 0
/
mapping.py
185 lines (149 loc) · 6.68 KB
/
mapping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
# Unless explicitly stated otherwise all files in this repository are licensed
# under the Apache License 2.0.
# Copyright 2020 Datadog, Inc. for original work
# Copyright 2021 GraphMetrics for modifications
"""A mapping between values and integer indices that imposes relative accuracy
guarantees. Specifically, for any value `minIndexableValue() < value <
maxIndexableValue` implementations of `KeyMapping` must be such that
`value(key(v))` is close to `v` with a relative error that is less than
`relative_accuracy`.
In implementations of KeyMapping, there is generally a trade-off between the
cost of computing the key and the number of keys that are required to cover a
given range of values (memory optimality). The most memory-optimal mapping is
the LogarithmicMapping, but it requires the costly evaluation of the logarithm
when computing the index. Other mappings can approximate the logarithmic
mapping, while being less computationally costly.
"""
from abc import ABC, abstractmethod
import math
import numpy as np
from exception import IllegalArgumentException
class KeyMapping(ABC):
"""
Args:
relative_accuracy (float): the accuracy guarantee; referred to as alpha
in the paper. (0. < alpha < 1.)
offset (float): an offset that can be used to shift all bin keys
Attributes:
gamma (float): the base for the exponential buckets
gamma = (1 + alpha) / (1 - alpha)
min_possible: the smallest value the sketch can distinguish from 0
max_possible: the largest value the sketch can handle
_multiplier (float): used for calculating log_gamma(value)
initially, _multiplier = 1 / log(gamma)
"""
def __init__(self, relative_accuracy, offset=0.0):
if relative_accuracy <= 0 or relative_accuracy >= 1:
raise IllegalArgumentException("Relative accuracy must be between 0 and 1.")
self.relative_accuracy = relative_accuracy
self._offset = offset
gamma_mantissa = 2 * relative_accuracy / (1 - relative_accuracy)
self.gamma = 1 + gamma_mantissa
self._multiplier = 1 / math.log1p(gamma_mantissa)
self.min_possible = np.finfo(np.float64).tiny * self.gamma
self.max_possible = np.finfo(np.float64).max / self.gamma
@classmethod
def from_gamma_offset(cls, gamma, offset):
""" Constructor used by pb.proto """
relative_accuracy = (gamma - 1.0) / (gamma + 1.0)
return cls(relative_accuracy, offset=offset)
@abstractmethod
def _log_gamma(self, value):
""" Return (an approximation of) the logarithm of the value base gamma """
@abstractmethod
def _pow_gamma(self, value):
""" Return (an approximation of) gamma to the power value """
def key(self, value):
"""
Args:
value (float)
Returns:
int: the key specifying the bucket for value
"""
return int(math.ceil(self._log_gamma(value)) + self._offset)
def value(self, key):
"""
Args:
key (int)
Returns:
float: the value represented by the bucket specified by the key
"""
return self._pow_gamma(key - self._offset) * (2.0 / (1 + self.gamma))
class LogarithmicMapping(KeyMapping):
"""A memory-optimal KeyMapping, i.e., given a targeted relative accuracy, it
requires the least number of keys to cover a given range of values. This is
done by logarithmically mapping floating-point values to integers.
"""
def __init__(self, relative_accuracy, offset=0.0):
super().__init__(relative_accuracy, offset=offset)
self._multiplier *= math.log(2)
def _log_gamma(self, value):
return math.log2(value) * self._multiplier
def _pow_gamma(self, value):
return np.exp2(value / self._multiplier)
class LinearlyInterpolatedMapping(KeyMapping):
"""A fast KeyMapping that approximates the memory-optimal one
(LogarithmicMapping) by extracting the floor value of the logarithm to the
base 2 from the binary representations of floating-point values and
linearly interpolating the logarithm in-between."""
def _log2_approx(self, value):
"""approximates log2 by s + f
where v = (s+1) * 2 ** f for s in [0, 1)
frexp(v) returns m and e s.t.
v = m * 2 ** e ; (m in [0.5, 1) or 0.0)
so we adjust m and e accordingly
"""
mantissa, exponent = math.frexp(value)
significand = 2 * mantissa - 1
return significand + (exponent - 1)
def _exp2_approx(self, value):
""" inverse of _log2_approx """
exponent = math.floor(value) + 1
mantissa = (value - exponent + 2) / 2.0
return math.ldexp(mantissa, exponent)
def _log_gamma(self, value):
return self._log2_approx(value) * self._multiplier
def _pow_gamma(self, value):
return self._exp2_approx(value / self._multiplier)
class CubicallyInterpolatedMapping(KeyMapping):
"""A fast KeyMapping that approximates the memory-optimal LogarithmicMapping by
extracting the floor value of the logarithm to the base 2 from the binary
representations of floating-point values and cubically interpolating the
logarithm in-between.
More detailed documentation of this method can be found in:
<a href="https://github.com/DataDog/sketches-java/">sketches-java</a>
"""
A = 6 / 35
B = -3 / 5
C = 10 / 7
def __init__(self, relative_accuracy, offset=0.0):
super().__init__(relative_accuracy, offset=offset)
self._multiplier /= self.C
def _cubic_log2_approx(self, value):
""" approximates log2 using a cubic polynomial """
mantissa, exponent = math.frexp(value)
significand = 2 * mantissa - 1
return (
(self.A * significand + self.B) * significand + self.C
) * significand + (exponent - 1)
def _cubic_exp2_approx(self, value):
""" Derived from Cardano's formula """
exponent = math.floor(value)
delta_0 = self.B * self.B - 3 * self.A * self.C
delta_1 = (
2 * self.B * self.B * self.B
- 9 * self.A * self.B * self.C
- 27 * self.A * self.A * (value - exponent)
)
cardano = np.cbrt(
(delta_1 - np.sqrt(delta_1 * delta_1 - 4 * delta_0 * delta_0 * delta_0)) / 2
)
significand_plus_one = (
-(self.B + cardano + delta_0 / cardano) / (3 * self.A) + 1
)
mantissa = significand_plus_one / 2
return np.ldexp(mantissa, exponent + 1)
def _log_gamma(self, value):
return self._cubic_log2_approx(value) * self._multiplier
def _pow_gamma(self, value):
return self._cubic_exp2_approx(value / self._multiplier)