-
Notifications
You must be signed in to change notification settings - Fork 19
/
Copy pathentropy.py
82 lines (70 loc) · 2.41 KB
/
entropy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import sys
import pathlib as pt
from math import log2
import numpy as np
def _cl_co():
def co(i):
o = 0
for _ in range(8):
o += i & 1
i >>= 1
return o
LUT = [co(i) for i in range(256)]
return LUT.__getitem__ # fastest way to count bits in a byte
bitcount = _cl_co()
def main(argv=sys.argv):
if len(argv) == 1:
print("Provide a file")
return
f = pt.Path(argv[1])
tot = 0
counts = np.zeros(256, dtype=np.uint32)
# h = 0
with f.open("rb") as fp:
while (b := fp.read(256)):
i = -1
for i in range(7, len(b), 8):
# h += bitcount(b[i]) \
# + bitcount(b[i - 1]) \
# + bitcount(b[i - 2]) \
# + bitcount(b[i - 3]) \
# + bitcount(b[i - 4]) \
# + bitcount(b[i - 5]) \
# + bitcount(b[i - 6]) \
# + bitcount(b[i - 7])
# tot += 64
tot += 8
counts[b[i]] += 1
counts[b[i - 1]] += 1
counts[b[i - 2]] += 1
counts[b[i - 3]] += 1
counts[b[i - 4]] += 1
counts[b[i - 5]] += 1
counts[b[i - 6]] += 1
counts[b[i - 7]] += 1
for i in range(i + 1, len(b)):
# tot += 8
# h += bitcount(b[i])
counts[b[i]] += 1
tot += 1
probs = counts / tot
ent = -1 * (probs * np.log2(np.where(probs == 0, np.ones(1), probs))).sum()
if ent == 0: ent = -1 * ent
print(probs)
print(counts)
print("Entropy per byte: ", ent, "bits or", ent / 8, "bytes")
print("Entropy of file: ", ent * tot, "bits or", ent * tot / 8, "bytes")
print("Size of file: ", tot, "bytes")
print("Delta: ", tot - ent * tot / 8, "bytes compressable theoritically")
print("Best Theoritical Coding ratio: ", 8 / ent)
# p1 = h / tot
# p0 = (tot - h) / tot
# print("Probability to be high: ", p1, h, tot)
# # Realised late, I could have calculated byte entropy and wouldn't need
# # bit counting
# ent = p1 * (log2(tot) - log2(h)) + p0 * (log2(tot) - log2(tot - h))
# print("Informational entropy per bit: ", ent, "bits")
# print("Entropy per byte: ", ent * 8, "bits")
# print("Entropy of entire file: ", ent * tot, "bits")
if __name__ == "__main__":
main()