From 022ef22ba7a7bacbb79394abf8de1f25e6f8820f Mon Sep 17 00:00:00 2001 From: Tyler Barrus Date: Sat, 10 Nov 2018 18:49:57 -0500 Subject: [PATCH] Expanding Bloom Filter (#43) * first pass at an expanding bloom filter! * document the expanding bloom filter --- docs/source/code.rst | 6 + docs/source/quickstart.rst | 11 ++ probables/__init__.py | 8 +- probables/blooms/__init__.py | 4 +- probables/blooms/basebloom.py | 4 +- probables/blooms/expandingbloom.py | 122 +++++++++++++++++++++ probables/countminsketch/countminsketch.py | 3 +- probables/cuckoo/countingcuckoo.py | 4 +- setup.cfg | 2 +- tests/expandingbloom_test.py | 56 ++++++++++ 10 files changed, 209 insertions(+), 11 deletions(-) create mode 100644 probables/blooms/expandingbloom.py create mode 100644 tests/expandingbloom_test.py diff --git a/docs/source/code.rst b/docs/source/code.rst index dabb7d9..4b5ba38 100644 --- a/docs/source/code.rst +++ b/docs/source/code.rst @@ -40,6 +40,12 @@ BloomFilterOnDisk For more information of all methods and properties, see `BloomFilter`_. +ExpandingBloomFilter ++++++++++++++++++++++++++++++++ + +.. autoclass:: probables.ExpandingBloomFilter + :members: + CountingBloomFilter +++++++++++++++++++++++++++++++ diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst index d1baf5b..74ac548 100644 --- a/docs/source/quickstart.rst +++ b/docs/source/quickstart.rst @@ -82,6 +82,17 @@ Bloom Filter that is run directly off of disk instead of in memory. This can be useful for very large Bloom Filters or when needing to access many Blooms that are exported to file. +Expanding Bloom Filter +""""""""""""""""""""""""""""""""""""""""""""""" + +The **Expanding Bloom Filter** is a specialized version of the standard +Bloom Filter that automatically grows to ensure that the desired false positive +rate is not exceeded. This is ideal for situations that it is a wild guess to +determine the number of elements that will be added. + +At this time, it is not possible to import or export an **Expanding Bloom +Filter** but that is a planned feature. + Counting Bloom Filter """"""""""""""""""""""""""""""""""""""""""""""" diff --git a/probables/__init__.py b/probables/__init__.py index 4ce2e52..b68b95b 100644 --- a/probables/__init__.py +++ b/probables/__init__.py @@ -1,6 +1,7 @@ ''' pyprobables module ''' from __future__ import (unicode_literals, absolute_import, print_function) -from . blooms import (BloomFilter, BloomFilterOnDisk, CountingBloomFilter) +from . blooms import (BloomFilter, BloomFilterOnDisk, CountingBloomFilter, + ExpandingBloomFilter) from . countminsketch import (CountMinSketch, HeavyHitters, StreamThreshold, CountMeanSketch, CountMeanMinSketch) from . cuckoo import (CuckooFilter, CountingCuckooFilter) @@ -11,7 +12,7 @@ __maintainer__ = 'Tyler Barrus' __email__ = 'barrust@gmail.com' __license__ = 'MIT' -__version__ = '0.2.0' +__version__ = '0.2.5' __credits__ = [] __url__ = 'https://github.com/barrust/pyprobables' __bugtrack_url__ = 'https://github.com/barrust/pyprobables/issues' @@ -20,4 +21,5 @@ 'CountMinSketch', 'CountMeanSketch', 'CountMeanMinSketch', 'HeavyHitters', 'StreamThreshold', 'CuckooFilter', 'CountingCuckooFilter', 'InitializationError', 'NotSupportedError', - 'ProbablesBaseException', 'CuckooFilterFullError'] + 'ProbablesBaseException', 'CuckooFilterFullError', + 'ExpandingBloomFilter'] diff --git a/probables/blooms/__init__.py b/probables/blooms/__init__.py index fcc3bb0..38ae44d 100644 --- a/probables/blooms/__init__.py +++ b/probables/blooms/__init__.py @@ -3,5 +3,7 @@ from . bloom import (BloomFilter, BloomFilterOnDisk) from . countingbloom import (CountingBloomFilter) +from . expandingbloom import (ExpandingBloomFilter) -__all__ = ['BloomFilter', 'BloomFilterOnDisk', 'CountingBloomFilter'] +__all__ = ['BloomFilter', 'BloomFilterOnDisk', 'CountingBloomFilter', + 'ExpandingBloomFilter'] diff --git a/probables/blooms/basebloom.py b/probables/blooms/basebloom.py index 1302dc6..baa2d71 100644 --- a/probables/blooms/basebloom.py +++ b/probables/blooms/basebloom.py @@ -38,12 +38,12 @@ def __init__(self, blm_type, est_elements=None, false_positive_rate=None, self._els_added = 0 self._on_disk = False # not on disk self.__blm_type = blm_type - if self.__blm_type in ['regular', 'reg-ondisk']: + if self.__blm_type in ['regular', 'reg-ondisk', 'expanding']: self.__impt_type = 'B' else: self.__impt_type = 'I' - if blm_type in ['regular', 'reg-ondisk']: + if blm_type in ['regular', 'reg-ondisk', 'expanding']: msg = ('Insufecient parameters to set up the Bloom Filter') else: msg = ('Insufecient parameters to set up the Counting Bloom ' diff --git a/probables/blooms/expandingbloom.py b/probables/blooms/expandingbloom.py new file mode 100644 index 0000000..53db77e --- /dev/null +++ b/probables/blooms/expandingbloom.py @@ -0,0 +1,122 @@ +''' BloomFilter, python implementation + License: MIT + Author: Tyler Barrus (barrust@gmail.com) + URL: https://github.com/barrust/pyprobables +''' +from __future__ import (unicode_literals, absolute_import, print_function) + +from . bloom import (BloomFilter) + + +class ExpandingBloomFilter(object): + ''' Simple expanding Bloom Filter implementation for use in python; the + Bloom Fiter will automatically expand, or grow, if the false + positive rate is about to become greater than the desired false + positive rate. + + Args: + est_elements (int): The number of estimated elements to be added + false_positive_rate (float): The desired false positive rate + hash_function (function): Hashing strategy function to use \ + `hf(key, number)` + Returns: + ExpandingBloomFilter: An expanding Bloom Filter object + Note: + At this point, the expanding Bloom Filter does not support \ + `export` or `import` ''' + + def __init__(self, est_elements=None, false_positive_rate=None, + hash_function=None): + ''' initialize ''' + self._blooms = list() + self.__fpr = false_positive_rate + self.__est_elements = est_elements + self.__hash_func = hash_function + self.__added_elements = 0 # total added... + # add in the initial bloom filter! + self.__add_bloom_filter() + + def __contains__(self, key): + ''' setup the `in` functionality ''' + return self.check(key) + + @property + def expansions(self): + ''' int: The number of expansions ''' + return len(self._blooms) - 1 + + @property + def false_positive_rate(self): + ''' float: The desired false positive rate of the expanding Bloom \ + Filter ''' + return self.__fpr + + @property + def estimated_elements(self): + '''int: The original number of elements estimated to be in the Bloom \ + Filter ''' + return self.__est_elements + + @property + def elements_added(self): + ''' int: The total number of elements added ''' + return self.__added_elements + + def __add_bloom_filter(self): + ''' build a new bloom and add it on! ''' + blm = BloomFilter(self.__est_elements, self.__fpr, self.__hash_func) + self._blooms.append(blm) + + def __check_for_growth(self): + ''' detereming if the bloom filter should automatically grow ''' + if self._blooms[-1].elements_added >= self.__est_elements: + self.__add_bloom_filter() + + def check(self, key): + ''' Check to see if the key is in the Bloom Filter + + Args: + key (str): The key to check for in the Bloom Filter + Returns: + bool: `True` if the element is likely present; `False` if \ + definately not present ''' + hashes = self._blooms[0].hashes(key) + return self.check_alt(hashes) + + def check_alt(self, hashes): + ''' Check to see if the hashes are in the Bloom Filter + + Args: + hashes (list): The hash representation to check for in the \ + Bloom Filter + Returns: + bool: `True` if the element is likely present; `False` if \ + definately not present ''' + for blm in self._blooms: + if blm.check_alt(hashes): + return True + return False + + def add(self, key, force=False): + ''' Add the key to the Bloom Filter + + Args: + key (str): The element to be inserted + force (bool): `True` will force it to be inserted, even if it \ + likely has been inserted before \ + `False` will only insert if not found in the Bloom Filter ''' + hashes = self._blooms[0].hashes(key) + self.add_alt(hashes, force) + + def add_alt(self, hashes, force=False): + ''' Add the element represented by hashes into the Bloom Filter + + Args: + hashes (list): A list of integers representing the key to insert + force (bool): `True` will force it to be inserted, even if \ + it likely has been inserted before \ + `False` will only insert if not found in the Bloom Filter ''' + self.__added_elements += 1 + if force or not self.check_alt(hashes): + self.__check_for_growth() + self._blooms[-1].add_alt(hashes) diff --git a/probables/countminsketch/countminsketch.py b/probables/countminsketch/countminsketch.py index 9a5b0fb..588d609 100644 --- a/probables/countminsketch/countminsketch.py +++ b/probables/countminsketch/countminsketch.py @@ -451,8 +451,7 @@ class HeavyHitters(CountMinSketch): For width and depth, width may realistically be in the thousands \ while depth is in the single digit to teens ''' - __slots__ = CountMinSketch.__slots__ - __slots__.extend(['__top_x', '__top_x_size', '__num_hitters', '__smallest']) + __slots__ = ['__top_x', '__top_x_size', '__num_hitters', '__smallest'] def __init__(self, num_hitters=100, width=None, depth=None, confidence=None, error_rate=None, filepath=None, diff --git a/probables/cuckoo/countingcuckoo.py b/probables/cuckoo/countingcuckoo.py index a77056a..37781ce 100644 --- a/probables/cuckoo/countingcuckoo.py +++ b/probables/cuckoo/countingcuckoo.py @@ -28,8 +28,8 @@ class CountingCuckooFilter(CuckooFilter): Returns: CountingCuckooFilter: A Cuckoo Filter object ''' - __slots__ = CuckooFilter.__slots__ - __slots__.extend(['__unique_elements']) + __slots__ = ['__unique_elements', '_inserted_elements', '_bucket_size', + '__max_cuckoo_swaps', '_cuckoo_capacity', '_buckets'] def __init__(self, capacity=10000, bucket_size=4, max_swaps=500, expansion_rate=2, auto_expand=True, finger_size=4, diff --git a/setup.cfg b/setup.cfg index a793a6f..2b9797e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,7 +1,7 @@ [bdist_wheel] universal=1 -[pep8] +[pycodestyle] max-line-length=120 [flake8] diff --git a/tests/expandingbloom_test.py b/tests/expandingbloom_test.py new file mode 100644 index 0000000..7b721e8 --- /dev/null +++ b/tests/expandingbloom_test.py @@ -0,0 +1,56 @@ +# -*- coding: utf-8 -*- +''' Unittest class ''' +from __future__ import (unicode_literals, absolute_import, print_function) +import unittest +from probables import (ExpandingBloomFilter) + +class TestExpandingBloomFilter(unittest.TestCase): + + def test_ebf_init(self): + ''' test the initialization of an expanding bloom filter ''' + blm = ExpandingBloomFilter(est_elements=10, false_positive_rate=0.05) + self.assertEqual(blm.expansions, 0) + + def test_ebf_add_lots(self): + ''' test adding "lots" of elements to force the expansion ''' + blm = ExpandingBloomFilter(est_elements=10, false_positive_rate=0.05) + for i in range(100): + blm.add("{}".format(i), True) + self.assertEqual(blm.expansions, 9) + + def test_ebf_add_lots_without_force(self): + ''' testing adding "lots" but force them to be inserted multiple times''' + blm = ExpandingBloomFilter(est_elements=10, false_positive_rate=0.05) + # simulate false positives... notice it didn't grow a few... + for i in range(120): + blm.add("{}".format(i)) + self.assertEqual(blm.expansions, 9) + self.assertEqual(blm.elements_added, 120) + + def test_ebf_check(self): + ''' ensure that checking the expanding bloom filter works ''' + blm = ExpandingBloomFilter(est_elements=25, false_positive_rate=0.05) + # expand it out some first! + for i in range(100): + blm.add("{}".format(i)) + blm.add('this is a test') + blm.add('this is another test') + self.assertGreater(blm.expansions, 1) + self.assertEqual(blm.check('this is a test'), True) + self.assertEqual(blm.check('this is another test'), True) + self.assertEqual(blm.check('this is yet another test'), False) + self.assertEqual(blm.check('this is not another test'), False) + + def test_ebf_contains(self): + ''' ensure that "in" functionality for the expanding bloom filter works ''' + blm = ExpandingBloomFilter(est_elements=25, false_positive_rate=0.05) + # expand it out some first! + for i in range(100): + blm.add("{}".format(i)) + blm.add('this is a test') + blm.add('this is another test') + self.assertGreater(blm.expansions, 1) + self.assertEqual('this is a test' in blm, True) + self.assertEqual('this is another test' in blm, True) + self.assertEqual('this is yet another test' in blm, False) + self.assertEqual('this is not another test' in blm, False)