diff --git a/fvnbloom/__init__.py b/fvnbloom/__init__.py new file mode 100644 index 0000000..6a57c63 --- /dev/null +++ b/fvnbloom/__init__.py @@ -0,0 +1 @@ +from fvnbloom.pybloom import BloomFilter, create_empty, load \ No newline at end of file diff --git a/fvnbloom/pybloom.py b/fvnbloom/pybloom.py new file mode 100644 index 0000000..73c11cb --- /dev/null +++ b/fvnbloom/pybloom.py @@ -0,0 +1,171 @@ +import math +import json +import base64 + +import numpy as np + +from numba import njit, int32 + + +# this is a 1-to-1 translation of our js bloom filters to python + + +@njit(int32(int32)) +def popcnt(v): + v -= (v >> 1) & 0x55555555 + v = (v & 0x33333333) + ((v >> 2) & 0x33333333) + return ((v + (v >> 4) & 0xf0f0f0f) * 0x1010101) >> 24 + + +# a * 16777619 mod 2**32 +@njit(int32(int32)) +def fnv_multiply(a): + return a + (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24) + + +#// See https://web.archive.org/web/20131019013225/http://home.comcast.net/~bretm/hash/6.html +@njit(int32(int32)) +def fnv_mix(a): + a += (a << 13) + a ^= (a >> 7) + a += (a << 3) + a ^= (a >> 17) + a += (a << 5) + return a & 0xffffffff + + +#// Fowler/Noll/Vo hashing. +#// Nonstandard variation: this function optionally takes a seed value that is incorporated +#// into the offset basis. According to http://www.isthe.com/chongo/tech/comp/fnv/index.html +#// "almost any offset_basis will serve so long as it is non-zero". +@njit +def fnv_1a(v, seed): + a = 2166136261 ^ seed + + for i in range(len(v)): + c = v[i] + d = c & 0xff00 + if d: + a = fnv_multiply(a ^ d >> 8) + a = fnv_multiply(a ^ c & 0xff) + + return fnv_mix(a) + + +@njit +def bf_calculate_locations(r, m, key): + a = fnv_1a(key, 0) + b = fnv_1a(key, 1576284489) # // The seed value is chosen randomly + x = a % m + + for i in range(len(r)): + if x < 0: + r[i] = x + m + else: + r[i] = x + x = (x + b) % m + + return r + + +@njit +def bf_test(locations, buckets): + for i in range(len(locations)): + b = locations[i] + + if buckets[math.floor(b / 32)] & (1 << (b % 32)) == 0: + return False + + return True + + +@njit +def bf_add(locations, buckets, key): + for i in range(len(locations)): + b = locations[i] + buckets[math.floor(b / 32)] |= 1 << (b % 32) + + +@njit +def buckets_union(b1, b2): + n1 = len(b1) + n2 = len(b2) + assert n1 == n2 + + for i in range(n1): + b1[i] = b1[i] | b2[i] + + return b1 + + +def create_empty(capacity, error_rate=0.001): + if not (0 < error_rate < 1): + raise ValueError("Error_Rate must be between 0 and 1.") + if not capacity > 0: + raise ValueError("Capacity must be > 0") + + num_bits = (-capacity * math.log(error_rate) / (math.log(2) * math.log(2))) + num_hashes = max(1, round(num_bits / capacity * math.log(2))) + + n = math.ceil(num_bits / 32) + buckets = np.zeros(n, dtype='int32') + + return BloomFilter(num_hashes, buckets) + + +class BloomFilter(object): + def __init__(self, num_hashes, buckets): + self.buckets = buckets + self.num_hashes = num_hashes + self.n = len(buckets) + self.m = self.n * 32 + self._locations = np.zeros(self.num_hashes, dtype='uint32') + + def _calculate_locations(self, key): + return bf_calculate_locations(self._locations, self.m, key) + + def _calculate_key(self, key): + bkey = key.encode() + #bkey = np.frombuffer(bkey, dtype='uint8') + return bkey + + def test(self, key): + key = self._calculate_key(key) + l = self._calculate_locations(key) + return bf_test(l, self.buckets) + + def __contains__(self, key): + return self.test(key) + + def add(self, key): + key = self._calculate_key(key) + l = self._calculate_locations(key) + bf_add(l, self.buckets, key) + + def union(self, other): + assert self.num_hashes == other.num_hashes + self.buckets = buckets_union(self.buckets, other.buckets) + return self + + def save(self, file): + save(self.num_hashes, self.buckets, file) + + +def save(num_hashes, buckets, file): + buckets = np.array(buckets, dtype='int32') + b64 = base64.b64encode(buckets.tobytes()).decode() + + d = dict(num_hashes=num_hashes, buckets=b64) + + with open(file, 'w') as f_out: + json.dump(d, f_out) + + +def load(file): + with open(file, 'r') as f_in: + d = json.load(f_in) + + b64 = d['buckets'] + buckets = np.frombuffer(base64.b64decode(b64), dtype='int32') + + return BloomFilter(d['num_hashes'], buckets) \ No newline at end of file diff --git a/pybloom/__init__.py b/pybloom/__init__.py deleted file mode 100644 index ee5300d..0000000 --- a/pybloom/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -"""pybloom - -""" - -from .pybloom import BloomFilter, ScalableBloomFilter - diff --git a/pybloom/benchmarks.py b/pybloom/benchmarks.py deleted file mode 100755 index 661a36b..0000000 --- a/pybloom/benchmarks.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env python -# -"""Test performance of BloomFilter at a set capacity and error rate.""" -import sys -from pybloom import BloomFilter -import bitarray, math, time -from utils import range_fn - - -def main(capacity=100000, request_error_rate=0.1): - f = BloomFilter(capacity=capacity, error_rate=request_error_rate) - assert (capacity == f.capacity) - start = time.time() - for i in range_fn(0, f.capacity): - f.add(i, skip_check=True) - end = time.time() - print("{:5.3f} seconds to add to capacity, {:10.2f} entries/second".format( - end - start, f.capacity / (end - start))) - oneBits = f.bitarray.count(True) - zeroBits = f.bitarray.count(False) - #print "Number of 1 bits:", oneBits - #print "Number of 0 bits:", zeroBits - print("Number of Filter Bits:", f.num_bits) - print("Number of slices:", f.num_slices) - print("Bits per slice:", f.bits_per_slice) - print("------") - print("Fraction of 1 bits at capacity: {:5.3f}".format( - oneBits / float(f.num_bits))) - # Look for false positives and measure the actual fp rate - trials = f.capacity - fp = 0 - start = time.time() - for i in range_fn(f.capacity, f.capacity + trials + 1): - if i in f: - fp += 1 - end = time.time() - print(("{:5.3f} seconds to check false positives, " - "{:10.2f} checks/second".format(end - start, trials / (end - start)))) - print("Requested FP rate: {:2.4f}".format(request_error_rate)) - print("Experimental false positive rate: {:2.4f}".format(fp / float(trials))) - # Compute theoretical fp max (Goel/Gupta) - k = f.num_slices - m = f.num_bits - n = f.capacity - fp_theory = math.pow((1 - math.exp(-k * (n + 0.5) / (m - 1))), k) - print("Projected FP rate (Goel/Gupta): {:2.6f}".format(fp_theory)) - -if __name__ == '__main__' : - status = main() - sys.exit(status) diff --git a/pybloom/pybloom.py b/pybloom/pybloom.py deleted file mode 100644 index beeefe4..0000000 --- a/pybloom/pybloom.py +++ /dev/null @@ -1,436 +0,0 @@ -# -*- encoding: utf-8 -*- -"""This module implements a bloom filter probabilistic data structure and -an a Scalable Bloom Filter that grows in size as your add more items to it -without increasing the false positive error_rate. - -Requires the bitarray library: http://pypi.python.org/pypi/bitarray/ - - >>> from pybloom import BloomFilter - >>> f = BloomFilter(capacity=10000, error_rate=0.001) - >>> for i in range_fn(0, f.capacity): - ... _ = f.add(i) - ... - >>> 0 in f - True - >>> f.capacity in f - False - >>> len(f) <= f.capacity - True - >>> (1.0 - (len(f) / float(f.capacity))) <= f.error_rate + 2e-18 - True - - >>> from pybloom import ScalableBloomFilter - >>> sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) - >>> count = 10000 - >>> for i in range_fn(0, count): - ... _ = sbf.add(i) - ... - >>> sbf.capacity > count - True - >>> len(sbf) <= count - True - >>> (1.0 - (len(sbf) / float(count))) <= sbf.error_rate + 2e-18 - True - -""" -from __future__ import absolute_import -import math -import hashlib -from pybloom.utils import range_fn, is_string_io, running_python_3 -from struct import unpack, pack, calcsize - -try: - import bitarray -except ImportError: - raise ImportError('pybloom requires bitarray >= 0.3.4') - -__version__ = '2.0' -__author__ = "Jay Baird , Bob Ippolito ,\ - Marius Eriksen ,\ - Alex Brasetvik ,\ - Matt Bachmann ,\ - " - -def make_hashfuncs(num_slices, num_bits): - if num_bits >= (1 << 31): - fmt_code, chunk_size = 'Q', 8 - elif num_bits >= (1 << 15): - fmt_code, chunk_size = 'I', 4 - else: - fmt_code, chunk_size = 'H', 2 - total_hash_bits = 8 * num_slices * chunk_size - if total_hash_bits > 384: - hashfn = hashlib.sha512 - elif total_hash_bits > 256: - hashfn = hashlib.sha384 - elif total_hash_bits > 160: - hashfn = hashlib.sha256 - elif total_hash_bits > 128: - hashfn = hashlib.sha1 - else: - hashfn = hashlib.md5 - fmt = fmt_code * (hashfn().digest_size // chunk_size) - num_salts, extra = divmod(num_slices, len(fmt)) - if extra: - num_salts += 1 - salts = tuple(hashfn(hashfn(pack('I', i)).digest()) for i in range_fn(num_salts)) - def _make_hashfuncs(key): - if running_python_3: - if isinstance(key, str): - key = key.encode('utf-8') - else: - key = str(key).encode('utf-8') - else: - if isinstance(key, unicode): - key = key.encode('utf-8') - else: - key = str(key) - i = 0 - for salt in salts: - h = salt.copy() - h.update(key) - for uint in unpack(fmt, h.digest()): - yield uint % num_bits - i += 1 - if i >= num_slices: - return - - return _make_hashfuncs - - -class BloomFilter(object): - FILE_FMT = b'>> b = BloomFilter(capacity=100000, error_rate=0.001) - >>> b.add("test") - False - >>> "test" in b - True - - """ - if not (0 < error_rate < 1): - raise ValueError("Error_Rate must be between 0 and 1.") - if not capacity > 0: - raise ValueError("Capacity must be > 0") - # given M = num_bits, k = num_slices, P = error_rate, n = capacity - # k = log2(1/P) - # solving for m = bits_per_slice - # n ~= M * ((ln(2) ** 2) / abs(ln(P))) - # n ~= (k * m) * ((ln(2) ** 2) / abs(ln(P))) - # m ~= n * abs(ln(P)) / (k * (ln(2) ** 2)) - num_slices = int(math.ceil(math.log(1.0 / error_rate, 2))) - bits_per_slice = int(math.ceil( - (capacity * abs(math.log(error_rate))) / - (num_slices * (math.log(2) ** 2)))) - self._setup(error_rate, num_slices, bits_per_slice, capacity, 0) - self.bitarray = bitarray.bitarray(self.num_bits, endian='little') - self.bitarray.setall(False) - - def _setup(self, error_rate, num_slices, bits_per_slice, capacity, count): - self.error_rate = error_rate - self.num_slices = num_slices - self.bits_per_slice = bits_per_slice - self.capacity = capacity - self.num_bits = num_slices * bits_per_slice - self.count = count - self.make_hashes = make_hashfuncs(self.num_slices, self.bits_per_slice) - - def __contains__(self, key): - """Tests a key's membership in this bloom filter. - - >>> b = BloomFilter(capacity=100) - >>> b.add("hello") - False - >>> "hello" in b - True - - """ - bits_per_slice = self.bits_per_slice - bitarray = self.bitarray - hashes = self.make_hashes(key) - offset = 0 - for k in hashes: - if not bitarray[offset + k]: - return False - offset += bits_per_slice - return True - - def __len__(self): - """Return the number of keys stored by this bloom filter.""" - return self.count - - def add(self, key, skip_check=False): - """ Adds a key to this bloom filter. If the key already exists in this - filter it will return True. Otherwise False. - - >>> b = BloomFilter(capacity=100) - >>> b.add("hello") - False - >>> b.add("hello") - True - >>> b.count - 1 - - """ - bitarray = self.bitarray - bits_per_slice = self.bits_per_slice - hashes = self.make_hashes(key) - found_all_bits = True - if self.count > self.capacity: - raise IndexError("BloomFilter is at capacity") - offset = 0 - for k in hashes: - if not skip_check and found_all_bits and not bitarray[offset + k]: - found_all_bits = False - self.bitarray[offset + k] = True - offset += bits_per_slice - - if skip_check: - self.count += 1 - return False - elif not found_all_bits: - self.count += 1 - return False - else: - return True - - def copy(self): - """Return a copy of this bloom filter. - """ - new_filter = BloomFilter(self.capacity, self.error_rate) - new_filter.bitarray = self.bitarray.copy() - return new_filter - - def union(self, other): - """ Calculates the union of the two underlying bitarrays and returns - a new bloom filter object.""" - if self.capacity != other.capacity or \ - self.error_rate != other.error_rate: - raise ValueError("Unioning filters requires both filters to have \ -both the same capacity and error rate") - new_bloom = self.copy() - new_bloom.bitarray = new_bloom.bitarray | other.bitarray - return new_bloom - - def __or__(self, other): - return self.union(other) - - def intersection(self, other): - """ Calculates the intersection of the two underlying bitarrays and returns - a new bloom filter object.""" - if self.capacity != other.capacity or \ - self.error_rate != other.error_rate: - raise ValueError("Intersecting filters requires both filters to \ -have equal capacity and error rate") - new_bloom = self.copy() - new_bloom.bitarray = new_bloom.bitarray & other.bitarray - return new_bloom - - def __and__(self, other): - return self.intersection(other) - - def tofile(self, f): - """Write the bloom filter to file object `f'. Underlying bits - are written as machine values. This is much more space - efficient than pickling the object.""" - f.write(pack(self.FILE_FMT, self.error_rate, self.num_slices, - self.bits_per_slice, self.capacity, self.count)) - (f.write(self.bitarray.tobytes()) if is_string_io(f) - else self.bitarray.tofile(f)) - - @classmethod - def fromfile(cls, f, n=-1): - """Read a bloom filter from file-object `f' serialized with - ``BloomFilter.tofile''. If `n' > 0 read only so many bytes.""" - headerlen = calcsize(cls.FILE_FMT) - - if 0 < n < headerlen: - raise ValueError('n too small!') - - filter = cls(1) # Bogus instantiation, we will `_setup'. - filter._setup(*unpack(cls.FILE_FMT, f.read(headerlen))) - filter.bitarray = bitarray.bitarray(endian='little') - if n > 0: - (filter.bitarray.frombytes(f.read(n-headerlen)) if is_string_io(f) - else filter.bitarray.fromfile(f, n - headerlen)) - else: - (filter.bitarray.frombytes(f.read()) if is_string_io(f) - else filter.bitarray.fromfile(f)) - if filter.num_bits != filter.bitarray.length() and \ - (filter.num_bits + (8 - filter.num_bits % 8) - != filter.bitarray.length()): - raise ValueError('Bit length mismatch!') - - return filter - - def __getstate__(self): - d = self.__dict__.copy() - del d['make_hashes'] - return d - - def __setstate__(self, d): - self.__dict__.update(d) - self.make_hashes = make_hashfuncs(self.num_slices, self.bits_per_slice) - -class ScalableBloomFilter(object): - SMALL_SET_GROWTH = 2 # slower, but takes up less memory - LARGE_SET_GROWTH = 4 # faster, but takes up more memory faster - FILE_FMT = '>> b = ScalableBloomFilter(initial_capacity=512, error_rate=0.001, \ - mode=ScalableBloomFilter.SMALL_SET_GROWTH) - >>> b.add("test") - False - >>> "test" in b - True - >>> unicode_string = u'ยก' - >>> b.add(unicode_string) - False - >>> unicode_string in b - True - """ - if not error_rate or error_rate < 0: - raise ValueError("Error_Rate must be a decimal less than 0.") - self._setup(mode, 0.9, initial_capacity, error_rate) - self.filters = [] - - def _setup(self, mode, ratio, initial_capacity, error_rate): - self.scale = mode - self.ratio = ratio - self.initial_capacity = initial_capacity - self.error_rate = error_rate - - def __contains__(self, key): - """Tests a key's membership in this bloom filter. - - >>> b = ScalableBloomFilter(initial_capacity=100, error_rate=0.001, \ - mode=ScalableBloomFilter.SMALL_SET_GROWTH) - >>> b.add("hello") - False - >>> "hello" in b - True - - """ - for f in reversed(self.filters): - if key in f: - return True - return False - - def add(self, key): - """Adds a key to this bloom filter. - If the key already exists in this filter it will return True. - Otherwise False. - - >>> b = ScalableBloomFilter(initial_capacity=100, error_rate=0.001, \ - mode=ScalableBloomFilter.SMALL_SET_GROWTH) - >>> b.add("hello") - False - >>> b.add("hello") - True - - """ - if key in self: - return True - if not self.filters: - filter = BloomFilter( - capacity=self.initial_capacity, - error_rate=self.error_rate * (1.0 - self.ratio)) - self.filters.append(filter) - else: - filter = self.filters[-1] - if filter.count >= filter.capacity: - filter = BloomFilter( - capacity=filter.capacity * self.scale, - error_rate=filter.error_rate * self.ratio) - self.filters.append(filter) - filter.add(key, skip_check=True) - return False - - @property - def capacity(self): - """Returns the total capacity for all filters in this SBF""" - return sum(f.capacity for f in self.filters) - - @property - def count(self): - return len(self) - - def tofile(self, f): - """Serialize this ScalableBloomFilter into the file-object - `f'.""" - f.write(pack(self.FILE_FMT, self.scale, self.ratio, - self.initial_capacity, self.error_rate)) - - # Write #-of-filters - f.write(pack(b' 0: - # Then each filter directly, with a header describing - # their lengths. - headerpos = f.tell() - headerfmt = b'<' + b'Q'*(len(self.filters)) - f.write(b'.' * calcsize(headerfmt)) - filter_sizes = [] - for filter in self.filters: - begin = f.tell() - filter.tofile(f) - filter_sizes.append(f.tell() - begin) - - f.seek(headerpos) - f.write(pack(headerfmt, *filter_sizes)) - - @classmethod - def fromfile(cls, f): - """Deserialize the ScalableBloomFilter in file object `f'.""" - filter = cls() - filter._setup(*unpack(cls.FILE_FMT, f.read(calcsize(cls.FILE_FMT)))) - nfilters, = unpack(b' 0: - header_fmt = b'<' + b'Q'*nfilters - bytes = f.read(calcsize(header_fmt)) - filter_lengths = unpack(header_fmt, bytes) - for fl in filter_lengths: - filter.filters.append(BloomFilter.fromfile(f, fl)) - else: - filter.filters = [] - - return filter - - def __len__(self): - """Returns the total number of elements stored in this SBF""" - return sum(f.count for f in self.filters) - - -if __name__ == "__main__": - import doctest - doctest.testmod() diff --git a/pybloom/tests.py b/pybloom/tests.py deleted file mode 100644 index 13d9b7d..0000000 --- a/pybloom/tests.py +++ /dev/null @@ -1,112 +0,0 @@ -from __future__ import absolute_import -from pybloom.pybloom import BloomFilter, ScalableBloomFilter -from pybloom.utils import running_python_3, range_fn - -try: - from StringIO import StringIO - import cStringIO -except ImportError: - from io import BytesIO as StringIO -import os -import doctest -import unittest -import random -import tempfile -from unittest import TestSuite - -def additional_tests(): - proj_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - readme_fn = os.path.join(proj_dir, 'README.txt') - suite = TestSuite([doctest.DocTestSuite('pybloom.pybloom')]) - if os.path.exists(readme_fn): - suite.addTest(doctest.DocFileSuite(readme_fn, module_relative=False)) - return suite - -class TestUnionIntersection(unittest.TestCase): - def test_union(self): - bloom_one = BloomFilter(100, 0.001) - bloom_two = BloomFilter(100, 0.001) - chars = [chr(i) for i in range_fn(97, 123)] - for char in chars[int(len(chars)/2):]: - bloom_one.add(char) - for char in chars[:int(len(chars)/2)]: - bloom_two.add(char) - new_bloom = bloom_one.union(bloom_two) - for char in chars: - self.assertTrue(char in new_bloom) - - def test_intersection(self): - bloom_one = BloomFilter(100, 0.001) - bloom_two = BloomFilter(100, 0.001) - chars = [chr(i) for i in range_fn(97, 123)] - for char in chars: - bloom_one.add(char) - for char in chars[:int(len(chars)/2)]: - bloom_two.add(char) - new_bloom = bloom_one.intersection(bloom_two) - for char in chars[:int(len(chars)/2)]: - self.assertTrue(char in new_bloom) - for char in chars[int(len(chars)/2):]: - self.assertTrue(char not in new_bloom) - - def test_intersection_capacity_fail(self): - bloom_one = BloomFilter(1000, 0.001) - bloom_two = BloomFilter(100, 0.001) - def _run(): - new_bloom = bloom_one.intersection(bloom_two) - self.assertRaises(ValueError, _run) - - def test_union_capacity_fail(self): - bloom_one = BloomFilter(1000, 0.001) - bloom_two = BloomFilter(100, 0.001) - def _run(): - new_bloom = bloom_one.union(bloom_two) - self.assertRaises(ValueError, _run) - - def test_intersection_k_fail(self): - bloom_one = BloomFilter(100, 0.001) - bloom_two = BloomFilter(100, 0.01) - def _run(): - new_bloom = bloom_one.intersection(bloom_two) - self.assertRaises(ValueError, _run) - - def test_union_k_fail(self): - bloom_one = BloomFilter(100, 0.01) - bloom_two = BloomFilter(100, 0.001) - def _run(): - new_bloom = bloom_one.union(bloom_two) - self.assertRaises(ValueError, _run) - -class Serialization(unittest.TestCase): - SIZE = 12345 - EXPECTED = set([random.randint(0, 10000100) for _ in range_fn(SIZE)]) - - def test_serialization(self): - for klass, args in [(BloomFilter, (self.SIZE,)), - (ScalableBloomFilter, ())]: - filter = klass(*args) - for item in self.EXPECTED: - filter.add(item) - - f = tempfile.TemporaryFile() - filter.tofile(f) - stringio = StringIO() - filter.tofile(stringio) - streams_to_test = [f, stringio] - if not running_python_3: - cstringio = cStringIO.StringIO() - filter.tofile(cstringio) - streams_to_test.append(cstringio) - - del filter - - for stream in streams_to_test: - stream.seek(0) - filter = klass.fromfile(stream) - for item in self.EXPECTED: - self.assertTrue(item in filter) - del(filter) - stream.close() - -if __name__ == '__main__': - unittest.main() diff --git a/pybloom/utils.py b/pybloom/utils.py deleted file mode 100644 index 535d77b..0000000 --- a/pybloom/utils.py +++ /dev/null @@ -1,24 +0,0 @@ -import sys -try: - import StringIO - import cStringIO -except ImportError: - from io import BytesIO - -running_python_3 = sys.version_info[0] == 3 - - -def range_fn(*args): - if running_python_3: - return range(*args) - else: - return xrange(*args) - - -def is_string_io(instance): - if running_python_3: - return isinstance(instance, BytesIO) - else: - return isinstance(instance, (StringIO.StringIO, - cStringIO.InputType, - cStringIO.OutputType)) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 7c8e633..ba5c2fc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ -bitarray>=0.3.4 \ No newline at end of file +numpy>=1.14.0 +numba==0.35.0+10.g143f70e90 \ No newline at end of file diff --git a/setup.py b/setup.py index 2d07d1e..fa1b387 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ #!/usr/bin/env python from setuptools import setup -VERSION = '2.0.0' +VERSION = '0.0.1' DESCRIPTION = "PyBloom: A Probabilistic data structure" LONG_DESCRIPTION = """ pybloom is a Python implementation of the bloom filter probabilistic data @@ -21,8 +21,9 @@ Topic :: Software Development :: Libraries :: Python Modules """.splitlines())) + setup( - name="pybloom", + name="fvnbloom", version=VERSION, description=DESCRIPTION, long_description=LONG_DESCRIPTION, @@ -31,11 +32,11 @@ 'probabilistic', 'set'), author="Jay Baird", author_email="jay.baird@me.com", - url="http://github.com/jaybaird/python-bloomfilter/", + url="https://github.com/simplaex/python-bloomfilter/", license="MIT License", platforms=['any'], - test_suite="pybloom.tests", + test_suite="tests", zip_safe=True, - install_requires=['bitarray>=0.3.4'], - packages=['pybloom'] + install_requires=['numpy>=1.14.0'], + packages=['fvnbloom'] ) diff --git a/tests/bloom_tests.py b/tests/bloom_tests.py new file mode 100644 index 0000000..365e99f --- /dev/null +++ b/tests/bloom_tests.py @@ -0,0 +1,91 @@ +import os +import uuid + +import fvnbloom + + +def test_add_test(): + total = 1000 + uuids = [] + + bf = fvnbloom.create_empty(capacity=total, error_rate=0.01) + + for i in range(total): + did = uuid.uuid4().hex + uuids.append(did) + bf.add(did) + + for did in uuids: + assert did in bf + + +def test_union(): + total = 1000 + strings1 = [] + bf1 = fvnbloom.create_empty(capacity=2 * total, error_rate=0.01) + + for i in range(total): + did = uuid.uuid4().hex + strings1.append(did) + bf1.add(did) + + strings2 = [] + bf2 = fvnbloom.create_empty(capacity=2 * total, error_rate=0.01) + + for i in range(total): + did = uuid.uuid4().hex + strings2.append(did) + bf2.add(did) + + bf = bf1.union(bf2) + + for did in strings1 + strings2: + assert did in bf + + +def test_save_load(): + total = 1000 + uuids = [] + + bf = fvnbloom.create_empty(capacity=total, error_rate=0.01) + + for i in range(total): + did = uuid.uuid4().hex + uuids.append(did) + bf.add(did) + + path = 'test.json.bloom' + + try: + bf.save(path) + bf_loaded = fvnbloom.load(path) + + for did in uuids: + assert did in bf_loaded + + except: + assert False + finally: + os.remove(path) + + +def test_error_rate(): + total = 10000 + uuids = [] + + bf = fvnbloom.create_empty(capacity=total, error_rate=0.01) + + for i in range(total): + did = uuid.uuid4().hex + uuids.append(did) + bf.add(did) + + fps = 0 + for i in range(total): + did = uuid.uuid4().hex + if did in bf: + fps = fps + 1 + + fpr = fps / total + + assert abs(fpr - 0.01) < 0.005 \ No newline at end of file