Forum: d0p3 BBS

A perfect generic bloom filter

From Bonita Montero@3:633/10 to All on Wed May 20 21:30:22 2026

The cool thing about this class is that in contrast to the C++ contai-
ners which need a hasher it needs multiple hashers as any Bloom filter.
The hashers are passed as a C++20 span and are internally stored as a
vector.
The bitmap is stored as a vector of size_t's. If Atomic is true they're referred through an atomic_ref, i.e. the Bloom filter becomes lock-free
with that.

#include <random>
#include <span>
#include <utility>
#include <cassert>
#include <atomic>
#include "pct.hpp"
#include "defer.hpp"
#undef min
#undef max

#if defined(__clang__)
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wunqualified-std-cast-call"
#endif

template<typename Key, typename Hasher, bool Atomic = false>
struct bloom
{
static constexpr float DefaultFpr = (float)1.0_permill;
struct match_result { bool matched; size_t hash; };
bloom() = default;
bloom( size_t nKeys, std::span<Hasher> hashers, float fpr = DefaultFpr );
bloom( const bloom & ) = default;
bloom( bloom && ) noexcept;
bloom &operator =( const bloom & ) = default;
bloom &operator =( bloom && ) noexcept;
size_t add( const Key &key ) noexcept;
match_result match( const Key &key ) const noexcept;
void resize( size_t nKeys, std::span<Hasher> hashers, float fpr = DefaultFpr );
size_t n_bits() const noexcept;
size_t n_keys() const noexcept;
float fpr() const noexcept;
float usage() const noexcept;
static size_t n_bits( size_t nKeys, size_t nHashes, float fpr = DefaultFpr ) noexcept;
static float fpr( size_t nBits, size_t nKeys, size_t nHashes ) noexcept;
private:
static constexpr size_t
StSize = sizeof( size_t ),
StBits = 8 * StSize;
std::vector<size_t> m_bits;
size_t m_nKeys = 0;
std::vector<Hasher> m_hashers;
};

template<typename Key, typename Hasher, bool Atomic>
bloom<Key, Hasher, Atomic>::bloom::bloom( size_t nKeys,
std::span<Hasher> hashers, float fpr )
{
resize( nKeys, hashers, fpr );
}

template<typename Key, typename Hasher, bool Atomic>
bloom<Key, Hasher, Atomic>::bloom::bloom( bloom &&other ) noexcept
{
other.m_nKeys = 0;
}

template<typename Key, typename Hasher, bool Atomic>
bloom<Key, Hasher, Atomic> &bloom<Key, Hasher, Atomic>::operator =(
bloom &&other ) noexcept
{
using namespace std;
m_bits = move( other.m_bits );
m_nKeys = other.m_nKeys;
other.m_nKeys = 0;
m_hashers = ( other.m_hashers );
}

template<typename Key, typename Hasher, bool Atomic>
size_t bloom<Key, Hasher, Atomic>::add( const Key &key ) noexcept
{
using namespace std;
size_t hMask = n_bits() - 1, sumHash = 0;
for( const Hasher &hasher : m_hashers ) [[likely]]
{
size_t
hash = hasher( key ),
mask = (size_t)1 << (hash % StBits),
&bits = m_bits[(hash & hMask) / StBits];
if constexpr( Atomic )
atomic_ref( bits ).fetch_or( mask, memory_order_relaxed );
else
bits |= mask;
sumHash ^= hash;
}
return sumHash;
}

template<typename Key, typename Hasher, bool Atomic>
auto bloom<Key, Hasher, Atomic>::match( const Key &key ) const noexcept

match_result

{
using namespace std;
size_t hMask = n_bits() - 1, sumHash = 0;
for( const Hasher &hasher : m_hashers ) [[unlikely]]
{
size_t
hash = hasher( key ),
mask = (size_t)1 << (hash % StBits),
word;
const size_t &bits = m_bits[(hash & hMask) / StBits];
if constexpr( Atomic )
word = atomic_ref( bits ).load( memory_order_relaxed );
else
word = bits;
if( !(word & mask) ) [[likely]]
return match_result( false, 0 );
sumHash ^= hash;
}
return match_result( true, sumHash );
}

template<typename Key, typename Hasher, bool Atomic>
void bloom<Key, Hasher, Atomic>::resize( size_t nKeys, std::span<Hasher> hashers, float fpr )
{
using namespace std;
defer clear( [&]
{
m_bits = vector<size_t>();
m_hashers = vector<Hasher>();
} );
// resize bitmap, thereby erasing everything
m_nKeys = 0;
size_t nBits = n_bits( nKeys, hashers.size(), fpr );
if( m_bits.size() * StBits != nBits )
m_bits = vector<size_t>( nBits / StBits );
else
if constexpr( Atomic )
for( size_t a = nBits / StBits; a--; )
atomic_ref( m_bits[a] ).store( 0, memory_order_relaxed );
else
fill( m_bits.begin(), m_bits.end(), 0 );
m_hashers = vector<Hasher>( move_iterator( hashers.begin() ), move_iterator( hashers.end() ) );
clear.disable();
}

template<typename Key, typename Hasher, bool Atomic>
size_t bloom<Key, Hasher, Atomic>::n_bits() const noexcept
{
return StBits * m_bits.size();
}

template<typename Key, typename Hasher, bool Atomic>
inline size_t bloom<Key, Hasher, Atomic>::n_keys() const noexcept
{
return m_nKeys;
}

template<typename Key, typename Hasher, bool Atomic>
float bloom<Key, Hasher, Atomic>::fpr() const noexcept
{
return fpr( n_bits(), m_nKeys, m_hashers.size() );
}

template<typename Key, typename Hasher, bool Atomic>
float bloom<Key, Hasher, Atomic>::usage() const noexcept
{
using namespace std;
size_t nSet = 0;
for( const size_t &word : m_bits )
if constexpr( Atomic )
nSet += popcount( atomic_ref( word ).load( memory_order_relaxed ) );
else
nSet += popcount( word );
return (float)((double)nSet / (double)n_bits());
}

template<typename Key, typename Hasher, bool Atomic>
size_t bloom<Key, Hasher, Atomic>::n_bits( size_t nKeys, size_t nHashes,
float fpr ) noexcept
{
using namespace std;
constexpr float MinFpr = (float)1.0_permill;
constexpr double MaxBits = numeric_limits<ptrdiff_t>::min();
fpr = !isnan( fpr ) && !isinf( fpr ) ? fpr : DefaultFpr;
fpr = max( min( fpr, 1.0f ), MinFpr );
nHashes += (size_t)!nHashes;
double
dKeys = (double)nKeys,
dHashes = (double)nHashes,
dBits;
dBits = 1.0 - pow( fpr, 1.0 / dHashes );
dBits = 1.0 - pow( dBits, 1.0 / (dHashes * dKeys) );
dBits = 1.0 / dBits;
assert(dBits >= 0.0);
size_t nBits = (size_t)min( dBits, MaxBits );
return bit_ceil( max( nBits, StBits ) );
}

template<typename Key, typename Hasher, bool Atomic>
float bloom<Key, Hasher, Atomic>::fpr( size_t nBits, size_t nKeys,
size_t nHashes ) noexcept
{
using namespace std;
constexpr size_t MaxBits = numeric_limits<ptrdiff_t>::min();
nBits = nBits < MaxBits ? bit_ceil( nBits ) : MaxBits;
double
dBits = (double)nBits,
dKeys = (double)nKeys,
dHashes = (double)nHashes,
fpr;
fpr = 1.0 - 1.0 / dBits;
fpr = 1.0 - pow( fpr, dHashes * dKeys );
return (float)pow( fpr, dHashes );
}

#if defined(__clang__)
#pragma clang diagnostic pop
#endif

--- PyGate Linux v1.5.14
* Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)

From Bonita Montero@3:633/10 to All on Thu May 21 09:46:30 2026

Am 20.05.2026 um 21:30 schrieb Bonita Montero:

template<typename Key, typename Hasher, bool Atomic>
size_t bloom<Key, Hasher, Atomic>::n_bits( size_t nKeys, size_t nHashes, float fpr ) noexcept
{
��using namespace std;
��constexpr float MinFpr = (float)1.0_permill;
��constexpr double MaxBits = numeric_limits<ptrdiff_t>::min();

constexpr double MaxBits = (double)(size_t)numeric_limits<ptrdiff_t>::min();

--- PyGate Linux v1.5.14
* Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)

From boltar@3:633/10 to All on Thu May 21 08:45:37 2026

On Wed, 20 May 2026 21:30:22 +0200
Bonita Montero <Bonita.Montero@gmail.com> gabbled:

The cool thing about this class is that in contrast to the C++ contai-
ners which need a hasher it needs multiple hashers as any Bloom filter.

Had to google wtf a bloom filter is. Seems like a remarkably complicated
way to partially replace a hash lookup.

--- PyGate Linux v1.5.14
* Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)

From Bonita Montero@3:633/10 to All on Fri May 22 08:34:01 2026

Am 21.05.2026 um 10:45 schrieb boltar@caprica.universe:

Had to google wtf a bloom filter is. Seems like a remarkably complicated
way to partially replace a hash lookup.

But bloom filter occupy much less memory. And checking for a true
negative, which is the case that mostly applies when using Bloom
filters, is much more efficient since if the false positive rate
is chosen properly you check only a single bit for that in most
cases.
And I think a hash set is much more complicated.

--- PyGate Linux v1.5.14
* Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)

From boltar@3:633/10 to All on Fri May 22 09:15:06 2026

On Fri, 22 May 2026 08:34:01 +0200
Bonita Montero <Bonita.Montero@gmail.com> gabbled:

Am 21.05.2026 um 10:45 schrieb boltar@caprica.universe:

Had to google wtf a bloom filter is. Seems like a remarkably complicated
way to partially replace a hash lookup.

But bloom filter occupy much less memory. And checking for a true
negative, which is the case that mostly applies when using Bloom
filters, is much more efficient since if the false positive rate
is chosen properly you check only a single bit for that in most
cases.
And I think a hash set is much more complicated.

Really? I think most competant programmers could write a hash table in a few hours, albeit perhaps a not very efficient one. I doubt many - and I include myself - could write this filter without constantly refering to the spec.
Also I suspect the use cases of something that doesn't give a definitive
yes or no is very limited.

--- PyGate Linux v1.5.14
* Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)

From Bonita Montero@3:633/10 to All on Fri May 22 11:34:59 2026

Am 22.05.2026 um 11:15 schrieb boltar@caprica.universe:

Really? I think most competant programmers could write a hash table in a
few hours, ...

Yes, because it's well known. But if you learned the basic principles
of Bloom filters from Wikipedia implementing that is simpler than a
hash set. You don't need any buckes, no compare function but just a
bunch of hashers (usually the same with different seeds).
That's nearly a magnitude simpler than a hash set.

I doubt many - and I include myself - could write this filter
without constantly refering to the spec.

That's not much different than with a Bloom filter.

Also I suspect the use cases of something that doesn't give a
definitive yes or no is very limited.

Of course, but where it applies it's much more efficient and a lot
more memory-saving.

--- PyGate Linux v1.5.14
* Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)

From red floyd@3:633/10 to All on Fri May 22 16:11:50 2026

On 5/21/2026 1:45 AM, boltar@caprica.universe wrote:

On Wed, 20 May 2026 21:30:22 +0200
Bonita Montero <Bonita.Montero@gmail.com> gabbled:

The cool thing about this class is that in contrast to the C++ contai-
ners which need a hasher it needs multiple hashers as any Bloom filter.

Had to google wtf a bloom filter is. Seems like a remarkably complicated
way to partially replace a hash lookup.

According to Wikipedia, it's for data sets that are too big for a hash
lookup.

--- PyGate Linux v1.5.14
* Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)

From boltar@3:633/10 to All on Sat May 23 08:51:32 2026

On Fri, 22 May 2026 16:11:50 -0700
red floyd <no.spam.here@its.invalid> gabbled:

On 5/21/2026 1:45 AM, boltar@caprica.universe wrote:

On Wed, 20 May 2026 21:30:22 +0200
Bonita Montero <Bonita.Montero@gmail.com> gabbled:

The cool thing about this class is that in contrast to the C++ contai-
ners which need a hasher it needs multiple hashers as any Bloom filter.

Had to google wtf a bloom filter is. Seems like a remarkably complicated
way to partially replace a hash lookup.

According to Wikipedia, it's for data sets that are too big for a hash >lookup.

That makes little sense. If you don't have enough memory for the hash table then you're certainly not going to have enough for the data itself unless
each data is shorter than the hash value.

--- PyGate Linux v1.5.14
* Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)

From Chris M. Thomasson@3:633/10 to All on Sun May 24 03:15:10 2026

On 5/23/2026 1:51 AM, boltar@caprica.universe wrote:

On Fri, 22 May 2026 16:11:50 -0700
red floyd <no.spam.here@its.invalid> gabbled:

On 5/21/2026 1:45 AM, boltar@caprica.universe wrote:

On Wed, 20 May 2026 21:30:22 +0200
Bonita Montero <Bonita.Montero@gmail.com> gabbled:

The cool thing about this class is that in contrast to the C++ contai- >>>> ners which need a hasher it needs multiple hashers as any Bloom filter. >>>

Had to google wtf a bloom filter is. Seems like a remarkably complicated >>> way to partially replace a hash lookup.

According to Wikipedia, it's for data sets that are too big for a hash
lookup.

That makes little sense. If you don't have enough memory for the hash table then you're certainly not going to have enough for the data itself unless each data is shorter than the hash value.

Think of a finite hash table that handles collisions by linking into a
stack, queue, or something... Or try a rehash with a state in the node...

--- PyGate Linux v1.5.14
* Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)

From Bonita Montero@3:633/10 to All on Sun May 24 18:35:01 2026

Am 23.05.2026 um 10:51 schrieb boltar@caprica.universe:

That makes little sense. If you don't have enough memory for the hash

table

then you're certainly not going to have enough for the data itself unless each data is shorter than the hash value.

Bloom filters are used to accelerate lookups. They are employed in scenarios where there is a very high volume of true negatives, and a comparatively
lower volume of true or false positives. In the ast majority of cases,
the check of the very first bit will fail?provided, of course, that the false-positive rate has been set sufficiently low ? which is naturally
much faster than performing a full lookup in a hash table.
In the event of a true or false positive result, however, one must still perform the lookup in the hash table; consequently, this approach is
primarily beneficial in situations characterized by a predominance of
true negatives.
Virtually all database engines utilize Bloom filters in appropriate
contexts to minimize costly I/O operations.

--- PyGate Linux v1.5.14
* Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)

Who's Online
Recent Visitors
- RufusT
  Tue May 19 12:56:22 2026
  from Dallas, TX via RLogin
- Guest
  Tue May 26 22:56:58 2026
  from Melbourne via Telnet
- Guest
  Sun Jun 14 06:31:07 2026
  from Ny via Telnet
- Frostydev
  Mon Jul 13 05:56:46 2026
  from Maryland via Raw

System Info

Sysop:	Tetrazocine
Location:	Melbourne, VIC, Australia
Users:	12
Nodes:	8 (0 / 8)
Uptime:	03:08:20
Calls:	220
Files:	21,513
Messages:	82,368

A perfect generic bloom filter

Who's Online

Recent Visitors

System Info