The cool thing about this class is that in contrast to the C++ contai-
ners which need a hasher it needs multiple hashers as any Bloom filter.
The hashers are passed as a C++20 span and are internally stored as a
vector.
The bitmap is stored as a vector of size_t's. If Atomic is true they're referred through an atomic_ref, i.e. the Bloom filter becomes lock-free
with that.
#include <random>
#include <span>
#include <utility>
#include <cassert>
#include <atomic>
#include "pct.hpp"
#include "defer.hpp"
#undef min
#undef max
#if defined(__clang__)
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wunqualified-std-cast-call"
#endif
template<typename Key, typename Hasher, bool Atomic = false>
struct bloom
{
static constexpr float DefaultFpr = (float)1.0_permill;
struct match_result { bool matched; size_t hash; };
bloom() = default;
bloom( size_t nKeys, std::span<Hasher> hashers, float fpr = DefaultFpr );
bloom( const bloom & ) = default;
bloom( bloom && ) noexcept;
bloom &operator =( const bloom & ) = default;
bloom &operator =( bloom && ) noexcept;
size_t add( const Key &key ) noexcept;
match_result match( const Key &key ) const noexcept;
void resize( size_t nKeys, std::span<Hasher> hashers, float fpr = DefaultFpr );
size_t n_bits() const noexcept;
size_t n_keys() const noexcept;
float fpr() const noexcept;
float usage() const noexcept;
static size_t n_bits( size_t nKeys, size_t nHashes, float fpr = DefaultFpr ) noexcept;
static float fpr( size_t nBits, size_t nKeys, size_t nHashes ) noexcept;
private:
static constexpr size_t
StSize = sizeof( size_t ),
StBits = 8 * StSize;
std::vector<size_t> m_bits;
size_t m_nKeys = 0;
std::vector<Hasher> m_hashers;
};
template<typename Key, typename Hasher, bool Atomic>
bloom<Key, Hasher, Atomic>::bloom::bloom( size_t nKeys,
std::span<Hasher> hashers, float fpr )
{
resize( nKeys, hashers, fpr );
}
template<typename Key, typename Hasher, bool Atomic>
bloom<Key, Hasher, Atomic>::bloom::bloom( bloom &&other ) noexcept
{
other.m_nKeys = 0;
}
template<typename Key, typename Hasher, bool Atomic>
bloom<Key, Hasher, Atomic> &bloom<Key, Hasher, Atomic>::operator =(
bloom &&other ) noexcept
{
using namespace std;
m_bits = move( other.m_bits );
m_nKeys = other.m_nKeys;
other.m_nKeys = 0;
m_hashers = ( other.m_hashers );
}
template<typename Key, typename Hasher, bool Atomic>
size_t bloom<Key, Hasher, Atomic>::add( const Key &key ) noexcept
{
using namespace std;
size_t hMask = n_bits() - 1, sumHash = 0;
for( const Hasher &hasher : m_hashers ) [[likely]]
{
size_t
hash = hasher( key ),
mask = (size_t)1 << (hash % StBits),
&bits = m_bits[(hash & hMask) / StBits];
if constexpr( Atomic )
atomic_ref( bits ).fetch_or( mask, memory_order_relaxed );
else
bits |= mask;
sumHash ^= hash;
}
return sumHash;
}
template<typename Key, typename Hasher, bool Atomic>
auto bloom<Key, Hasher, Atomic>::match( const Key &key ) const noexcept
match_result
{
using namespace std;
size_t hMask = n_bits() - 1, sumHash = 0;
for( const Hasher &hasher : m_hashers ) [[unlikely]]
{
size_t
hash = hasher( key ),
mask = (size_t)1 << (hash % StBits),
word;
const size_t &bits = m_bits[(hash & hMask) / StBits];
if constexpr( Atomic )
word = atomic_ref( bits ).load( memory_order_relaxed );
else
word = bits;
if( !(word & mask) ) [[likely]]
return match_result( false, 0 );
sumHash ^= hash;
}
return match_result( true, sumHash );
}
template<typename Key, typename Hasher, bool Atomic>
void bloom<Key, Hasher, Atomic>::resize( size_t nKeys, std::span<Hasher> hashers, float fpr )
{
using namespace std;
defer clear( [&]
{
m_bits = vector<size_t>();
m_hashers = vector<Hasher>();
} );
// resize bitmap, thereby erasing everything
m_nKeys = 0;
size_t nBits = n_bits( nKeys, hashers.size(), fpr );
if( m_bits.size() * StBits != nBits )
m_bits = vector<size_t>( nBits / StBits );
else
if constexpr( Atomic )
for( size_t a = nBits / StBits; a--; )
atomic_ref( m_bits[a] ).store( 0, memory_order_relaxed );
else
fill( m_bits.begin(), m_bits.end(), 0 );
m_hashers = vector<Hasher>( move_iterator( hashers.begin() ), move_iterator( hashers.end() ) );
clear.disable();
}
template<typename Key, typename Hasher, bool Atomic>
size_t bloom<Key, Hasher, Atomic>::n_bits() const noexcept
{
return StBits * m_bits.size();
}
template<typename Key, typename Hasher, bool Atomic>
inline size_t bloom<Key, Hasher, Atomic>::n_keys() const noexcept
{
return m_nKeys;
}
template<typename Key, typename Hasher, bool Atomic>
float bloom<Key, Hasher, Atomic>::fpr() const noexcept
{
return fpr( n_bits(), m_nKeys, m_hashers.size() );
}
template<typename Key, typename Hasher, bool Atomic>
float bloom<Key, Hasher, Atomic>::usage() const noexcept
{
using namespace std;
size_t nSet = 0;
for( const size_t &word : m_bits )
if constexpr( Atomic )
nSet += popcount( atomic_ref( word ).load( memory_order_relaxed ) );
else
nSet += popcount( word );
return (float)((double)nSet / (double)n_bits());
}
template<typename Key, typename Hasher, bool Atomic>
size_t bloom<Key, Hasher, Atomic>::n_bits( size_t nKeys, size_t nHashes,
float fpr ) noexcept
{
using namespace std;
constexpr float MinFpr = (float)1.0_permill;
constexpr double MaxBits = numeric_limits<ptrdiff_t>::min();
fpr = !isnan( fpr ) && !isinf( fpr ) ? fpr : DefaultFpr;
fpr = max( min( fpr, 1.0f ), MinFpr );
nHashes += (size_t)!nHashes;
double
dKeys = (double)nKeys,
dHashes = (double)nHashes,
dBits;
dBits = 1.0 - pow( fpr, 1.0 / dHashes );
dBits = 1.0 - pow( dBits, 1.0 / (dHashes * dKeys) );
dBits = 1.0 / dBits;
assert(dBits >= 0.0);
size_t nBits = (size_t)min( dBits, MaxBits );
return bit_ceil( max( nBits, StBits ) );
}
template<typename Key, typename Hasher, bool Atomic>
float bloom<Key, Hasher, Atomic>::fpr( size_t nBits, size_t nKeys,
size_t nHashes ) noexcept
{
using namespace std;
constexpr size_t MaxBits = numeric_limits<ptrdiff_t>::min();
nBits = nBits < MaxBits ? bit_ceil( nBits ) : MaxBits;
double
dBits = (double)nBits,
dKeys = (double)nKeys,
dHashes = (double)nHashes,
fpr;
fpr = 1.0 - 1.0 / dBits;
fpr = 1.0 - pow( fpr, dHashes * dKeys );
return (float)pow( fpr, dHashes );
}
#if defined(__clang__)
#pragma clang diagnostic pop
#endif
--- PyGate Linux v1.5.14
* Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)