• A perfect generic bloom filter

    From Bonita Montero@3:633/10 to All on Wed May 20 21:30:22 2026
    The cool thing about this class is that in contrast to the C++ contai-
    ners which need a hasher it needs multiple hashers as any Bloom filter.
    The hashers are passed as a C++20 span and are internally stored as a
    vector.
    The bitmap is stored as a vector of size_t's. If Atomic is true they're referred through an atomic_ref, i.e. the Bloom filter becomes lock-free
    with that.

    #include <random>
    #include <span>
    #include <utility>
    #include <cassert>
    #include <atomic>
    #include "pct.hpp"
    #include "defer.hpp"
    #undef min
    #undef max

    #if defined(__clang__)
    #pragma clang diagnostic push
    #pragma clang diagnostic ignored "-Wunqualified-std-cast-call"
    #endif

    template<typename Key, typename Hasher, bool Atomic = false>
    struct bloom
    {
    static constexpr float DefaultFpr = (float)1.0_permill;
    struct match_result { bool matched; size_t hash; };
    bloom() = default;
    bloom( size_t nKeys, std::span<Hasher> hashers, float fpr = DefaultFpr );
    bloom( const bloom & ) = default;
    bloom( bloom && ) noexcept;
    bloom &operator =( const bloom & ) = default;
    bloom &operator =( bloom && ) noexcept;
    size_t add( const Key &key ) noexcept;
    match_result match( const Key &key ) const noexcept;
    void resize( size_t nKeys, std::span<Hasher> hashers, float fpr = DefaultFpr );
    size_t n_bits() const noexcept;
    size_t n_keys() const noexcept;
    float fpr() const noexcept;
    float usage() const noexcept;
    static size_t n_bits( size_t nKeys, size_t nHashes, float fpr = DefaultFpr ) noexcept;
    static float fpr( size_t nBits, size_t nKeys, size_t nHashes ) noexcept;
    private:
    static constexpr size_t
    StSize = sizeof( size_t ),
    StBits = 8 * StSize;
    std::vector<size_t> m_bits;
    size_t m_nKeys = 0;
    std::vector<Hasher> m_hashers;
    };

    template<typename Key, typename Hasher, bool Atomic>
    bloom<Key, Hasher, Atomic>::bloom::bloom( size_t nKeys,
    std::span<Hasher> hashers, float fpr )
    {
    resize( nKeys, hashers, fpr );
    }

    template<typename Key, typename Hasher, bool Atomic>
    bloom<Key, Hasher, Atomic>::bloom::bloom( bloom &&other ) noexcept
    {
    other.m_nKeys = 0;
    }

    template<typename Key, typename Hasher, bool Atomic>
    bloom<Key, Hasher, Atomic> &bloom<Key, Hasher, Atomic>::operator =(
    bloom &&other ) noexcept
    {
    using namespace std;
    m_bits = move( other.m_bits );
    m_nKeys = other.m_nKeys;
    other.m_nKeys = 0;
    m_hashers = ( other.m_hashers );
    }

    template<typename Key, typename Hasher, bool Atomic>
    size_t bloom<Key, Hasher, Atomic>::add( const Key &key ) noexcept
    {
    using namespace std;
    size_t hMask = n_bits() - 1, sumHash = 0;
    for( const Hasher &hasher : m_hashers ) [[likely]]
    {
    size_t
    hash = hasher( key ),
    mask = (size_t)1 << (hash % StBits),
    &bits = m_bits[(hash & hMask) / StBits];
    if constexpr( Atomic )
    atomic_ref( bits ).fetch_or( mask, memory_order_relaxed );
    else
    bits |= mask;
    sumHash ^= hash;
    }
    return sumHash;
    }

    template<typename Key, typename Hasher, bool Atomic>
    auto bloom<Key, Hasher, Atomic>::match( const Key &key ) const noexcept
    match_result
    {
    using namespace std;
    size_t hMask = n_bits() - 1, sumHash = 0;
    for( const Hasher &hasher : m_hashers ) [[unlikely]]
    {
    size_t
    hash = hasher( key ),
    mask = (size_t)1 << (hash % StBits),
    word;
    const size_t &bits = m_bits[(hash & hMask) / StBits];
    if constexpr( Atomic )
    word = atomic_ref( bits ).load( memory_order_relaxed );
    else
    word = bits;
    if( !(word & mask) ) [[likely]]
    return match_result( false, 0 );
    sumHash ^= hash;
    }
    return match_result( true, sumHash );
    }

    template<typename Key, typename Hasher, bool Atomic>
    void bloom<Key, Hasher, Atomic>::resize( size_t nKeys, std::span<Hasher> hashers, float fpr )
    {
    using namespace std;
    defer clear( [&]
    {
    m_bits = vector<size_t>();
    m_hashers = vector<Hasher>();
    } );
    // resize bitmap, thereby erasing everything
    m_nKeys = 0;
    size_t nBits = n_bits( nKeys, hashers.size(), fpr );
    if( m_bits.size() * StBits != nBits )
    m_bits = vector<size_t>( nBits / StBits );
    else
    if constexpr( Atomic )
    for( size_t a = nBits / StBits; a--; )
    atomic_ref( m_bits[a] ).store( 0, memory_order_relaxed );
    else
    fill( m_bits.begin(), m_bits.end(), 0 );
    m_hashers = vector<Hasher>( move_iterator( hashers.begin() ), move_iterator( hashers.end() ) );
    clear.disable();
    }

    template<typename Key, typename Hasher, bool Atomic>
    size_t bloom<Key, Hasher, Atomic>::n_bits() const noexcept
    {
    return StBits * m_bits.size();
    }

    template<typename Key, typename Hasher, bool Atomic>
    inline size_t bloom<Key, Hasher, Atomic>::n_keys() const noexcept
    {
    return m_nKeys;
    }

    template<typename Key, typename Hasher, bool Atomic>
    float bloom<Key, Hasher, Atomic>::fpr() const noexcept
    {
    return fpr( n_bits(), m_nKeys, m_hashers.size() );
    }

    template<typename Key, typename Hasher, bool Atomic>
    float bloom<Key, Hasher, Atomic>::usage() const noexcept
    {
    using namespace std;
    size_t nSet = 0;
    for( const size_t &word : m_bits )
    if constexpr( Atomic )
    nSet += popcount( atomic_ref( word ).load( memory_order_relaxed ) );
    else
    nSet += popcount( word );
    return (float)((double)nSet / (double)n_bits());
    }

    template<typename Key, typename Hasher, bool Atomic>
    size_t bloom<Key, Hasher, Atomic>::n_bits( size_t nKeys, size_t nHashes,
    float fpr ) noexcept
    {
    using namespace std;
    constexpr float MinFpr = (float)1.0_permill;
    constexpr double MaxBits = numeric_limits<ptrdiff_t>::min();
    fpr = !isnan( fpr ) && !isinf( fpr ) ? fpr : DefaultFpr;
    fpr = max( min( fpr, 1.0f ), MinFpr );
    nHashes += (size_t)!nHashes;
    double
    dKeys = (double)nKeys,
    dHashes = (double)nHashes,
    dBits;
    dBits = 1.0 - pow( fpr, 1.0 / dHashes );
    dBits = 1.0 - pow( dBits, 1.0 / (dHashes * dKeys) );
    dBits = 1.0 / dBits;
    assert(dBits >= 0.0);
    size_t nBits = (size_t)min( dBits, MaxBits );
    return bit_ceil( max( nBits, StBits ) );
    }

    template<typename Key, typename Hasher, bool Atomic>
    float bloom<Key, Hasher, Atomic>::fpr( size_t nBits, size_t nKeys,
    size_t nHashes ) noexcept
    {
    using namespace std;
    constexpr size_t MaxBits = numeric_limits<ptrdiff_t>::min();
    nBits = nBits < MaxBits ? bit_ceil( nBits ) : MaxBits;
    double
    dBits = (double)nBits,
    dKeys = (double)nKeys,
    dHashes = (double)nHashes,
    fpr;
    fpr = 1.0 - 1.0 / dBits;
    fpr = 1.0 - pow( fpr, dHashes * dKeys );
    return (float)pow( fpr, dHashes );
    }

    #if defined(__clang__)
    #pragma clang diagnostic pop
    #endif

    --- PyGate Linux v1.5.14
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)
  • From Bonita Montero@3:633/10 to All on Thu May 21 09:46:30 2026
    Am 20.05.2026 um 21:30 schrieb Bonita Montero:

    template<typename Key, typename Hasher, bool Atomic>
    size_t bloom<Key, Hasher, Atomic>::n_bits( size_t nKeys, size_t nHashes, float fpr ) noexcept
    {
    ˙˙˙˙using namespace std;
    ˙˙˙˙constexpr float MinFpr = (float)1.0_permill;
    ˙˙˙˙constexpr double MaxBits = numeric_limits<ptrdiff_t>::min();

    constexpr double MaxBits = (double)(size_t)numeric_limits<ptrdiff_t>::min();

    --- PyGate Linux v1.5.14
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)
  • From boltar@3:633/10 to All on Thu May 21 08:45:37 2026
    On Wed, 20 May 2026 21:30:22 +0200
    Bonita Montero <Bonita.Montero@gmail.com> gabbled:
    The cool thing about this class is that in contrast to the C++ contai-
    ners which need a hasher it needs multiple hashers as any Bloom filter.

    Had to google wtf a bloom filter is. Seems like a remarkably complicated
    way to partially replace a hash lookup.


    --- PyGate Linux v1.5.14
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)
  • From Bonita Montero@3:633/10 to All on Fri May 22 08:34:01 2026
    Am 21.05.2026 um 10:45 schrieb boltar@caprica.universe:

    Had to google wtf a bloom filter is. Seems like a remarkably complicated
    way to partially replace a hash lookup.

    But bloom filter occupy much less memory. And checking for a true
    negative, which is the case that mostly applies when using Bloom
    filters, is much more efficient since if the false positive rate
    is chosen properly you check only a single bit for that in most
    cases.
    And I think a hash set is much more complicated.


    --- PyGate Linux v1.5.14
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)
  • From boltar@3:633/10 to All on Fri May 22 09:15:06 2026
    On Fri, 22 May 2026 08:34:01 +0200
    Bonita Montero <Bonita.Montero@gmail.com> gabbled:
    Am 21.05.2026 um 10:45 schrieb boltar@caprica.universe:

    Had to google wtf a bloom filter is. Seems like a remarkably complicated
    way to partially replace a hash lookup.

    But bloom filter occupy much less memory. And checking for a true
    negative, which is the case that mostly applies when using Bloom
    filters, is much more efficient since if the false positive rate
    is chosen properly you check only a single bit for that in most
    cases.
    And I think a hash set is much more complicated.

    Really? I think most competant programmers could write a hash table in a few hours, albeit perhaps a not very efficient one. I doubt many - and I include myself - could write this filter without constantly refering to the spec.
    Also I suspect the use cases of something that doesn't give a definitive
    yes or no is very limited.


    --- PyGate Linux v1.5.14
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)
  • From Bonita Montero@3:633/10 to All on Fri May 22 11:34:59 2026
    Am 22.05.2026 um 11:15 schrieb boltar@caprica.universe:

    Really? I think most competant programmers could write a hash table in a
    few hours, ...

    Yes, because it's well known. But if you learned the basic principles
    of Bloom filters from Wikipedia implementing that is simpler than a
    hash set. You don't need any buckes, no compare function but just a
    bunch of hashers (usually the same with different seeds).
    That's nearly a magnitude simpler than a hash set.

    I doubt many - and I include myself - could write this filter
    without constantly refering to the spec.

    That's not much different than with a Bloom filter.

    Also I suspect the use cases of something that doesn't give a
    definitive yes or no is very limited.

    Of course, but where it applies it's much more efficient and a lot
    more memory-saving.


    --- PyGate Linux v1.5.14
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)
  • From red floyd@3:633/10 to All on Fri May 22 16:11:50 2026
    On 5/21/2026 1:45 AM, boltar@caprica.universe wrote:
    On Wed, 20 May 2026 21:30:22 +0200
    Bonita Montero <Bonita.Montero@gmail.com> gabbled:
    The cool thing about this class is that in contrast to the C++ contai-
    ners which need a hasher it needs multiple hashers as any Bloom filter.

    Had to google wtf a bloom filter is. Seems like a remarkably complicated
    way to partially replace a hash lookup.

    According to Wikipedia, it's for data sets that are too big for a hash
    lookup.

    --- PyGate Linux v1.5.14
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)
  • From boltar@3:633/10 to All on Sat May 23 08:51:32 2026
    On Fri, 22 May 2026 16:11:50 -0700
    red floyd <no.spam.here@its.invalid> gabbled:
    On 5/21/2026 1:45 AM, boltar@caprica.universe wrote:
    On Wed, 20 May 2026 21:30:22 +0200
    Bonita Montero <Bonita.Montero@gmail.com> gabbled:
    The cool thing about this class is that in contrast to the C++ contai-
    ners which need a hasher it needs multiple hashers as any Bloom filter.

    Had to google wtf a bloom filter is. Seems like a remarkably complicated
    way to partially replace a hash lookup.

    According to Wikipedia, it's for data sets that are too big for a hash >lookup.

    That makes little sense. If you don't have enough memory for the hash table then you're certainly not going to have enough for the data itself unless
    each data is shorter than the hash value.


    --- PyGate Linux v1.5.14
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)
  • From Chris M. Thomasson@3:633/10 to All on Sun May 24 03:15:10 2026
    On 5/23/2026 1:51 AM, boltar@caprica.universe wrote:
    On Fri, 22 May 2026 16:11:50 -0700
    red floyd <no.spam.here@its.invalid> gabbled:
    On 5/21/2026 1:45 AM, boltar@caprica.universe wrote:
    On Wed, 20 May 2026 21:30:22 +0200
    Bonita Montero <Bonita.Montero@gmail.com> gabbled:
    The cool thing about this class is that in contrast to the C++ contai- >>>> ners which need a hasher it needs multiple hashers as any Bloom filter. >>>
    Had to google wtf a bloom filter is. Seems like a remarkably complicated >>> way to partially replace a hash lookup.

    According to Wikipedia, it's for data sets that are too big for a hash
    lookup.

    That makes little sense. If you don't have enough memory for the hash table then you're certainly not going to have enough for the data itself unless each data is shorter than the hash value.


    Think of a finite hash table that handles collisions by linking into a
    stack, queue, or something... Or try a rehash with a state in the node...

    --- PyGate Linux v1.5.14
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)
  • From Bonita Montero@3:633/10 to All on Sun May 24 18:35:01 2026
    Am 23.05.2026 um 10:51 schrieb boltar@caprica.universe:

    That makes little sense. If you don't have enough memory for the hash
    table
    then you're certainly not going to have enough for the data itself unless each data is shorter than the hash value.

    Bloom filters are used to accelerate lookups. They are employed in scenarios where there is a very high volume of true negatives, and a comparatively
    lower volume of true or false positives. In the ast majority of cases,
    the check of the very first bit will fail?provided, of course, that the false-positive rate has been set sufficiently low ? which is naturally
    much faster than performing a full lookup in a hash table.
    In the event of a true or false positive result, however, one must still perform the lookup in the hash table; consequently, this approach is
    primarily beneficial in situations characterized by a predominance of
    true negatives.
    Virtually all database engines utilize Bloom filters in appropriate
    contexts to minimize costly I/O operations.

    --- PyGate Linux v1.5.14
    * Origin: Dragon's Lair, PyGate NNTP<>Fido Gate (3:633/10)