• Futex Stack Test...

    From Chris M. Thomasson@3:633/280.2 to All on Tue Feb 18 10:17:46 2025
    This is a little C++20 test using a futex to allow one to wait on a
    lock-free stack. The main stack logic is in struct ct_stack. Well, can
    you get to compile and run? Thanks...

    I still need to check my algorithm out in Relacy. I think it might be
    able to model futex.


    My code:
    ______________________________________
    #include <iostream>
    #include <thread>
    #include <atomic>
    #include <algorithm>
    #include <cassert>


    #define CT_THREAD_N (42)
    #define CT_WORK_N (10000000)
    #define CT_WAIT ((ct_node*)0xDEADBEEF)


    struct ct_node
    {
    ct_node* m_next = { nullptr };

    ct_node()
    {
    //std::cout << "(" << this << ")::ct_node::ct_node()\n";
    }

    ~ct_node()
    {
    //std::cout << "(" << this << ")::ct_node::~ct_node()\n";
    }
    };


    struct ct_stack
    {
    std::atomic<ct_node*> m_head = { nullptr };

    void
    push(
    ct_node* node
    ) {
    ct_node* head = m_head.load(std::memory_order_relaxed);

    do
    {
    if (head == CT_WAIT)
    {
    node->m_next = nullptr;
    }

    else
    {
    node->m_next = head;
    }

    } while (! m_head.compare_exchange_weak(
    head,
    node,
    std::memory_order_release)
    );

    if (head == CT_WAIT)
    {
    // slow path...
    m_head.notify_one();
    }
    }

    ct_node*
    flush_wait()
    {
    ct_node* head = nullptr;

    for (;;)
    {
    head = m_head.exchange(nullptr, std::memory_order_acquire);

    while (! head || head == CT_WAIT)
    {
    // slow path...
    head = m_head.exchange(CT_WAIT, std::memory_order_acquire);

    if (! head || head == CT_WAIT)
    {
    m_head.wait(CT_WAIT, std::memory_order_relaxed);
    continue;
    }
    }

    break;
    }

    assert(head && head != CT_WAIT);

    return head;
    }
    };


    struct ct_work : public ct_node
    {
    unsigned long m_state = { 0 };
    };


    struct ct_shared
    {
    ct_stack m_stack_in;
    ct_stack m_stack_out;

    ct_shared()
    {
    std::cout << "ct_shared::ct_shared()\n" << std::endl;
    }

    ~ct_shared()
    {
    assert(! m_stack_in.m_head || m_stack_in.m_head == CT_WAIT);
    assert(! m_stack_out.m_head || m_stack_out.m_head == CT_WAIT);

    std::cout << "ct_shared::~ct_shared()\n" << std::endl;
    }
    };


    void
    ct_thread(
    ct_shared& shared
    ) {
    unsigned long state = 0;

    while (! state)
    {
    ct_work* head = (ct_work*)shared.m_stack_in.flush_wait();

    while (head)
    {
    ct_work* next = (ct_work*)head->m_next;

    if (head->m_state == 666)
    {
    // Shutdown detected...
    state = 1;
    shared.m_stack_in.push(head);
    }

    else
    {
    shared.m_stack_out.push(head);
    }

    head = next;
    }
    }

    //std::cout << "shutdown..." << std::endl;
    }


    int
    main()
    {
    {
    std::cout << "Futex Stack Test\n";
    std::cout << "by: Chris M. Thomasson\n";
    std::cout << "version: (0.0.1)\n";
    std::cout << "_________________________________\n" << std::endl;
    }

    unsigned long g_ct_work_alloations = 0;
    unsigned long g_ct_work_dealloations = 0;

    {
    std::cout << "CT_THREAD_N = " << CT_THREAD_N << std::endl;
    std::cout << "CT_WORK_N = " << CT_WORK_N << std::endl;
    std::cout << "CT_WAIT = " << CT_WAIT << std::endl;
    }

    {
    ct_shared shared = { };

    std::thread threads[CT_THREAD_N];

    std::cout << "\nLaunching threads...\n" << std::endl;

    for (unsigned long i = 0; i < CT_THREAD_N; ++i)
    {
    threads[i] = std::thread(ct_thread, std::ref(shared));
    }

    //std::this_thread::sleep_for(std::chrono::seconds(2));

    std::cout << "\nGenerate work...\n" << std::endl;

    // Generate work...
    {
    for (unsigned long i = 0; i < CT_WORK_N; ++i)
    {
    shared.m_stack_in.push(new ct_work);
    ++g_ct_work_alloations;
    }
    }

    // Wait for work...
    {
    unsigned long wn = 0;

    while (wn < CT_WORK_N)
    {
    ct_work* head = (ct_work*)shared.m_stack_out.flush_wait();

    while (head)
    {
    ct_work* next = (ct_work*)head->m_next;
    delete head;
    ++g_ct_work_dealloations;
    ++wn;
    head = next;
    }
    }
    }

    std::cout << "\nCompleted all work!\n" << std::endl;
    std::cout << "Sending shutdown state...\n" << std::endl;

    // Send shutdown state...
    {
    for (unsigned long i = 0; i < CT_THREAD_N; ++i)
    {
    ct_work* w = new ct_work;
    ++g_ct_work_alloations;
    w->m_state = 666;
    shared.m_stack_in.push(w);
    }
    }

    // Join threads...
    {
    for (unsigned long i = 0; i < CT_THREAD_N; ++i)
    {
    threads[i].join();
    }
    }

    std::cout << "\nThreads joined...\n" << std::endl;

    // Dump shutdown state...
    std::cout << "Dump shutdown state...\n" << std::endl;

    {
    ct_work* head = (ct_work*)shared.m_stack_in.m_head.load(std::memory_order_relaxed);
    shared.m_stack_in.m_head = nullptr;

    while (head)
    {
    ct_work* next = (ct_work*)head->m_next;
    delete head;
    ++g_ct_work_dealloations;
    head = next;
    }
    }

    std::cout << "\nShutdown complete!\n" << std::endl;
    }

    // Sanity check...
    {
    std::cout << "g_ct_work_alloations = " << g_ct_work_alloations
    << "\n";
    std::cout << "g_ct_work_dealloations = " <<
    g_ct_work_dealloations << std::endl;

    if (g_ct_work_alloations != g_ct_work_dealloations)
    {
    std::cout << "Pardon my French, but shit!!!!!!!!!!!!!! NOT GOOD\n" << std::endl;
    std::cout << "DAMN IT!!!! Grrrrr\n" << std::endl;
    }
    }

    std::cout << "\nFin!\n" << std::endl;

    return 0;
    }
    ______________________________________




    My output:
    _______________________
    Futex Stack Test
    by: Chris M. Thomasson
    version: (0.0.1)
    _________________________________

    CT_THREAD_N = 42
    CT_WORK_N = 10000000
    CT_WAIT = 00000000DEADBEEF
    ct_shared::ct_shared()


    Launching threads...


    Generate work...


    Completed all work!

    Sending shutdown state...


    Threads joined...

    Dump shutdown state...


    Shutdown complete!

    ct_shared::~ct_shared()

    g_ct_work_alloations = 10000042
    g_ct_work_dealloations = 10000042

    Fin!
    _______________________


    Well, any luck?

    --- MBSE BBS v1.0.8.4 (Linux-x86_64)
    * Origin: A noiseless patient Spider (3:633/280.2@fidonet)
  • From Paavo Helde@3:633/280.2 to All on Tue Feb 18 18:01:55 2025
    On 18.02.2025 01:17, Chris M. Thomasson wrote:
    This is a little C++20 test using a futex to allow one to wait on a lock-free stack. The main stack logic is in struct ct_stack. Well, can
    you get to compile and run? Thanks...

    Seamed to work fine, but compilation produced some warnings about the
    CT_WAIT macro.


    Build started at 08:55...
    ------ Build started: Project: ConsoleTest2022, Configuration: Release
    x64 ------
    futex-stack-test.cpp 1>C:\Test\ConsoleTestVS2022\ConsoleTest2022\futex-stack-test.cpp(55,25): warning C4312: 'type cast': conversion from 'unsigned int' to 'ct_node
    *' of greater size 1>C:\Test\ConsoleTestVS2022\ConsoleTest2022\futex-stack-test.cpp(71,21): warning C4312: 'type cast': conversion from 'unsigned int' to 'ct_node
    *' of greater size 1>C:\Test\ConsoleTestVS2022\ConsoleTest2022\futex-stack-test.cpp(87,38): warning C4312: 'type cast': conversion from 'unsigned int' to 'ct_node
    *' of greater size 1>C:\Test\ConsoleTestVS2022\ConsoleTest2022\futex-stack-test.cpp(90,40): warning C4312: 'type cast': conversion from 'unsigned int' to 'ct_node
    *' of greater size 1>C:\Test\ConsoleTestVS2022\ConsoleTest2022\futex-stack-test.cpp(92,39): warning C4312: 'type cast': conversion from 'unsigned int' to 'ct_node
    *' of greater size 1>C:\Test\ConsoleTestVS2022\ConsoleTest2022\futex-stack-test.cpp(94,33): warning C4312: 'type cast': conversion from 'unsigned int' to 'ct_node
    *' of greater size 1>C:\Test\ConsoleTestVS2022\ConsoleTest2022\futex-stack-test.cpp(185,38): warning C4312: 'type cast': conversion from 'unsigned int' to 'ct_node
    *' of greater size
    Generating code
    77 of 78 functions (98.7%) were compiled, the rest were copied from
    previous compilation.
    66 functions were new in current compilation
    0 functions had inline decision re-evaluated but remain unchanged
    Finished generating code
    ConsoleTest2022.vcxproj -> C:\Test\ConsoleTestVS2022\ConsoleTest2022\x64\Release\ConsoleTest2022.exe 1>Done building project "ConsoleTest2022.vcxproj".
    ========== Build: 1 succeeded, 0 failed, 0 up-to-date, 0 skipped ==========


    Futex Stack Test
    by: Chris M. Thomasson
    version: (0.0.1)
    _________________________________

    CT_THREAD_N = 42
    CT_WORK_N = 10000000
    CT_WAIT = 00000000DEADBEEF
    ct_shared::ct_shared()


    Launching threads...


    Generate work...


    Completed all work!

    Sending shutdown state...


    Threads joined...

    Dump shutdown state...


    Shutdown complete!

    ct_shared::~ct_shared()

    g_ct_work_alloations = 10000042
    g_ct_work_dealloations = 10000042

    Fin!



    I still need to check my algorithm out in Relacy. I think it might be
    able to model futex.


    My code:
    ______________________________________
    #include <iostream>
    #include <thread>
    #include <atomic>
    #include <algorithm>
    #include <cassert>


    #define CT_THREAD_N (42)
    #define CT_WORK_N (10000000)
    #define CT_WAIT ((ct_node*)0xDEADBEEF)


    struct ct_node
    {
    ˙˙˙ ct_node* m_next = { nullptr };

    ˙˙˙ ct_node()
    ˙˙˙ {
    ˙˙˙˙˙˙˙ //std::cout << "(" << this << ")::ct_node::ct_node()\n";
    ˙˙˙ }

    ˙˙˙ ~ct_node()
    ˙˙˙ {
    ˙˙˙˙˙˙˙ //std::cout << "(" << this << ")::ct_node::~ct_node()\n";
    ˙˙˙ }
    };


    struct ct_stack
    {
    ˙˙˙ std::atomic<ct_node*> m_head = { nullptr };

    ˙˙˙ void
    ˙˙˙ push(
    ˙˙˙˙˙˙˙ ct_node* node
    ˙˙˙ ) {
    ˙˙˙˙˙˙˙ ct_node* head = m_head.load(std::memory_order_relaxed);

    ˙˙˙˙˙˙˙ do
    ˙˙˙˙˙˙˙ {
    ˙˙˙˙˙˙˙˙˙˙˙ if (head == CT_WAIT)
    ˙˙˙˙˙˙˙˙˙˙˙ {
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ node->m_next = nullptr;
    ˙˙˙˙˙˙˙˙˙˙˙ }

    ˙˙˙˙˙˙˙˙˙˙˙ else
    ˙˙˙˙˙˙˙˙˙˙˙ {
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ node->m_next = head;
    ˙˙˙˙˙˙˙˙˙˙˙ }

    ˙˙˙˙˙˙˙ } while (! m_head.compare_exchange_weak(
    ˙˙˙˙˙˙˙˙˙˙˙ head,
    ˙˙˙˙˙˙˙˙˙˙˙ node,
    ˙˙˙˙˙˙˙˙˙˙˙ std::memory_order_release)
    ˙˙˙˙˙˙˙˙˙ );

    ˙˙˙˙˙˙˙ if (head == CT_WAIT)
    ˙˙˙˙˙˙˙ {
    ˙˙˙˙˙˙˙˙˙˙˙ // slow path...
    ˙˙˙˙˙˙˙˙˙˙˙ m_head.notify_one();
    ˙˙˙˙˙˙˙ }
    ˙˙˙ }

    ˙˙˙ ct_node*
    ˙˙˙ flush_wait()
    ˙˙˙ {
    ˙˙˙˙˙˙˙ ct_node* head = nullptr;

    ˙˙˙˙˙˙˙ for (;;)
    ˙˙˙˙˙˙˙ {
    ˙˙˙˙˙˙˙˙˙˙˙ head = m_head.exchange(nullptr, std::memory_order_acquire);

    ˙˙˙˙˙˙˙˙˙˙˙ while (! head || head == CT_WAIT)
    ˙˙˙˙˙˙˙˙˙˙˙ {
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ // slow path...
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ head = m_head.exchange(CT_WAIT,
    std::memory_order_acquire);

    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ if (! head || head == CT_WAIT)
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ {
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ m_head.wait(CT_WAIT, std::memory_order_relaxed);
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ continue;
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ }
    ˙˙˙˙˙˙˙˙˙˙˙ }

    ˙˙˙˙˙˙˙˙˙˙˙ break;
    ˙˙˙˙˙˙˙ }

    ˙˙˙˙˙˙˙ assert(head && head != CT_WAIT);

    ˙˙˙˙˙˙˙ return head;
    ˙˙˙ }
    };


    struct ct_work : public ct_node
    {
    ˙˙˙ unsigned long m_state = { 0 };
    };


    struct ct_shared
    {
    ˙˙˙ ct_stack m_stack_in;
    ˙˙˙ ct_stack m_stack_out;

    ˙˙˙ ct_shared()
    ˙˙˙ {
    ˙˙˙˙˙˙˙ std::cout << "ct_shared::ct_shared()\n" << std::endl;
    ˙˙˙ }

    ˙˙˙ ~ct_shared()
    ˙˙˙ {
    ˙˙˙˙˙˙˙ assert(! m_stack_in.m_head || m_stack_in.m_head == CT_WAIT);
    ˙˙˙˙˙˙˙ assert(! m_stack_out.m_head || m_stack_out.m_head == CT_WAIT);

    ˙˙˙˙˙˙˙ std::cout << "ct_shared::~ct_shared()\n" << std::endl;
    ˙˙˙ }
    };


    void
    ct_thread(
    ˙˙˙ ct_shared& shared
    ) {
    ˙˙˙ unsigned long state = 0;

    ˙˙˙ while (! state)
    ˙˙˙ {
    ˙˙˙˙˙˙˙ ct_work* head = (ct_work*)shared.m_stack_in.flush_wait();

    ˙˙˙˙˙˙˙ while (head)
    ˙˙˙˙˙˙˙ {
    ˙˙˙˙˙˙˙˙˙˙˙ ct_work* next = (ct_work*)head->m_next;

    ˙˙˙˙˙˙˙˙˙˙˙ if (head->m_state == 666)
    ˙˙˙˙˙˙˙˙˙˙˙ {
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ // Shutdown detected...
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ state = 1;
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ shared.m_stack_in.push(head);
    ˙˙˙˙˙˙˙˙˙˙˙ }

    ˙˙˙˙˙˙˙˙˙˙˙ else
    ˙˙˙˙˙˙˙˙˙˙˙ {
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ shared.m_stack_out.push(head);
    ˙˙˙˙˙˙˙˙˙˙˙ }

    ˙˙˙˙˙˙˙˙˙˙˙ head = next;
    ˙˙˙˙˙˙˙ }
    ˙˙˙ }

    ˙˙˙ //std::cout << "shutdown..." << std::endl;
    }


    int
    main()
    {
    ˙˙˙ {
    ˙˙˙˙˙˙˙ std::cout << "Futex Stack Test\n";
    ˙˙˙˙˙˙˙ std::cout << "by: Chris M. Thomasson\n";
    ˙˙˙˙˙˙˙ std::cout << "version: (0.0.1)\n";
    ˙˙˙˙˙˙˙ std::cout << "_________________________________\n" << std::endl;
    ˙˙˙ }

    ˙˙˙ unsigned long g_ct_work_alloations = 0;
    ˙˙˙ unsigned long g_ct_work_dealloations = 0;

    ˙˙˙ {
    ˙˙˙˙˙˙˙ std::cout << "CT_THREAD_N = " << CT_THREAD_N << std::endl;
    ˙˙˙˙˙˙˙ std::cout << "CT_WORK_N = " << CT_WORK_N << std::endl;
    ˙˙˙˙˙˙˙ std::cout << "CT_WAIT = " << CT_WAIT << std::endl;
    ˙˙˙ }

    ˙˙˙ {
    ˙˙˙˙˙˙˙ ct_shared shared = { };

    ˙˙˙˙˙˙˙ std::thread threads[CT_THREAD_N];

    ˙˙˙˙˙˙˙ std::cout << "\nLaunching threads...\n" << std::endl;

    ˙˙˙˙˙˙˙ for (unsigned long i = 0; i < CT_THREAD_N; ++i)
    ˙˙˙˙˙˙˙ {
    ˙˙˙˙˙˙˙˙˙˙˙ threads[i] = std::thread(ct_thread, std::ref(shared));
    ˙˙˙˙˙˙˙ }

    ˙˙˙˙˙˙˙ //std::this_thread::sleep_for(std::chrono::seconds(2));

    ˙˙˙˙˙˙˙ std::cout << "\nGenerate work...\n" << std::endl;

    ˙˙˙˙˙˙˙ // Generate work...
    ˙˙˙˙˙˙˙ {
    ˙˙˙˙˙˙˙˙˙˙˙ for (unsigned long i = 0; i < CT_WORK_N; ++i)
    ˙˙˙˙˙˙˙˙˙˙˙ {
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ shared.m_stack_in.push(new ct_work);
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ ++g_ct_work_alloations;
    ˙˙˙˙˙˙˙˙˙˙˙ }
    ˙˙˙˙˙˙˙ }

    ˙˙˙˙˙˙˙ // Wait for work...
    ˙˙˙˙˙˙˙ {
    ˙˙˙˙˙˙˙˙˙˙˙ unsigned long wn = 0;

    ˙˙˙˙˙˙˙˙˙˙˙ while (wn < CT_WORK_N)
    ˙˙˙˙˙˙˙˙˙˙˙ {
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ ct_work* head = (ct_work*)shared.m_stack_out.flush_wait();

    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ while (head)
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ {
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ ct_work* next = (ct_work*)head->m_next;
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ delete head;
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ ++g_ct_work_dealloations;
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ ++wn;
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ head = next;
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ }
    ˙˙˙˙˙˙˙˙˙˙˙ }
    ˙˙˙˙˙˙˙ }

    ˙˙˙˙˙˙˙ std::cout << "\nCompleted all work!\n" << std::endl;
    ˙˙˙˙˙˙˙ std::cout << "Sending shutdown state...\n" << std::endl;

    ˙˙˙˙˙˙˙ // Send shutdown state...
    ˙˙˙˙˙˙˙ {
    ˙˙˙˙˙˙˙˙˙˙˙ for (unsigned long i = 0; i < CT_THREAD_N; ++i)
    ˙˙˙˙˙˙˙˙˙˙˙ {
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ ct_work* w = new ct_work;
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ ++g_ct_work_alloations;
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ w->m_state = 666;
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ shared.m_stack_in.push(w);
    ˙˙˙˙˙˙˙˙˙˙˙ }
    ˙˙˙˙˙˙˙ }

    ˙˙˙˙˙˙˙ // Join threads...
    ˙˙˙˙˙˙˙ {
    ˙˙˙˙˙˙˙˙˙˙˙ for (unsigned long i = 0; i < CT_THREAD_N; ++i)
    ˙˙˙˙˙˙˙˙˙˙˙ {
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ threads[i].join();
    ˙˙˙˙˙˙˙˙˙˙˙ }
    ˙˙˙˙˙˙˙ }

    ˙˙˙˙˙˙˙ std::cout << "\nThreads joined...\n" << std::endl;

    ˙˙˙˙˙˙˙ // Dump shutdown state...
    ˙˙˙˙˙˙˙ std::cout << "Dump shutdown state...\n" << std::endl;

    ˙˙˙˙˙˙˙ {
    ˙˙˙˙˙˙˙˙˙˙˙ ct_work* head = (ct_work*)shared.m_stack_in.m_head.load(std::memory_order_relaxed);
    ˙˙˙˙˙˙˙˙˙˙˙ shared.m_stack_in.m_head = nullptr;

    ˙˙˙˙˙˙˙˙˙˙˙ while (head)
    ˙˙˙˙˙˙˙˙˙˙˙ {
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ ct_work* next = (ct_work*)head->m_next;
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ delete head;
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ ++g_ct_work_dealloations;
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ head = next;
    ˙˙˙˙˙˙˙˙˙˙˙ }
    ˙˙˙˙˙˙˙ }

    ˙˙˙˙˙˙˙ std::cout << "\nShutdown complete!\n" << std::endl;
    ˙˙˙ }

    ˙˙˙ // Sanity check...
    ˙˙˙ {
    ˙˙˙˙˙˙˙ std::cout << "g_ct_work_alloations = " << g_ct_work_alloations
    << "\n";
    ˙˙˙˙˙˙˙ std::cout << "g_ct_work_dealloations = " <<
    g_ct_work_dealloations << std::endl;

    ˙˙˙˙˙˙˙ if (g_ct_work_alloations != g_ct_work_dealloations)
    ˙˙˙˙˙˙˙ {
    ˙˙˙˙˙˙˙˙˙˙˙ std::cout << "Pardon my French, but shit!!!!!!!!!!!!!! NOT GOOD\n" << std::endl;
    ˙˙˙˙˙˙˙˙˙˙˙ std::cout << "DAMN IT!!!! Grrrrr\n" << std::endl;
    ˙˙˙˙˙˙˙ }
    ˙˙˙ }

    ˙˙˙ std::cout << "\nFin!\n" << std::endl;

    ˙˙˙ return 0;
    }
    ______________________________________




    My output:
    _______________________
    Futex Stack Test
    by: Chris M. Thomasson
    version: (0.0.1)
    _________________________________

    CT_THREAD_N = 42
    CT_WORK_N = 10000000
    CT_WAIT = 00000000DEADBEEF
    ct_shared::ct_shared()


    Launching threads...


    Generate work...


    Completed all work!

    Sending shutdown state...


    Threads joined...

    Dump shutdown state...


    Shutdown complete!

    ct_shared::~ct_shared()

    g_ct_work_alloations = 10000042
    g_ct_work_dealloations = 10000042

    Fin!
    _______________________


    Well, any luck?


    --- MBSE BBS v1.0.8.4 (Linux-x86_64)
    * Origin: A noiseless patient Spider (3:633/280.2@fidonet)
  • From James Kuyper@3:633/280.2 to All on Wed Feb 19 06:13:53 2025
    On 2/18/25 02:01, Paavo Helde wrote:
    On 18.02.2025 01:17, Chris M. Thomasson wrote:
    ....
    Seamed to work fine, but compilation produced some warnings about the CT_WAIT macro.


    Build started at 08:55...
    ------ Build started: Project: ConsoleTest2022, Configuration: Release
    x64 ------
    futex-stack-test.cpp 1>C:\Test\ConsoleTestVS2022\ConsoleTest2022\futex-stack-test.cpp(55,25): warning C4312: 'type cast': conversion from 'unsigned int' to 'ct_node
    *' of greater size 1>C:\Test\ConsoleTestVS2022\ConsoleTest2022\futex-stack-test.cpp(71,21): warning C4312: 'type cast': conversion from 'unsigned int' to 'ct_node
    *' of greater size 1>C:\Test\ConsoleTestVS2022\ConsoleTest2022\futex-stack-test.cpp(87,38): warning C4312: 'type cast': conversion from 'unsigned int' to 'ct_node
    *' of greater size 1>C:\Test\ConsoleTestVS2022\ConsoleTest2022\futex-stack-test.cpp(90,40): warning C4312: 'type cast': conversion from 'unsigned int' to 'ct_node
    *' of greater size 1>C:\Test\ConsoleTestVS2022\ConsoleTest2022\futex-stack-test.cpp(92,39): warning C4312: 'type cast': conversion from 'unsigned int' to 'ct_node
    *' of greater size 1>C:\Test\ConsoleTestVS2022\ConsoleTest2022\futex-stack-test.cpp(94,33): warning C4312: 'type cast': conversion from 'unsigned int' to 'ct_node
    *' of greater size 1>C:\Test\ConsoleTestVS2022\ConsoleTest2022\futex-stack-test.cpp(185,38): warning C4312: 'type cast': conversion from 'unsigned int' to 'ct_node
    *' of greater size
    ....
    #define CT_WAIT ((ct_node*)0xDEADBEEF)

    That should be a reinterpret_cast<>. Correcting that gets rid of all of
    those messages.

    Also, the following expressions should all use static_cast<>:
    ˙˙˙˙˙˙˙ ct_work* head = (ct_work*)shared.m_stack_in.flush_wait();
    ....
    ˙˙˙˙˙˙˙˙˙˙˙ ct_work* next = (ct_work*)head->m_next;
    ....
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ ct_work* head = (ct_work*)shared.m_stack_out.flush_wait(); ....
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ ct_work* next = (ct_work*)head->m_next;
    ....
    ˙˙˙˙˙˙˙˙˙˙˙ ct_work* head =
    (ct_work*)shared.m_stack_in.m_head.load(std::memory_order_relaxed);
    ....
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ ct_work* next = (ct_work*)head->m_next


    --- MBSE BBS v1.0.8.4 (Linux-x86_64)
    * Origin: A noiseless patient Spider (3:633/280.2@fidonet)
  • From Chris M. Thomasson@3:633/280.2 to All on Thu Feb 20 06:57:47 2025
    On 2/17/2025 11:01 PM, Paavo Helde wrote:
    On 18.02.2025 01:17, Chris M. Thomasson wrote:
    This is a little C++20 test using a futex to allow one to wait on a
    lock-free stack. The main stack logic is in struct ct_stack. Well, can
    you get to compile and run? Thanks...

    Seamed to work fine, but compilation produced some warnings about the CT_WAIT macro.
    [...]

    Sorry about that. ;^o

    Too used to C style casts. Thanks for giving it a go, Paavo. :^)

    It's just an interesting way to use the std futex directly in the state
    of a lock-free stack. My code can be streamlined as well. For instance,
    the ct_stack::flush_wait function can be much better. Something like
    this, still not ideal, but a little better?

    just typed this into the newsreader, but it should work fine: _______________________
    ct_node*
    flush_wait()
    {
    ct_node* head = m_head.exchange(nullptr, std::memory_order_acquire);

    while (! head || head == CT_WAIT)
    {
    // slow path...
    head = m_head.exchange(CT_WAIT, std::memory_order_acquire);

    if (! head || head == CT_WAIT)
    {
    m_head.wait(CT_WAIT, std::memory_order_relaxed);
    }
    }

    assert(head && head != CT_WAIT);

    return head;
    }
    _______________________

    I think I could use a compare_exchange for the wait state in the
    slow-path, but trying to optimize a slow path is probably not even worth
    it all that much. Humm...

    --- MBSE BBS v1.0.8.4 (Linux-x86_64)
    * Origin: A noiseless patient Spider (3:633/280.2@fidonet)
  • From Chris M. Thomasson@3:633/280.2 to All on Thu Feb 20 07:06:38 2025
    On 2/18/2025 11:13 AM, James Kuyper wrote:
    On 2/18/25 02:01, Paavo Helde wrote:
    On 18.02.2025 01:17, Chris M. Thomasson wrote:
    ...
    Seamed to work fine, but compilation produced some warnings about the
    CT_WAIT macro.


    Build started at 08:55...
    ------ Build started: Project: ConsoleTest2022, Configuration: Release
    x64 ------
    futex-stack-test.cpp
    C:\Test\ConsoleTestVS2022\ConsoleTest2022\futex-stack-test.cpp(55,25):
    warning C4312: 'type cast': conversion from 'unsigned int' to 'ct_node
    [...]
    ...
    #define CT_WAIT ((ct_node*)0xDEADBEEF)



    That should be a reinterpret_cast<>. Correcting that gets rid of all of
    those messages.

    Also, the following expressions should all use static_cast<>:

    Ditto! Thanks James. I need to get off the C train when using C++!
    Grrr... Now, I have a little reservation about CT_WAIT. Let's say we
    allocate a work node that just happens to be equal to CT_WAIT! That
    would be bad. Humm... Should I allocate a special node, or put it on the
    stack or something, and denote it as CT_WAIT?

    ˙˙˙˙˙˙˙ ct_work* head = (ct_work*)shared.m_stack_in.flush_wait();
    ...
    ˙˙˙˙˙˙˙˙˙˙˙ ct_work* next = (ct_work*)head->m_next;
    ...
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ ct_work* head = (ct_work*)shared.m_stack_out.flush_wait();
    ...
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ ct_work* next = (ct_work*)head->m_next;
    ...
    ˙˙˙˙˙˙˙˙˙˙˙ ct_work* head =
    (ct_work*)shared.m_stack_in.m_head.load(std::memory_order_relaxed);
    ...
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ ct_work* next = (ct_work*)head->m_next



    --- MBSE BBS v1.0.8.4 (Linux-x86_64)
    * Origin: A noiseless patient Spider (3:633/280.2@fidonet)
  • From Chris M. Thomasson@3:633/280.2 to All on Fri Feb 28 19:41:15 2025
    On 2/17/2025 11:01 PM, Paavo Helde wrote:
    On 18.02.2025 01:17, Chris M. Thomasson wrote:
    This is a little C++20 test using a futex to allow one to wait on a
    lock-free stack. The main stack logic is in struct ct_stack. Well, can
    you get to compile and run? Thanks...

    Seamed to work fine, but compilation produced some warnings about the CT_WAIT macro.
    [...]
    Shutdown complete!

    ct_shared::~ct_shared()




    g_ct_work_alloations = 10000042
    g_ct_work_dealloations = 10000042
    ^^^^^^^^^^^^^^^^

    It would help if I spelled those damn words right! Allocation,
    Deallocation... Where are my c's? lol. Wow.

    Sorry about that non-sense.





    Fin!
    _______________________


    Well, any luck?



    --- MBSE BBS v1.0.8.4 (Linux-x86_64)
    * Origin: A noiseless patient Spider (3:633/280.2@fidonet)
  • From Wuns Haerst@3:633/280.2 to All on Fri May 2 21:01:49 2025
    It's not hard to implement a lock-free stack with DW-CAS,
    so no need for futexes. That's my dcas_atomic:

    #pragma once
    #include <cstdint>
    #include <utility>
    #include <atomic>
    #include <type_traits>
    #include <climits>
    #include <bit>
    #include <memory>

    #if defined(_MSC_VER)
    #pragma warning(push)
    #pragma warning(disable: 26495)
    #endif
    #if defined(__llvm__)
    #pragma clang diagnostic push
    #pragma clang diagnostic ignored "-Watomic-alignment"
    #endif

    constexpr int
    DCAS_ACQ_REL = (int)std::memory_order::acq_rel,
    DCAS_ACQUIRE = (int)std::memory_order::acquire,
    DCAS_RELAXED = (int)std::memory_order::relaxed,
    DCAS_RELEASE = (int)std::memory_order::release,
    DCAS_SEQ_CST = (int)std::memory_order::seq_cst;
    template<int Type>
    using dcas_type = std::integral_constant<int, Type>;
    using dcas_acq_rel = dcas_type<DCAS_ACQ_REL>;
    using dcas_acquire = dcas_type<DCAS_ACQUIRE>;
    using dcas_relaxed = dcas_type<DCAS_RELAXED>;
    using dcas_release = dcas_type<DCAS_RELEASE>;
    using dcas_seq_cst = dcas_type<DCAS_SEQ_CST>;

    struct dcas_atomic
    {
    static_assert(sizeof(size_t) == 8 || sizeof(size_t) == 4, "must be 64 or 32 bit architecture");
    struct pair_t { size_t first, second; };
    dcas_atomic() = default;
    dcas_atomic( pair_t const &desired );
    std::atomic<size_t> &first() noexcept;
    std::atomic<size_t> &second() noexcept;
    operator pair_t() noexcept;
    template<int Type>
    bool compare_exchange( pair_t &expected, pair_t const &desired, dcas_type<Type> = dcas_seq_cst() ) noexcept;
    template<int Type>
    pair_t load( dcas_type<Type> = dcas_seq_cst() ) const noexcept;
    template<int Type>
    void store( pair_t const &niu, dcas_type<Type> = dcas_seq_cst() ) noexcept;
    private:
    #if defined(__GNUC__) || defined(__clang__) && !defined(_MSC_VER)
    #if SIZE_WIDTH == 64
    using dword_t = unsigned __int128;
    #elif SIZE_WIDTH == 32
    using dword_t = unsigned long long;
    #else
    #error unknown architecture
    #endif
    #endif
    union alignas(2 * sizeof(size_t)) U
    {
    U() {}
    static_assert(sizeof(std::atomic<size_t>) == sizeof(size_t), "sizeof(atomic<size_t>) must be == sizeof(size_t)");
    std::atomic<size_t> m_atomics[2];
    #if defined(_MSC_VER)
    #if defined(_M_X64) || defined(_M_ARM64) || defined(_M_ARM64EC)
    __int64 volatile m_firstAndSecond[2];
    #elif defined(_M_IX86)
    __int64 volatile m_firstAndSecond;
    #else
    #error unknown architecture
    #endif
    #elif defined(__GNUC__) || defined(__clang__)
    dword_t volatile m_firstAndSecond;
    #endif
    } u;
    };

    inline dcas_atomic::dcas_atomic( pair_t const &desired )
    {
    u.m_atomics[0].store( desired.first, std::memory_order_relaxed );
    u.m_atomics[1].store( desired.second, std::memory_order_relaxed );
    }

    inline std::atomic<size_t> &dcas_atomic::first() noexcept
    {
    return u.m_atomics[0];
    }

    inline std::atomic<size_t> &dcas_atomic::second() noexcept
    {
    return u.m_atomics[1];
    }

    inline dcas_atomic::operator pair_t() noexcept
    {
    return pair_t( first().load( std::memory_order_relaxed ), second().load( std::memory_order_relaxed ) );
    }

    template<int Type>
    inline bool dcas_atomic::compare_exchange( pair_t &expected, pair_t
    const &desired, dcas_type<Type> ) noexcept
    {
    using namespace std;
    static_assert(Type == DCAS_ACQ_REL || Type == DCAS_ACQUIRE || Type == DCAS_RELAXED || Type == DCAS_RELEASE || Type == DCAS_SEQ_CST);
    #if defined(_MSC_VER)
    #if defined(_M_X64) || defined(_M_ARM64) || defined(_M_ARM64EC)
    __int64 expectedA[2];
    expectedA[0] = (__int64)expected.first;
    expectedA[1] = (__int64)expected.second;
    char fRet;
    #if defined(_M_X64)
    fRet = _InterlockedCompareExchange128( u.m_firstAndSecond, desired.second, desired.first, expectedA );
    #else
    if constexpr( Type == DCAS_ACQ_REL || Type == DCAS_SEQ_CST )
    fRet = _InterlockedCompareExchange128( u.m_firstAndSecond, desired.second, desired.first, expectedA );
    else if constexpr( Type == DCAS_ACQUIRE )
    fRet = _InterlockedCompareExchange128_acq( u.m_firstAndSecond, desired.second, desired.first, expectedA );
    else if constexpr( Type == DCAS_RELAXED )
    fRet = _InterlockedCompareExchange128_nf( u.m_firstAndSecond, desired.second, desired.first, expectedA );
    else
    fRet = _InterlockedCompareExchange128_rel( u.m_firstAndSecond, desired.second, desired.first, expectedA );
    #endif
    if( fRet )
    return true;
    expected.first = expectedA[0];
    expected.second = expectedA[1];
    return false;
    #elif defined(_M_IX86)
    auto compose = []( pair_t const &p ) -> uint64_t { return p.first | (uint64_t)p.second << 32; };
    uint64_t
    cDesired = compose( desired ),
    cExpected = compose( expected ),
    cResult = _InterlockedCompareExchange64( &u.m_firstAndSecond, cDesired, cExpected );
    if( cResult == cExpected ) [[likely]]
    return true;
    expected.first = (uint32_t)cResult;
    expected.second = (uint32_t)(cResult >> 32);
    return false;
    #else
    #error unspupported Windows-platform
    #endif
    #elif defined(__GNUC__) || defined(__clang__)
    constexpr auto
    pair_t::*FIRST = std::endian::native == std::endian::little ? &pair_t::first : &pair_t::second,
    pair_t::*SECOND = std::endian::native == std::endian::little ? &pair_t::second : &pair_t::first;
    auto compose = []( pair_t const &p ) -> dword_t { return (dword_t)(p.*FIRST) | (dword_t)(p.*SECOND) << SIZE_WIDTH; };
    dword_t
    dwExpected = compose( expected ),
    dwDesired = compose( desired );
    bool ret;
    if constexpr( Type == DCAS_ACQ_REL )
    ret = __atomic_compare_exchange_n( &u.m_firstAndSecond, &dwExpected,
    dwDesired, true, __ATOMIC_ACQ_REL, __ATOMIC_RELAXED );
    else if constexpr( Type == DCAS_ACQUIRE )
    ret = __atomic_compare_exchange_n( &u.m_firstAndSecond, &dwExpected,
    dwDesired, true, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED );
    else if constexpr( Type == DCAS_RELAXED )
    ret = __atomic_compare_exchange_n( &u.m_firstAndSecond, &dwExpected,
    dwDesired, true, __ATOMIC_RELAXED, __ATOMIC_RELAXED );
    else if constexpr( Type == DCAS_RELEASE )
    ret = __atomic_compare_exchange_n( &u.m_firstAndSecond, &dwExpected,
    dwDesired, true, __ATOMIC_RELEASE, __ATOMIC_RELAXED );
    else
    ret = __atomic_compare_exchange_n( &u.m_firstAndSecond, &dwExpected,
    dwDesired, true, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED );
    if( ret ) [[likely]]
    return true;
    expected.*FIRST = (size_t)dwExpected;
    expected.*SECOND = (size_t)(dwExpected >> SIZE_WIDTH);
    return false;
    #else
    #error unspupported platform
    #endif
    }

    template<int Type>
    inline typename dcas_atomic::pair_t dcas_atomic::load( dcas_type<Type> )
    const noexcept
    {
    using namespace std;
    static_assert(Type == DCAS_ACQ_REL || Type == DCAS_ACQUIRE || Type == DCAS_RELAXED || Type == DCAS_RELEASE || Type == DCAS_SEQ_CST);
    pair_t cmp( 0, 0 );
    const_cast<dcas_atomic *>(this)->compare_exchange( cmp, cmp, dcas_type<Type>() );
    return cmp;
    }

    template<int Type>
    inline void dcas_atomic::store( pair_t const &niu, dcas_type<Type> )
    noexcept
    {
    using namespace std;
    static_assert(Type == DCAS_ACQ_REL || Type == DCAS_ACQUIRE || Type == DCAS_RELAXED || Type == DCAS_RELEASE || Type == DCAS_SEQ_CST);
    pair_t cmp( *this );
    while( !this->compare_exchange( cmp, niu, dcas_type<Type>() ) );
    }

    #if defined(_MSC_VER)
    #pragma warning(pop)
    #endif
    #if defined(__llvm__)
    #pragma clang diagnostic pop
    #endif

    Am 18.02.2025 um 00:17 schrieb Chris M. Thomasson:
    This is a little C++20 test using a futex to allow one to wait on a lock-free stack. The main stack logic is in struct ct_stack. Well, can
    you get to compile and run? Thanks...

    I still need to check my algorithm out in Relacy. I think it might be
    able to model futex.


    My code:
    ______________________________________
    #include <iostream>
    #include <thread>
    #include <atomic>
    #include <algorithm>
    #include <cassert>


    #define CT_THREAD_N (42)
    #define CT_WORK_N (10000000)
    #define CT_WAIT ((ct_node*)0xDEADBEEF)


    struct ct_node
    {
    ˙˙˙ ct_node* m_next = { nullptr };

    ˙˙˙ ct_node()
    ˙˙˙ {
    ˙˙˙˙˙˙˙ //std::cout << "(" << this << ")::ct_node::ct_node()\n";
    ˙˙˙ }

    ˙˙˙ ~ct_node()
    ˙˙˙ {
    ˙˙˙˙˙˙˙ //std::cout << "(" << this << ")::ct_node::~ct_node()\n";
    ˙˙˙ }
    };


    struct ct_stack
    {
    ˙˙˙ std::atomic<ct_node*> m_head = { nullptr };

    ˙˙˙ void
    ˙˙˙ push(
    ˙˙˙˙˙˙˙ ct_node* node
    ˙˙˙ ) {
    ˙˙˙˙˙˙˙ ct_node* head = m_head.load(std::memory_order_relaxed);

    ˙˙˙˙˙˙˙ do
    ˙˙˙˙˙˙˙ {
    ˙˙˙˙˙˙˙˙˙˙˙ if (head == CT_WAIT)
    ˙˙˙˙˙˙˙˙˙˙˙ {
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ node->m_next = nullptr;
    ˙˙˙˙˙˙˙˙˙˙˙ }

    ˙˙˙˙˙˙˙˙˙˙˙ else
    ˙˙˙˙˙˙˙˙˙˙˙ {
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ node->m_next = head;
    ˙˙˙˙˙˙˙˙˙˙˙ }

    ˙˙˙˙˙˙˙ } while (! m_head.compare_exchange_weak(
    ˙˙˙˙˙˙˙˙˙˙˙ head,
    ˙˙˙˙˙˙˙˙˙˙˙ node,
    ˙˙˙˙˙˙˙˙˙˙˙ std::memory_order_release)
    ˙˙˙˙˙˙˙˙˙ );

    ˙˙˙˙˙˙˙ if (head == CT_WAIT)
    ˙˙˙˙˙˙˙ {
    ˙˙˙˙˙˙˙˙˙˙˙ // slow path...
    ˙˙˙˙˙˙˙˙˙˙˙ m_head.notify_one();
    ˙˙˙˙˙˙˙ }
    ˙˙˙ }

    ˙˙˙ ct_node*
    ˙˙˙ flush_wait()
    ˙˙˙ {
    ˙˙˙˙˙˙˙ ct_node* head = nullptr;

    ˙˙˙˙˙˙˙ for (;;)
    ˙˙˙˙˙˙˙ {
    ˙˙˙˙˙˙˙˙˙˙˙ head = m_head.exchange(nullptr, std::memory_order_acquire);

    ˙˙˙˙˙˙˙˙˙˙˙ while (! head || head == CT_WAIT)
    ˙˙˙˙˙˙˙˙˙˙˙ {
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ // slow path...
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ head = m_head.exchange(CT_WAIT,
    std::memory_order_acquire);

    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ if (! head || head == CT_WAIT)
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ {
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ m_head.wait(CT_WAIT, std::memory_order_relaxed);
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ continue;
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ }
    ˙˙˙˙˙˙˙˙˙˙˙ }

    ˙˙˙˙˙˙˙˙˙˙˙ break;
    ˙˙˙˙˙˙˙ }

    ˙˙˙˙˙˙˙ assert(head && head != CT_WAIT);

    ˙˙˙˙˙˙˙ return head;
    ˙˙˙ }
    };


    struct ct_work : public ct_node
    {
    ˙˙˙ unsigned long m_state = { 0 };
    };


    struct ct_shared
    {
    ˙˙˙ ct_stack m_stack_in;
    ˙˙˙ ct_stack m_stack_out;

    ˙˙˙ ct_shared()
    ˙˙˙ {
    ˙˙˙˙˙˙˙ std::cout << "ct_shared::ct_shared()\n" << std::endl;
    ˙˙˙ }

    ˙˙˙ ~ct_shared()
    ˙˙˙ {
    ˙˙˙˙˙˙˙ assert(! m_stack_in.m_head || m_stack_in.m_head == CT_WAIT);
    ˙˙˙˙˙˙˙ assert(! m_stack_out.m_head || m_stack_out.m_head == CT_WAIT);

    ˙˙˙˙˙˙˙ std::cout << "ct_shared::~ct_shared()\n" << std::endl;
    ˙˙˙ }
    };


    void
    ct_thread(
    ˙˙˙ ct_shared& shared
    ) {
    ˙˙˙ unsigned long state = 0;

    ˙˙˙ while (! state)
    ˙˙˙ {
    ˙˙˙˙˙˙˙ ct_work* head = (ct_work*)shared.m_stack_in.flush_wait();

    ˙˙˙˙˙˙˙ while (head)
    ˙˙˙˙˙˙˙ {
    ˙˙˙˙˙˙˙˙˙˙˙ ct_work* next = (ct_work*)head->m_next;

    ˙˙˙˙˙˙˙˙˙˙˙ if (head->m_state == 666)
    ˙˙˙˙˙˙˙˙˙˙˙ {
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ // Shutdown detected...
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ state = 1;
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ shared.m_stack_in.push(head);
    ˙˙˙˙˙˙˙˙˙˙˙ }

    ˙˙˙˙˙˙˙˙˙˙˙ else
    ˙˙˙˙˙˙˙˙˙˙˙ {
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ shared.m_stack_out.push(head);
    ˙˙˙˙˙˙˙˙˙˙˙ }

    ˙˙˙˙˙˙˙˙˙˙˙ head = next;
    ˙˙˙˙˙˙˙ }
    ˙˙˙ }

    ˙˙˙ //std::cout << "shutdown..." << std::endl;
    }


    int
    main()
    {
    ˙˙˙ {
    ˙˙˙˙˙˙˙ std::cout << "Futex Stack Test\n";
    ˙˙˙˙˙˙˙ std::cout << "by: Chris M. Thomasson\n";
    ˙˙˙˙˙˙˙ std::cout << "version: (0.0.1)\n";
    ˙˙˙˙˙˙˙ std::cout << "_________________________________\n" << std::endl;
    ˙˙˙ }

    ˙˙˙ unsigned long g_ct_work_alloations = 0;
    ˙˙˙ unsigned long g_ct_work_dealloations = 0;

    ˙˙˙ {
    ˙˙˙˙˙˙˙ std::cout << "CT_THREAD_N = " << CT_THREAD_N << std::endl;
    ˙˙˙˙˙˙˙ std::cout << "CT_WORK_N = " << CT_WORK_N << std::endl;
    ˙˙˙˙˙˙˙ std::cout << "CT_WAIT = " << CT_WAIT << std::endl;
    ˙˙˙ }

    ˙˙˙ {
    ˙˙˙˙˙˙˙ ct_shared shared = { };

    ˙˙˙˙˙˙˙ std::thread threads[CT_THREAD_N];

    ˙˙˙˙˙˙˙ std::cout << "\nLaunching threads...\n" << std::endl;

    ˙˙˙˙˙˙˙ for (unsigned long i = 0; i < CT_THREAD_N; ++i)
    ˙˙˙˙˙˙˙ {
    ˙˙˙˙˙˙˙˙˙˙˙ threads[i] = std::thread(ct_thread, std::ref(shared));
    ˙˙˙˙˙˙˙ }

    ˙˙˙˙˙˙˙ //std::this_thread::sleep_for(std::chrono::seconds(2));

    ˙˙˙˙˙˙˙ std::cout << "\nGenerate work...\n" << std::endl;

    ˙˙˙˙˙˙˙ // Generate work...
    ˙˙˙˙˙˙˙ {
    ˙˙˙˙˙˙˙˙˙˙˙ for (unsigned long i = 0; i < CT_WORK_N; ++i)
    ˙˙˙˙˙˙˙˙˙˙˙ {
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ shared.m_stack_in.push(new ct_work);
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ ++g_ct_work_alloations;
    ˙˙˙˙˙˙˙˙˙˙˙ }
    ˙˙˙˙˙˙˙ }

    ˙˙˙˙˙˙˙ // Wait for work...
    ˙˙˙˙˙˙˙ {
    ˙˙˙˙˙˙˙˙˙˙˙ unsigned long wn = 0;

    ˙˙˙˙˙˙˙˙˙˙˙ while (wn < CT_WORK_N)
    ˙˙˙˙˙˙˙˙˙˙˙ {
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ ct_work* head = (ct_work*)shared.m_stack_out.flush_wait();

    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ while (head)
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ {
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ ct_work* next = (ct_work*)head->m_next;
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ delete head;
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ ++g_ct_work_dealloations;
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ ++wn;
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ head = next;
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ }
    ˙˙˙˙˙˙˙˙˙˙˙ }
    ˙˙˙˙˙˙˙ }

    ˙˙˙˙˙˙˙ std::cout << "\nCompleted all work!\n" << std::endl;
    ˙˙˙˙˙˙˙ std::cout << "Sending shutdown state...\n" << std::endl;

    ˙˙˙˙˙˙˙ // Send shutdown state...
    ˙˙˙˙˙˙˙ {
    ˙˙˙˙˙˙˙˙˙˙˙ for (unsigned long i = 0; i < CT_THREAD_N; ++i)
    ˙˙˙˙˙˙˙˙˙˙˙ {
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ ct_work* w = new ct_work;
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ ++g_ct_work_alloations;
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ w->m_state = 666;
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ shared.m_stack_in.push(w);
    ˙˙˙˙˙˙˙˙˙˙˙ }
    ˙˙˙˙˙˙˙ }

    ˙˙˙˙˙˙˙ // Join threads...
    ˙˙˙˙˙˙˙ {
    ˙˙˙˙˙˙˙˙˙˙˙ for (unsigned long i = 0; i < CT_THREAD_N; ++i)
    ˙˙˙˙˙˙˙˙˙˙˙ {
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ threads[i].join();
    ˙˙˙˙˙˙˙˙˙˙˙ }
    ˙˙˙˙˙˙˙ }

    ˙˙˙˙˙˙˙ std::cout << "\nThreads joined...\n" << std::endl;

    ˙˙˙˙˙˙˙ // Dump shutdown state...
    ˙˙˙˙˙˙˙ std::cout << "Dump shutdown state...\n" << std::endl;

    ˙˙˙˙˙˙˙ {
    ˙˙˙˙˙˙˙˙˙˙˙ ct_work* head = (ct_work*)shared.m_stack_in.m_head.load(std::memory_order_relaxed);
    ˙˙˙˙˙˙˙˙˙˙˙ shared.m_stack_in.m_head = nullptr;

    ˙˙˙˙˙˙˙˙˙˙˙ while (head)
    ˙˙˙˙˙˙˙˙˙˙˙ {
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ ct_work* next = (ct_work*)head->m_next;
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ delete head;
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ ++g_ct_work_dealloations;
    ˙˙˙˙˙˙˙˙˙˙˙˙˙˙˙ head = next;
    ˙˙˙˙˙˙˙˙˙˙˙ }
    ˙˙˙˙˙˙˙ }

    ˙˙˙˙˙˙˙ std::cout << "\nShutdown complete!\n" << std::endl;
    ˙˙˙ }

    ˙˙˙ // Sanity check...
    ˙˙˙ {
    ˙˙˙˙˙˙˙ std::cout << "g_ct_work_alloations = " << g_ct_work_alloations
    << "\n";
    ˙˙˙˙˙˙˙ std::cout << "g_ct_work_dealloations = " <<
    g_ct_work_dealloations << std::endl;

    ˙˙˙˙˙˙˙ if (g_ct_work_alloations != g_ct_work_dealloations)
    ˙˙˙˙˙˙˙ {
    ˙˙˙˙˙˙˙˙˙˙˙ std::cout << "Pardon my French, but shit!!!!!!!!!!!!!! NOT GOOD\n" << std::endl;
    ˙˙˙˙˙˙˙˙˙˙˙ std::cout << "DAMN IT!!!! Grrrrr\n" << std::endl;
    ˙˙˙˙˙˙˙ }
    ˙˙˙ }

    ˙˙˙ std::cout << "\nFin!\n" << std::endl;

    ˙˙˙ return 0;
    }
    ______________________________________




    My output:
    _______________________
    Futex Stack Test
    by: Chris M. Thomasson
    version: (0.0.1)
    _________________________________

    CT_THREAD_N = 42
    CT_WORK_N = 10000000
    CT_WAIT = 00000000DEADBEEF
    ct_shared::ct_shared()


    Launching threads...


    Generate work...


    Completed all work!

    Sending shutdown state...


    Threads joined...

    Dump shutdown state...


    Shutdown complete!

    ct_shared::~ct_shared()

    g_ct_work_alloations = 10000042
    g_ct_work_dealloations = 10000042

    Fin!
    _______________________


    Well, any luck?


    --- MBSE BBS v1.1.1 (Linux-x86_64)
    * Origin: A noiseless patient Spider (3:633/280.2@fidonet)
  • From Chris M. Thomasson@3:633/280.2 to All on Sat May 3 04:30:56 2025
    On 5/2/2025 4:01 AM, Wuns Haerst wrote:
    It's not hard to implement a lock-free stack with DW-CAS,

    Oh, I know. :^)

    Fwiw, my stack uses exchange for pop and a normal cas push. Have another
    one that uses exchange for push and pop, pretty nice.


    so no need for futexes. That's my dcas_atomic:

    Keep in mind that the futex is only there to allow threads to wait on an
    empty condition...


    [snip code]

    I will take a look later on today.

    --- MBSE BBS v1.1.1 (Linux-x86_64)
    * Origin: A noiseless patient Spider (3:633/280.2@fidonet)
  • From Chris M. Thomasson@3:633/280.2 to All on Sat May 3 07:41:18 2025
    On 5/2/2025 11:30 AM, Chris M. Thomasson wrote:
    On 5/2/2025 4:01 AM, Wuns Haerst wrote:
    It's not hard to implement a lock-free stack with DW-CAS,

    Oh, I know. :^)

    Fwiw, my stack uses exchange for pop and a normal cas push. Have another
    one that uses exchange for push and pop, pretty nice.


    so no need for futexes. That's my dcas_atomic:

    Keep in mind that the futex is only there to allow threads to wait on an empty condition...


    [snip code]

    I will take a look later on today.

    Later on... Working now with GPU fun.

    https://youtu.be/xDROUq-q0yI

    ;^D

    Keep in mind that the futex is there to allow for waits.

    --- MBSE BBS v1.1.1 (Linux-x86_64)
    * Origin: A noiseless patient Spider (3:633/280.2@fidonet)
  • From Chris M. Thomasson@3:633/280.2 to All on Sat May 3 11:49:08 2025
    On 5/2/2025 4:01 AM, Wuns Haerst wrote:
    It's not hard to implement a lock-free stack with DW-CAS,
    so no need for futexes. That's my dcas_atomic:

    #pragma once
    #include <cstdint>
    #include <utility>
    #include <atomic>
    #include <type_traits>
    #include <climits>
    #include <bit>
    #include <memory>

    #if defined(_MSC_VER)
    ˙˙˙˙#pragma warning(push)
    ˙˙˙˙#pragma warning(disable: 26495)
    #endif
    #if defined(__llvm__)
    ˙˙˙ #pragma clang diagnostic push
    ˙˙˙ #pragma clang diagnostic ignored "-Watomic-alignment"
    #endif

    constexpr int
    ˙˙˙˙DCAS_ACQ_REL = (int)std::memory_order::acq_rel,
    ˙˙˙˙DCAS_ACQUIRE = (int)std::memory_order::acquire,
    ˙˙˙˙DCAS_RELAXED = (int)std::memory_order::relaxed,
    ˙˙˙˙DCAS_RELEASE = (int)std::memory_order::release,
    ˙˙˙˙DCAS_SEQ_CST = (int)std::memory_order::seq_cst;
    template<int Type>
    using dcas_type = std::integral_constant<int, Type>;
    using dcas_acq_rel = dcas_type<DCAS_ACQ_REL>;
    using dcas_acquire = dcas_type<DCAS_ACQUIRE>;
    using dcas_relaxed = dcas_type<DCAS_RELAXED>;
    using dcas_release = dcas_type<DCAS_RELEASE>;
    using dcas_seq_cst = dcas_type<DCAS_SEQ_CST>;

    struct dcas_atomic
    {
    ˙˙˙˙static_assert(sizeof(size_t) == 8 || sizeof(size_t) == 4, "must be
    64 or 32 bit architecture");
    ˙˙˙˙struct pair_t { size_t first, second; };
    ˙˙˙˙dcas_atomic() = default;
    ˙˙˙˙dcas_atomic( pair_t const &desired );
    ˙˙˙˙std::atomic<size_t> &first() noexcept;
    ˙˙˙˙std::atomic<size_t> &second() noexcept;
    ˙˙˙˙operator pair_t() noexcept;
    ˙˙˙˙template<int Type>
    ˙˙˙˙bool compare_exchange( pair_t &expected, pair_t const &desired, dcas_type<Type> = dcas_seq_cst() ) noexcept;
    ˙˙˙˙template<int Type>
    ˙˙˙˙pair_t load( dcas_type<Type> = dcas_seq_cst() ) const noexcept;
    ˙˙˙˙template<int Type>
    ˙˙˙˙void store( pair_t const &niu, dcas_type<Type> = dcas_seq_cst() ) noexcept;
    private:
    #if defined(__GNUC__) || defined(__clang__) && !defined(_MSC_VER)
    ˙˙˙˙#if SIZE_WIDTH == 64
    ˙˙˙˙using dword_t = unsigned __int128;
    ˙˙˙˙#elif SIZE_WIDTH == 32
    ˙˙˙˙using dword_t = unsigned long long;
    ˙˙˙˙#else
    ˙˙˙˙˙˙˙ #error unknown architecture
    ˙˙˙˙#endif
    #endif
    ˙˙˙˙union alignas(2 * sizeof(size_t)) U
    ˙˙˙˙{
    ˙˙˙˙˙˙˙ U() {}
    ˙˙˙˙˙˙˙ static_assert(sizeof(std::atomic<size_t>) == sizeof(size_t), "sizeof(atomic<size_t>) must be == sizeof(size_t)");
    ˙˙˙˙˙˙˙ std::atomic<size_t> m_atomics[2];
    #if defined(_MSC_VER)
    ˙˙˙˙#if defined(_M_X64) || defined(_M_ARM64) || defined(_M_ARM64EC)
    ˙˙˙˙˙˙˙ __int64 volatile m_firstAndSecond[2];
    ˙˙˙˙#elif defined(_M_IX86)
    ˙˙˙˙˙˙˙ __int64 volatile m_firstAndSecond;
    ˙˙˙˙#else
    ˙˙˙˙˙˙˙ #error unknown architecture
    ˙˙˙˙#endif
    #elif defined(__GNUC__) || defined(__clang__)
    ˙˙˙˙˙˙˙ dword_t volatile m_firstAndSecond;
    #endif
    ˙˙˙˙} u;
    };

    inline dcas_atomic::dcas_atomic( pair_t const &desired )
    {
    ˙˙˙˙u.m_atomics[0].store( desired.first, std::memory_order_relaxed );
    ˙˙˙˙u.m_atomics[1].store( desired.second, std::memory_order_relaxed );
    }

    inline std::atomic<size_t> &dcas_atomic::first() noexcept
    {
    ˙˙˙˙return u.m_atomics[0];
    }

    inline std::atomic<size_t> &dcas_atomic::second() noexcept
    {
    ˙˙˙˙return u.m_atomics[1];
    }

    inline dcas_atomic::operator pair_t() noexcept
    {
    ˙˙˙˙return pair_t( first().load( std::memory_order_relaxed ), second().load( std::memory_order_relaxed ) );
    }

    template<int Type>
    inline bool dcas_atomic::compare_exchange( pair_t &expected, pair_t
    const &desired, dcas_type<Type> ) noexcept
    {
    ˙˙˙˙using namespace std;
    ˙˙˙˙static_assert(Type == DCAS_ACQ_REL || Type == DCAS_ACQUIRE || Type
    == DCAS_RELAXED || Type == DCAS_RELEASE || Type == DCAS_SEQ_CST);
    #if defined(_MSC_VER)
    ˙˙˙˙#if defined(_M_X64) || defined(_M_ARM64) || defined(_M_ARM64EC)
    ˙˙˙˙__int64 expectedA[2];
    ˙˙˙˙expectedA[0] = (__int64)expected.first;
    ˙˙˙˙expectedA[1] = (__int64)expected.second;
    ˙˙˙˙char fRet;
    ˙˙˙˙#if defined(_M_X64)
    ˙˙˙˙fRet = _InterlockedCompareExchange128( u.m_firstAndSecond, desired.second, desired.first, expectedA );
    ˙˙˙˙#else
    ˙˙˙˙if constexpr( Type == DCAS_ACQ_REL || Type == DCAS_SEQ_CST )
    ˙˙˙˙˙˙˙ fRet = _InterlockedCompareExchange128( u.m_firstAndSecond, desired.second, desired.first, expectedA );
    ˙˙˙˙else if constexpr( Type == DCAS_ACQUIRE )
    ˙˙˙˙˙˙˙ fRet = _InterlockedCompareExchange128_acq( u.m_firstAndSecond, desired.second, desired.first, expectedA );
    ˙˙˙˙else if constexpr( Type == DCAS_RELAXED )
    ˙˙˙˙˙˙˙ fRet = _InterlockedCompareExchange128_nf( u.m_firstAndSecond, desired.second, desired.first, expectedA );
    ˙˙˙˙else
    ˙˙˙˙˙˙˙ fRet = _InterlockedCompareExchange128_rel( u.m_firstAndSecond, desired.second, desired.first, expectedA );
    ˙˙˙˙#endif
    ˙˙˙˙if( fRet )
    ˙˙˙˙˙˙˙ return true;
    ˙˙˙˙expected.first = expectedA[0];
    ˙˙˙˙expected.second = expectedA[1];
    ˙˙˙˙return false;
    ˙˙˙˙#elif defined(_M_IX86)
    ˙˙˙˙auto compose = []( pair_t const &p ) -> uint64_t { return p.first | (uint64_t)p.second << 32; };
    ˙˙˙˙uint64_t
    ˙˙˙˙˙˙˙ cDesired = compose( desired ),
    ˙˙˙˙˙˙˙ cExpected = compose( expected ),
    ˙˙˙˙˙˙˙ cResult = _InterlockedCompareExchange64( &u.m_firstAndSecond, cDesired, cExpected );
    ˙˙˙˙if( cResult == cExpected ) [[likely]]
    ˙˙˙˙˙˙˙ return true;
    ˙˙˙˙expected.first = (uint32_t)cResult;
    ˙˙˙˙expected.second = (uint32_t)(cResult >> 32);
    ˙˙˙˙return false;
    ˙˙˙˙#else
    ˙˙˙˙˙˙˙ #error unspupported Windows-platform
    ˙˙˙˙#endif
    #elif defined(__GNUC__) || defined(__clang__)
    ˙˙˙˙constexpr auto
    ˙˙˙˙˙˙˙ pair_t::*FIRST = std::endian::native == std::endian::little ? &pair_t::first : &pair_t::second,
    ˙˙˙˙˙˙˙ pair_t::*SECOND = std::endian::native == std::endian::little ? &pair_t::second : &pair_t::first;
    ˙˙˙˙auto compose = []( pair_t const &p ) -> dword_t˙˙˙ { return (dword_t)(p.*FIRST) | (dword_t)(p.*SECOND) << SIZE_WIDTH; };
    ˙˙˙˙dword_t
    ˙˙˙˙˙˙˙ dwExpected = compose( expected ),
    ˙˙˙˙˙˙˙ dwDesired = compose( desired );
    ˙˙˙˙bool ret;
    ˙˙˙˙if constexpr( Type == DCAS_ACQ_REL )
    ˙˙˙˙˙˙˙ ret = __atomic_compare_exchange_n( &u.m_firstAndSecond, &dwExpected, dwDesired, true, __ATOMIC_ACQ_REL, __ATOMIC_RELAXED );
    ˙˙˙˙else if constexpr( Type == DCAS_ACQUIRE )
    ˙˙˙˙˙˙˙ ret = __atomic_compare_exchange_n( &u.m_firstAndSecond, &dwExpected, dwDesired, true, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED );
    ˙˙˙˙else if constexpr( Type == DCAS_RELAXED )
    ˙˙˙˙˙˙˙ ret = __atomic_compare_exchange_n( &u.m_firstAndSecond, &dwExpected, dwDesired, true, __ATOMIC_RELAXED, __ATOMIC_RELAXED );
    ˙˙˙˙else if constexpr( Type == DCAS_RELEASE )
    ˙˙˙˙˙˙˙ ret = __atomic_compare_exchange_n( &u.m_firstAndSecond, &dwExpected, dwDesired, true, __ATOMIC_RELEASE, __ATOMIC_RELAXED );
    ˙˙˙˙else
    ˙˙˙˙˙˙˙ ret = __atomic_compare_exchange_n( &u.m_firstAndSecond, &dwExpected, dwDesired, true, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED );
    ˙˙˙˙if( ret ) [[likely]]
    ˙˙˙˙˙˙˙ return true;
    ˙˙˙˙expected.*FIRST = (size_t)dwExpected;
    ˙˙˙˙expected.*SECOND = (size_t)(dwExpected >> SIZE_WIDTH);
    ˙˙˙˙return false;
    #else
    ˙˙˙˙#error unspupported platform
    #endif
    }

    template<int Type>
    inline typename dcas_atomic::pair_t dcas_atomic::load( dcas_type<Type> ) const noexcept
    {
    ˙˙˙˙using namespace std;
    ˙˙˙˙static_assert(Type == DCAS_ACQ_REL || Type == DCAS_ACQUIRE || Type
    == DCAS_RELAXED || Type == DCAS_RELEASE || Type == DCAS_SEQ_CST);
    ˙˙˙˙pair_t cmp( 0, 0 );
    ˙˙˙˙const_cast<dcas_atomic *>(this)->compare_exchange( cmp, cmp, dcas_type<Type>() );
    ˙˙˙˙return cmp;
    }

    template<int Type>
    inline void dcas_atomic::store( pair_t const &niu, dcas_type<Type> ) noexcept
    {
    ˙˙˙˙using namespace std;
    ˙˙˙˙static_assert(Type == DCAS_ACQ_REL || Type == DCAS_ACQUIRE || Type
    == DCAS_RELAXED || Type == DCAS_RELEASE || Type == DCAS_SEQ_CST);
    ˙˙˙˙pair_t cmp( *this );
    ˙˙˙˙while( !this->compare_exchange( cmp, niu, dcas_type<Type>() ) );
    }

    #if defined(_MSC_VER)
    ˙˙˙˙#pragma warning(pop)
    #endif
    #if defined(__llvm__)
    ˙˙˙ #pragma clang diagnostic pop
    #endif


    Hard to read that mess. Where is your push and pop, and where are you
    bumping the ABA counter?




    --- MBSE BBS v1.1.1 (Linux-x86_64)
    * Origin: A noiseless patient Spider (3:633/280.2@fidonet)
  • From Wuns Haerst@3:633/280.2 to All on Sat May 3 12:21:42 2025
    Am 03.05.2025 um 03:49 schrieb Chris M. Thomasson:
    On 5/2/2025 4:01 AM, Wuns Haerst wrote:
    It's not hard to implement a lock-free stack with DW-CAS,
    so no need for futexes. That's my dcas_atomic:

    #pragma once
    #include <cstdint>
    #include <utility>
    #include <atomic>
    #include <type_traits>
    #include <climits>
    #include <bit>
    #include <memory>

    #if defined(_MSC_VER)
    ˙˙˙˙˙#pragma warning(push)
    ˙˙˙˙˙#pragma warning(disable: 26495)
    #endif
    #if defined(__llvm__)
    ˙˙˙˙ #pragma clang diagnostic push
    ˙˙˙˙ #pragma clang diagnostic ignored "-Watomic-alignment"
    #endif

    constexpr int
    ˙˙˙˙˙DCAS_ACQ_REL = (int)std::memory_order::acq_rel,
    ˙˙˙˙˙DCAS_ACQUIRE = (int)std::memory_order::acquire,
    ˙˙˙˙˙DCAS_RELAXED = (int)std::memory_order::relaxed,
    ˙˙˙˙˙DCAS_RELEASE = (int)std::memory_order::release,
    ˙˙˙˙˙DCAS_SEQ_CST = (int)std::memory_order::seq_cst;
    template<int Type>
    using dcas_type = std::integral_constant<int, Type>;
    using dcas_acq_rel = dcas_type<DCAS_ACQ_REL>;
    using dcas_acquire = dcas_type<DCAS_ACQUIRE>;
    using dcas_relaxed = dcas_type<DCAS_RELAXED>;
    using dcas_release = dcas_type<DCAS_RELEASE>;
    using dcas_seq_cst = dcas_type<DCAS_SEQ_CST>;

    struct dcas_atomic
    {
    ˙˙˙˙˙static_assert(sizeof(size_t) == 8 || sizeof(size_t) == 4, "must
    be 64 or 32 bit architecture");
    ˙˙˙˙˙struct pair_t { size_t first, second; };
    ˙˙˙˙˙dcas_atomic() = default;
    ˙˙˙˙˙dcas_atomic( pair_t const &desired );
    ˙˙˙˙˙std::atomic<size_t> &first() noexcept;
    ˙˙˙˙˙std::atomic<size_t> &second() noexcept;
    ˙˙˙˙˙operator pair_t() noexcept;
    ˙˙˙˙˙template<int Type>
    ˙˙˙˙˙bool compare_exchange( pair_t &expected, pair_t const &desired,
    dcas_type<Type> = dcas_seq_cst() ) noexcept;
    ˙˙˙˙˙template<int Type>
    ˙˙˙˙˙pair_t load( dcas_type<Type> = dcas_seq_cst() ) const noexcept;
    ˙˙˙˙˙template<int Type>
    ˙˙˙˙˙void store( pair_t const &niu, dcas_type<Type> = dcas_seq_cst() )
    noexcept;
    private:
    #if defined(__GNUC__) || defined(__clang__) && !defined(_MSC_VER)
    ˙˙˙˙˙#if SIZE_WIDTH == 64
    ˙˙˙˙˙using dword_t = unsigned __int128;
    ˙˙˙˙˙#elif SIZE_WIDTH == 32
    ˙˙˙˙˙using dword_t = unsigned long long;
    ˙˙˙˙˙#else
    ˙˙˙˙˙˙˙˙ #error unknown architecture
    ˙˙˙˙˙#endif
    #endif
    ˙˙˙˙˙union alignas(2 * sizeof(size_t)) U
    ˙˙˙˙˙{
    ˙˙˙˙˙˙˙˙ U() {}
    ˙˙˙˙˙˙˙˙ static_assert(sizeof(std::atomic<size_t>) == sizeof(size_t),
    "sizeof(atomic<size_t>) must be == sizeof(size_t)");
    ˙˙˙˙˙˙˙˙ std::atomic<size_t> m_atomics[2];
    #if defined(_MSC_VER)
    ˙˙˙˙˙#if defined(_M_X64) || defined(_M_ARM64) || defined(_M_ARM64EC)
    ˙˙˙˙˙˙˙˙ __int64 volatile m_firstAndSecond[2];
    ˙˙˙˙˙#elif defined(_M_IX86)
    ˙˙˙˙˙˙˙˙ __int64 volatile m_firstAndSecond;
    ˙˙˙˙˙#else
    ˙˙˙˙˙˙˙˙ #error unknown architecture
    ˙˙˙˙˙#endif
    #elif defined(__GNUC__) || defined(__clang__)
    ˙˙˙˙˙˙˙˙ dword_t volatile m_firstAndSecond;
    #endif
    ˙˙˙˙˙} u;
    };

    inline dcas_atomic::dcas_atomic( pair_t const &desired )
    {
    ˙˙˙˙˙u.m_atomics[0].store( desired.first, std::memory_order_relaxed );
    ˙˙˙˙˙u.m_atomics[1].store( desired.second, std::memory_order_relaxed );
    }

    inline std::atomic<size_t> &dcas_atomic::first() noexcept
    {
    ˙˙˙˙˙return u.m_atomics[0];
    }

    inline std::atomic<size_t> &dcas_atomic::second() noexcept
    {
    ˙˙˙˙˙return u.m_atomics[1];
    }

    inline dcas_atomic::operator pair_t() noexcept
    {
    ˙˙˙˙˙return pair_t( first().load( std::memory_order_relaxed ),
    second().load( std::memory_order_relaxed ) );
    }

    template<int Type>
    inline bool dcas_atomic::compare_exchange( pair_t &expected, pair_t
    const &desired, dcas_type<Type> ) noexcept
    {
    ˙˙˙˙˙using namespace std;
    ˙˙˙˙˙static_assert(Type == DCAS_ACQ_REL || Type == DCAS_ACQUIRE ||
    Type == DCAS_RELAXED || Type == DCAS_RELEASE || Type == DCAS_SEQ_CST);
    #if defined(_MSC_VER)
    ˙˙˙˙˙#if defined(_M_X64) || defined(_M_ARM64) || defined(_M_ARM64EC)
    ˙˙˙˙˙__int64 expectedA[2];
    ˙˙˙˙˙expectedA[0] = (__int64)expected.first;
    ˙˙˙˙˙expectedA[1] = (__int64)expected.second;
    ˙˙˙˙˙char fRet;
    ˙˙˙˙˙#if defined(_M_X64)
    ˙˙˙˙˙fRet = _InterlockedCompareExchange128( u.m_firstAndSecond,
    desired.second, desired.first, expectedA );
    ˙˙˙˙˙#else
    ˙˙˙˙˙if constexpr( Type == DCAS_ACQ_REL || Type == DCAS_SEQ_CST )
    ˙˙˙˙˙˙˙˙ fRet = _InterlockedCompareExchange128( u.m_firstAndSecond,
    desired.second, desired.first, expectedA );
    ˙˙˙˙˙else if constexpr( Type == DCAS_ACQUIRE )
    ˙˙˙˙˙˙˙˙ fRet =
    _InterlockedCompareExchange128_acq( u.m_firstAndSecond,
    desired.second, desired.first, expectedA );
    ˙˙˙˙˙else if constexpr( Type == DCAS_RELAXED )
    ˙˙˙˙˙˙˙˙ fRet = _InterlockedCompareExchange128_nf( u.m_firstAndSecond,
    desired.second, desired.first, expectedA );
    ˙˙˙˙˙else
    ˙˙˙˙˙˙˙˙ fRet =
    _InterlockedCompareExchange128_rel( u.m_firstAndSecond,
    desired.second, desired.first, expectedA );
    ˙˙˙˙˙#endif
    ˙˙˙˙˙if( fRet )
    ˙˙˙˙˙˙˙˙ return true;
    ˙˙˙˙˙expected.first = expectedA[0];
    ˙˙˙˙˙expected.second = expectedA[1];
    ˙˙˙˙˙return false;
    ˙˙˙˙˙#elif defined(_M_IX86)
    ˙˙˙˙˙auto compose = []( pair_t const &p ) -> uint64_t { return p.first
    | (uint64_t)p.second << 32; };
    ˙˙˙˙˙uint64_t
    ˙˙˙˙˙˙˙˙ cDesired = compose( desired ),
    ˙˙˙˙˙˙˙˙ cExpected = compose( expected ),
    ˙˙˙˙˙˙˙˙ cResult = _InterlockedCompareExchange64( &u.m_firstAndSecond,
    cDesired, cExpected );
    ˙˙˙˙˙if( cResult == cExpected ) [[likely]]
    ˙˙˙˙˙˙˙˙ return true;
    ˙˙˙˙˙expected.first = (uint32_t)cResult;
    ˙˙˙˙˙expected.second = (uint32_t)(cResult >> 32);
    ˙˙˙˙˙return false;
    ˙˙˙˙˙#else
    ˙˙˙˙˙˙˙˙ #error unspupported Windows-platform
    ˙˙˙˙˙#endif
    #elif defined(__GNUC__) || defined(__clang__)
    ˙˙˙˙˙constexpr auto
    ˙˙˙˙˙˙˙˙ pair_t::*FIRST = std::endian::native == std::endian::little ?
    &pair_t::first : &pair_t::second,
    ˙˙˙˙˙˙˙˙ pair_t::*SECOND = std::endian::native ==
    std::endian::little ? &pair_t::second : &pair_t::first;
    ˙˙˙˙˙auto compose = []( pair_t const &p ) -> dword_t˙˙˙ { return
    (dword_t)(p.*FIRST) | (dword_t)(p.*SECOND) << SIZE_WIDTH; };
    ˙˙˙˙˙dword_t
    ˙˙˙˙˙˙˙˙ dwExpected = compose( expected ),
    ˙˙˙˙˙˙˙˙ dwDesired = compose( desired );
    ˙˙˙˙˙bool ret;
    ˙˙˙˙˙if constexpr( Type == DCAS_ACQ_REL )
    ˙˙˙˙˙˙˙˙ ret = __atomic_compare_exchange_n( &u.m_firstAndSecond,
    &dwExpected, dwDesired, true, __ATOMIC_ACQ_REL, __ATOMIC_RELAXED );
    ˙˙˙˙˙else if constexpr( Type == DCAS_ACQUIRE )
    ˙˙˙˙˙˙˙˙ ret = __atomic_compare_exchange_n( &u.m_firstAndSecond,
    &dwExpected, dwDesired, true, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED );
    ˙˙˙˙˙else if constexpr( Type == DCAS_RELAXED )
    ˙˙˙˙˙˙˙˙ ret = __atomic_compare_exchange_n( &u.m_firstAndSecond,
    &dwExpected, dwDesired, true, __ATOMIC_RELAXED, __ATOMIC_RELAXED );
    ˙˙˙˙˙else if constexpr( Type == DCAS_RELEASE )
    ˙˙˙˙˙˙˙˙ ret = __atomic_compare_exchange_n( &u.m_firstAndSecond,
    &dwExpected, dwDesired, true, __ATOMIC_RELEASE, __ATOMIC_RELAXED );
    ˙˙˙˙˙else
    ˙˙˙˙˙˙˙˙ ret = __atomic_compare_exchange_n( &u.m_firstAndSecond,
    &dwExpected, dwDesired, true, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED );
    ˙˙˙˙˙if( ret ) [[likely]]
    ˙˙˙˙˙˙˙˙ return true;
    ˙˙˙˙˙expected.*FIRST = (size_t)dwExpected;
    ˙˙˙˙˙expected.*SECOND = (size_t)(dwExpected >> SIZE_WIDTH);
    ˙˙˙˙˙return false;
    #else
    ˙˙˙˙˙#error unspupported platform
    #endif
    }

    template<int Type>
    inline typename dcas_atomic::pair_t
    dcas_atomic::load( dcas_type<Type> ) const noexcept
    {
    ˙˙˙˙˙using namespace std;
    ˙˙˙˙˙static_assert(Type == DCAS_ACQ_REL || Type == DCAS_ACQUIRE ||
    Type == DCAS_RELAXED || Type == DCAS_RELEASE || Type == DCAS_SEQ_CST);
    ˙˙˙˙˙pair_t cmp( 0, 0 );
    ˙˙˙˙˙const_cast<dcas_atomic *>(this)->compare_exchange( cmp, cmp,
    dcas_type<Type>() );
    ˙˙˙˙˙return cmp;
    }

    template<int Type>
    inline void dcas_atomic::store( pair_t const &niu, dcas_type<Type> )
    noexcept
    {
    ˙˙˙˙˙using namespace std;
    ˙˙˙˙˙static_assert(Type == DCAS_ACQ_REL || Type == DCAS_ACQUIRE ||
    Type == DCAS_RELAXED || Type == DCAS_RELEASE || Type == DCAS_SEQ_CST);
    ˙˙˙˙˙pair_t cmp( *this );
    ˙˙˙˙˙while( !this->compare_exchange( cmp, niu, dcas_type<Type>() ) );
    }

    #if defined(_MSC_VER)
    ˙˙˙˙˙#pragma warning(pop)
    #endif
    #if defined(__llvm__)
    ˙˙˙˙ #pragma clang diagnostic pop
    #endif


    Hard to read that mess. Where is your push and pop, and where are you bumping the ABA counter?




    That's only the DWCAS-atomic.


    --- MBSE BBS v1.1.1 (Linux-x86_64)
    * Origin: A noiseless patient Spider (3:633/280.2@fidonet)
  • From Bonita Montero@3:633/280.2 to All on Sat May 3 13:02:10 2025
    That's the atomic_stack:

    header:

    #pragma once
    #if defined(_MSC_VER)
    #include <Windows.h>
    #endif
    #include <atomic>
    #include "dcas_atomic.h"

    struct atomic_stack
    {
    struct link { link *next; };
    atomic_stack();
    atomic_stack( link *pFirstItem );
    void push( link *pItem );
    void push_chain( link *pFirstItem );
    void push_chain( link *pFirstItem, link *pLastItem );
    link *pop();
    link *pop_all();
    operator bool();
    private:
    using dcas_pair = typename dcas_atomic::pair_t;
    dcas_atomic m_top;
    };

    inline atomic_stack::atomic_stack() :
    atomic_stack( nullptr )
    {
    }

    inline atomic_stack::atomic_stack( link *pFirstItem ) :
    m_top( dcas_pair( (size_t)pFirstItem, 0 ) )
    {
    }

    inline void atomic_stack::push_chain( link *pFirstItem )
    {
    link *pLastItem = pFirstItem;
    for( ; pLastItem->next; pLastItem = pLastItem->next );
    push_chain( pFirstItem, pLastItem );
    }

    inline atomic_stack::operator bool()
    {
    return m_top.first();
    }

    cpp:

    #include "atomic_stack.h"

    void atomic_stack::push( link *pItem )
    {
    dcas_pair cmp( m_top ), niu;
    do
    {
    pItem->next = (link *)cmp.first;
    niu.first = (size_t)pItem;
    niu.second = cmp.second + 1;
    } while( !m_top.compare_exchange( cmp, niu, dcas_release() ) );
    }

    void atomic_stack::push_chain( link *pFirstItem, link *pLastItem )
    {
    dcas_pair cmp( m_top ), niu;
    do
    {
    pLastItem->next = (link *)cmp.first;
    niu.first = (size_t)pFirstItem;
    niu.second = cmp.second + 1;
    } while( !m_top.compare_exchange( cmp, niu, dcas_release() ) );
    }

    typename atomic_stack::link *atomic_stack::pop()
    {
    using namespace std;
    dcas_pair cmp, niu;
    link *pItem;
    cmp.first = m_top.first();
    if( !(pItem = (link *)cmp.first) ) [[unlikely]]
    return nullptr;
    cmp.second = m_top.second();
    for( ; ; )
    {
    niu.first = (size_t)pItem->next;
    niu.second = cmp.second + 1;
    if( m_top.compare_exchange( cmp, niu, dcas_acquire() ) ) [[likely]]
    return pItem;
    if( !(pItem = (link *)cmp.first) ) [[unlikely]]
    return nullptr;
    }
    }

    typename atomic_stack::link *atomic_stack::pop_all()
    {
    using namespace std;
    dcas_pair cmp, niu;
    cmp.first = m_top.first();
    if( !cmp.first ) [[unlikely]]
    return nullptr;
    cmp.second = m_top.second();
    niu.first = (size_t)nullptr;
    do
    niu.second = cmp.second + 1;
    while( !m_top.compare_exchange( cmp, niu, dcas_acquire() ) );
    return (link *)cmp.first;
    }

    --- MBSE BBS v1.1.1 (Linux-x86_64)
    * Origin: A noiseless patient Spider (3:633/280.2@fidonet)
  • From Chris M. Thomasson@3:633/280.2 to All on Sun May 4 04:02:16 2025
    On 5/2/2025 8:02 PM, Bonita Montero wrote:
    That's the atomic_stack:

    header:

    #pragma once
    #if defined(_MSC_VER)
    ˙˙˙˙#include <Windows.h>
    #endif
    #include <atomic>
    #include "dcas_atomic.h"

    struct atomic_stack
    {
    ˙˙˙˙struct link { link *next; };
    ˙˙˙˙atomic_stack();
    ˙˙˙˙atomic_stack( link *pFirstItem );
    ˙˙˙˙void push( link *pItem );
    ˙˙˙˙void push_chain( link *pFirstItem );
    ˙˙˙˙void push_chain( link *pFirstItem, link *pLastItem );
    ˙˙˙˙link *pop();
    ˙˙˙˙link *pop_all();
    ˙˙˙˙operator bool();
    private:
    ˙˙˙˙using dcas_pair = typename dcas_atomic::pair_t;
    ˙˙˙˙dcas_atomic m_top;
    };

    inline atomic_stack::atomic_stack() :
    ˙˙˙˙atomic_stack( nullptr )
    {
    }

    inline atomic_stack::atomic_stack( link *pFirstItem ) :
    ˙˙˙˙m_top( dcas_pair( (size_t)pFirstItem, 0 ) )
    {
    }

    inline void atomic_stack::push_chain( link *pFirstItem )
    {
    ˙˙˙˙link *pLastItem = pFirstItem;
    ˙˙˙˙for( ; pLastItem->next; pLastItem = pLastItem->next );
    ˙˙˙˙push_chain( pFirstItem, pLastItem );
    }

    inline atomic_stack::operator bool()
    {
    ˙˙˙˙return m_top.first();
    }

    cpp:

    #include "atomic_stack.h"

    void atomic_stack::push( link *pItem )
    {
    ˙˙˙˙dcas_pair cmp( m_top ), niu;
    ˙˙˙˙do
    ˙˙˙˙{
    ˙˙˙˙˙˙˙ pItem->next = (link *)cmp.first;
    ˙˙˙˙˙˙˙ niu.first = (size_t)pItem;
    ˙˙˙˙˙˙˙ niu.second = cmp.second + 1;
    ˙˙˙˙} while( !m_top.compare_exchange( cmp, niu, dcas_release() ) );
    }

    void atomic_stack::push_chain( link *pFirstItem, link *pLastItem )
    {
    ˙˙˙˙dcas_pair cmp( m_top ), niu;
    ˙˙˙˙do
    ˙˙˙˙{
    ˙˙˙˙˙˙˙ pLastItem->next = (link *)cmp.first;
    ˙˙˙˙˙˙˙ niu.first = (size_t)pFirstItem;
    ˙˙˙˙˙˙˙ niu.second = cmp.second + 1;
    ˙˙˙˙} while( !m_top.compare_exchange( cmp, niu, dcas_release() ) );
    }

    typename atomic_stack::link *atomic_stack::pop()
    {
    ˙˙˙˙using namespace std;
    ˙˙˙˙dcas_pair cmp, niu;
    ˙˙˙˙link *pItem;
    ˙˙˙˙cmp.first = m_top.first();
    ˙˙˙˙if( !(pItem = (link *)cmp.first) ) [[unlikely]]
    ˙˙˙˙˙˙˙ return nullptr;
    ˙˙˙˙cmp.second = m_top.second();
    ˙˙˙˙for( ; ; )
    ˙˙˙˙{
    ˙˙˙˙˙˙˙ niu.first = (size_t)pItem->next;
    ˙˙˙˙˙˙˙ niu.second = cmp.second + 1;
    ˙˙˙˙˙˙˙ if( m_top.compare_exchange( cmp, niu, dcas_acquire() ) )
    [[likely]]
    ˙˙˙˙˙˙˙˙˙˙˙ return pItem;
    ˙˙˙˙˙˙˙ if( !(pItem = (link *)cmp.first) ) [[unlikely]]
    ˙˙˙˙˙˙˙˙˙˙˙ return nullptr;
    ˙˙˙˙}
    }

    typename atomic_stack::link *atomic_stack::pop_all()
    {
    ˙˙˙˙using namespace std;
    ˙˙˙˙dcas_pair cmp, niu;
    ˙˙˙˙cmp.first = m_top.first();
    ˙˙˙˙if( !cmp.first ) [[unlikely]]
    ˙˙˙˙˙˙˙ return nullptr;
    ˙˙˙˙cmp.second = m_top.second();
    ˙˙˙˙niu.first = (size_t)nullptr;
    ˙˙˙˙do
    ˙˙˙˙˙˙˙ niu.second = cmp.second + 1;
    ˙˙˙˙while( !m_top.compare_exchange( cmp, niu, dcas_acquire() ) );
    ˙˙˙˙return (link *)cmp.first;
    }

    From a cursory read it seems fine, Bonita. Also, I had a suspicion that
    Wuns Haerst just "might" be you because the code style itself looked
    fairly similar to yours. Am I right? Also, I like the push chain.
    Actually, I was thinking about adding it to my my futex stack test.

    Now, its fun to be able to get a lock-free stack with only single word exchange and cas, and have the ability to use it with a futex to allow
    one to wait on empty conditions.

    Wrt DWCAS, it's a god damn shame that C++ tends to make it not lock
    free: That sucks! Believe it or not, there is a way to make it work with single word exchange for the push and pop. I posted about it before on
    this group.

    Now, there is another issue with a single pop... If you delete a node,
    it can bite the dust. A proxy collector works well with it, but that is
    a bit overkill! Uggg ;^o However, iirc, SEH can be used to handle it.
    Iirc, that is what the SLIST uses on windows? Been a while...

    There is no memory issue if you flush, or pop_all. :^)

    --- MBSE BBS v1.1.1 (Linux-x86_64)
    * Origin: A noiseless patient Spider (3:633/280.2@fidonet)
  • From Bonita Montero@3:633/280.2 to All on Sun May 4 04:19:59 2025
    Am 03.05.2025 um 20:02 schrieb Chris M. Thomasson:

    Now, its fun to be able to get a lock-free stack with only single word exchange and cas, and have the ability to use it with a futex to allow
    one to wait on empty conditions.

    Is a lock-free stack with a single word CAS really possible without an ABA-problem ?

    Wrt DWCAS, it's a god damn shame that C++ tends to make it not lock
    free: That sucks! ...

    That's what I also was somewhat frustrated about, but it wasn't much
    work to use the proper compiler-intrinsics. I think you can't expect
    that this will change in the next 10 years since lock-free stacks are
    an exotic topic for 99% of all developers. I guess even the skilled compiler-writers are mostly not aware of this as such datastructures
    rarely make sense an often have drawback that you have to poll since
    you never wait in the kernel as with a mutex.

    Now, there is another issue with a single pop... If you delete a node,
    it can bite the dust. ...

    You could catvch an SEH-exception or with Linux a signal and jump out
    of the handler with siglongjmp if the memory physically has been de-
    allocated.
    But maybe this problem doesn't arise that often. With memory-allocators
    like mimalloc, jemalloc or TCMalloc you only have one thread who pops
    the items being freed by non pool-owning threads and deallocates them
    and not multiple contending threads.



    --- MBSE BBS v1.1.1 (Linux-x86_64)
    * Origin: A noiseless patient Spider (3:633/280.2@fidonet)
  • From Chris M. Thomasson@3:633/280.2 to All on Sun May 4 05:58:41 2025
    On 5/3/2025 11:19 AM, Bonita Montero wrote:
    Am 03.05.2025 um 20:02 schrieb Chris M. Thomasson:

    Now, its fun to be able to get a lock-free stack with only single word
    exchange and cas, and have the ability to use it with a futex to allow
    one to wait on empty conditions.

    Is a lock-free stack with a single word CAS really possible without an ABA-problem ?

    Will get back to you. Need to run a bunch of errands. If you omit single
    item pop, yup. It's also possible to get rid of the cas in push.

    [...]

    --- MBSE BBS v1.1.1 (Linux-x86_64)
    * Origin: A noiseless patient Spider (3:633/280.2@fidonet)
  • From Chris M. Thomasson@3:633/280.2 to All on Sun May 4 11:57:11 2025
    On 5/3/2025 12:58 PM, Chris M. Thomasson wrote:
    On 5/3/2025 11:19 AM, Bonita Montero wrote:
    Am 03.05.2025 um 20:02 schrieb Chris M. Thomasson:

    Now, its fun to be able to get a lock-free stack with only single
    word exchange and cas, and have the ability to use it with a futex to
    allow one to wait on empty conditions.

    Is a lock-free stack with a single word CAS really possible without an
    ABA-problem ?

    Will get back to you. Need to run a bunch of errands. If you omit single item pop, yup. It's also possible to get rid of the cas in push.

    The ABA problem _and_ memory issue is only there in on the single node
    pop for a lock-free stack. If you just whack it with a nullptr aka flush
    _all_ nodes, then you are all right and ready to roll.

    Actually, IBM had/has a nice write up of it in an appendix for the
    principles of operations iirc. I think it might of been in appendix 40
    ish or something. God it's been a while since I read that Bonita. Then
    they have a rather nifty PLO, or perform locked operation, again iirc...

    The futex is fun to use because it allows one to wait on an empty
    condition...

    Actually, I remember Joe Seigh taking all the nodes in an exchange,
    reversing them, and BAM, its FIFO. ;^)

    [...]



    --- MBSE BBS v1.1.1 (Linux-x86_64)
    * Origin: A noiseless patient Spider (3:633/280.2@fidonet)
  • From Bonita Montero@3:633/280.2 to All on Thu May 8 19:11:48 2025
    Am 04.05.2025 um 03:57 schrieb Chris M. Thomasson:

    The ABA problem _and_ memory issue is only there in on the single node
    pop for a lock-free stack. If you just whack it with a nullptr aka flush _all_ nodes, then you are all right and ready to roll.

    Very interesting! I'm just going to believe this without having thought
    it through. This ultimately means that when memory allocators clean up
    foreign releases from a lock-free stack, they don't need DWCAS to do so.


    --- MBSE BBS v1.1.1 (Linux-x86_64)
    * Origin: A noiseless patient Spider (3:633/280.2@fidonet)
  • From Chris M. Thomasson@3:633/280.2 to All on Fri May 9 05:37:50 2025
    On 5/8/2025 2:11 AM, Bonita Montero wrote:
    Am 04.05.2025 um 03:57 schrieb Chris M. Thomasson:

    The ABA problem _and_ memory issue is only there in on the single node
    pop for a lock-free stack. If you just whack it with a nullptr aka
    flush _all_ nodes, then you are all right and ready to roll.

    Very interesting! I'm just going to believe this without having thought
    it through. This ultimately means that when memory allocators clean up foreign releases from a lock-free stack, they don't need DWCAS to do so.


    It only works if you remove all of the nodes, aka flush (pop_all). Not pop_one. If you intermix the two, then pop_one will still have the
    memory issue where a node can be prematurely deleted. This is besides
    the ABA problem, a completely different problem.

    SEH on the pop_one, or some other means, would still be needed if you
    delete nodes. If you only pop_all, then that eliminates ABA and the
    memory issue. Now, iirc way back, around 20 years ago. I think I
    remember reading about some interesting issues about using CAS on a word
    that is part of a DWCAS anchor. So, some pseudo-code:

    struct dwcas_anchor
    {
    word* head;
    word aba;
    };

    And using CAS on the dwcas_anchor::head. Keep in mind that ABA does not
    need to be updated if we remove all the nodes. Actually, pushing a node
    does not need to update ABA. Only the pop_one's. Iirc, this makes things tricky on the pop_one side. Iirc, aba needs be read first. Ummm... Take
    a look at some of my older code. I had to program this in ASM back then,
    lol:

    https://web.archive.org/web/20060214112345/http://appcore.home.comcast.net/appcore/src/cpu/i686/ac_i686_gcc_asm.html

    https://web.archive.org/web/20060214112539/http://appcore.home.comcast.net/appcore/src/cpu/i686/ac_i686_masm_asm.html


    ..align 16
    ..globl ac_i686_stack_mpmc_push_cas
    ac_i686_stack_mpmc_push_cas:
    movl 4(%esp), %edx
    movl (%edx), %eax
    movl 8(%esp), %ecx

    ac_i686_stack_mpmc_push_cas_retry:
    movl %eax, (%ecx)
    lock cmpxchgl %ecx, (%edx)
    jne ac_i686_stack_mpmc_push_cas_retry
    ret



    ..align 16
    ..globl np_ac_i686_stack_mpmc_pop_dwcas
    np_ac_i686_stack_mpmc_pop_dwcas:
    pushl %esi
    pushl %ebx
    movl 12(%esp), %esi
    movl 4(%esi), %edx
    movl (%esi), %eax

    np_ac_i686_stack_mpmc_pop_dwcas_retry:
    test %eax, %eax
    je np_ac_i686_stack_mpmc_pop_dwcas_fail
    movl (%eax), %ebx
    leal 1(%edx), %ecx
    lock cmpxchg8b (%esi)
    jne np_ac_i686_stack_mpmc_pop_dwcas_retry

    np_ac_i686_stack_mpmc_pop_dwcas_fail:
    popl %ebx
    popl %esi
    ret




    --- MBSE BBS v1.1.1 (Linux-x86_64)
    * Origin: A noiseless patient Spider (3:633/280.2@fidonet)
  • From jseigh@3:633/280.2 to All on Fri May 9 08:46:48 2025
    On 5/3/25 21:57, Chris M. Thomasson wrote:
    On 5/3/2025 12:58 PM, Chris M. Thomasson wrote:
    On 5/3/2025 11:19 AM, Bonita Montero wrote:
    Am 03.05.2025 um 20:02 schrieb Chris M. Thomasson:

    Now, its fun to be able to get a lock-free stack with only single
    word exchange and cas, and have the ability to use it with a futex
    to allow one to wait on empty conditions.

    Is a lock-free stack with a single word CAS really possible without an
    ABA-problem ?

    Will get back to you. Need to run a bunch of errands. If you omit
    single item pop, yup. It's also possible to get rid of the cas in push.

    The ABA problem _and_ memory issue is only there in on the single node
    pop for a lock-free stack. If you just whack it with a nullptr aka flush _all_ nodes, then you are all right and ready to roll.

    Actually, IBM had/has a nice write up of it in an appendix for the principles of operations iirc. I think it might of been in appendix 40
    ish or something. God it's been a while since I read that Bonita. Then
    they have a rather nifty PLO, or perform locked operation, again iirc...

    The futex is fun to use because it allows one to wait on an empty condition...

    Actually, I remember Joe Seigh taking all the nodes in an exchange, reversing them, and BAM, its FIFO. ;^)


    Probably for a MPSC queue. I don't the FIFO semantics would hold up
    very well for MPMC queue.

    Joe Seigh

    --- MBSE BBS v1.1.1 (Linux-x86_64)
    * Origin: A noiseless patient Spider (3:633/280.2@fidonet)
  • From Chris M. Thomasson@3:633/280.2 to All on Sun May 11 06:48:10 2025
    On 5/8/2025 3:46 PM, jseigh wrote:
    On 5/3/25 21:57, Chris M. Thomasson wrote:
    On 5/3/2025 12:58 PM, Chris M. Thomasson wrote:
    On 5/3/2025 11:19 AM, Bonita Montero wrote:
    Am 03.05.2025 um 20:02 schrieb Chris M. Thomasson:

    Now, its fun to be able to get a lock-free stack with only single
    word exchange and cas, and have the ability to use it with a futex
    to allow one to wait on empty conditions.

    Is a lock-free stack with a single word CAS really possible without an >>>> ABA-problem ?

    Will get back to you. Need to run a bunch of errands. If you omit
    single item pop, yup. It's also possible to get rid of the cas in push.

    The ABA problem _and_ memory issue is only there in on the single node
    pop for a lock-free stack. If you just whack it with a nullptr aka
    flush _all_ nodes, then you are all right and ready to roll.

    Actually, IBM had/has a nice write up of it in an appendix for the
    principles of operations iirc. I think it might of been in appendix 40
    ish or something. God it's been a while since I read that Bonita. Then
    they have a rather nifty PLO, or perform locked operation, again iirc...

    The futex is fun to use because it allows one to wait on an empty
    condition...

    Actually, I remember Joe Seigh taking all the nodes in an exchange,
    reversing them, and BAM, its FIFO. ;^)


    Probably for a MPSC queue.˙ I don't the FIFO semantics would hold up
    very well for MPMC queue.

    :^)

    Well, even with a MPMP FIFO queue without the pop_all reverse results
    type approach, it can go:

    Thread A pushes say, 10 items:

    item[0]
    item[1]
    ....

    Thread B pops item[0], gets interrupted, thread C pops item[1] and
    processes it before thread B got a chance to process item[0]? That can
    happen even without the pop_all/flush "trick"... Or, am I missing
    something Joe?

    --- MBSE BBS v1.1.1 (Linux-x86_64)
    * Origin: A noiseless patient Spider (3:633/280.2@fidonet)
  • From Chris M. Thomasson@3:633/280.2 to All on Sun May 11 06:48:59 2025
    On 5/10/2025 1:48 PM, Chris M. Thomasson wrote:
    On 5/8/2025 3:46 PM, jseigh wrote:
    On 5/3/25 21:57, Chris M. Thomasson wrote:
    On 5/3/2025 12:58 PM, Chris M. Thomasson wrote:
    On 5/3/2025 11:19 AM, Bonita Montero wrote:
    Am 03.05.2025 um 20:02 schrieb Chris M. Thomasson:

    Now, its fun to be able to get a lock-free stack with only single >>>>>> word exchange and cas, and have the ability to use it with a futex >>>>>> to allow one to wait on empty conditions.

    Is a lock-free stack with a single word CAS really possible without an >>>>> ABA-problem ?

    Will get back to you. Need to run a bunch of errands. If you omit
    single item pop, yup. It's also possible to get rid of the cas in push. >>>
    The ABA problem _and_ memory issue is only there in on the single
    node pop for a lock-free stack. If you just whack it with a nullptr
    aka flush _all_ nodes, then you are all right and ready to roll.

    Actually, IBM had/has a nice write up of it in an appendix for the
    principles of operations iirc. I think it might of been in appendix
    40 ish or something. God it's been a while since I read that Bonita.
    Then they have a rather nifty PLO, or perform locked operation, again
    iirc...

    The futex is fun to use because it allows one to wait on an empty
    condition...

    Actually, I remember Joe Seigh taking all the nodes in an exchange,
    reversing them, and BAM, its FIFO. ;^)


    Probably for a MPSC queue.˙ I don't the FIFO semantics would hold up
    very well for MPMC queue.

    :^)

    Well, even with a MPMP FIFO queue without the pop_all reverse results

    God damn typos! I meant MPMC. Argh!!!!!! Sorry about that.




    type approach, it can go:

    Thread A pushes say, 10 items:

    item[0]
    item[1]
    ...

    Thread B pops item[0], gets interrupted, thread C pops item[1] and
    processes it before thread B got a chance to process item[0]? That can happen even without the pop_all/flush "trick"... Or, am I missing
    something Joe?


    --- MBSE BBS v1.1.1 (Linux-x86_64)
    * Origin: A noiseless patient Spider (3:633/280.2@fidonet)