I am studying std::atomic_flag
in C 20 on a Windows 10 64bits OS with gcc 11.2.0 using MINGW64.
Compiler setting is as follows:
g .exe -Wall -fexceptions -g -std=gnu 2a -O3 -m64
I studied the ping pong example from Two Atomic Flags.
I ran the example code, but suffered a deadlock problem.
The example code is as follows:
// pingPongAtomicFlags.cpp
#include <iostream>
#include <atomic>
#include <thread>
std::atomic_flag condAtomicFlag1{};
std::atomic_flag condAtomicFlag2{};
std::atomic<int> counter{};
constexpr int countlimit = 1'000'000;
void ping() {
while(counter < countlimit) {
condAtomicFlag1.wait(false); // (1)
condAtomicFlag1.clear(); // (2)
counter;
condAtomicFlag2.test_and_set(); // (4)
condAtomicFlag2.notify_one(); // (3)
}
}
void pong() {
while(counter < countlimit) {
condAtomicFlag2.wait(false);
condAtomicFlag2.clear();
condAtomicFlag1.test_and_set();
condAtomicFlag1.notify_one();
}
}
int main() {
auto start = std::chrono::system_clock::now();
condAtomicFlag1.test_and_set(); // (5)
std::thread t1(ping);
std::thread t2(pong);
t1.join();
t2.join();
std::chrono::duration<double> dur = std::chrono::system_clock::now() - start;
std::cout << "Duration: " << dur.count() << " seconds" << std::endl;
}
This deadlock problem should be blocked by condAtomicFlag1.wait ( false )
and condAtomicFlag2.wait ( false )
.
In order to understand the reason of deadlock, I put atom_notifyAck_flag1
and atom_notifyAck_flag2
to monitor the behavior after notify_one()
, and use two threads thr_notifyAck_monitor_flag1
and thr_notifyAck_monitor_flag2
to monitor atom_notifyAck_flag1
and atom_notifyAck_flag2
seperately.
atom_notifyAck_flag1
in void pong()
will be set to false
after condAtomicFlag1.notify_one()
. In void ping()
, after condAtomicFlag1.wait(false)
received the notification, condAtomicFlag1.wait(false)
will release the block and than set atom_notifyAck_flag1
to true
. atom_notifyAck_flag2
has the similar behavior as atom_notifyAck_flag1
.
On my computer, atom_notifyAck_flag
should be altered from false
to true
in 0.0001ms. If atom_notifyAck_flag
is monitored as false
after sleep_for(10ms)
again, I treat the notify_one()
is failed and trigger a new notify_one()
to release the block by wait()
.
The example with thrf_notifyAck_monitor()
is as follows:
// pingPongAtomicFlags.cpp
#include <iostream>
#include <atomic>
#include <thread>
std::atomic_flag condAtomicFlag1{ATOMIC_FLAG_INIT};
std::atomic_flag condAtomicFlag2{ATOMIC_FLAG_INIT};
std::atomic<bool> atom_notifyAck_flag1 {false};
std::atomic<bool> atom_notifyAck_flag2 {false};
std::atomic<int> counter{};
constexpr int countlimit = 1'000'000;
using namespace std::literals;
void ping()
{
while ( counter < countlimit )
//Thanks to Nate Eldredge for the reminder of edge condition of race.
{
condAtomicFlag1.wait ( false ); // (1)
condAtomicFlag1.clear(); // (2)
atom_notifyAck_flag1 = true;
counter;
condAtomicFlag2.test_and_set(); // (4)
condAtomicFlag2.notify_one(); // (3)
atom_notifyAck_flag2 = false;
}
}
void pong()
{
while ( counter < countlimit )
{
condAtomicFlag2.wait ( false );
condAtomicFlag2.clear();
atom_notifyAck_flag2 = true;
condAtomicFlag1.test_and_set();
condAtomicFlag1.notify_one();
atom_notifyAck_flag1 = false;
}
}
void thrf_notifyAck_monitor ( const std::atomic<bool>& last_notifyAck_check
, std::atomic<bool>& atom_notifyAck_Flag
, int flagId )
{
int atom_notifyAck_Flag_failCounter {};
while ( !last_notifyAck_check )
{
if ( !atom_notifyAck_Flag ) //atom_notifyAck_Flag is false
{
std::this_thread::sleep_for ( 10ms ); //wait 10ms and check again
if ( !atom_notifyAck_Flag )
//atom_notifyAck_Flag is false for 10ms
{
std::cout << "atom_notifyAck_Flag_"
<< flagId
<< " failed [" << atom_notifyAck_Flag_failCounter
<< "] times "
<< "at counter "
<< counter << "\n";
atom_notifyAck_Flag_failCounter;
condAtomicFlag1.notify_all();
}
}
}
}
int main()
{
std::cout << "start ping pong ...\n";
auto start = std::chrono::system_clock::now();
condAtomicFlag1.test_and_set(); // (5)
std::thread t1 ( ping );
std::thread t2 ( pong );
std::atomic<bool> last_notifyAck_check {false};
//Thanks to Ted Lyngmo for reminder
std::thread thr_notifyAck_monitor_flag1 ( thrf_notifyAck_monitor
, std::ref ( last_notifyAck_check )
, std::ref ( atom_notifyAck_flag1 ), 1 );
std::thread thr_notifyAck_monitor_flag2 ( thrf_notifyAck_monitor
, std::ref ( last_notifyAck_check )
, std::ref ( atom_notifyAck_flag2 ), 2 );
t1.join();
t2.join();
last_notifyAck_check = true;
thr_notifyAck_monitor_flag1.join();
thr_notifyAck_monitor_flag2.join();
std::cout << "\n";
std::cout << "counter " << counter << "\n";
std::chrono::duration<double> dur = std::chrono::system_clock::now() - start;
std::cout << "Duration: " << dur.count() << " seconds" << std::endl << "\n";
}
The monitor thread does work.
The result messages are as follows:
start ping pong ...
atom_notifyAck_Flag_2 failed [0] times at counter 284
atom_notifyAck_Flag_1 failed [0] times at counter 713
atom_notifyAck_Flag_2 failed [1] times at counter 1145
atom_notifyAck_Flag_1 failed [1] times at counter 4128
atom_notifyAck_Flag_1 failed [2] times at counter 5519
atom_notifyAck_Flag_1 failed [3] times at counter 28465
atom_notifyAck_Flag_1 failed [4] times at counter 28812
atom_notifyAck_Flag_2 failed [2] times at counter 35854
atom_notifyAck_Flag_2 failed [3] times at counter 54880
atom_notifyAck_Flag_1 failed [5] times at counter 55227
atom_notifyAck_Flag_2 failed [4] times at counter 65113
atom_notifyAck_Flag_1 failed [6] times at counter 65519
atom_notifyAck_Flag_1 failed [7] times at counter 78369
atom_notifyAck_Flag_1 failed [8] times at counter 89895
atom_notifyAck_Flag_2 failed [5] times at counter 90408
atom_notifyAck_Flag_1 failed [9] times at counter 90872
atom_notifyAck_Flag_1 failed [10] times at counter 115252
atom_notifyAck_Flag_1 failed [11] times at counter 121390
atom_notifyAck_Flag_2 failed [6] times at counter 149745
atom_notifyAck_Flag_2 failed [7] times at counter 275242
atom_notifyAck_Flag_2 failed [8] times at counter 276173
atom_notifyAck_Flag_1 failed [12] times at counter 304177
atom_notifyAck_Flag_2 failed [9] times at counter 417666
atom_notifyAck_Flag_2 failed [10] times at counter 421162
atom_notifyAck_Flag_2 failed [11] times at counter 455784
atom_notifyAck_Flag_2 failed [12] times at counter 466523
atom_notifyAck_Flag_2 failed [13] times at counter 470350
atom_notifyAck_Flag_1 failed [13] times at counter 485684
atom_notifyAck_Flag_2 failed [14] times at counter 486125
atom_notifyAck_Flag_1 failed [14] times at counter 486566
atom_notifyAck_Flag_2 failed [15] times at counter 492096
atom_notifyAck_Flag_2 failed [16] times at counter 515809
atom_notifyAck_Flag_1 failed [15] times at counter 516158
atom_notifyAck_Flag_1 failed [16] times at counter 516509
atom_notifyAck_Flag_1 failed [17] times at counter 516927
atom_notifyAck_Flag_2 failed [17] times at counter 517392
atom_notifyAck_Flag_2 failed [18] times at counter 517840
atom_notifyAck_Flag_2 failed [19] times at counter 518319
atom_notifyAck_Flag_2 failed [20] times at counter 518810
atom_notifyAck_Flag_2 failed [21] times at counter 519290
atom_notifyAck_Flag_2 failed [22] times at counter 541904
atom_notifyAck_Flag_1 failed [18] times at counter 542383
atom_notifyAck_Flag_2 failed [23] times at counter 543410
atom_notifyAck_Flag_1 failed [19] times at counter 544379
atom_notifyAck_Flag_2 failed [24] times at counter 544876
atom_notifyAck_Flag_2 failed [25] times at counter 545324
atom_notifyAck_Flag_2 failed [26] times at counter 566521
atom_notifyAck_Flag_2 failed [27] times at counter 588983
atom_notifyAck_Flag_2 failed [28] times at counter 589465
atom_notifyAck_Flag_1 failed [20] times at counter 592211
atom_notifyAck_Flag_1 failed [21] times at counter 611563
atom_notifyAck_Flag_2 failed [29] times at counter 614010
atom_notifyAck_Flag_1 failed [22] times at counter 619736
atom_notifyAck_Flag_2 failed [30] times at counter 642781
atom_notifyAck_Flag_1 failed [23] times at counter 651436
atom_notifyAck_Flag_1 failed [24] times at counter 772356
atom_notifyAck_Flag_1 failed [25] times at counter 772738
atom_notifyAck_Flag_2 failed [31] times at counter 773207
atom_notifyAck_Flag_1 failed [26] times at counter 773691
atom_notifyAck_Flag_1 failed [27] times at counter 774162
atom_notifyAck_Flag_1 failed [28] times at counter 774575
atom_notifyAck_Flag_2 failed [32] times at counter 864664
atom_notifyAck_Flag_2 failed [33] times at counter 986915
atom_notifyAck_Flag_1 failed [29] times at counter 1000000
counter 1000000
Duration: 1.09015 seconds
Process returned 0 (0x0) execution time : 1.126 s
Under this test, it shows that the wait()
didn't always receive the notify_one
correctly, so it kept waiting and was blocked. However, std::atomic_flag::notify_one()
provided atomic behavior, so it should not happen collision in the system.
Is notify_one()
easily fail a common sense to use this kind of non-blocking function? What can I do to avoid this kind of deadlock?
CodePudding user response:
There is a race at the end of the program that can cause a deadlock.
Suppose we get the following sequencing, starting at counter == 999999
:
ping() pong()
====== ======
condAtomicFlag1.wait(false); // waiting
condAtomicFlag1.test_and_set();
condAtomicFlag1.notify_one();
// wakes up
condAtomicFlag1.clear();
counter; // counter = 1000000
while(counter < countlimit) // false
// pong() returns
condAtomicFlag2.test_and_set();
condAtomicFlag2.notify_one(); // nobody listening
while(counter <= countlimit) { // true
condAtomicFlag1.wait(false); // never becomes true, deadlock
I haven't thought carefully about the correct fix, but perhaps you can.
In my tests, that's the only actual deadlock. All the other notifications from the monitor appear to be spurious, likely because the system scheduled out a thread for more than 10 ms. If I change the timeout to 1 second, all the notifications go away except for "at counter 1000000".
CodePudding user response:
Based on an assumption, the atomFlag_pong.notify_one()
might be dropped in my system because of some reason, so atomFlag_pong.wait(false)
can only wait forever.
In order to confirm the atomFlag_pong.notify_one()
arrived to atomFlag_pong.wait(false)
successfully, I check the atomFlag_pong.test()
as an acknowledgement in an additional while loop, and continuously send atomFlag_pong.notify_one()
till atomFlag_pong.test()==false
which means atomFlag_pong.wait(false)
is passed successfully. In the second while loop, && counter < countLimit
condition have to be checked again, because counter
can be altered by thread thr_ping
in the second while loop.
In another way, I confirm atomFlag_pong.notify_one()
arrived successfully by using the similar acknowledgement trick.
After testing, it does work without deadlock finally.
My example code is as follows:
// pingPongAtomicFlags.cpp
#include <iostream>
#include <atomic>
#include <thread>
std::atomic_flag atomFlag_ping {};
std::atomic_flag atomFlag_pong {};
std::atomic<int> counter{};
constexpr int countLimit = 1'000'000;
using namespace std::literals;
void thrf_ping()
{
while ( counter < countLimit )
{
atomFlag_ping.wait ( false );
atomFlag_ping.clear();
//set atomFlag_ping as false which means atomFlag_ping.wait(false) is passed
counter;
atomFlag_pong.test_and_set(); //set atomFlag_pong as true
// use && counter < countLimit condition again is necessary,
// because counter might be changed in the previous lines.
while ( atomFlag_pong.test() == true && counter < countLimit)
{
// Send atomFlag_pong.notify_one() continuously till [atomFlag_pong.test() == false]
// to avoid the atomFlag_pong.notify_one() is dropped by system
// [atomFlag_pong.test() == false] means that atomFlag_pong.wait(false) is passed,
// so progress of pong can move on
atomFlag_pong.notify_one();
}
}
}
void thrf_pong()
{
while ( counter < countLimit )
{
atomFlag_pong.wait ( false );
atomFlag_pong.clear();
//set atomFlag_pong as false which means atomFlag_pong.wait(false) is passed
atomFlag_ping.test_and_set();
// use && counter < countLimit condition again is necessary,
// because counter might be changed in the previous lines.
while ( atomFlag_ping.test() == true && counter < countLimit)
{
// Send atomFlag_ping.notify_one() continuously till [atomFlag_ping.test() == false]
// to avoid the atomFlag_ping.notify_one() is dropped by system
// [atomFlag_ping.test() == false] means that atomFlag_ping.wait(false) is passed,
// so progress of ping can move on
atomFlag_ping.notify_one();
}
}
}
int main()
{
std::cout << "start ping pong ...\n";
auto start = std::chrono::system_clock::now();
atomFlag_ping.test_and_set(); // drop the first ball, set atomFlag_ping as true
std::thread thr_ping ( thrf_ping );
std::thread thr_pong ( thrf_pong );
thr_ping.join();
thr_pong.join();
std::cout << "\n";
std::cout << "counter " << counter << "\n";
std::chrono::duration<double> dur = std::chrono::system_clock::now() - start;
std::cout << "Duration: " << dur.count() << " seconds" << std::endl << "\n";
}