To compare the performance difference between std::atomic<int>
and std::mutex
protected int
, I have this test program:
#include <iostream>
#include <atomic>
#include <mutex>
#include <thread>
#include <chrono>
#include <limits>
using namespace std;
#ifndef INT_MAX
const int INT_MAX = numeric_limits<std::int32_t>::max();
const int INT_MIN = numeric_limits<std::int32_t>::min();
#endif
using std::chrono::steady_clock;
const size_t LOOP_COUNT = 12500000;
const size_t THREAD_COUNT = 8;
int intArray[2] = { 0, INT_MAX };
atomic<int> atomicArray[2];
void atomic_tf() {//3.19s
for (size_t i = 0; i < LOOP_COUNT; i) {
atomicArray[0] ;
atomicArray[1]--;
}
}
mutex m;
void mutex_tf() {//0.25s
m.lock();
for (size_t i = 0; i < LOOP_COUNT; i) {
intArray[0] ;
intArray[1]--;
}
m.unlock();
}
int main() {
{
atomicArray[0] = 0;
atomicArray[1] = INT_MAX;
thread tp[THREAD_COUNT];
steady_clock::time_point t1 = steady_clock::now();
for (size_t t = 0; t < THREAD_COUNT; t) {
tp[t] = thread(atomic_tf);
}
for (size_t t = 0; t < THREAD_COUNT; t) {
tp[t].join();
}
steady_clock::time_point t2 = steady_clock::now();
cout << (float)((t2 - t1).count()) / 1000000000 << endl;
}
{
thread tp[THREAD_COUNT];
steady_clock::time_point t1 = steady_clock::now();
for (size_t t = 0; t < THREAD_COUNT; t) {
tp[t] = thread(mutex_tf);
}
for (size_t t = 0; t < THREAD_COUNT; t) {
tp[t].join();
}
steady_clock::time_point t2 = steady_clock::now();
cout << (float)((t2 - t1).count()) / 1000000000 << endl;
}
return 0;
}
I ran this program on windows/linux many times (compiled with clang 14, g 12), basically same result.
atomic_tf
will take 3 secondsmutex_tf
will take 0.25 seconds.
Almost 10 times of performance difference.
My question is, if my test program is valid, then does it indicate that using atomic variable is much more expensive compared with using mutex normal variables?
How does this performance difference come from? Thanks!
CodePudding user response:
Your test does not really compare the performance of mutex vs atomic:
Your mutex version locks the mutex once, then does
12500000
iterations without paying any additional cost for thread synchronization mechanism.In your atomic version you pay the cost of the atomic synchronization for every increment, and every decrement of the atomic value (each happens
12500000
times).
In order to compare the two, you need to lock and unlock the mutex for every increment or decrement of the value.
Something like:
void mutex_tf()
{
for (size_t i = 0; i < LOOP_COUNT; i)
{
m.lock();
intArray[0] ;
m.unlock();
m.lock();
intArray[1]--;
m.unlock();
}
}