I am basically writing code to count if a pair sum is even(among all pairs from 1 to 100000). I wrote a code using pthreads and without pthreads. But the code with pthreads is taking more time than the serial one. Here is my serial code
#include<bits/stdc .h>
using namespace std;
int main()
{
long long sum = 0, count = 0, n = 100000;
auto start = chrono::high_resolution_clock::now();
for(int i = 1; i <= n; i )
for(int j = i-1; j >= 0; j--)
{
sum = i j;
if(sum%2 == 0)
count ;
}
cout<<"count is "<<count<<endl;
auto end = chrono::high_resolution_clock::now();
double time_taken = chrono::duration_cast<chrono::nanoseconds>(end - start).count();
time_taken *= 1e-9;
cout << "Time taken by program is : " << fixed << time_taken << setprecision(9)<<" secs"<<endl;
return 0;
}
and here is my parallel code
#include<bits/stdc .h>
using namespace std;
#define MAX_THREAD 3
long long cnt[5] = {0};
long long n = 100000;
int work_per_thread;
int start[] = {1, 60001, 83001, 100001};
void *count_array(void* arg)
{
int t = *((int*)arg);
long long sum = 0;
for(int i = start[t]; i < start[t 1]; i )
for(int j = i-1; j >=0; j--)
{
sum = i j;
if(sum%2 == 0)
cnt[t] ;
}
cout<<"thread"<<t<<" finished work "<<cnt[t]<<endl;
return NULL;
}
int main()
{
pthread_t threads[MAX_THREAD];
int arr[] = {0,1,2};
long long total_count = 0;
work_per_thread = n/MAX_THREAD;
auto start = chrono::high_resolution_clock::now();
for(int i = 0; i < MAX_THREAD; i )
pthread_create(&threads[i], NULL, count_array, &arr[i]);
for(int i = 0; i < MAX_THREAD; i )
pthread_join(threads[i], NULL);
for(int i = 0; i < MAX_THREAD; i )
total_count = cnt[i];
cout << "count is " << total_count << endl;
auto end = chrono::high_resolution_clock::now();
double time_taken = chrono::duration_cast<chrono::nanoseconds>(end - start).count();
time_taken *= 1e-9;
cout << "Time taken by program is : " << fixed << time_taken << setprecision(9)<<" secs"<<endl;
return 0;
}
In the parallel code I am creating three threads and 1st thread will be doing its computation from 1 to 60000, 2nd thread from 60001 to 83000 and so on. I have chosen these numbers so that each thread gets to do approximately similar number of computations. The parallel execution takes 10.3 secs whereas serial one takes 7.7 secs. I have 6 cores and 2 threads per core. I also used htop command to check if the required number of threads are running or not and it seems to be working fine. I don't understand where the problem is.
CodePudding user response:
The all cores in the threaded version compete for cnt[].
Use a local counter inside the loop and copy the result into cnt[t] after the loop is ready.