I know that many people already ask of this topic, but I really do not understand why my program is slow even I do not calculate standard input and output.
This is my single thread program.
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define LEN 30000
int **arr;
int main()
{
arr = malloc(sizeof(int *) * LEN);
for(int i=0;i<LEN;i )
arr[i] = malloc(sizeof(int) * LEN);
clock_t st, ed;
st = clock();
for(int i=0;i<LEN;i )
for(int j=0;j<LEN;j )
arr[i][j] = 1;
ed = clock();
printf("time : %ld\n", ed-st);
}
And this is my multi thread program
#include <stdio.h>
#include <pthread.h>
#include <stdlib.h>
#include <time.h>
#define LEN 30000
int **arr;
void *thread_excute(void *thread_argv)
{
int start = ((int *)thread_argv)[0];
int height = ((int *)thread_argv)[1];
for(int i=0;i<height;i )
for(int j=0;j<LEN;j )
arr[start i][j] = 1;
return 0;
}
int main()
{
arr = malloc(sizeof(int *) * LEN);
for(int i=0;i<LEN;i )
arr[i] = malloc(sizeof(int) * LEN);
clock_t st, ed;
st = clock();
// thread = 4
pthread_t *thread_num = malloc(sizeof(int) * 4);
int **argv = malloc(sizeof(int *) * 4);
for(int t=0;t<4;t )
{
argv[t] = malloc(sizeof(int) * 2);
argv[t][0] = t*(LEN/4);
argv[t][1] = LEN/4;
pthread_create(&thread_num[t], 0, thread_excute, (void *)argv[t]);
}
for(int t=0;t<4;t )
pthread_join(thread_num[t], 0);
ed = clock();
printf("time : %ld\n", ed-st);
}
This is program that fill 30000 * 30000 int array to 1. I make multithread program that each thread fill different row of array. So I guess multithread program will be faster than singlethread program. But this is output.
// single thread program
time : 3782958
// multi thread program
time : 3997991
I run this program on Ubuntu 20.04, and I have 4 cpu cores.
And I compile this code by gcc file.c -o file -lpthread
I do not know why this happens.
CodePudding user response:
Here is the tweaked version of your multi-thread program:
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define LEN 30000
int **arr;
void *thread_excute(void *thread_argv) {
// clock_t st, ed;
// st = clock();
int start = ((int *)thread_argv)[0];
int height = ((int *)thread_argv)[1];
int thread = ((int *)thread_argv)[2];
for(int i=start; i<start height; i )
for(int j=0;j<LEN; j )
arr[i][j] = 1;
// ed = clock();
// printf("%d %ld %ld %ld\n", thread, st, ed, ed-st);
return 0;
}
int main(int argc, char *argv[]) {
arr = calloc(1, sizeof(int *) * LEN);
arr[0] = calloc(LEN, sizeof(int) * LEN);
for(int i=1; i<LEN; i )
arr[i] = arr[0] i * LEN;
clock_t st, ed;
unsigned threads = (argc == 2) ? atoi(argv[1]) : 4;
pthread_t *thread_num = malloc(sizeof(int) * threads);
st = clock();
int **argv2 = malloc(sizeof(int *) * threads);
for(int t=0; t<threads; t ) {
argv2[t] = malloc(sizeof(int) * 3);
argv2[t][0] = t * (LEN/threads);
argv2[t][1] = LEN/threads;
argv2[t][2] = t;
pthread_create(&thread_num[t], 0, thread_excute, (void *)argv2[t]);
}
for(int t=0; t<threads; t )
pthread_join(thread_num[t], 0);
ed = clock();
printf("%ld\n", ed-st);
}
If you build this with -O3
and benchmark with (bash) time, I cannot no longer reliably get a lower multi-threaded run compared to the single threaded version (4 cores, 8 hyper threads):
threads | clock | time (real) | time (sys) |
---|---|---|---|
single | 352917 | 0m0.361s | 0m0.089s |
multi 1 | 365564 | 0m0.373s | 0m0.273s |
multi 2 | 524756 | 0m0.274s | 0m0.430s |
multi 4 | 774477 | 0m0.215s | 0m0.589s |
multi 8 | 1711224 | 0m0.331s | 0m1.136s |
My conclusion is that clock()
is not be a good metric to measure "slower". Your program is getting faster till 4 threads. Another take away is that the operating system does a lot more work to accomplish the same thing as the number of threads goes up.