Performance of multithreaded algorithm to find max number in array-CodePudding

I'm trying to learn about multithreaded algorithms so I've implemented a simple find max number function of an array. I've made a baseline program (findMax1.c) which loads from a file about 263 million int numbers into memory. Then I simply use a for loop to find the max number. Then I've made another program (findMax2.c) which uses 4 threads. I chose 4 threads because the CPU (intel i5 4460) I'm using has 4 cores and 1 thread per core. So my guess is that if I assign each core a chunk of the array to process it would be more efficient because that way I'll have fewer cache misses. Now, each thread finds the max number from each chunk, then I join all threads to finally find the max number from all those chunks. The baseline program findMax1.c takes about 660ms to complete the task, so my initial thought was that findMax2.c (which uses 4 threads) would take about 165ms (660ms / 4) to complete since now I have 4 threads running all in parallel to do the same task, but findMax2.c takes about 610ms. Only 50ms less than findMax1.c. What am I missing? is there something wrong with the implementation of the threaded program?

findMax1.c

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <time.h>

int main(void)
{
    int i, *array, max = 0, position;
    size_t array_size_in_bytes = 1024*1024*1024, elements_read, array_size;
    FILE *f;
    clock_t t;
    double time;

    array = (int*) malloc(array_size_in_bytes);
    assert(array != NULL); // assert if condition is falsa 

    printf("Loading array...");

    t = clock();
    f = fopen("numbers.bin", "rb");
    assert(f != NULL);

    elements_read = fread(array, array_size_in_bytes, 1, f);
    t = clock() - t;
    time = ((double) t) / CLOCKS_PER_SEC;
    assert(elements_read == 1);

    printf("done!\n");
    printf("File load time: %f [s]\n", time);

    fclose(f);

    array_size = array_size_in_bytes / sizeof(int);

    printf("Finding max...");

    t = clock();

    for(i = 0; i < array_size; i  )
        if(array[i] > max)
        {
            max = array[i];
            position = i;
        }

    t = clock() - t;
    time = ((double) t) / CLOCKS_PER_SEC;

    printf("done!\n");
    printf("----------- Program results -------------\nMax number: %d position %d\n", max, position);
    printf("Time %f [s]\n", time);

    return 0;
}

findMax2.c:

#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <time.h>
#include <pthread.h>
#include <stdlib.h>
#include <unistd.h>
#include <sched.h>

#define NUM_THREADS 4

int max_chunk[NUM_THREADS], pos_chunk[NUM_THREADS];
int *array;
pthread_t tid[NUM_THREADS];

void *thread(void *arg)
{
    size_t array_size_in_bytes = 1024*1024*1024;
    int i, rc, offset, chunk_size, array_size, *core_id = (int*) arg, num_cores = sysconf(_SC_NPROCESSORS_ONLN);
    pthread_t id = pthread_self();
    cpu_set_t cpuset;

    if (*core_id < 0 || *core_id >= num_cores)
        return NULL;

    CPU_ZERO(&cpuset);
    CPU_SET(*core_id, &cpuset);

    rc = pthread_setaffinity_np(id, sizeof(cpu_set_t), &cpuset);
    if(rc != 0)
    {
        printf("pthread_setaffinity_np() failed! - rc %d\n", rc);
        return NULL;
    }

    printf("Thread running on CPU %d\n", sched_getcpu());
    
    array_size = (int) (array_size_in_bytes / sizeof(int));
    chunk_size = (int) (array_size / NUM_THREADS);
    offset = chunk_size * (*core_id);
    
    // Find max number in the array chunk
    for(i = offset; i < (offset   chunk_size); i  )
    {
        if(array[i] > max_chunk[*core_id])
        {
            max_chunk[*core_id] = array[i];
            pos_chunk[*core_id] = i;
        }
    }
    
    return NULL;        
}

void load_array(void)
{
    FILE *f;
    size_t array_size_in_bytes = 1024*1024*1024, elements_read;

    array = (int*) malloc(array_size_in_bytes);
    assert(array != NULL); // assert if condition is false

    printf("Loading array...");

    f = fopen("numbers.bin", "rb");
    assert(f != NULL);

    elements_read = fread(array, array_size_in_bytes, 1, f);
    assert(elements_read == 1);

    printf("done!\n");

    fclose(f);
}

int main(void)
{
    int i, max = 0, position, id[NUM_THREADS], rc;
    clock_t t;
    double time;

    load_array();

    printf("Finding max...");

    t = clock();

    // Create threads
    for(i = 0; i < NUM_THREADS; i  )
    {
        id[i] = i; // uso id para pasarle un puntero distinto a cada thread
        rc = pthread_create(&(tid[i]), NULL, &thread, (void*)(id   i));
        if (rc != 0)
            printf("Can't create thread! rc = %d\n", rc);
        else
            printf("Thread %lu created\n", tid[i]);
    }
    
    // Join threads
    for(i = 0; i < NUM_THREADS; i  )
        pthread_join(tid[i], NULL);

    // Find max number from all chunks
    for(i = 0; i < NUM_THREADS; i  )
        if(max_chunk[i] > max)
        {
            max = max_chunk[i];
            position = pos_chunk[i];
        }

    t = clock() - t;
    time = ((double) t) / CLOCKS_PER_SEC;

    printf("done!\n");
    free(array);

    printf("----------- Program results -------------\nMax number: %d position %d\n", max, position);
    printf("Time %f [s]\n", time);

    pthread_exit(NULL);

    return 0;
}

CodePudding user response：

First of all, you're measuring your time wrong. clock() measures process CPU time, i.e., time used by all threads. The real elapsed time will be fraction of that. clock_gettime(CLOCK_MONOTONIC,...) should yield better measurements.

Second, your core loops aren't at all comparable.

In the multithreaded program you're writing in each loop iteration to global variables that are very close to each other and that is horrible for cache contention. You could space that global memory apart (make each array item a cache-aligned struct (_Alignas(64))) and that'll help the time, but a better and fairer approach would be to use local variables (which should go into registers), copying the approach of the first loop, and then write out the chunk result to memory at the end of the loop:

int l_max_chunk=0, l_pos_chunk=0, *a;
for(i = 0,a=array offset; i < chunk_size; i  )
    if(a[i] > l_max_chunk) l_max_chunk=a[i], l_pos_chunk=i;
max_chunk[*core_id] = l_max_chunk;
pos_chunk[*core_id] = l_pos_chunk;

Here's your modified test program with expected speedups (I'm getting approx. a 2x speedup on my two-core processor). (I've also taken the liberty of replacing the file load with in-memory initialization, to make it simpler to test.)

#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <time.h>
#include <pthread.h>
#include <stdlib.h>
#include <unistd.h>
#include <sched.h>

#include <stdint.h>
struct timespec ts0,ts1;
uint64_t sc_timespec_diff(struct timespec Ts1, struct timespec Ts0) { return (Ts1.tv_sec - Ts0.tv_sec)*1000000000 (Ts1.tv_nsec - Ts0.tv_nsec); }

#define NUM_THREADS 4

int max_chunk[NUM_THREADS], pos_chunk[NUM_THREADS];
int *array;
pthread_t tid[NUM_THREADS];

void *thread(void *arg)
{
    size_t array_size_in_bytes = 1024*1024*1024;
    int i, rc, offset, chunk_size, array_size, *core_id = (int*) arg, num_cores = sysconf(_SC_NPROCESSORS_ONLN);
    #if 1 //shouldn't make  much difference
    pthread_t id = pthread_self();
    cpu_set_t cpuset;

    if (*core_id < 0 || *core_id >= num_cores)
        return NULL;

    CPU_ZERO(&cpuset);
    CPU_SET(*core_id, &cpuset);

    rc = pthread_setaffinity_np(id, sizeof(cpu_set_t), &cpuset);
    if(rc != 0)
    {
        printf("pthread_setaffinity_np() failed! - rc %d\n", rc);
        return NULL;
    }

    printf("Thread running on CPU %d\n", sched_getcpu());
    #endif
    
    array_size = (int) (array_size_in_bytes / sizeof(int));
    chunk_size = (int) (array_size / NUM_THREADS);
    offset = chunk_size * (*core_id);
    
    // Find max number in the array chunk
    
    #if 0 //horrible for caches
    for(i = offset; i < (offset   chunk_size); i  )
    {
        if(array[i] > max_chunk[*core_id])
        {
            max_chunk[*core_id] = array[i];
            pos_chunk[*core_id] = i;
        }
    }
    #else
    int l_max_chunk=0, l_pos_chunk=0, *a;
    for(i = 0,a=array offset; i < chunk_size; i  )
        if(a[i] > l_max_chunk) l_max_chunk=a[i], l_pos_chunk=i;
    max_chunk[*core_id] = l_max_chunk;
    pos_chunk[*core_id] = l_pos_chunk;
    #endif
    
    return NULL;        
}

void load_array(void)
{
    FILE *f;
    size_t array_size_in_bytes = 1024*1024*1024, array_size=array_size_in_bytes/sizeof(int);

    array = (int*) malloc(array_size_in_bytes);
    if(array == NULL) abort(); // assert if condition is false
    for(size_t i=0; i<array_size; i  ) array[i]=i;

}


int main(void)
{
    int i, max = 0, position, id[NUM_THREADS], rc;
    clock_t t;
    double time;

    load_array();

    printf("Finding max...");

    t = clock();

    clock_gettime(CLOCK_MONOTONIC,&ts0);

    // Create threads
    for(i = 0; i < NUM_THREADS; i  )
    {
        id[i] = i; // uso id para pasarle un puntero distinto a cada thread
        rc = pthread_create(&(tid[i]), NULL, &thread, (void*)(id   i));
        if (rc != 0)
            printf("Can't create thread! rc = %d\n", rc);
        else
            printf("Thread %lu created\n", tid[i]);
    }
    
    // Join threads
    for(i = 0; i < NUM_THREADS; i  )
        pthread_join(tid[i], NULL);

    // Find max number from all chunks
    for(i = 0; i < NUM_THREADS; i  )
        if(max_chunk[i] > max)
        {
            max = max_chunk[i];
            position = pos_chunk[i];
        }

    clock_gettime(CLOCK_MONOTONIC,&ts1);
    printf("Time2 %.6LF\n", sc_timespec_diff(ts1,ts0)/1E9L);

    t = clock() - t;
    time = ((double) t) / CLOCKS_PER_SEC;

    printf("done!\n");
    free(array);

    printf("----------- Program results -------------\nMax number: %d position %d\n", max, position);
    printf("Time %f [s]\n", time);

    pthread_exit(NULL);

    return 0;
}

My timings:

0.188917 for the signle threaded version
2.511590 for the original multithreaded version (measured with clock_gettime(CLOCK_MONOTONIC,...)
0.099802 with the modified threaded version (measured with clock_gettime(CLOCK_MONOTONIC,...)

ran on a Linux machine with Intel(R) Core(TM) i7-2620M CPU @ 2.70GHz.