Home > Net >  Why my multi-thread program runs slower than single thread?
Why my multi-thread program runs slower than single thread?

Time:05-20

I'm new to threads,my teacher asked for a 9 nine threads program that makes 3 arrays A B C with 10^6 elements each-all equal to 1. So we must use 9 threads to fill (with the number 1) those arrays faster. I came with a plan to use 3 threads to fill each array. I cut the arrays into 3 parts ... i=0 to i=333332... i=333333 to i=666662... i=666666 to i=999999. But when I run the program with nine threads runs slower than the single threaded program ( which contains a for loop from 0 to 10^6 and fills the 3 arrays with the number 1).

this is my 9 threads code:

#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <unistd.h>
#include <pthread.h>
#include <sys/sysinfo.h>
#include <time.h>

double A[1000000];
double B[1000000];
double C[1000000];
double D[1000000];

  void * fillA1(void * tid){

  for (int i=0;i<333333;i  ){

    A[i]=1.0;

  }

    return NULL;
  }
void * fillA2(void * tid){

  for (int i=333333;i<666666;i  ){

    A[i]=1.0;

    
  }

    return NULL;
  }

void * fillA3(void * tid){

  for (int i=666666;i<1000000;i  ){

    A[i]=1.0;

    
  }

    return NULL;
  }


void * fillB1(void*tid){

  for (int i=0;i<333333;i  ){
    B[i]=1.0;
    
  }
return NULL;
  }

void * fillB2(void*tid){

  for (int i=333333;i<666666;i  ){
    B[i]=1.0;
    
  }
return NULL;
  }

void * fillB3 (void*tid){

  for (int i=666666;i<1000000;i  ){
    B[i]=1.0;
    
  }
return NULL;
  }


void * fillC1(void*tid){

  for (int i=0;i<333333;i  ){
    C[i]=1.0;
    
  }
  return NULL;
  
  }

void * fillC2(void*tid){

  for (int i=333333;i<666666;i  ){
    C[i]=1.0;
    
  }
  return NULL;
  
  }

void * fillC3(void*tid){

  for (int i=666666;i<1000000;i  ){
    C[i]=1.0;
    
  }
  return NULL;
  
  }


int main (void){
   double time_spent = 0.0;
 
    clock_t begin = clock();
 

// declare 9 thread type variables;
pthread_t tid0;// pthread_t is a data type used to uniquely identify a thread.
pthread_t tid1;
pthread_t tid2;
pthread_t tid3;
pthread_t tid4;
pthread_t tid5;
pthread_t tid6;
pthread_t tid7;
pthread_t tid8;

//create an array with 9 threads address
pthread_t * pthreads[] ={&tid0,&tid1,&tid2,&tid3,&tid4,&tid5,&tid6,&tid7,&tid8};



  // use 9 threads to fill the 3 arrays simultaneously
  pthread_create(pthreads[0],NULL,fillA1,NULL);
  pthread_create(pthreads[1],NULL,fillA2,NULL);
  pthread_create(pthreads[2],NULL,fillA3,NULL);
  pthread_create(pthreads[3],NULL,fillB1, NULL);
  pthread_create(pthreads[4],NULL,fillB2, NULL);
  pthread_create(pthreads[5],NULL,fillB3, NULL);
  pthread_create(pthreads[6],NULL,fillC1,NULL);
  pthread_create(pthreads[7],NULL,fillC2,NULL);
  pthread_create(pthreads[8],NULL,fillC3,NULL);

  
  for(int i=0;i<9;i  ){
    pthread_join(*pthreads[i],NULL);
  }
 

 

 
 
clock_t end = clock();
 
    // calculate elapsed time by finding difference (end - begin) and
    // dividing the difference by CLOCKS_PER_SEC to convert to seconds
    time_spent  = (double)(end - begin) / CLOCKS_PER_SEC;
 
    printf("The elapsed time is %f seconds", time_spent);

  return 0;
}

this is my single threaded code:

#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <unistd.h>
#include <pthread.h>
#include<time.h>

double A[1000000];
double B[1000000];
double C[1000000];
double D[1000000];

int main (void){
  //to store the execution time of code
    double time_spent = 0.0;
 
    clock_t begin = clock();
 

 
for (int i=0;i<1000000;i  ){
    A[i]=1;
    B[i]=1;
    C[i]=1;
 
 
  }
 
clock_t end = clock();
 
    // calculate elapsed time by finding difference (end - begin) and
    // dividing the difference by CLOCKS_PER_SEC to convert to seconds
    time_spent  = (double)(end - begin) / CLOCKS_PER_SEC;
 
    printf("The elapsed time is %f seconds\n", time_spent);
  return 0;
}



CodePudding user response:

Having a separate thread function for each segment in each array, makes the program less flexible. The usual way is to have a struct that describes the work for each thread (e.g. pointer to array and start/end indexes).

Also, clock_gettime is probably better than clock for timing. And, do not put printf in anything that is being timed.

Note that using 3 arrays (e.g. A/B/C) may not be any different than a single array that is 3 times the size (but I left that in).

Here's a refactored version that will let you experiment more:

#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <unistd.h>
#include <pthread.h>
#include <sys/sysinfo.h>
#include <time.h>

#define DATA_SIZE   1000000             // number of elements in array
#define NPER        3                   // number of threads per array
#define NARRAY      3                   // number of arrays
#define CHUNK_SIZE  (DATA_SIZE / NPER)  // amount of data for each thread

#define NTHREAD     (NARRAY * NPER)     // number of threads

double A[DATA_SIZE];
double B[DATA_SIZE];
double C[DATA_SIZE];
double D[DATA_SIZE];

struct work {
    pthread_t tid;
    double *data;
    int start;
    int count;
};

double
tscgetf(void)
{
    struct timespec ts;
    double sec;

    clock_gettime(CLOCK_MONOTONIC,&ts);
    sec = ts.tv_nsec;
    sec /= 1e9;
    sec  = ts.tv_sec;

    return sec;
}

void *
fill(void *tid)
{
    struct work *work = tid;
    double *data = &work->data[work->start];
    double *edata = &data[work->count];

    for (;  data < edata;    data)
        *data = 1.0;

    return NULL;
}

int
main(void)
{
    double time_spent = 0.0;

    struct work buflist[NARRAY] = {
        { .data = A },
        { .data = B },
        { .data = C },
    };
    struct work *buf;

    struct work threads[NTHREAD];
    struct work *work;

    int tidx = 0;
    for (int ibuf = 0;  ibuf < NARRAY;    ibuf) {
        buf = &buflist[ibuf];

        for (int ichunk = 0;  ichunk < NPER;    ichunk,   tidx) {
            work = &threads[tidx];
            double *ptr = buf->data;

            work->data = ptr;
            work->start = buf->start;

            if (ichunk == (NPER - 1))
                work->count = DATA_SIZE - buf->start;
            else
                work->count = CHUNK_SIZE;

            printf("%d: ibuf=%d start=%d count=%d (end=%d)\n",
                tidx,ibuf,work->start,work->count,work->start   work->count);

            buf->start  = work->count;
        }
    }

    double begin = tscgetf();

    for (tidx = 0;  tidx < NTHREAD;    tidx) {
        work = &threads[tidx];
        pthread_create(&work->tid,NULL,fill,work);
    }

    for (tidx = 0;  tidx < NTHREAD;    tidx) {
        work = &threads[tidx];
        pthread_join(work->tid,NULL);
    }

    double end = tscgetf();

    // calculate elapsed time
    time_spent = end - begin;

    printf("The elapsed time is %.9f seconds\n", time_spent);

    return 0;
}

Here is the output for my system:

0: ibuf=0 start=0 count=333333 (end=333333)
1: ibuf=0 start=333333 count=333333 (end=666666)
2: ibuf=0 start=666666 count=333334 (end=1000000)
3: ibuf=1 start=0 count=333333 (end=333333)
4: ibuf=1 start=333333 count=333333 (end=666666)
5: ibuf=1 start=666666 count=333334 (end=1000000)
6: ibuf=2 start=0 count=333333 (end=333333)
7: ibuf=2 start=333333 count=333333 (end=666666)
8: ibuf=2 start=666666 count=333334 (end=1000000)
The elapsed time is 0.004271893 seconds

UPDATE:

For what you really want to measure: the effect of number of threads on performance, I'd use a single array as the number of threads doesn't have to be a multiple of the number of data arrays.

Note: To account for system load, timeslicing, I would run each benckmark (e.g. at a given number of threads) a number of different times (e.g. repeat 10 times) and use the lowest elapsed time for each thread count. (You could automate this with a script)

Here's the code I would use:

#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <unistd.h>
#include <pthread.h>
#include <sys/sysinfo.h>
#include <time.h>

#ifndef DATA_SIZE
#define DATA_SIZE   4000000             // number of elements in array
#endif
#ifndef NTHREAD
#define NTHREAD     3                   // number of threads per array
#endif
#define CHUNK_SIZE  (DATA_SIZE / NTHREAD)   // amount of data for each thread

double A[DATA_SIZE];

struct work {
    pthread_t tid;
    double *data;
    int start;
    int count;
};

double
tscgetf(void)
{
    struct timespec ts;
    double sec;

    clock_gettime(CLOCK_MONOTONIC,&ts);
    sec = ts.tv_nsec;
    sec /= 1e9;
    sec  = ts.tv_sec;

    return sec;
}

void *
fill(void *tid)
{
    struct work *work = tid;
    double *data = &work->data[work->start];
    double *edata = &data[work->count];

    for (;  data < edata;    data)
        *data = 1.0;

    return NULL;
}

int
main(void)
{
    double time_spent = 0.0;

    struct work threads[NTHREAD];
    struct work *work;

    int tidx = 0;
    int start = 0;
    for (int tidx = 0;  tidx < NTHREAD;    tidx) {
        work = &threads[tidx];
        double *ptr = A;

        work->data = ptr;
        work->start = start;

        if (tidx == (NTHREAD - 1))
            work->count = DATA_SIZE - start;
        else
            work->count = CHUNK_SIZE;

        printf("%d: start=%d count=%d (end=%d)\n",
            tidx,work->start,work->count,work->start   work->count);

        start  = work->count;
    }

    double begin = tscgetf();

    for (tidx = 0;  tidx < NTHREAD;    tidx) {
        work = &threads[tidx];
        pthread_create(&work->tid,NULL,fill,work);
    }

    for (tidx = 0;  tidx < NTHREAD;    tidx) {
        work = &threads[tidx];
        pthread_join(work->tid,NULL);
    }

    double end = tscgetf();

    // calculate elapsed time
    time_spent = end - begin;

    printf("The elapsed time is %.9f seconds (RATE: %.3f/sec)\n",
        time_spent,(double) DATA_SIZE / time_spent);

    return 0;
}

And here is the output for various thread counts:

0: start=0 count=4000000 (end=4000000)
The elapsed time is 0.016544766 seconds (RATE: 241768306.067/sec)

0: start=0 count=2000000 (end=2000000)
1: start=2000000 count=2000000 (end=4000000)
The elapsed time is 0.008973760 seconds (RATE: 445744055.862/sec)

0: start=0 count=1333333 (end=1333333)
1: start=1333333 count=1333333 (end=2666666)
2: start=2666666 count=1333334 (end=4000000)
The elapsed time is 0.006181198 seconds (RATE: 647123775.983/sec)

0: start=0 count=1000000 (end=1000000)
1: start=1000000 count=1000000 (end=2000000)
2: start=2000000 count=1000000 (end=3000000)
3: start=3000000 count=1000000 (end=4000000)
The elapsed time is 0.006610514 seconds (RATE: 605096656.791/sec)

0: start=0 count=800000 (end=800000)
1: start=800000 count=800000 (end=1600000)
2: start=1600000 count=800000 (end=2400000)
3: start=2400000 count=800000 (end=3200000)
4: start=3200000 count=800000 (end=4000000)
The elapsed time is 0.005417253 seconds (RATE: 738381626.381/sec)

0: start=0 count=666666 (end=666666)
1: start=666666 count=666666 (end=1333332)
2: start=1333332 count=666666 (end=1999998)
3: start=1999998 count=666666 (end=2666664)
4: start=2666664 count=666666 (end=3333330)
5: start=3333330 count=666670 (end=4000000)
The elapsed time is 0.004728798 seconds (RATE: 845880951.049/sec)

0: start=0 count=571428 (end=571428)
1: start=571428 count=571428 (end=1142856)
2: start=1142856 count=571428 (end=1714284)
3: start=1714284 count=571428 (end=2285712)
4: start=2285712 count=571428 (end=2857140)
5: start=2857140 count=571428 (end=3428568)
6: start=3428568 count=571432 (end=4000000)
The elapsed time is 0.004353798 seconds (RATE: 918738077.652/sec)

0: start=0 count=500000 (end=500000)
1: start=500000 count=500000 (end=1000000)
2: start=1000000 count=500000 (end=1500000)
3: start=1500000 count=500000 (end=2000000)
4: start=2000000 count=500000 (end=2500000)
5: start=2500000 count=500000 (end=3000000)
6: start=3000000 count=500000 (end=3500000)
7: start=3500000 count=500000 (end=4000000)
The elapsed time is 0.006288737 seconds (RATE: 636057758.927/sec)

0: start=0 count=444444 (end=444444)
1: start=444444 count=444444 (end=888888)
2: start=888888 count=444444 (end=1333332)
3: start=1333332 count=444444 (end=1777776)
4: start=1777776 count=444444 (end=2222220)
5: start=2222220 count=444444 (end=2666664)
6: start=2666664 count=444444 (end=3111108)
7: start=3111108 count=444444 (end=3555552)
8: start=3555552 count=444448 (end=4000000)
The elapsed time is 0.005567823 seconds (RATE: 718413632.935/sec)

0: start=0 count=400000 (end=400000)
1: start=400000 count=400000 (end=800000)
2: start=800000 count=400000 (end=1200000)
3: start=1200000 count=400000 (end=1600000)
4: start=1600000 count=400000 (end=2000000)
5: start=2000000 count=400000 (end=2400000)
6: start=2400000 count=400000 (end=2800000)
7: start=2800000 count=400000 (end=3200000)
8: start=3200000 count=400000 (end=3600000)
9: start=3600000 count=400000 (end=4000000)
The elapsed time is 0.005202681 seconds (RATE: 768834332.840/sec)

0: start=0 count=363636 (end=363636)
1: start=363636 count=363636 (end=727272)
2: start=727272 count=363636 (end=1090908)
3: start=1090908 count=363636 (end=1454544)
4: start=1454544 count=363636 (end=1818180)
5: start=1818180 count=363636 (end=2181816)
6: start=2181816 count=363636 (end=2545452)
7: start=2545452 count=363636 (end=2909088)
8: start=2909088 count=363636 (end=3272724)
9: start=3272724 count=363636 (end=3636360)
10: start=3636360 count=363640 (end=4000000)
The elapsed time is 0.004945641 seconds (RATE: 808793049.562/sec)

0: start=0 count=333333 (end=333333)
1: start=333333 count=333333 (end=666666)
2: start=666666 count=333333 (end=999999)
3: start=999999 count=333333 (end=1333332)
4: start=1333332 count=333333 (end=1666665)
5: start=1666665 count=333333 (end=1999998)
6: start=1999998 count=333333 (end=2333331)
7: start=2333331 count=333333 (end=2666664)
8: start=2666664 count=333333 (end=2999997)
9: start=2999997 count=333333 (end=3333330)
10: start=3333330 count=333333 (end=3666663)
11: start=3666663 count=333337 (end=4000000)
The elapsed time is 0.005431988 seconds (RATE: 736378677.432/sec)

0: start=0 count=307692 (end=307692)
1: start=307692 count=307692 (end=615384)
2: start=615384 count=307692 (end=923076)
3: start=923076 count=307692 (end=1230768)
4: start=1230768 count=307692 (end=1538460)
5: start=1538460 count=307692 (end=1846152)
6: start=1846152 count=307692 (end=2153844)
7: start=2153844 count=307692 (end=2461536)
8: start=2461536 count=307692 (end=2769228)
9: start=2769228 count=307692 (end=3076920)
10: start=3076920 count=307692 (end=3384612)
11: start=3384612 count=307692 (end=3692304)
12: start=3692304 count=307696 (end=4000000)
The elapsed time is 0.005035344 seconds (RATE: 794384720.028/sec)

0: start=0 count=285714 (end=285714)
1: start=285714 count=285714 (end=571428)
2: start=571428 count=285714 (end=857142)
3: start=857142 count=285714 (end=1142856)
4: start=1142856 count=285714 (end=1428570)
5: start=1428570 count=285714 (end=1714284)
6: start=1714284 count=285714 (end=1999998)
7: start=1999998 count=285714 (end=2285712)
8: start=2285712 count=285714 (end=2571426)
9: start=2571426 count=285714 (end=2857140)
10: start=2857140 count=285714 (end=3142854)
11: start=3142854 count=285714 (end=3428568)
12: start=3428568 count=285714 (end=3714282)
13: start=3714282 count=285718 (end=4000000)
The elapsed time is 0.004831767 seconds (RATE: 827854458.801/sec)

0: start=0 count=266666 (end=266666)
1: start=266666 count=266666 (end=533332)
2: start=533332 count=266666 (end=799998)
3: start=799998 count=266666 (end=1066664)
4: start=1066664 count=266666 (end=1333330)
5: start=1333330 count=266666 (end=1599996)
6: start=1599996 count=266666 (end=1866662)
7: start=1866662 count=266666 (end=2133328)
8: start=2133328 count=266666 (end=2399994)
9: start=2399994 count=266666 (end=2666660)
10: start=2666660 count=266666 (end=2933326)
11: start=2933326 count=266666 (end=3199992)
12: start=3199992 count=266666 (end=3466658)
13: start=3466658 count=266666 (end=3733324)
14: start=3733324 count=266676 (end=4000000)
The elapsed time is 0.006333866 seconds (RATE: 631525817.104/sec)

0: start=0 count=250000 (end=250000)
1: start=250000 count=250000 (end=500000)
2: start=500000 count=250000 (end=750000)
3: start=750000 count=250000 (end=1000000)
4: start=1000000 count=250000 (end=1250000)
5: start=1250000 count=250000 (end=1500000)
6: start=1500000 count=250000 (end=1750000)
7: start=1750000 count=250000 (end=2000000)
8: start=2000000 count=250000 (end=2250000)
9: start=2250000 count=250000 (end=2500000)
10: start=2500000 count=250000 (end=2750000)
11: start=2750000 count=250000 (end=3000000)
12: start=3000000 count=250000 (end=3250000)
13: start=3250000 count=250000 (end=3500000)
14: start=3500000 count=250000 (end=3750000)
15: start=3750000 count=250000 (end=4000000)
The elapsed time is 0.005258777 seconds (RATE: 760633102.332/sec)
  • Related