Why column-major traversal is actually faster than row-major traversal when 2d array is small-CodePudding

I am experimenting the row- vs column-major traversal. The general idea is that row-major traversal should be faster due to caching/vectorization/etc. Here is my test code:

// gcc -o main.out main.c -O3
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

int main() {
  size_t dim[] = {10, 50, 100, 200, 500, 1000, 5000, 10000, 20000};
  struct timespec ts;
  uint32_t* arr_ptr;
  double delta, t0;
  printf(
    "Dim\tArraySize(KB)\tRow-Major Time\tRM Sample\tCol-Major Time\tCM Sample\n"
  );
  for (int i = 0; i < sizeof(dim) / sizeof(dim[0]);   i) {
    size_t d = dim[i];
    printf("%5lu,\tlu,\t", d, d * d * sizeof(uint32_t) / 1024);
    arr_ptr = (uint32_t*)malloc(d * d * sizeof(uint32_t));
    memset(arr_ptr, 0, d * d * sizeof(uint32_t));
    timespec_get(&ts, TIME_UTC);
    t0 = ts.tv_sec   ts.tv_nsec / 1000.0 / 1000.0 / 1000.0;
    for (int j = 0; j < d;   j) {
      for (int k = 0; k < d;   k) {
        *(arr_ptr   j * d   k)  = (j   k);
      }
    }
    timespec_get(&ts, TIME_UTC);
    delta = ts.tv_sec   ts.tv_nsec / 1000.0 / 1000.0 / 1000.0 - t0;
    printf("%0.9lf,\t%8u,\t", delta, *(arr_ptr   ts.tv_sec % d * d   ts.tv_nsec % d));
    free(arr_ptr);

    arr_ptr = (uint32_t*)malloc(d * d * sizeof(uint32_t));
    memset(arr_ptr, 0, d * d * sizeof(uint32_t));
    timespec_get(&ts, TIME_UTC);
    t0 = ts.tv_sec   ts.tv_nsec / 1000.0 / 1000.0 / 1000.0;
    for (int j = 0; j < d;   j) {
      for (int k = 0; k < d;   k) {
        *(arr_ptr   k * d   j)  = (j   k);
      }
    }
    timespec_get(&ts, TIME_UTC);
    delta = ts.tv_sec   ts.tv_nsec / 1000.0 / 1000.0 / 1000.0 - t0;
    printf("%0.9lf,\t%8u\n", delta, *(arr_ptr   ts.tv_sec % d * d   ts.tv_nsec % d));
    free(arr_ptr);
  }
  return 0;
}

The result is as follows:

Dim ArraySize(KB)   Row-Major Time  RM Sample   Col-Major Time  CM Sample
   10,            0,    0.000000238,          10,   0.000000238,          18
   20,            1,    0.000000238,          19,   0.000000477,          40
   50,            9,    0.000002384,          31,   0.000001431,         122
  100,           39,    0.000013113,          96,   0.000005245,         272
  200,          156,    0.000077486,         263,   0.000042200,         240
  500,          976,    0.000452757,         558,   0.000420809,         362
 1000,         3906,    0.001549959,        1095,   0.001713991,        1030
 2000,        15625,    0.005422354,        3281,   0.006698370,        3571
 5000,        97656,    0.036512375,        5439,   0.053869963,        4949
10000,       390625,    0.148355007,        7462,   1.049520969,        6046
20000,      1562500,    0.768568516,       22097,   7.388022661,       32356

The general theory holds only after the dimension reaches 500--before that, column-major traversal keeps outperforming row-major traversal. This seems not random--I ran the test multiple times with different gcc parameters, such as -O3 -ffast-math and -O3 -fno-tree-vectorize, the performance gap appears to be stable. But why?

CodePudding user response：

After investigating a bit and per suggestion from @Fe2O3, the code is revised to the following and it works:

#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>

int main() {
  size_t dim[] = {10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000, 20000};
  struct timespec ts;
  uint32_t* arr_ptr;
  double delta, t0;
  printf(
    "Dim\tArraySize(KB)\tRow-Major Time\tRM Sample\tCol-Major Time\tCM Sample\n"
  );
  for (int i = 0; i < sizeof(dim) / sizeof(dim[0]);   i) {
    size_t d = dim[i];
    printf("%5lu,\tlu,\t", d, d * d * sizeof(uint32_t) / 1024);
    arr_ptr = (uint32_t*)malloc(d * d * sizeof(uint32_t));
    for (int j = 0; j < d;   j) {
      for (int k = 0; k < d;   k) {
        *(arr_ptr   j * d   k) = (j * k);
      }
    }
    timespec_get(&ts, TIME_UTC);
    t0 = ts.tv_sec   ts.tv_nsec / 1000.0 / 1000.0 / 1000.0;
    for (int j = 0; j < d;   j) {
      for (int k = 0; k < d;   k) {
        *(arr_ptr   j * d   k)  = (j   k);
      }
    }
    timespec_get(&ts, TIME_UTC);
    delta = ts.tv_sec   ts.tv_nsec / 1000.0 / 1000.0 / 1000.0 - t0;
    printf("%0.9lf,\t%8u,\t", delta, *(arr_ptr   ts.tv_sec % d * d   ts.tv_nsec % d));
    free(arr_ptr);

    arr_ptr = (uint32_t*)malloc(d * d * sizeof(uint32_t));
    for (int j = 0; j < d;   j) {
      for (int k = 0; k < d;   k) {
        *(arr_ptr   k * d   j) = (j * k);
      }
    }
    timespec_get(&ts, TIME_UTC);
    t0 = ts.tv_sec   ts.tv_nsec / 1000.0 / 1000.0 / 1000.0;
    for (int j = 0; j < d;   j) {
      for (int k = 0; k < d;   k) {
        *(arr_ptr   k * d   j)  = (j   k);
      }
    }
    timespec_get(&ts, TIME_UTC);
    delta = ts.tv_sec   ts.tv_nsec / 1000.0 / 1000.0 / 1000.0 - t0;
    printf("%0.9lf,\t%8u\n", delta, *(arr_ptr   ts.tv_sec % d * d   ts.tv_nsec % d));
    free(arr_ptr);
  }
  return 0;
}

Results:

Dim ArraySize(KB)   Row-Major Time  RM Sample   Col-Major Time  CM Sample
   10,            0,    0.000000238,          39,   0.000000238,           3
   20,            1,    0.000000238,          83,   0.000000477,          27
   50,            9,    0.000000715,         147,   0.000001431,          15
  100,           39,    0.000002623,        5129,   0.000005245,         485
  200,          156,    0.000009060,       15553,   0.000018835,       24023
  500,          976,    0.000072718,        9557,   0.000153065,        7235
 1000,         3906,    0.000302553,       91963,   0.001093388,      282539
 2000,        15625,    0.001767159,     1004401,   0.006136179,      133513
 5000,        97656,    0.010307550,     2686865,   0.045175791,     2730377
10000,       390625,    0.040507793,    44626185,   0.827179193,    53503515
20000,      1562500,    0.206705093,    26209730,   5.742965460,    173268143

The trick is we add one more "preparatory loop" before the timed loop. One possible reason behind it is that malloc() doesn't really provide all the necessary memory upon request. The memory blocks are actually provided immediately before first access--as a result, the first loop could be slower as malloc() is still busy letting us have the memory blocks it promised.

After using it for the first time, all things are set, so the 2nd loop behaves exactly as we expect.

CodePudding user response：

Here is a slight reworking of the code that allocates and 'touches' every element of the largest array size. This forces the memory management routines to actually go through any "page fault" speed optimisations, creating a level playing field.

Then, start & stop times are taken for updating all the elements of "arrays" of varying square sizes and statistics are reported.

I've 'touched' this code, but do not use gcc (can't really check syntax or my typing mistakes) nor do I have the nS timer routines. Be kind to my mistakes, please.

// gcc -o main.out main.c -O3
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

int main() {
    size_t dim[] = {
           10,
           20,
           50,
          100,
          200,
          500,
         1000,
         2000,
         5000,
        10000,
        20000,
    };
    const int nDim = sizeof dim/sizeof dim[0];
    size_t d, i, j, k;
    struct timespec ts, te; // start & end times

    // allocate, use and retain the largest array (vector) size.
    d = dim[ nDim - 1 ];
    uint32_t *arr_ptr = (uint32_t*)malloc( d * d * sizeof *arr_ptr );

    timespec_get( &ts, TIME_UTC ); // just for curiosity
    for( j = 0; j < d;   j )
        for( k = 0; k < d;   k )
            *(arr_ptr   (j * d)   k) = (j   k); // simple assignment
    timespec_get( &te, TIME_UTC );
    { 
        double tss = ts.tv_sec   ts.tv_nsec / 1000.0 / 1000.0 / 1000.0;
        double tse = te.tv_sec   te.tv_nsec / 1000.0 / 1000.0 / 1000.0;
        printf( "Initial time taken = %0.9lf\n", tse - tss );
    }

    printf(
        "Dim\tArraySize(KB)\t"
        "Row-Major Time\tRM Sample\t"
        "Col-Major Time\tCM Sample\t Difference\n" );

    for( i = 0; i < nDim;   i ) {
        double tss, tse;

        d = dim[i];
        printf( "%5lu,\tlu,\t", d, d * d * sizeof *arr_ptr / 1024 );

        timespec_get( &ts, TIME_UTC );
        for( j = 0; j < d;   j )
            for( k = 0; k < d;   k )
                *(arr_ptr   (j * d)   k)  = (j   k); // Row-Major thrashing
        timespec_get( &te, TIME_UTC );

        tss = ts.tv_sec   ts.tv_nsec / 1000.0 / 1000.0 / 1000.0;
        tse = te.tv_sec   te.tv_nsec / 1000.0 / 1000.0 / 1000.0;
        double deltaR = tse - tss;
        // the 'randomly selected' data presented as hex, not decimal
        printf( "%0.9lf,\tX,\t", deltaR, *(arr_ptr   ts.tv_sec % d * d   ts.tv_nsec % d) );

        timespec_get( &ts, TIME_UTC );
        for( j = 0; j < d;   j )
            for( k = 0; k < d;   k )
                *(arr_ptr   (k * d )  j)  = (j   k); // Col-Major thrashing
        timespec_get( &te, TIME_UTC );

        tss = ts.tv_sec   ts.tv_nsec / 1000.0 / 1000.0 / 1000.0;
        tse = te.tv_sec   te.tv_nsec / 1000.0 / 1000.0 / 1000.0;
        double deltaC = tse - tss;
        printf( "%0.9lf,\tX\t", deltaC, *(arr_ptr   ts.tv_sec % d * d   ts.tv_nsec % d) );

        printf( "%0.9lf\n", deltaR - deltaC ); // difference
    }

    free( arr_ptr ); // thank you for your service. you're free now/

    return 0;
}