Implement a function that rotates 90 degrees in a two-dimensional array that represents a color imag-CodePudding

I am trying to reduce the runtime of a particular function This is the naive implementation:

typedef struct {
    unsigned short red;
    unsigned short green;
    unsigned short blue;
} pixel;

#define RIDX(i,j,n)  ((i)*(n) (j))

void naive_rotate(int dim, pixel *src, pixel *dst) 
{
    int i, j;

    for (i = 0; i < dim; i  )
        for (j = 0; j < dim; j  )
            dst[RIDX(dim - 1 - j, i, dim)] = src[RIDX(i, j, dim)];
}

Does anyone have an idea how can I improve my runtime ?

CodePudding user response：

Looking at the assembly produced by gcc and clang, there is not much to improve besides choosing the appropriate optimisation options.

Here are a few ideas:

You could try and help the compiler avoid extra multiplications, but these two already optimize that:

void naive_rotate1(int dim, const pixel *src, pixel *dst) {
    pixel *dst0 = dst   (dim - 1) * dim;

    for (int i = 0; i < dim; i  ) {
        pixel *dst1 = dst0   i;
        for (int j = 0; j < dim; j  ) {
            *dst1 = *src  ;
            dst1 -= dim;
        }
    }
}

You could use size_t instead of int for the index types:

void naive_rotate2(size_t dim, const pixel *src, pixel *dst) {
    pixel *dst0 = dst   (dim - 1) * dim;

    for (size_t i = 0; i < dim; i  ) {
        pixel *dst1 = dst0   i;
        for (size_t j = 0; j < dim; j  ) {
            *dst1 = *src  ;
            dst1 -= dim;
        }
    }
}

You could change your image representation and use 4 components instead of 3 to improve aligned accesses:

typedef struct {
    unsigned short red;
    unsigned short green;
    unsigned short blue;
    unsigned short alpha;
} pixel4;

void naive_rotate4(size_t dim, const pixel4 *src, pixel4 *dst) {
    pixel4 *dst0 = dst   (dim - 1) * dim;

    for (size_t i = 0; i < dim; i  ) {
        pixel4 *dst1 = dst0   i;
        for (size_t j = 0; j < dim; j  ) {
            *dst1 = *src  ;
            dst1 -= dim;
        }
    }
}

You may improve cache locality by moving and rotating 8x8 subtiles instead of whole rows at a time.

typedef struct {
    unsigned short red;
    unsigned short green;
    unsigned short blue;
} pixel;

#define RIDX(i,j,n)  ((i)*(n) (j))

void naive_rotate3(int dim, pixel *src, pixel *dst) 
{
    int dim8 = dim - dim % 8;

    for (int i = 0; i < dim8; i  = 8) {
        for (int j = 0; j < dim8; j  = 8) {
            for (int ii = 0; ii < 8; ii  ) {
                for (int jj = 0; jj < 8; jj  ) {
                    dst[RIDX(dim - 1 - j - jj, i   ii, dim)] = src[RIDX(i   ii, j   jj, dim)];
                }
            }
         }
    }
    for (int i = dim8; i < dim; i  ) {
        for (int j = 0; j < dim; j  ) {
            dst[RIDX(dim - 1 - j, i, dim)] = src[RIDX(i, j, dim)];
        }
    }
    for (int i = 0; i < dim8; i  ) {
        for (int j = dim8; j < dim; j  ) {
            dst[RIDX(dim - 1 - j, i, dim)] = src[RIDX(i, j, dim)];
        }
    }
}

Combining both tiles and 4 components:


void naive_rotate5(int dim, pixel4 *src, pixel4 *dst) {
    int dim8 = dim - dim % 8;

    for (int i = 0; i < dim8; i  = 8) {
        for (int j = 0; j < dim8; j  = 8) {
            for (int ii = 0; ii < 8; ii  ) {
                for (int jj = 0; jj < 8; jj  ) {
                    dst[RIDX(dim - 1 - j - jj, i   ii, dim)] = src[RIDX(i   ii, j   jj, dim)];
                }
            }
        }
    }
    for (int i = dim8; i < dim; i  ) {
        for (int j = 0; j < dim; j  ) {
            dst[RIDX(dim - 1 - j, i, dim)] = src[RIDX(i, j, dim)];
        }
    }
    for (int i = 0; i < dim8; i  ) {
        for (int j = dim8; j < dim; j  ) {
            dst[RIDX(dim - 1 - j, i, dim)] = src[RIDX(i, j, dim)];
        }
    }
}

You should benchmark these alternatives to measure actual performance, which may vary from one compiler/system to another.

On my system, I get these timings for 2048x2048 images:

naive_rotate(2048): 54.129ms
naive_rotate1(2048): 53.904ms
naive_rotate2(2048): 57.569ms
naive_rotate3(2048): 15.788ms
naive_rotate4(2048): 57.403ms
naive_rotate5(2048): 11.811ms

moving 8x8 subtiles achieves a remarkable 4x to 5x improvement on large images. 4x4 or 16x16 still outperform naive_rotate, but only by a factor of 2 on my old macBook.

CodePudding user response：

May be by removing some computations when accessing the src image? This way:

pixel *p = src;
for (i = 0; i < dim; i  )
    for (j = 0; j < dim; j  )
        dst[RIDX(dim - 1 - j, i, dim)] = *p  ;

Removing on dst is little more tricky (modulos and additions), not sure it will worth the effort (except if dim is a power of 2?).

CodePudding user response：

You could eliminate the multiplications (apart from the "hidden" multiplications by sizeof(pixel) in the pointer arithmetic) by using increment/decrement:

void naive_rotate(int dim, pixel *src, pixel *dst) 
{
    int i, j;

    if (dim) {
        for (i = 0; i < dim; i  ) {
            pixel *savedst = dst;
            src  = dim;
            for (j = 0; j < dim - 1; j  ) {
                src--;
                *dst = *src;
                dst  = dim;
            }
            src--;
            *dst = *src;
            src  = dim;
            dst = savedst   1;
        }
    }
}