I am trying to reduce the runtime of a particular function This is the naive implementation:
typedef struct {
unsigned short red;
unsigned short green;
unsigned short blue;
} pixel;
#define RIDX(i,j,n) ((i)*(n) (j))
void naive_rotate(int dim, pixel *src, pixel *dst)
{
int i, j;
for (i = 0; i < dim; i )
for (j = 0; j < dim; j )
dst[RIDX(dim - 1 - j, i, dim)] = src[RIDX(i, j, dim)];
}
Does anyone have an idea how can I improve my runtime ?
CodePudding user response:
Looking at the assembly produced by gcc and clang, there is not much to improve besides choosing the appropriate optimisation options.
Here are a few ideas:
- You could try and help the compiler avoid extra multiplications, but these two already optimize that:
void naive_rotate1(int dim, const pixel *src, pixel *dst) {
pixel *dst0 = dst (dim - 1) * dim;
for (int i = 0; i < dim; i ) {
pixel *dst1 = dst0 i;
for (int j = 0; j < dim; j ) {
*dst1 = *src ;
dst1 -= dim;
}
}
}
- You could use
size_t
instead ofint
for the index types:
void naive_rotate2(size_t dim, const pixel *src, pixel *dst) {
pixel *dst0 = dst (dim - 1) * dim;
for (size_t i = 0; i < dim; i ) {
pixel *dst1 = dst0 i;
for (size_t j = 0; j < dim; j ) {
*dst1 = *src ;
dst1 -= dim;
}
}
}
- You could change your image representation and use 4 components instead of 3 to improve aligned accesses:
typedef struct {
unsigned short red;
unsigned short green;
unsigned short blue;
unsigned short alpha;
} pixel4;
void naive_rotate4(size_t dim, const pixel4 *src, pixel4 *dst) {
pixel4 *dst0 = dst (dim - 1) * dim;
for (size_t i = 0; i < dim; i ) {
pixel4 *dst1 = dst0 i;
for (size_t j = 0; j < dim; j ) {
*dst1 = *src ;
dst1 -= dim;
}
}
}
- You may improve cache locality by moving and rotating 8x8 subtiles instead of whole rows at a time.
typedef struct {
unsigned short red;
unsigned short green;
unsigned short blue;
} pixel;
#define RIDX(i,j,n) ((i)*(n) (j))
void naive_rotate3(int dim, pixel *src, pixel *dst)
{
int dim8 = dim - dim % 8;
for (int i = 0; i < dim8; i = 8) {
for (int j = 0; j < dim8; j = 8) {
for (int ii = 0; ii < 8; ii ) {
for (int jj = 0; jj < 8; jj ) {
dst[RIDX(dim - 1 - j - jj, i ii, dim)] = src[RIDX(i ii, j jj, dim)];
}
}
}
}
for (int i = dim8; i < dim; i ) {
for (int j = 0; j < dim; j ) {
dst[RIDX(dim - 1 - j, i, dim)] = src[RIDX(i, j, dim)];
}
}
for (int i = 0; i < dim8; i ) {
for (int j = dim8; j < dim; j ) {
dst[RIDX(dim - 1 - j, i, dim)] = src[RIDX(i, j, dim)];
}
}
}
- Combining both tiles and 4 components:
void naive_rotate5(int dim, pixel4 *src, pixel4 *dst) {
int dim8 = dim - dim % 8;
for (int i = 0; i < dim8; i = 8) {
for (int j = 0; j < dim8; j = 8) {
for (int ii = 0; ii < 8; ii ) {
for (int jj = 0; jj < 8; jj ) {
dst[RIDX(dim - 1 - j - jj, i ii, dim)] = src[RIDX(i ii, j jj, dim)];
}
}
}
}
for (int i = dim8; i < dim; i ) {
for (int j = 0; j < dim; j ) {
dst[RIDX(dim - 1 - j, i, dim)] = src[RIDX(i, j, dim)];
}
}
for (int i = 0; i < dim8; i ) {
for (int j = dim8; j < dim; j ) {
dst[RIDX(dim - 1 - j, i, dim)] = src[RIDX(i, j, dim)];
}
}
}
You should benchmark these alternatives to measure actual performance, which may vary from one compiler/system to another.
On my system, I get these timings for 2048x2048 images:
naive_rotate(2048): 54.129ms
naive_rotate1(2048): 53.904ms
naive_rotate2(2048): 57.569ms
naive_rotate3(2048): 15.788ms
naive_rotate4(2048): 57.403ms
naive_rotate5(2048): 11.811ms
moving 8x8 subtiles achieves a remarkable 4x to 5x improvement on large images. 4x4 or 16x16 still outperform naive_rotate
, but only by a factor of 2 on my old macBook.
CodePudding user response:
May be by removing some computations when accessing the src
image? This way:
pixel *p = src;
for (i = 0; i < dim; i )
for (j = 0; j < dim; j )
dst[RIDX(dim - 1 - j, i, dim)] = *p ;
Removing on dst
is little more tricky (modulos and additions), not sure it will worth the effort (except if dim
is a power of 2?).
CodePudding user response:
You could eliminate the multiplications (apart from the "hidden" multiplications by sizeof(pixel)
in the pointer arithmetic) by using increment/decrement:
void naive_rotate(int dim, pixel *src, pixel *dst)
{
int i, j;
if (dim) {
for (i = 0; i < dim; i ) {
pixel *savedst = dst;
src = dim;
for (j = 0; j < dim - 1; j ) {
src--;
*dst = *src;
dst = dim;
}
src--;
*dst = *src;
src = dim;
dst = savedst 1;
}
}
}