Looking for performance improvement of NEON code to match clipping area on the screen-CodePudding

Here is my test code to find 1st clipping area on the screen. Two subroutines and dummy loops in the code to compare the performance of them.

point_in_neon (NEON version) and point_in (Regular version) does the same thing: find out the first clipping area (contains given point) in given list and return -1 if there is no matching area.

I expected NEON version is faster than regular version. Unfortunately, it is slower than regular version. Is there another way to speed it up?

The compiler command is:

${CC} -O2 -ftree-vectorize -o vcomp vcomp.c

Thanks,

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <assert.h>
#include <math.h>
#include <sys/time.h>
#include <arm_neon.h>

#define WIDTH   (4096)
#define HEIGHT  (4096)
#define CLIPS   (32)

static inline uint64_t now(void) {
    struct timeval tv;

    gettimeofday(&tv,NULL);
    return tv.tv_sec*1000000 tv.tv_usec;
}

typedef struct _rect_t {
    int32_t x;
    int32_t y;
    uint32_t width;
    uint32_t height;
} rect_t;

typedef struct _point_t {
    int32_t x;
    int32_t y;
} point_t;

int32_t inline point_in_neon(const point_t *pt, const rect_t rs[4]) {
    const int32_t right[4]={
        rs[0].x rs[0].width-1,
        rs[1].x rs[1].width-1,
        rs[2].x rs[2].width-1,
        rs[3].x rs[3].width-1
    }, bottom[4]={
        rs[0].y rs[0].height-1,
        rs[1].y rs[1].height-1,
        rs[2].y rs[2].height-1,
        rs[3].y rs[3].height-1
    };
    int32x4_t p, r;
    uint32x4_t t;
    uint32_t res[4];

    //p = <Xp, Xp, Xp, Xp>
    p=vld1q_dup_s32(&pt->x);

    //r = <Left0, Left1, Left2, Left3>
    r=vld1q_lane_s32(&rs[0].x, r, 0);
    r=vld1q_lane_s32(&rs[1].x, r, 1);
    r=vld1q_lane_s32(&rs[2].x, r, 2);
    r=vld1q_lane_s32(&rs[3].x, r, 3);
    //t = (p >= r)
    t=vcgeq_s32(p, r);

    //r = <Right0, Right1, Right2, Right3>
    r=vld1q_s32(&right);
    //t = t & (r >= p)
    t=vandq_u32(t, vcgeq_s32(r, p));
    
    //p = <Yp, Yp, Yp, Yp>
    p=vld1q_dup_s32(&pt->y);
    
    //r = <Top0, Top1, Top2, Top3>
    r=vld1q_lane_s32(&rs[0].y, r, 0);
    r=vld1q_lane_s32(&rs[1].y, r, 1);
    r=vld1q_lane_s32(&rs[2].y, r, 2);
    r=vld1q_lane_s32(&rs[3].y, r, 3);
    //t = t & (p >= r)
    t=vandq_u32(t, vcgeq_s32(p, r));
   
    //r = <Bottom0, Bottom1, Bottom2, Bottom3>
    r=vld1q_s32(&bottom);
    //t = t & (r >= p)
    t=vandq_u32(t, vcgeq_s32(r, p));

    vst1q_u32(res, t);
    
    if(res[0])
        return 0;
    else if(res[1])
        return 1;
    else if(res[2])
        return 2;
    else if(res[3])
        return 3;

    return -1;
}

int32_t inline point_in(const point_t *pt, const rect_t *rs, uint32_t len) {
    int32_t i;
    
    for(i=0;i<len;i  ) {
        int32_t right=rs[i].x rs[i].width-1,
                bottom=rs[i].y rs[i].height-1;
                
        if(pt->x>=rs[i].x && pt->x<=right &&
           pt->y>=rs[i].y && pt->y<=bottom)
            return i;
    }

    return -1;
}

int32_t main(int32_t argc, char *argv[]) {
    rect_t rs[CLIPS];
    
    int32_t i, j;
    uint64_t ts0, ts1;
    int32_t res[2][CLIPS];
    
    srand((unsigned int)time(NULL));
    for(i=0;i<CLIPS;i  ) {
        rs[i].x=rand()%WIDTH;
        rs[i].y=rand()%HEIGHT;
        rs[i].width=rand()%WIDTH;
        rs[i].height=rand()%HEIGHT;
    }
    memset(res, 0, sizeof(res));

    ts0=now();
    for(i=0;i<HEIGHT;i  ) {
        for(j=0;j<WIDTH;j  ) {
            point_t p={i, j};
            int32_t idx=point_in(&p, rs, CLIPS);
            
            if(idx>=0)
                res[0][idx]=1;
        }
    }
    ts0=now()-ts0;

    ts1=now();
    for(i=0;i<HEIGHT;i  ) {
        for(j=0;j<WIDTH;j  ) {
            int32_t k, idx;
            point_t p={i, j};
            
            for(k=0, idx=-1;k<CLIPS/4;k  ) {
                idx=point_in_neon(&p, &rs[k*4]);
                if(idx>=0)
                    break;
            }
            
            if(idx>=0)
                res[1][k*4 idx]=1;
        }
    }
    ts1=now()-ts1;

    /*
    for(i=0;i<CLIPS;i  ) {
        if(res[0][i]!=res[1][i]) {
            printf("error.\n");
            return 1;
        }
    }
    */

    printf("regular = %lu\n", ts0);
    printf("neon    = %lu\n", ts1);
   
    return 0;
}

CodePudding user response：

According to Peter Cordes's suggestion, I replaced data loding parts of point_in_neon subroutine with vld4q_s32 intrinsic and subsequent right and bottom calculation can be vectorized. Now the code is shorter and faster than regular version.

int32_t inline point_in_neon(const point_t *pt, const rect_t rs[4]) {
    int32x4x4_t r;
    int32x4_t right, bottom, p;
    uint32x4_t t;
    uint32_t res[4];

    /*
      r.val[0] = <X0, X1, X2, X3>
      r.val[1] = <Y0, Y1, Y2, Y3>
      r.val[2] = <Width0, Width1, Width2, Width3>
      r.val[3] = <Height0, Height1, Height2, Height3>
    */
    r=vld4q_s32(rs);
    //right = <Right0, Right1, Right2, Right3>
    right=vsubq_s32(vaddq_s32(r.val[0], r.val[2]), vdupq_n_s32(1));
    //bottom = <Bottom0, Bottom1, Bottom2, Bottom3>
    bottom=vsubq_s32(vaddq_s32(r.val[1], r.val[3]), vdupq_n_s32(1));

    //p = <Xp, Xp, Xp, Xp>
    p=vld1q_dup_s32(&pt->x);
    //t = (p >= left)
    t=vcgeq_s32(p, r.val[0]);
    //t = t & (right >= p)
    t=vandq_u32(t, vcgeq_s32(right, p));
    //p = <Yp, Yp, Yp, Yp>
    p=vld1q_dup_s32(&pt->y);
    //t = t & (p >= top)
    t=vandq_u32(t, vcgeq_s32(p, r.val[1]));
    //t = t & (r >= bottom)
    t=vandq_u32(t, vcgeq_s32(bottom, p));
    
    vst1q_u32(res, t);
    if(res[0])
        return 0;
    else if(res[1])
        return 1;
    else if(res[2])
        return 2;
    else if(res[3])
        return 3;

    return -1;
}

CodePudding user response：

Starting with your original point_in method, we can clean up a little bit here by removing the -1's, and changing <= to <.

int32_t inline point_in(const point_t *pt, const rect_t *rs, uint32_t len) {
    int32_t i;
    
    for(i=0; i < len; i  ) 
    {
        // this is pointless - change your data structures so that
        // the rect stores minx/maxx, miny/maxy instead!
        int32_t right = rs[i].x   rs[i].width;
        int32_t bottom= rs[i].y   rs[i].height;

        bool cmp0 = pt->x >= rs[i].x;
        bool cmp1 = pt->y >= rs[i].y;
        bool cmp2 = pt->x < right;
        bool cmp3 = pt->y < bottom;
        if(cmp0 & cmp1 & cmp2 & cmp3)
            return i;
    }
    return -1;
}

Next obvious thing to point out:

// your screen size... 
#define WIDTH   (4096)
#define HEIGHT  (4096)

// yet your structures use uint32 as storage???
typedef struct _rect_t {
    int32_t x;
    int32_t y;
    uint32_t width;
    uint32_t height;
} rect_t;

typedef struct _point_t {
    int32_t x;
    int32_t y;
} point_t;

If you can get away with using 16bit integers, this will go at twice the speed (because you can fit 8x 16bit numbers in a SIMD register, v.s. 4x 32bit). Whilst we're at it, we might as well change the data layout to structure of array at the same time.

I'm also going to hoist the pointless p.x width out, and store it as xmax/ymax instead (removes duplicated computation in your loops).

typedef struct rect_x8_t {
    int16x8_t x;
    int16x8_t y;
    int16x8_t xmax; //< x   width
    int16x8_t ymax; //< y   height
} rect_x8_t;

typedef struct point_x8_t {
    int16x8_t x;
    int16x8_t y;
} point_x8_t;

On the assumption you don't have a number of clips that's divisible by 8, we'll need to pad the number slightly (not a big deal)

// assuming this has already been initialised
rect_t rs[CLIPS];

// how many batches of 8 do we need?
uint32_t CLIPS8 = (CLIPS / 8)   (CLIPS & 7 ? 1 : 0);

// allocate in batches of 8
rect_x8_t rs8[CLIPS8] = {};

// I'm going to do this rubbishly as an pre-process step. 
// I don't care too much about efficiency here... 
for(uint32_t i = 0; i < CLIPS;   i) {
    rs8[i / 8].x[i & 7] = rs[i].x;
    rs8[i / 8].y[i & 7] = rs[I].y;
    rs8[i / 8].xmax[i & 7] = rs[i].x   rs[i].width;
    rs8[i / 8].ymax[i & 7] = rs[i].y   rs[i].height;
}

I have a couple of concerns here:

    for(i=0;i<HEIGHT;i  ) {
        for(j=0;j<WIDTH;j  ) {
    
            // This seems wrong? Shouldn't it be p = {j, i} ?
            point_t p={i, j};
            int32_t idx=point_in(&p, rs, CLIPS);

            // I'm not quite sure what the result says about your 
            // image data and clip regions??? 
            //
            // This seems like a really silly way of asking 
            // a simple question about the clip regions. The pixels
            // don't have any effect here. 
            if(idx >= 0)
                res[0][idx] = 1;
        }
    }

Anyhow, now refactoring the point_in method to use int16x8_t, we get:

inline int32_t point_in_x8(const point_x8_t pt, 
                           const rect_x8_t* rs,
                           uint32_t len) {
    for(int32_t i = 0; i < len; i  ) {
        // perform comparisons on 8 rects at a time
        uint16x8_t cmp0 = vcgeq_s16(pt.x, rs[i].x);
        uint16x8_t cmp1 = vcgeq_s16(pt.y, rs[i].y);
        uint16x8_t cmp2 = vcltq_s16(pt.x, rs[i].xmax);
        uint16x8_t cmp3 = vcltq_s16(pt.y, rs[I].ymax);

        // combine to single comparison value
        uint16x8_t cmp01 = vandq_u16(cmp0, cmp1);
        uint16x8_t cmp23 = vandq_u16(cmp2, cmp3);
        uint16x8_t cmp0123 = vandq_u16(cmp01, cmp23);

        // use a horizontal max to see if any lanes are true
        if(vmaxvq_u16(cmp0123)) {
            for(int32_t j = 0; j < 8;   j) {
                if(cmp0123[j])
                    return 8*i   j;
            }
        }
    }
    return -1;
}

Any additional padded elements in the rect_x8_t structs should end up being ignored (since they should be 0/0, 0/0, which will always end up being false).

Then finally...

    for(i = 0; i < HEIGHT; i  ) {
        point_x8_t p;
        // splat the y value
        p.y = vld1q_dup_s16(i);
        for(j = 0; j < WIDTH; j  ) {
            // splat the x value
            p.x = vld1q_dup_s16(j);
    
            int32_t idx = point_in_x8(p, rs8, CLIPS8);

            if(idx >= 0)
                res[1][idx] = 1;
        }
    }

The vld4 instruction actually has a fairly high latency. Given that WIDTH * HEIGHT is actually a very big number, pre-swizzling here (as a pre-processing step) makes a lot more sense imho.

HOWEVER

This whole algorithm could be massively improved by simply ignoring the pixels, and working on CLIP regions directly.

A clip region will be false if it is entirely contained by the preceding clip regions

for(i = 0; i < CLIPS; i  ) {

   // if region is empty, ignore. 
   if(rs[i].width == 0 || rs[i].height == 0) {
       res[0][i] = 0;
       continue;
   }

   // first region will always be true (unless it's of zero size)
   if(i == 0) {
      res[0][1] = 1;
      continue;
   }

   uint32_t how_many_intersect = 0;
   bool entirely_contained = false;
   uint32_t intersection_indices[CLIPS] = {};

   // do a lazy test first.
   for(j = i - 1; j >= 0; --j) {
       // if the last region is entirely contained by preceding
       // ones, it will be false. exit loop. 
       if(region_is_entirely_contained(rs[i], rs[j])) {
          res[0][i] = 0;
          entirely_contained = true;
          j = -1; ///< break out of loop
       }
       else
       // do the regions intersect?
       if(region_intersects(rs[i], rs[j])) {
          intersection_indices[how_many_intersect] = j;
            how_many_intersect;
       }
   }

   // if one region entirely contains this clip region, skip it.
   if(entirely_contained) {
      continue; 
   }

   // if you only intersect one or no regions, the result is true.
   if(how_many_intersect <= 1) {
       res[0][i] = 1;
       continue; 
   }

   // If you get here, the result is *probably* true, however 
   // you will need to split this clip region against the previous
   // ones to be fully sure. If all regions are fully contained, 
   // the answer is false. 

   // I won't implement it, but something like this:

 * split rs[i] against each rs[intersection_indices[]].
 * Throw away the rectangles that are entirely contained. 
 * Each bit that remains should be tested against each rs[intersection_indices[]] 
 * If you find any split rectangle that isn't contained, 
   set to true and move on. 

 

}