cuda SIMD instruction for per-byte multiplication with unsigned saturation-CodePudding

CUDA has a nice set of SIMD instructions for integers that allow efficient SIMD computations. Among those, there are some that compute addition and subtraction per byte or per half-word (like __vadd2 and __vadd4), however, I couldn't find a similar function that computes per-byte multiplication for a 32bit register. I would appreciate it if someone can help me find a proper solution.

CodePudding user response：

however, I couldn't find a similar function that computes per-byte multiplication for a 32bit register.

There isn't one that returns the 4 individual products.

The closest is the __dp4a() intrinsic which return the sum of the 4 products, in a 32-bit integer.

You could write an 8-bit packed unsigned multiply with saturation like this:

$ cat t2048.cu
#include <cstdio>
#include <cstdint>

__host__ __device__ uchar4 u8mulsat(const uchar4 &a, const uchar4 &b){

  const unsigned sv = 255;
  uchar4 result;
  unsigned t;
  t = a.x*b.x;
  if (t > sv) t = sv;
  result.x = t;
  t = a.y*b.y;
  if (t > sv) t = sv;
  result.y = t;
  t = a.z*b.z;
  if (t > sv) t = sv;
  result.z = t;
  t = a.w*b.w;
  if (t > sv) t = sv;
  result.w = t;
  return result;
}

__global__ void k(uchar4 a, uchar4 b, uchar4 *c){

  *c = u8mulsat(a, b);
}

int main(){

  uchar4 a,b,c, *d_c;
  cudaMalloc(&d_c, sizeof(uchar4));
  a.x = 1;
  a.y = 2;
  a.z = 4;
  a.w = 8;
  b.x = 64;
  b.y = 64;
  b.z = 64;
  b.w = 1;
  k<<<1,1>>>(a, b, d_c);
  cudaMemcpy(&c, d_c, sizeof(uchar4), cudaMemcpyDeviceToHost);
  printf("c.x = %u\n", (unsigned)c.x);
  printf("c.y = %u\n", (unsigned)c.y);
  printf("c.z = %u\n", (unsigned)c.z);
  printf("c.w = %u\n", (unsigned)c.w);
}
$ nvcc -o t2048 t2048.cu
$ compute-sanitizer ./t2048
========= COMPUTE-SANITIZER
c.x = 64
c.y = 128
c.z = 255
c.w = 8
========= ERROR SUMMARY: 0 errors
$ cuobjdump -sass ./t2048

Fatbin elf code:
================
arch = sm_52
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit

        code for sm_52

Fatbin elf code:
================
arch = sm_52
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit

        code for sm_52
                Function : _Z1k6uchar4S_PS_
        .headerflags    @"EF_CUDA_SM52 EF_CUDA_PTX_SM(EF_CUDA_SM52)"
                                                                 /* 0x001c4400e22007f6 */
        /*0008*/                   MOV R1, c[0x0][0x20] ;        /* 0x4c98078000870001 */
        /*0010*/                   LDC.U8 R0, c[0x0][0x140] ;    /* 0xef9000001407ff00 */
        /*0018*/                   LDC.U8 R2, c[0x0][0x144] ;    /* 0xef9000001447ff02 */
                                                                 /* 0x001d4400e6200731 */
        /*0028*/                   LDC.U8 R3, c[0x0][0x141] ;    /* 0xef9000001417ff03 */
        /*0030*/                   LDC.U8 R4, c[0x0][0x145] ;    /* 0xef9000001457ff04 */
        /*0038*/                   LDC.U8 R5, c[0x0][0x142] ;    /* 0xef9000001427ff05 */
                                                                 /* 0x001dfc00ee200751 */
        /*0048*/                   LDC.U8 R6, c[0x0][0x146] ;    /* 0xef9000001467ff06 */
        /*0050*/                   LDC.U8 R7, c[0x0][0x143] ;    /* 0xef9000001437ff07 */
        /*0058*/                   LDC.U8 R8, c[0x0][0x147] ;    /* 0xef9000001477ff08 */
                                                                 /* 0x009fd002fe200fe1 */
        /*0068*/                   XMAD R0, R2, R0, RZ ;         /* 0x5b007f8000070200 */
        /*0070*/                   XMAD R2, R4, R3, RZ ;         /* 0x5b007f8000370402 */
        /*0078*/                   XMAD R3, R6, R5, RZ ;         /* 0x5b007f8000570603 */
                                                                 /* 0x001fc408fe2007f1 */
        /*0088*/                   IMNMX.U32 R0, R0, 0xff, PT ;  /* 0x382003800ff70000 */
        /*0090*/                   XMAD R4, R8, R7, RZ ;         /* 0x5b007f8000770804 */
        /*0098*/                   IMNMX.U32 R2, R2, 0xff, PT ;  /* 0x382003800ff70202 */
                                                                 /* 0x001fc400fe2007e4 */
        /*00a8*/                   IMNMX.U32 R3, R3, 0xff, PT ;  /* 0x382003800ff70303 */
        /*00b0*/                   IMNMX.U32 R4, R4, 0xff, PT ;  /* 0x382003800ff70404 */
        /*00b8*/                   BFI R0, R2, 0x808, R0 ;       /* 0x36f0000080870200 */
                                                                 /* 0x001fd400fe2007f5 */
        /*00c8*/                   MOV R2, c[0x0][0x148] ;       /* 0x4c98078005270002 */
        /*00d0*/                   BFI R5, R3, 0x810, R0 ;       /* 0x36f0000081070305 */
        /*00d8*/                   MOV R3, c[0x0][0x14c] ;       /* 0x4c98078005370003 */
                                                                 /* 0x001ffc00fe2007e2 */
        /*00e8*/                   BFI R4, R4, 0x818, R5 ;       /* 0x36f0028081870404 */
        /*00f0*/                   STG.E [R2], R4 ;              /* 0xeedc200000070204 */
        /*00f8*/                   EXIT ;                        /* 0xe30000000007000f */
                                                                 /* 0x001f8000fc0007ff */
        /*0108*/                   BRA 0x100 ;                   /* 0xe2400fffff07000f */
        /*0110*/                   NOP;                          /* 0x50b0000000070f00 */
        /*0118*/                   NOP;                          /* 0x50b0000000070f00 */
                                                                 /* 0x001f8000fc0007e0 */
        /*0128*/                   NOP;                          /* 0x50b0000000070f00 */
        /*0130*/                   NOP;                          /* 0x50b0000000070f00 */
        /*0138*/                   NOP;                          /* 0x50b0000000070f00 */
                ..........



Fatbin ptx code:
================
arch = sm_52
code version = [7,4]
producer = <unknown>
host = linux
compile_size = 64bit
compressed
$

The SASS code appears to be about as I would expect, roughly the same length as the C code, ignoring the LDC and STG instructions.

CodePudding user response：

There is no existing intrinsic __vmulus8() in CUDA. However, it can be emulated using existing intrinsics. Basically, we can pack the four 16-bit products of four 8-bit quantities using two 32-bit variable to hold them. Then clamp each product to 255, and extract the least-significant byte of each product into the final result with the help of the permute operation. The code generated by CUDA 11 for compute capabilities >= 7.0 looks reasonable. Whether the performance is sufficient will depend on the use case. If this operation occurs in the middle of a processing pipeline computing with packed bytes, that should be the case.

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>

/* byte-wise multiply with unsigned saturation */
__device__ unsigned int vmulus4 (unsigned int a, unsigned int b)
{
    unsigned int plo, phi, mlo, mhi, res;
    // compute products
    plo = ((a & 0x000000ff) * (b & 0x000000ff)   
           (a & 0x0000ff00) * (b & 0x0000ff00));
    phi = (__umulhi (a & 0x00ff0000, b & 0x00ff0000)   
           __umulhi (a & 0xff000000, b & 0xff000000));
    // clamp products to 255
    mlo = __vcmpgtu2 (plo, 0x00ff00ff);
    mhi = __vcmpgtu2 (phi, 0x00ff00ff);
    plo |= mlo;
    phi |= mhi;
    // extract least significant eight bits of each product
    res = __byte_perm (plo, phi, 0x6420);
    return res;
}

__global__ void kernel (unsigned int a, unsigned int b, unsigned int *res)
{
    *res = vmulus4 (a, b);
}

unsigned int vmulus4_ref (unsigned int a, unsigned int b)
{
    unsigned char a0, a1, a2, a3, b0, b1, b2, b3;
    unsigned int p0, p1, p2, p3;
    a0 = (a >>  0) & 0xff;
    a1 = (a >>  8) & 0xff;
    a2 = (a >> 16) & 0xff;
    a3 = (a >> 24) & 0xff;
    b0 = (b >>  0) & 0xff;
    b1 = (b >>  8) & 0xff;
    b2 = (b >> 16) & 0xff;
    b3 = (b >> 24) & 0xff;
    p0 = (unsigned int)a0 * (unsigned int)b0;
    p1 = (unsigned int)a1 * (unsigned int)b1;
    p2 = (unsigned int)a2 * (unsigned int)b2;
    p3 = (unsigned int)a3 * (unsigned int)b3;
    if (p0 > 255) p0 = 255;
    if (p1 > 255) p1 = 255;
    if (p2 > 255) p2 = 255;
    if (p3 > 255) p3 = 255;
    return (p0 << 0)   (p1 << 8)   (p2 << 16)   (p3 << 24);
}

// George Marsaglia's KISS PRNG, period 2**123. Newsgroup sci.math, 21 Jan 1999
// Bug fix: Greg Rose, "KISS: A Bit Too Simple" http://eprint.iacr.org/2011/007
static uint32_t kiss_z=362436069, kiss_w=521288629;
static uint32_t kiss_jsr=123456789, kiss_jcong=380116160;
#define znew (kiss_z=36969*(kiss_z&65535) (kiss_z>>16))
#define wnew (kiss_w=18000*(kiss_w&65535) (kiss_w>>16))
#define MWC  ((znew<<16) wnew )
#define SHR3 (kiss_jsr^=(kiss_jsr<<13),kiss_jsr^=(kiss_jsr>>17), \
              kiss_jsr^=(kiss_jsr<<5))
#define CONG (kiss_jcong=69069*kiss_jcong 1234567)
#define KISS ((MWC^CONG) SHR3)

int main (void)
{
    unsigned int *resD = 0;
    unsigned int a, b, res, ref;

    cudaMalloc ((void**)&resD, sizeof resD[0]);
    for (int i = 0; i < 1000000; i  ) {
        a = KISS;
        b = KISS;
        kernel<<<1,1>>>(a, b, resD);
        cudaMemcpy (&res, resD, sizeof res, cudaMemcpyDeviceToHost);
        ref = vmulus4_ref (a, b);
        if (res != ref) {
            printf ("error: a=x b=x res=x ref=x\n", a, b, res, ref);
            return EXIT_FAILURE;
        }
    }
    cudaFree (resD);
    return EXIT_SUCCESS;
}