Is there a fast way to convert a string of 8 ASCII decimal digits into a binary number?-CodePudding

Consider 8 digit characters like 12345678 as a string. It can be converted to a number where every byte contains a digit like this:

const char* const str = "12345678";
const char* const base = "00000000";
const uint64_t unpacked = *reinterpret_cast<const uint64_t*>(str)
    - *reinterpret_cast<const uint64_t*>(base);

Then unpacked will be 0x0807060504030201 on a little-endian system.

What is the fastest way to convert the number into 12345678, perhaps by multiplying it by some magic number or using SIMD up to AVX2?

UPDATE: 12345678 has to be a number stored in a 32-bit or 64-bit integer, not a string.

CodePudding user response：

If you change your input format to breadth-first element order like this:

Sample 9 numbers, interleaved
digit[]:
1 1 1 1 1 1 1 1 1 ... 2 2 2 2 2 2 2 2 2 ...
... 3 3 3 3 3 3 3 3 3 ....

for(int j=0; j<num_parse; j =9)
{
    for(int i=0; i<9; i  )
    {
        value[i]  = 
          (multiplier[i]*=10)*
          (digit[i j]-'0');
    }
    // value vector copied to output
    // clear value & multiplier vectors
}

And if you convert more than just 9 values, like 512 or 8192 with padding to any multiple of 32, compiler should vectorize it.

To prepare input, you can use 8 different channels, 1 per digit of every parsed value.

CodePudding user response：

size_t len = strlen(str);
uint64_t unpacked = 0;
for (size_t i = 0; i < len; i  ) {
    uint8_t val = (uint8_t)(str[i] -'0');
    unpacked = (unpacked << 8) | val;
}

That will convert "12345678" into 0x0102030405060708

If you want the other endianness, replace the val = statement in the loop above with this:

    uint8_t val = (uint8_t)(str[len - i - 1] -'0');

You could also do the loop unrolling yourself and risk some potentially undefined behavior:

uint8_t* ptr = (uint8_t*)(&unpacked);
ptr[0] = str[0] - '0';
ptr[1] = str[1] - '0';
ptr[2] = str[2] - '0';
ptr[3] = str[3] - '0';
ptr[4] = str[4] - '0';
ptr[5] = str[5] - '0';
ptr[6] = str[6] - '0';
ptr[7] = str[7] - '0';

CodePudding user response：

I've implemented a small program to test some ideas. AVX2 implementation is ~1.5 times faster than the naive, with the table implementation in the middle:

AVX2: 12345678 in 3.42759
Naive: 12345678 in 5.12581
Table: 12345678 in 4.49478

Source code:

#include <cstdlib>
#include <cstdint>
#include <immintrin.h>
#include <iostream>
using namespace std;

const __m256i mask = _mm256_set1_epi32(0xf);
const __m256i mul = _mm256_setr_epi32(10000000, 1000000, 100000, 10000, 1000, 100, 10, 1);

const volatile char* str = "12345678";
volatile uint32_t h;

const int64_t nIter = 1000LL * 1000LL * 1000LL;

inline void parse_avx2() {
  const char* const base = "00000000";
  const uint64_t unpacked = *reinterpret_cast<const volatile uint64_t*>(str)
    - *reinterpret_cast<const uint64_t*>(base);

  const __m128i a = _mm_set1_epi64x(unpacked);
  const __m256i b = _mm256_cvtepu8_epi32(a);
  const __m256i d = _mm256_mullo_epi32(b, mul);
  const __m128i e = _mm_add_epi32(_mm256_extractf128_si256(d, 0), _mm256_extractf128_si256(d, 1));
  const uint64_t f0 = _mm_extract_epi64(e, 0);
  const uint64_t f1 = _mm_extract_epi64(e, 1);
  const uint64_t g = f0   f1;
  h = (g>>32)   (g&0xffffffff);
}

inline void parse_naive() {
  const char* const base = "00000000";
  const uint64_t unpacked = *reinterpret_cast<const volatile uint64_t*>(str)
    - *reinterpret_cast<const uint64_t*>(base);

  const uint8_t* a = reinterpret_cast<const uint8_t*>(&unpacked);
  h = a[7]   a[6]*10   a[5]*100   a[4]*1000   a[3]*10000   a[2]*100000   a[1]*1000000   a[0]*10000000;
}

uint32_t summands[8][10];
inline void parse_table() {
  const char* const base = "00000000";
  const uint64_t unpacked = *reinterpret_cast<const volatile uint64_t*>(str)
    - *reinterpret_cast<const uint64_t*>(base);

  const uint8_t* a = reinterpret_cast<const uint8_t*>(&unpacked);
  h = summands[7][a[0]]   summands[6][a[1]]   summands[5][a[2]]   summands[4][a[3]]
      summands[3][a[4]]   summands[2][a[5]]   summands[1][a[6]]   summands[0][a[7]];
}

int main() {
  clock_t start = clock();
  for(int64_t i=0; i<nIter; i  ) {
    parse_avx2();
  }
  clock_t end = clock();
  cout << "AVX2: " << h << " in " << double(end-start)/CLOCKS_PER_SEC << endl;

  start = clock();
  for(int64_t i=0; i<nIter; i  ) {
    parse_naive();
  }
  end = clock();
  cout << "Naive: " << h << " in " << double(end-start)/CLOCKS_PER_SEC << endl;

  uint32_t mul=1;
  for(int i=0; i<8; i  , mul*=10) {
    for(int j=0; j<9; j  ) {
      summands[i][j] = j*mul;
    }
  }

  start = clock();
  for(int64_t i=0; i<nIter; i  ) {
    parse_table();
  }
  end = clock();
  cout << "Table: " << h << " in " << double(end-start)/CLOCKS_PER_SEC << endl;
  return 0;
}