#include <stdio.h>
#include <iostream>
#include <string>
#include <chrono>
using namespace std;
const int p[9] = {1, 10, 100,
1000, 10000, 100000,
1000000, 10000000, 100000000};
class MyTimer {
private:
std::chrono::time_point<std::chrono::steady_clock> starter;
std::chrono::time_point<std::chrono::steady_clock> ender;
public:
void startCounter() {
starter = std::chrono::steady_clock::now();
}
double getCounter() {
ender = std::chrono::steady_clock::now();
return double(std::chrono::duration_cast<std::chrono::nanoseconds>(ender - starter).count()) /
1000000; // millisecond output
}
};
int convert1(char *a) {
int res = 0;
for (int i=0; i<9; i ) res = res * 10 a[i] - 48;
return res;
}
int convert2(char *a) {
return (a[0] - 48) * p[8] (a[1] - 48) * p[7] (a[2] - 48) * p[6]
(a[3] - 48) * p[5] (a[4] - 48) * p[4] (a[5] - 48) * p[3]
(a[6] - 48) * p[2] (a[7] - 48) * p[1] (a[8] - 48) * p[0];
}
int convert3(char *a) {
return (a[0] - 48) * p[8] a[1] * p[7] a[2] * p[6] a[3] * p[5]
a[4] * p[4] a[5] * p[3] a[6] * p[2] a[7] * p[1] a[8]
- 533333328;
}
const unsigned pu[9] = {1, 10, 100, 1000, 10000, 100000, 1000000, 10000000,
100000000};
int convert4u(char *aa) {
const unsigned char *a = (const unsigned char*) aa;
return a[0] * pu[8] a[1] * pu[7] a[2] * pu[6] a[3] * pu[5] a[4] * pu[4]
a[5] * pu[3] a[6] * pu[2] a[7] * pu[1] a[8] - (unsigned) 5333333328u;
}
int convert5(char* a) {
int val = 0;
for(size_t k =0;k <9; k) {
val = (val << 3) (val << 1) (a[k]-'0');
}
return val;
}
const unsigned pu2[9] = {100000000, 10000000, 1000000, 100000, 10000, 1000, 100, 10, 1};
int convert6u(char *a) {
return a[0]*pu2[0] a[1]*pu2[1] a[2]*pu2[2] a[3] * pu2[3] a[4] * pu2[4] a[5] * pu2[5] a[6] * pu2[6] a[7] * pu2[7] a[8] - (unsigned) 5333333328u;
}
using ConvertFunc = int(char*);
volatile int result = 0; // do something with the result of function to prevent unexpected optimization
void benchmark(ConvertFunc converter, string name, int numTest=100000) {
MyTimer timer;
const int N = 100000;
char a[9*N 1];
double runtime = 0;
for (int i=0; i<9*N; i ) a[i] = rand();
for (int t=1; t<=numTest; t ) {
// change something to prevent unexpected optimization
for (int i=0; i<N; i ) a[i] = rand() % 10 '0';
timer.startCounter();
for (int i=0; i<N; i = 9) result = converter(a i);
runtime = timer.getCounter();
}
cout << name << ": " << runtime << "ms\n";
}
int main() {
benchmark(convert1, "slow");
benchmark(convert2, "normal");
benchmark(convert3, "fast");
benchmark(convert4u, "unsigned");
benchmark(convert5, "shifting");
benchmark(convert6u, "reverse");
return 0;
}
I want to find the fastest way turn char a[9]
into an int
. The full problem is convert char a[15]
with form HHMMSSxxxxxxxxx timestamp to nanosecond, where ~50 bytes after the x
are allocated and can be safely read (but not write). We only care about the last 9 digits in this question.
Version 1 is basic, version 2,3 try to save some computation. I compile with -O3 flag, and storing power of 10s in array is fine because it is optimized away (checked using Godbolt).
How can I make this faster? Yes I know this sounds like premature optimization, but let's assume I need that final 2-3% boost.
Version 3 is decidedly faster, while version 2 is slower due to code size. But can we do better than version 3? Invalid benchmark
Edit 2: the function can return unsigned int
instead of int
(i.e
unsigned convert1(char *a);
Edit 3: I noticed that the new code is an invalid benchmark, since convert(a) is only executed once. Using the original code, the difference is only ~1%.
Edit 4: New benchmark. using unsigned (convert4u, convert6u) is consistently 3-5% faster than using int. I will run a long (10 min) benchmark to see if there's a winner. I've edited the code to use a new benchmark. It generates a large amount of data, then run the converter functions.
Edit 5: results: 4.19, 4.51, 3.82, 3.59, 7.64, 3.72
seconds. The unsigned version is fastest. Is it possible to use SIMD on just 9 bytes? If not, then I guess this is the best solution. I still hope there's a crazier solution, though
CodePudding user response:
An alternative candidate
Use unsigned
math to avoid UB of int
overflow and allow for taking all the - 48
out and then into a constant.
const unsigned p[9] = {1, 10, 100, 1000, 10000, 100000, 1000000, 10000000,
100000000};
int convert4u(const char *aa) {
const unsigned char *a = (const unsigned char*) aa;
return a[0] * p[8] a[1] * p[7] a[2] * p[6] a[3] * p[5] a[4] * p[4]
a[5] * p[3] a[6] * p[2] a[7] * p[1] a[8] - (unsigned) 5333333328u;
}
Perhaps also try ordering p[9]
like a[]
. Perhaps easier to parallel calculate. I see no down-side to re-ordering. Maybe a benefit, maybe not.
const unsigned p[9] = {100000000, 10000000, ..., 1};
int convert4u(const char *aa) {
const unsigned char *a = (const unsigned char*) aa;
return a[0]*p[0] a[1]*p[1] ... a[1]*p[1] a[8] - (unsigned) 5333333328u;
}
CodePudding user response:
I think the fastest code can be done by replacing slow multiplication operations with fast shift operations
Multiplication with 10 is the same as mutliplying a value with 8 and add the value multiplied by 2.
- And multiplying with 8 is the same as shift left by 3
- multiplying with 2 is the same as shift left by 1
This would result in a code like the below:
int convert4(char* a) {
int val = 0;
for(size_t k =0;k <9; k) {
val = (val << 3) (val << 1) (a[k]-'0');
}
return val;
}
I did not test it, but my guess is that this will be faster then all "multiplication-solutions".
Getting rid of the subtratcion in the loop, may gain additional speed.
Result so far on my machine:
With new benchmark: