Home > Net >  Looking for the absolute FASTEST way to write integers as individual digits - chars to a file in C -
Looking for the absolute FASTEST way to write integers as individual digits - chars to a file in C -

Time:01-05

I'm working on a program in C where the main objective is absolute speed - it's a code performance competition. There are more ways to speed up the program, however, the largest speedup potential is in I/O operations, specifically, saving to text file. The file is structured as follows: 3 integers of arbitrary digit count per line, separated by whitespaces. The integers are known beforehand, they just need to be converted to a string and written to the output buffer.

The integers only range from -1 to INT_MAX.

The buffer size varies (I set it) based on the data being written but most of the time, the written file size is in orders of 100s of megabytes to something over a gigabyte and buffer is between 4 and 8 MB. The main write loop is this:

    int fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR);
    const size_t w_bufsize = get_bufsize(param);
    void *buf = NULL;
    posix_memalign(&buf, sysconf(_SC_PAGE_SIZE), w_bufsize)
    posix_fadvise(fd, 0, 0, POSIX_FADV_NOREUSE);
    size_t num_written = 0;
    size_t f_idx = 0;
    for (int i = 0; i < num_ints;   i) {
        myStruct *str = main_struct->structs   i;
        f_idx = fast_write_3_ints(buf, str->int1, str->int2, str->int3, f_idx);
        if (f_idx   BYTES_PER_ROW > w_bufsize) {
            write(fd, buf, f_idx) != f_idx
            if (num_written)
                posix_fadvise(fd, (num_written - 1) * w_bufsize, w_bufsize,
                              POSIX_FADV_DONTNEED);
            f_idx = 0;
              num_written;
        }

(Return value checking and frees/closes abbreviated for readability)

For converting the integers to text, I use this method: https://kenny-peng.com/2021/05/28/printing_integers_fast.html I further improved it by bypassing the temporary buffer and memcpy-ing the characters directly to the output buffer (10-15 % perf increase on my machine). Here is abbreviated (where possible) version of my code

size_t fast_write_3_ints(char *out_buf, int num1, int num2, int num3,
                         size_t idx)
{
    char *temp_ptr = NULL;
    int n_digits = 0;
    if (num1 < 0) {
        out_buf[idx  ] = '-';
        num1 = -num1;
    }
    if (num1 < 10) {
        out_buf[idx  ] = num1   '0';
    } else {
        idx  = count_digits(num1);
        temp_ptr = out_buf   idx;
        for (; num1 >= 1000; num1 /= 1000) {
            temp_ptr -= 3;
            lookup_digits(temp_ptr, num1 % 1000, 3);
        }
        if (num1) {
            num1 %= 1000;
            n_digits = count_digits(num1);
            lookup_digits(temp_ptr - n_digits, num1, n_digits);
        }
    }
    out_buf[idx  ] = ' ';
    // write int 2 and 3 - abbreviated
    out_buf[idx  ] = '\n';
    return idx;
}

static void lookup_digits(char *arr, int num, char write_size)
{
    static const char table[3000] __attribute__((aligned(64))) =
        "000001002003004005006007008009"
        "010011012013014015016017018019"
        "020021022023024025026027028029"
        "030031032033034035036037038039"
        "040041042043044045046047048049"
        "050051052053054055056057058059"
        "060061062063064065066067068069"
        "070071072073074075076077078079"
        "080081082083084085086087088089"
        "090091092093094095096097098099"
        "100101102103104105106107108109"
        "110111112113114115116117118119"
        "120121122123124125126127128129"
        "130131132133134135136137138139"
        "140141142143144145146147148149"
        "150151152153154155156157158159"
        "160161162163164165166167168169"
        "170171172173174175176177178179"
        "180181182183184185186187188189"
        "190191192193194195196197198199"
        "200201202203204205206207208209"
        "210211212213214215216217218219"
        "220221222223224225226227228229"
        "230231232233234235236237238239"
        "240241242243244245246247248249"
        "250251252253254255256257258259"
        "260261262263264265266267268269"
        "270271272273274275276277278279"
        "280281282283284285286287288289"
        "290291292293294295296297298299"
        "300301302303304305306307308309"
        "310311312313314315316317318319"
        "320321322323324325326327328329"
        "330331332333334335336337338339"
        "340341342343344345346347348349"
        "350351352353354355356357358359"
        "360361362363364365366367368369"
        "370371372373374375376377378379"
        "380381382383384385386387388389"
        "390391392393394395396397398399"
        "400401402403404405406407408409"
        "410411412413414415416417418419"
        "420421422423424425426427428429"
        "430431432433434435436437438439"
        "440441442443444445446447448449"
        "450451452453454455456457458459"
        "460461462463464465466467468469"
        "470471472473474475476477478479"
        "480481482483484485486487488489"
        "490491492493494495496497498499"
        "500501502503504505506507508509"
        "510511512513514515516517518519"
        "520521522523524525526527528529"
        "530531532533534535536537538539"
        "540541542543544545546547548549"
        "550551552553554555556557558559"
        "560561562563564565566567568569"
        "570571572573574575576577578579"
        "580581582583584585586587588589"
        "590591592593594595596597598599"
        "600601602603604605606607608609"
        "610611612613614615616617618619"
        "620621622623624625626627628629"
        "630631632633634635636637638639"
        "640641642643644645646647648649"
        "650651652653654655656657658659"
        "660661662663664665666667668669"
        "670671672673674675676677678679"
        "680681682683684685686687688689"
        "690691692693694695696697698699"
        "700701702703704705706707708709"
        "710711712713714715716717718719"
        "720721722723724725726727728729"
        "730731732733734735736737738739"
        "740741742743744745746747748749"
        "750751752753754755756757758759"
        "760761762763764765766767768769"
        "770771772773774775776777778779"
        "780781782783784785786787788789"
        "790791792793794795796797798799"
        "800801802803804805806807808809"
        "810811812813814815816817818819"
        "820821822823824825826827828829"
        "830831832833834835836837838839"
        "840841842843844845846847848849"
        "850851852853854855856857858859"
        "860861862863864865866867868869"
        "870871872873874875876877878879"
        "880881882883884885886887888889"
        "890891892893894895896897898899"
        "900901902903904905906907908909"
        "910911912913914915916917918919"
        "920921922923924925926927928929"
        "930931932933934935936937938939"
        "940941942943944945946947948949"
        "950951952953954955956957958959"
        "960961962963964965966967968969"
        "970971972973974975976977978979"
        "980981982983984985986987988989"
        "990991992993994995996997998999";
        
    memcpy(arr, table   3 * num   3 - write_size, write_size);
}

static int count_digits(int num)
{
    if (num < 100000)
        if (num < 1000)
            if (num < 100)
                if (num < 10)
                    return 1;
                else
                    return 2;
            else
                return 3;
        else if (num < 10000)
            return 4;
        else
            return 5;
    else if (num < 10000000)
        if (num < 1000000)
            return 6;
        else
            return 7;
    else if (num < 100000000)
        return 8;
    else if (num < 1000000000)
        return 9;
    else
        return 10;
}

This is the main production code right now. Below I describe what alternatives I tried and how it turned out.

I also have to note that my computer is a 14" Macbook Pro with the M1 Pro chip and very fast SSD, which makes IO operations totally negligible compared to the main computation. However, the evaluation server/machine is of very different specs (likely), and there, saving the file is by far the slowest bit. I also noted that some changes made it perform better on my machine but worse on the actual evaluator (likely cache size/memory speed dependent).

I also tried implementing lookup-free int-to-string processing as described here: https://johnnylee-sde.github.io/Fast-unsigned-integer-to-string/ this did not improve performance by more than run-to-run variance on my machine.

I also tried extending the table to the 4*10000 numbers, but it improved performance on my machine by only 3-5 % and actually made it a little worse in the evaluation system (likely a lot slower CPU/memory).

Is there anything else I can optimize for? I am running out of ideas. The historically fastest version of the code saves to the file 18 % faster than my implementation.

A thread solving the exact some problem but with different functions that are (in my eyes) slower and perform a lot more ops? The fastest way to save graph to file in C Or should I attempt to integrate the single large buffer routine into my algorithm and write in st_blksize sized buffers instead? Thanks so much for any help or suggestions

EDIT: Function that determines output buffer size (consider param to be the amount of lines to be written)

size_t get_bufsize(int param)
{
    size_t bufsize = 4096;
    if (param >= 1000 && param < 10000)
        bufsize <<= 4;
    else if (param >= 10000 && param < 100000)
        bufsize <<= 6;
    else if (param >= 100000 && param < 1000000)
        bufsize <<= 8;
    else if (param >= 1000000 && param <= 5000000)
        bufsize <<= 10;
    else if (param > 5000000)
        bufsize <<= 11;
    //  printf("Buffer size: %zu\n", bufsize);
    return bufsize;
}

EDIT 2: The integers only range from -1 to INT_MAX.

CodePudding user response:

Here are some directions to try and improve you code efficiency:

  • if running on a legacy system, you should specify O_BINARY to ensure the write system call does not perform some system specific conversion.

  • when flushing the buffer to disk, you should try and only write a whole number of pages and shift the remaining chunk to the beginning of the buffer. Allocating a decent number of 4K pages plus some slack and writing the 4K pages is a better approach to allocating a huge number of pages and issuing partial writes.

  • Your function fast_write_3_ints has a redundant statement num1 %= 1000; as well as the if (num1) test. It and can be further simplified to improve speed on small values:

size_t fast_write_3_ints(char *out_buf, int num1, int num2, int num3,
                         size_t idx)
{
    char *temp_ptr;
    int n_digits;

    if (num1 < 0) {
        out_buf[idx  ] = '-';
        num1 = -num1;
    }
    if (num1 < 1000) {
        if (num1 < 10) {
            out_buf[idx  ] = num1   '0';
        } else {
            n_digits = 2   (num1 >= 100);
            lookup_digits(out_buf   idx, num1, n_digits));
            idx  = n_digits;
        }
    } else {
        n_digits = count_digits(num1);
        idx  = n_digits;
        temp_ptr = out_buf   idx;
        while (n_digits > 3) {
            int digits = num1 % 1000;
            num1 /= 1000;  // group division and modulo
            temp_ptr -= 3;
            lookup_digits(temp_ptr, digits, 3);
            n_digits -= 3;
        }
        lookup_digits(temp_ptr - n_digits, num1, n_digits);
    }
    out_buf[idx  ] = ' ';
    // write int 2 and 3 - abbreviated
    out_buf[idx  ] = '\n';
    return idx;
}
  • using branchless code for count_digits might get you some speed gains:
static int count_digits(int num) {
    return 1   (num > 9)     (num > 99)         (num > 999)  
           (num > 9999)      (num > 99999)      (num > 999999)  
           (num > 9999999)   (num > 99999999)   (num > 999999999);
}

CodePudding user response:

int vs. int_fast32_t

Rather than int, consider int_fast32_t as potentially a 64-bit type may be faster.

Avoid interval tests with 2 values

A little improvement perhaps with a simplified if tree.

Also, favor testing large values first as more likely to match.

uint_fast32_t get_bufsize(int param) {
    #define  BLOCK ((uint_fast32_t) 4096)
    if (param >= 5000000) {
        return BLOCK << 11;
    }
    if (param >= 1000000) {
        return BLOCK << 10;
    }
    if (param >= 100000) {
        return BLOCK << 8;
    }
    if (param >= 10000) {
        return BLOCK) << 6;
    }
    if (param >= 1000) {
        return BLOCK << 4;
    }
    return BLOCK;
}

unsigned vs. int

I have never encounter using int faster than unsigned, yet using unsigned has some potential for faster code. Something to try. After if (num1 < 0) test, code could move to unsigned math and maybe see a marginal improvement.


I doubt any of these will dramatically improve, yet may nudge toward a faster code.

CodePudding user response:

If you're trying to optimise to avoid unnecessarily executing code AND the only negative value is -1, change:

if (num1 < 0) {
    out_buf[idx  ] = '-';
    num1 = -num1;
}
if (num1 < 10) {
    out_buf[idx  ] = num1   '0';
} else {

to

if (num1 < 10) {
    if (num1 < 0) num = 1, out_buf[idx  ] = '-';
    out_buf[idx  ] = num1   '0';
} else {

Further, it seems you try to handle the residual 1,2or3 digits in some special case. This is unnecessary.

The example code below "borrows" the branchless function from @chqrlie. It also computes double/triple digits instead of indexing into a LUT. Think about that LUT... Slice off the first 100 values into a second "two digit" function, trim the leading zeros, and stop performing arcane calculations on pointers and counts. (I'm not suggesting you use these functions. Too much arithmetic happening. You could use two distinct conversion functions... or not.) Finally, this example only deals with positive numbers and only translates one.

void lookup_2_digits( char *p, int n ) { // Use a LUT... I didn't for this example
    p[1] = (char)(n % 10   '0'); n /= 10;
    p[0] = (char)(n   '0');
}

void lookup_3_digits( char *p, int n ) { // Use a LUT... I didn't for this example
    p[2] = (char)(n % 10   '0'); n /= 10;
    p[1] = (char)(n % 10   '0'); n /= 10;
    p[0] = (char)(n   '0');
}

int count_digits(int n) {
    return 1  (n > 9)         (n > 99)         (n > 999)
              (n > 9999)      (n > 99999)      (n > 999999)
              (n > 9999999)   (n > 99999999)   (n > 999999999);
}

void doit( int num1 ) {
    char out_buf[512] = {0};
    int idx = 0;
    idx  = count_digits( num1 );
    char *temp_ptr = out_buf   idx;
    do {
        if( num1 <= 99 ) {
            if( num1 <= 9 )
                /* Can deal with -1 here */
                *--temp_ptr = num1   '0';
            else
                lookup_2_digits( temp_ptr-2, num1 );
            num1 = 0;
        } else {
            lookup_3_digits( temp_ptr -= 3, num1 % 1000 );
            num1 /= 1000;
        }
    } while( num1 > 0 );
    puts( out_buf );
}

int main( void ) {
    doit( 2165536 );
    return 0;
}
  • Related