Home > Software engineering >  Parallel CRC32 in C
Parallel CRC32 in C

Time:06-06

I am trying to implement CRC32 calculation of file by splitting it in parts. I used algorithms and ideas from CRC Calculation Of A Mostly Static Data Stream. Unfortunately, my program gives incorrect answer, although it returns the same value of CRC regardless of number of parts. Please, tell me where the mistake is and what I do wrong. Here is the code of program:

#include <iostream>
#include <fstream>
#include <stdint.h>
#include <string>
#include <sstream>
#include <stdlib.h>
#include <stdio.h>

#include <pthread.h>


using namespace std;

struct data {
    
    pthread_t id;
    uint8_t *buf;
    long int start, end;
    long int num_zeros;
    uint32_t crc;
        
    
};


//Straight function
uint32_t crc_32(ifstream& input) {
    
    input.seekg(0, input.end);
    size_t size = input.tellg();
    input.seekg(0, input.beg);
    
    
    uint32_t polynomial = 0xEDB88320;
    uint32_t table[256];
    
    
    for(uint32_t i=0; i<=0xff; i  ) {
        
        
        
        uint32_t c = i;
        for (size_t j = 0; j < 8; j  ) 
        {
            if (c & 1) {
                c = polynomial ^ (c >> 1);
            }
            else {
                c >>= 1;
            }
        }
        table[i] = c;
        
    }
    
    uint32_t CRC = 0xffffffff;
    
    uint8_t buf;
    
    
    for(size_t i=0; i<size; i  ) {
        
        
        
        input.read( (char *) &buf, sizeof(buf));
        CRC = (CRC>>8) ^ table[(CRC ^ buf) & 0xff ];
        
    }
    
    CRC ^= 0xffffffff;
    
    return CRC;
    
    
}

// generate crc 

uint32_t GenCrc(data *work, long int beg, long int end, uint32_t *crctbl) {
    uint32_t init = 0x00000000;
    
    
    for(long int i = beg; i<end; i  ) {
        
        init = (init<<8)^crctbl[ (init>>24) ^ work->buf[i] ];
        
    }
    
    
    return init;
    
}




// (a*b)%crc 
uint32_t MpyModCrc(uint32_t a, uint32_t b) {
    
    uint32_t pd = 0;
    uint32_t i;
    for(i = 0; i < 32; i  ){
        pd = (pd<<1)^((0-(pd>>31))&0x04c11db7);
        pd ^= (0-(b>>31))&a;
        b <<= 1;
    }
    return pd;
    
}


// pow(2,p)%crc 
uint32_t PowModCrc(uint32_t p) {
    uint32_t prd = 0x1u;                    
    uint32_t sqr = 0x2u;                    
    while(p) {
        if(p&1)
            prd = MpyModCrc(prd, sqr);
        
        sqr = MpyModCrc(sqr, sqr);
        p >>= 1;
    }
    return prd;
}




void do_work(data *work) {
    
    //Generate lookup table:
    uint32_t polynomial = 0x04c11db7;
    
    uint32_t crctbl[256];
    uint32_t crc;
    uint32_t c;
    uint32_t i;
    
    for(c=0; c <256; c  ) {
        
        crc = c<<24;
        /*
        for(i=0; i <8; i  ) {
            
            if( (crc & 0x80000000) !=0) {
                crc <<= 1;
                crc ^= polynomial;
            }
            else {
                crc <<=1;
            }
        }
        */
        
        for(i=0; i<8; i  ) {
            crc = (crc<<1)^(0-(crc>>31))&polynomial;
        }
        
        crctbl[c] = crc;
    }
    
    
    
        
    uint32_t pmc;
    uint32_t crf;
    
    
    crf = GenCrc(work, work->start, work->end, crctbl);
    
    if(work->num_zeros > 0) {
    
        pmc = PowModCrc((work->num_zeros)*8);
        crf = MpyModCrc(crf, pmc);
    }
    
    work->crc = crf;

}






void *do_stuff(void *d) {
    
    data *mydata = (data*)d;
    do_work(mydata);
    
    return 0;
    
}


int main(int argc, char** argv) {



ifstream input("8733718.zip", ios::binary);

if(!input) {
    cerr << "Can't open file." <<endl;
    
}


input.seekg(0, input.end);
long int len = input.tellg();
input.seekg(0, input.beg);



uint8_t *buf = new uint8_t[len];

input.read( (char *) buf, len);

int k;
cout << "Enter number of parts: ";

if(!(cin >> k) || k<1) {
    cout << "Error. We need at least one part!" <<endl;
    return -1;
}


data *work = new data[k 1];

for(int i=0; i < k; i  ) {
    
    work[i].start = len*i;
    work[i].start /=k;
    
    work[i].end = len * (i 1);
    work[i].end /= k;
    
    work[i].num_zeros = len - work[i].end;
    work[i].buf = buf;
    
}

for(int i=0; i < k; i  ) {
    
    void *tmp = (void*)&work[i];
    pthread_create(&work[i].id, 0, do_stuff, tmp);
    
}

for(int i=0; i<k; i  ) {
    pthread_join(work[i].id, 0);
}


uint32_t crc = work[0].crc;


for(int i=1; i<k; i  ) {
    crc ^= work[i].crc;
}

delete [] buf;
delete [] work;









cout << "Straigth CRC_32 = ";
uint32_t result;
result = crc_32(input);
cout << hex << result;
cout <<endl <<endl;


cout << "Parallel CRC_32 = ";
uint32_t result2;
result2 = crc;
cout << hex << crc <<endl <<endl;

cout <<endl <<endl;
cout <<"=========================="<<endl<<endl;

input.close();

    return 0;
}

"Straight" function gives the answer which coincides with the answer of, for example, website https://emn178.github.io/online-tools/crc32_checksum.html. But "parallel" procedure gives another answer.

CodePudding user response:

crc_32() is a right shifting CRC that initial CRC = 0xffffffff and final CRC ^= 0xffffffff, while do_work() gencrc(), ... are using a left shifting CRC.

Change do_work() and gencrc() to be a modified version of crc_32() with initial CRC = 0, and final CRC not changed (don't use CRC ^= 0xffffffff). Change MpyModCrc() as answered by Mark Adler to be right shifting CRC. For PowModCrc(), change the initial values: uint32_t prd = 0x80000000u; and uint32_t sqr = 0x40000000u;, the value p is an integer, not a finite field number so it doesn't need to be changed.

After these changes try this code at the end (I haven't tested this yet):

    uint32_t crc = work[0].crc;
    for(int i=1; i<k; i  ) {
        crc ^= work[i].crc;
    }
    uint32_t pmc = PowModCrc(len*8);         // add initial CRC of 0xffffffff
    crc ^= MpyModCrc(0xffffffff, pmc);
    crc ^= 0xffffffff;                       // final_xor = 0xffffffff

If running on a PC in 64 bit mode, you could use a PCLMULQDQ based CRC to speed things up (probably to the point that multi-threading won't help much). You can search github for examples of this. The assembly files are a bit over 500 lines. I have 6 sets of these for Visual Studio | MASM (ML64.EXE). Link to to code for 32 bit CRC reflected. If using my code, you need to change the if defines from 0 to 1 to use CRC32 instead of CRC32C.

https://github.com/jeffareid/crc/tree/master/crc32r

CodePudding user response:

As rcgldr noted, you are mixing up reflected and non-reflected CRC calculations. The original CRC is reflected so you need to stick with that. You need to always be shifting right. You always need to use the reflected polynomial, as in the original, 0xedb88320.

So, GenCRC() needs to shift right, and use the low eight bits of the CRC (which you're calling init) instead of the high eight bits to get the index.

MpyModCrc() needs to shift right, use the low bit instead of the high bit for the decisions, and use the correct polynomial.

PowModCrc() is starting off with the wrong initial polynomials. Since the polynomials are reflected, the initial values need to be as well. So 1 (x0) is 0x80000000, and x (x1) is 0x40000000.

In do_work(), you need to generate the table exactly as you did crc_32(). Of course, why you're generating the exact same table in every thread, I have no idea. Do that once.

Lastly, having done all that, your threads will be computing the CRC with a zero initial value and zero final exclusive or. That's ok, so long as you then exclusive-or with the CRC of len zero bytes with an initial value of 0xffffffff, and then exclusive-or the final result with 0xffffffff to get the same effect. That is:

   crc ^= MpyModCrc(0xffffffff, PowModCrc(len * 8)) ^ 0xffffffff;

Alternatively, you could start the first work unit, and only that one, with an initial CRC of 0xffffffff. And then exclusive-or the final result with 0xffffffff.

Another improvement is to not calculate the CRC of so many zeros (even though it is an O(log n) calculation), and to only calculate the power function once. You can combine the CRCs as you join each thread, only needing PowModCrc() of your chunk size, calculating that just once, and applying it each time. And once more for the final chunk which may be smaller.

You don't need to read in the entire file and then do the CRC calculation. You should be calculating in parallel with reading. Instead of deciding how many pieces, decide on a fixed chunk size. Then read chunks and fire off threads for CRC calculations as you read them. Combine the CRCs as the threads complete, joining them in order. Limit the number of threads to something like twice the number of cores. You want to keep the cores busy, but you don't want the overhead of too many threads and too much memory in use.

The final alternative would be to not do any of this, and simply use zlib which provides the CRC combination functions for you.

  • Related