-2

I was looking for questions related to my problem but only found questions regarding CRC32 reversing. My topic is a bit different.

I am a novice programmer and I have such a task to do. I have input (3 strings of 4 bytes). For this data, I know three checksums computed using a hash function similar to CRC32. However, it is not a standard CRC32 because it differs between the default and unknown values ​​of the polynomial and the crcxor parameter. So for the input data of 4 bytes I calculate the CRC using different values ​​of the polynomial from 0 to 0xFFFFFFFF and using different values ​​of the parameter crcxor with the values between 0 and 0xFFFF. I wrote this program in CUDA C because it runs faster than the CPU. This is my third CUDA C program right after "Hello World" and "VectorAdd" :). To calculate all possible 0xFFFF x 0xFFFFFFFF variants, it takes about 5 hours for my NVIDIA GTX1060 card. I wanted to ask if it is possible to modify or optimize the following program code in order to do this task faster?

Ultimately, I would like to calculate 0xFFFFFFFF x 0xFFFFFFFF but I don't know yet if it can be done in a short time.

If anyone would like to have a look at my program code and provide valuable feedback, I would be extremely grateful.

 #include <stdio.h>
#include <cuda_runtime.h>
#include <cuda_profiler_api.h>


__device__ unsigned long calculate_crc(unsigned long data, unsigned long poly, unsigned long cxor) 
// truncated function for constant values crcinit = 0 refin = 0 refout = 0 direct = 0
{
    unsigned long i, j, k, c, bit, crc = 0;
    for (i=0,k=24; i<4; i++,k-=8) 
    {
        c = (data>>k)&0xFF;
        for (j=0x80; j; j>>=1) 
        {
            bit = crc & 0x80000000;
            crc<<= 1;
            if (c & j) bit^= 0x80000000;
            if (bit)   crc^= poly;
        }
    }   
    crc^= cxor;
    crc&= 0xFFFFFFFF;   
    return crc;
}


__global__ void calculate_crc_parameters(unsigned long n) 
{
    unsigned long polynom = 0; 
    unsigned long crcxor = 0;
    
    //Input data:
    const unsigned long data1 = 0x928F640C;
    const unsigned long data2 = 0x0121B30E;
    const unsigned long data3 = 0xCB652607;
    
    // calculated CRC for the above input data and for polynom 0xFF7A1DB7: crcinit: 0, crcxor: 0x00000000, refin: 0, refout: 0, direct: 0:
    // for these CRCs, the function should find the polynomial 0xFF7A1DB7 and crcxor = 0
    // finds it right away because crcxor = 0
    const unsigned long crc1 = 0x7076BCEB;  
    const unsigned long crc2 = 0x1F719D7A;  
    const unsigned long crc3 = 0x8369D986;
    
    // other example crc - for crcxor> 0
    // computed CRC for polynom 0xFF7A1DB7: crcinit: 0, crcxor: 0x000000FF, refin: 0, refout: 0, direct: 0:
    // for these CRCs, the function should find the polynomial 0xFF7A1DB7 and crcxor = 0x000000FF
    // Program find it after 1m 12sec.
    /*
    const unsigned long crc1 = 0x7076BC14;  
    const unsigned long crc2 = 0x1F719D85;  
    const unsigned long crc3 = 0x8369D979;
    */  
    
    // computed CRC for polynom 0xFF7A1DB7: crcinit: 0, crcxor: 0x0000FFFE, refin: 0, refout: 0, direct: 0:
    // for these CRCs, the function should find the polynomial 0xFF7A1DB7 and crcxor = 0x0000FFFE
    // searches for 5 hours
    /*
    const unsigned long crc1 = 0x70764315;  
    const unsigned long crc2 = 0x1F716284;  
    const unsigned long crc3 = 0x83692678;
    */
    
    // CRCs - polynom 0xFF7A1DB7: crcinit: 0, crcxor: 0xFF7A1DB7, refin: 0, refout: 0, direct: 0:
    // no implementation for 8-byte crcxor yet - and it would count for a long time
    /*
    const unsigned long crc1 = 0x8F0CA15C;  
    const unsigned long crc2 = 0xE00B80CD;  
    const unsigned long crc3 = 0x7C13C431;
    */
    unsigned int index_x  = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int stride_x = blockDim.x * gridDim.x;
    unsigned int index_y  = blockIdx.y * blockDim.y + threadIdx.y;
    unsigned int stride_y = blockDim.y * gridDim.y;
    unsigned int index_z  = blockIdx.z * blockDim.z + threadIdx.z;
    unsigned int stride_z = blockDim.z * gridDim.z;
    
    if((index_x<n)&&(index_y<n)&&(index_z<n))
    {
        polynom = (index_x << 16) ^ index_y; // "gluing" the polynomial
        // to get polynom e.g. 0xFF7A1DB7 we have to "glue it" with index_x and index_y
        // if index_x == 0xFF7A then LSH by 16 places and we get 0xFF7A0000
        // then xor from index_y: 0xFF7A0000 xor 0x00001DB7 and is 0xFF7A1DB7
        crcxor = index_z; // crcxor will take the values of index_z that is from 0x0000 to 0xFFFF
        if(calculate_crc(data1,polynom,crcxor)==crc1)
            if(calculate_crc(data2,polynom,crcxor)==crc2)
                if(calculate_crc(data3,polynom,crcxor)==crc3)       // compute three checksums and compare them
                    printf("\nCRC parameters found ---> polynom: 0x%08X,  crcxor: 0x%08X\n", polynom,crcxor);
                    // if the calculated 3 crc sums agree with the known 3 crcs, then display the parameters for which they were calculated
    
        if ((crcxor%0xFF==0)&&(polynom==0xFFFFFFFF)) printf("@");       // 1m 12s from displaying @ to the next @
        // if the @ sign is displayed 256 times, this will be the end of the program
        index_x+=stride_x;
        index_y+=stride_y;
        index_z+=stride_z;
    }
}



int main(void)
{
    unsigned long N = 0x10000;          //  0xFFFF + 0x01 = 65536dec
    ////////////////////////////////////////////////
    // for computing only in X and Y axes - for crcxor = zero all the time
    dim3 dimBlock(   4,     4,    1);
    dim3 dimGrid(16384, 16384,    1);
    
    ////////////////////////////////////////////////
    // for computing on the X, Y and Z axes, i.e. for crcxor taking values from the Z axis from 0 to 65535
    //dim3 dimBlock(   4,     4,   64);             //  4 * 4 * 64 = 1024 --- maximum block size
    //dim3 dimGrid(16384, 16384, 1024);            //uncomment this 2 lines for crcxor > 0
    //               4      4    64
    //               *      *     *
    //           16384  16384  1024
    //               =      =     =
    //        0x10000 0x10000 0x10000
    //   x, y, and z will trigger 65,536 times each
    
    cudaProfilerStart();    
    calculate_crc_parameters<<<dimGrid, dimBlock>>>(N);
    cudaDeviceSynchronize();
    cudaDeviceReset();
    cudaProfilerStop();
    return 0;
}

I compile it in cmd by: nvcc name.cu -o name

I work on win10 with Cuda Toolkit 11.5 Card is NVIDIA GTX 1060.

Could the use of pointers or memory allocations somehow speed up this program?

I computing test crc values here

rcgldr
  • 27,407
  • 3
  • 36
  • 61
th3r4t3l
  • 17
  • 5
  • You could start by using a sensible block size. By using 16 threads per block you are choosing to only engage 50% of the theoretical compute capacity of your GPU. The CUDA warp size is 32. Blocks sizes should always be a round multiple of the warp size – talonmies Nov 19 '21 at 02:18
  • So declaring dim variables like this: `dim3 dimBlock(32, 32,1); dim3 dimGrid(2048, 2048, 1);` is better than declaring `dim3 dimBlock( 4, 4, 1); dim3 dimGrid(16384, 16384, 1);` ??? In both cases it will be 65536 for x and y, right? – th3r4t3l Nov 21 '21 at 21:35
  • Yes, in the case where the threads per block is less than 32 – talonmies Nov 21 '21 at 23:41

1 Answers1

2

Optimization should begin with the algorithm, as opposed to optimizing a painfully pointless brute-force approach.

You can factor the search for a polynomial and a final exclusive-or, doing the polynomial first, and then (trivially) finding the exclusive-or value. All you need to do is take the exclusive-or of two of your data values, and the find the polynomial that produces the exclusive-or of the two CRCs of those values, assuming a zero final exclusive or. You will need to try at least two pairs in order to narrow it down to one choice for the polynomial.

Once you have the polynomial, now compute the CRC on one of your data values, exclusive-or that with the desired CRC, and now you have your final exclusive-or value. No search needed for the second step.

The polynomial search is fast enough that you can just use your CPU. No GPU or CUDA or whatever is needed. It took 40 seconds on my three-year old laptop. You only need to try odd polynomials. Even polynomials are not valid.

Exclusive-oring the data and the CRCs also cancels the initial value. So you can find the polynomial this way for CRCs that have both a non-zero initial value and a non-zero final exclusive or. However in order to then solve for both the initial value and final exclusive-or, you will need examples with different length messages, i.e. other than all four-byte messages. There are 232 possible combinations of initial value and final exclusive-or that will match any and all CRCs of four-byte messages.

As an aside, your CRC routine is needlessly complicated. See equivalent below. This prints poly = ff7a1db7, xor = 0000fffe:

#include <stdio.h>
#include <stdint.h>

uint32_t calculate_crc(uint32_t data, uint32_t poly, uint32_t xor) {
    for (int i = 0; i < 32; i++)
        data = data & 0x80000000 ? (data << 1) ^ poly : data << 1;
    return data ^ xor;
}

void findp(uint32_t data1, uint32_t data2, uint32_t data3,
           uint32_t crc1, uint32_t crc2, uint32_t crc3) {
    uint32_t d = data2, c = crc2;
    data1 ^= data3;  crc1 ^= crc3;
    data2 ^= data3;  crc2 ^= crc3;
    data3 ^= d;  crc3 ^= c;
    uint32_t poly = 1;
    do {
        if (calculate_crc(data1, poly, 0) == crc1 &&
            calculate_crc(data2, poly, 0) == crc2 &&
            calculate_crc(data3, poly, 0) == crc3)
            printf("poly = %08x, xor = %08x\n",
                   poly, calculate_crc(d, poly, 0) ^ c);
        poly += 2;
    } while (poly != 1);
}

int main(void) {
    findp(0x928F640C, 0x0121B30E, 0xCB652607,
          0x70764315, 0x1F716284, 0x83692678);
    return 0;
}

There is an even faster, in fact massively faster, approach by solving a set of linear equations over GF(2). However it would take me longer than 40 seconds to write that code, so this is where I would stop. Unless I had many, many of these CRCs to find. Or unless I was trying to find, for example, a 64-bit CRC polynomial.

Mark Adler
  • 101,978
  • 13
  • 118
  • 158
  • @MarkAdler - with just one pair of samples xor'ed together, the issue is one equation, two variables (polynomial and the quotient), so more samples are needed. – rcgldr Nov 24 '21 at 05:08