I have a CUDA code like:
__global__ void test()
{
for (unsigned int loop = 1; loop < MAX_LOOP; loop++)
{
unsigned char hash[32];
MD5Ctx ctx;
md5_init(&ctx);
md5_update(&ctx, "123", 3);
md_final(&ctx, hash);
}
}
Then I launch the kernel with only one thread like this:
cudaError_t status;
test<<<1, 1>>>();
cudaThreadSynchronize();
status = cudaGetLastError();
If I set MAX_LOOP to 3000, everything runs successfully. When I change MAX_LOOP to 5000 or even larger, cudaGetLastError will return back with the error: cudaErrorLaunchFailure(4).