I'm supposed to write a fast GPU solution for 1-bit images (C++). In my opinion my code is correct, but for some reason when I submit my answer the system says
/box/is.cu:3:10: fatal error: cudacheck.h: No such file or directory
3 | #include "cudacheck.h"
| ^~~~~~~~~~~~~
compilation terminated.
The code is below. Can you see the possible mistake I'm making?
#include "is.h"
#include <cuda_runtime.h>
#include "cudacheck.h"
#include <vector>
static inline int jakolasku(int alku, int loppu) { return (alku + loppu - 1)/loppu; }
std::vector<float> summienlasku(int ny, int nx, int pny, int pnx, const float* data){
std::vector<float> sums(pnx*pny, 0.f);
for(int alk=0; alk<ny; ++alk){
for(int lop=0; lop<nx; ++lop){
sums[(lop+1) + pnx*(alk+1)] = data[3 * (lop+nx*alk)]
+ sums[(lop+1) + pnx*alk]
+ sums[lop + pnx*(alk+1)]
- sums[lop + pnx*alk];
}
}
return sums;
}
__global__ void nelio(int ny, int nx, int size, int pny, int pnx, const float* sums, float* mitat){
int leveys = threadIdx.x + blockIdx.x * blockDim.x;
int korkeus = threadIdx.y + blockIdx.y * blockDim.y;
if( !(0 < leveys && leveys <= nx) || !(0 < korkeus && korkeus <=ny) ) return;
int xsize = korkeus * leveys;
int ysize = size - xSize;
float xluku = 1.0f / (float) xSize;
float yluku = ySize == 0 ? 0.f : 1.0f / (float) ySize;
float lk = sums[pnx*pny-1];
float L = 0.f;
for(int y0=0; y0<=ny-korkeus; ++y0){
int y1 = y0 + korkeus;
for(int x0=0; x0<=nx-leveys; ++x0){
int x1 = x0 + leveys;
float s1 = sums[y1*pnx + x1];
float s2 = sums[y1*pnx + x0];
float s3 = sums[y0*pnx + x1];
float s4 = sums[y0*pnx + x0];
float xtoin = s1 - s2 - s3 + s4;
float ytoin = lk - xtoin;
float l = xtoin * xtoin * xluku + ytoin * ytoin * yluku;
if(l > L) L = l;
}
}
mitat[korkeus*pnx + leveys] = L;
}
struct Rectangle{ int width; int height; int size; };
Rectangle loydanelikulmio(int ny, int nx, int pnx, const float* rectdims){
float L = 0.f;
int width = 0, height = 0;
for(int l=1; l<=ny; ++l){
for(int v=1; v<=nx; ++v){
float lu = rectdims[l*pnx+v];
if (lu > L){
L = lu;
width = v;
height = l;
}
}
}
Rectangle rect = {width, height, width*height};
return rect;
}
struct SegmentResult{ int y0; int x0; int y1; int x1; float outer[3]; float inner[3]; };
SegmentResult loydaSegmentti(int ny, int nx, int pny, int pnx, Rectangle* rect, const float* sums){
int size = nx*ny;
float vluku = sums[pnx*pny-1];
int korkeus = rect->height;
int leveys = rect->width;
int xkoko = rect->size;
int ykoko = size - xkoko;
float xx = 1.0f / (float) xkoko;
float yy = ykoko == 0 ? 0.f : 1.0f / (float) ykoko;
float K = 0.f;
float nL = 0.f, bL = 0.f;
int xx0 = 0, xx1 = 0, yy0 = 0, yy1 = 0;
for(int y0=0; y0<=ny-korkeus; ++y0){
for(int x0=0; x0<=nx-leveys; ++x0){
int y1 = y0 + korkeus;
int x1 = x0 + leveys;
float s1 = sums[y1*pnx + x1];
float s2 = sums[y1*pnx + x0];
float s3 = sums[y0*pnx + x1];
float s4 = sums[y0*pnx + x0];
float vlukuu = s1 - s2 - s3 + s4;
float ylukuu = vluku - vlukuu;
float k = vlukuu * vlukuu * xx + ylukuu * ylukuu * yy;
if(k > K){
K = k;
nL = xlukuu;
bL = ylukuu;
xx0 = x0;
xx1 = x1;
yy0 = y0;
yy1 = y1;
}
}
}
nL *= xx;
bL *= yy;
SegmentResult tulos = { yy0, xx0, yy1, xx1, { bL, bL, bL }, {nL, nL, nL } };
return tulos;
}
Result segment(int ny, int nx, const float* data){
int laskux = nx+1, laskuy = ny+1;
std::vector<float> summa = summienlasku(ny, nx, laskuy, laskux, data);
float* smuuttuja = NULL;
CHECK(cudaMalloc((void**)&smuuttuja, laskux*laskuy*sizeof(float)));
float* hmuuttuja = NULL;
CHECK(cudaMalloc((void**)&hmuuttuja, laskux*laskuy*sizeof(float)));
CHECK(cudaMemcpy(smuuttuja, summa.data(), laskux*laskuy*sizeof(float), cudaMemcpyHostToDevice));
{
dim3 dimBlock(16, 16);
dim3 dimGrid(jakolasku(nx, dimBlock.x), jakolasku(ny, dimBlock.y));
nelio<<<dimGrid, dimBlock>>>(ny, nx, nx*ny, laskuy, laskux, smuuttuja, hmuuttuja);
CHECK(cudaGetLastError());
}
std::vector<float> rectdims(lasku*laskuy);
CHECK(cudaMemcpy(rectdims.data(), hmuuttuja, laskux*laskuy*sizeof(float), cudaMemcpyDeviceToHost));
Rectangle rect = loydaSegmentti(ny, nx, laskux, rectdims.data());
SegmentResult sr = loydaSegmentti(ny, nx, laskuy, laskux, &rect, summa.data());
Result result {
sr.y0,
sr.x0,
sr.y1,
sr.x1,
{ sr.outer[0], sr.outer[1], sr.outer[2] },
{ sr.inner[0], sr.inner[1], sr.inner[2] }
};
CHECK(cudaFree(smuuttuja)); CHECK(cudaFree(hmuuttuja));
return result;
}
I'm not sure if the mistake is about the "cuda" rows here in my code? Why does it give an error witht the #include "cudacheck.h" when I'm trying to submit my code?