I'm trying to implement a metric working on squared tiles (8x8) of a gray scale image producing 3 outputs (accumulation of gradient, max and min of a tile): each output is an image having a dimension of (IMG_WIDTH/8; IMG_HEIGHT/8). In the following implementation the 3 results are computed separately but I'd like to compute them all together; moreover I cannot find a good schedule for gpu:
#define IMAGE_WIDTH (1280)
#define IMAGE_HEIGHT (1024)
#define TILE_SIZE (8)
Halide::Buffer<uint8_t> input_image(IMAGE_WIDTH, IMAGE_HEIGHT);
Halide::Var xo, yo, xi, yi;
Halide::Func tiled_input;
tiled_input(xi, yi, xo, yo) = input_image(Halide::clamp(xo * TILE_SIZE + xi, 0, input_image.width()-1),
Halide::clamp(yo * TILE_SIZE + yi, 0, input_image.height()-1));
Halide::Expr gradientX, gradientY;
gradientX = Halide::cast<int16_t>(tiled_input(xi+1, yi, xo, yo)) - Halide::cast<int16_t>(tiled_input(xi-1, yi, xo, yo));
gradientY = -(Halide::cast<int16_t>(tiled_input(xi, yi+1, xo, yo)) - Halide::cast<int16_t>(tiled_input(xi, yi-1, xo, yo)));
Halide::Expr agx, agy, m;
agx = Halide::abs(gradientX);
agy = Halide::abs(gradientY);
m = Halide::select(agx > agy, agx+(agy/2), agy+(agx/2));
Halide::RDom tile_domain(0, TILE_SIZE, 0, TILE_SIZE);
Halide::Func tiled_output_x, tiled_output_y;
tiled_output_x(xi, yi, xo, yo) = Halide::cast<int16_t>(m);
Halide::Func accx;
Halide::Func tiled_output_max, tiled_output_min;
accx(xo, yo) = Halide::sum(tiled_output_x(tile_domain.x, tile_domain.y, xo, yo));
tiled_output_max(xo, yo) = Halide::maximum(tiled_input(tile_domain.x, tile_domain.y, xo, yo));
tiled_output_min(xo, yo) = Halide::minimum(tiled_input(tile_domain.x, tile_domain.y, xo, yo));
Halide::Buffer<int16_t> output_buffer_accx(input_image.width()/TILE_SIZE, input_image.height()/TILE_SIZE);
Halide::Buffer<uint8_t> output_buffer_max(input_image.width()/TILE_SIZE, input_image.height()/TILE_SIZE);
Halide::Buffer<uint8_t> output_buffer_min(input_image.width()/TILE_SIZE, input_image.height()/TILE_SIZE);
accx.realize(output_buffer_accx, get_host_target());
tiled_output_max.realize(output_buffer_max, get_host_target());
tiled_output_min.realize(output_buffer_min, get_host_target());
A good schedule for CPU can be the following:
accx.vectorize(xo, TILE_SIZE).parallel(yo);
tiled_output_max.vectorize(xo, TILE_SIZE).parallel(yo);
tiled_output_min.vectorize(xo, TILE_SIZE).parallel(yo);
But I cannot find the appropriate for GPU. Any help ?