I am a beginner with Halide. I would like to compute a result image from a stack of input images, and I want to accelerate it using the GPU (currently I am using CUDA as the target). For example, I want to calculate the atan2 of 4 images such as below:
Func result;
result(x,y) = Halide::atan2(images(x,y,3) - images(x,y,1), images(x,y,0) - images(x,y,2));
I am having problems with this. It produces either an empty result array or a result array with bogus values.
Note that it produces correct results if I modify the function to process only a single image and pass in a two-dimensional input buffer, but as soon as I make the input array three-dimensional and send in a stack of N images, something goes wrong.
Is there an error in my code? Is there something special I need to do when I pass in a 3D buffer (with N images) but only loop over x and y in the Halide function?
See full code below:
#include "Halide.h"
#include <stdio.h>
#include <math.h>
#include <string>
#include <fstream>
#include <sstream>
using namespace Halide;
void generatePhaseImage(Halide::Buffer<float> image, float phi)
{
for (int y=0; y<image.height(); y++)
{
for (int x=0; x<image.width(); x++)
{
image(x,y) = 128.0f + 128.0f * sin((2.0f*M_PI * static_cast<float>(x) / 64.0f) + phi);
}
}
}
void writeBinaryFile(const std::string& filename, const Buffer<float>& image)
{
std::ofstream f(filename.c_str(), std::ios::binary);
std::cout << "Writing image of height " << image.height() << " ";
std::cout << "and width " << image.width() << std::endl;
for (int i=0; i<image.height(); i++)
{
for (int j=0; j<image.width(); j++)
{
f.write(reinterpret_cast<const char*>(&image(i,j)), sizeof(float));
}
}
}
int main(int argc, char **argv)
{
Var x, y, c, i, ii, xo, yo, xi, yi;
int h = 100;
int w = 100;
Buffer<float> images(w, h, 4);
for (int i=0; i<4; i++)
{
float phi = i * (2*M_PI / 4.);
generatePhaseImage(images.sliced(2, i), phi);
}
Func phaseStepping;
phaseStepping(x,y) = Halide::atan2( images(x,y,3) - images(x,y,1), images(x,y,0) - images(x,y,2));
for (int i=0; i<4; i++)
{
std::stringstream ss;
ss << "image" << i <<".bin";
writeBinaryFile(ss.str(), images.sliced(2, i));
}
Target target = get_host_target();
target.set_feature(Target::CUDA);
target.set_feature(Target::Debug);
phaseStepping.gpu_tile(x, y, xo, yo, xi, yi, 8, 8);
phaseStepping.compile_jit(target);
Buffer<float> result(w,h);
phaseStepping.realize(result);
result.copy_to_host();
writeBinaryFile("result.bin", result);
}
The output of the code is shown below:
Writing image of height 100 and width 100
Writing image of height 100 and width 100
Writing image of height 100 and width 100
Writing image of height 100 and width 100
Entering Pipeline phaseStepping
Input Buffer images: buffer(0, 0x0, 0x564812eefd80, 0, float32, {0, 100, 1}, {0, 100, 100}, {0, 4, 10000})
Input (void *) __user_context: 0x7ffff380ac68
Output Buffer phaseStepping: buffer(0, 0x0, 0x56481325b200, 0, float32, {0, 100, 1}, {0, 100, 100})
CUDA: halide_cuda_initialize_kernels (user_context: 0x0, state_ptr: 0x7f76cbb2b000, ptx_src: 0x7f76cbb28340, size: 6919
load_libcuda (user_context: 0x0)
Loaded CUDA runtime library: libcuda.so
Got device 0
GeForce GTX 560
total memory: 959 MB
max threads per block: 1024
warp size: 32
max block size: 1024 1024 64
max grid size: 65535 65535 65535
max shared memory per block: 49152
max constant memory per block: 65536
compute capability 2.1
cuda cores: 7 x 48 = 48
cuCtxCreate 0 -> 0x5648134ab8d0(3020)
cuModuleLoadData 0x7f76cbb28340, 6919 -> 0x56481386f850
Time: 1.857735e+00 ms
halide_copy_to_device 0x564812e02a68, host: 0x56481325b200, dev: 0, host_dirty: 0, dev_dirty: 0
halide_device_malloc: 0x564812e02a68 interface 0x7f76cbb340f0 host: 0x56481325b200, dev: 0, host_dirty: 0, dev_dirty:0 buf current interface: 0x0
CUDA: halide_cuda_device_malloc (user_context: 0x7ffff380ac68, buf: 0x564812e02a68)
allocating buffer(0, 0x0, 0x56481325b200, 0, float32, {0, 100, 1}, {0, 100, 100})
cuMemAlloc 40000 -> 0x501700000
Time: 1.549260e-01 ms
halide_copy_to_device 0x564812dc6418, host: 0x564812eefd80, dev: 0, host_dirty: 0, dev_dirty: 0
halide_device_malloc: 0x564812dc6418 interface 0x7f76cbb340f0 host: 0x564812eefd80, dev: 0, host_dirty: 0, dev_dirty:0 buf current interface: 0x0
CUDA: halide_cuda_device_malloc (user_context: 0x7ffff380ac68, buf: 0x564812dc6418)
allocating buffer(0, 0x0, 0x564812eefd80, 0, float32, {0, 100, 1}, {0, 100, 100}, {0, 4, 10000})
cuMemAlloc 160000 -> 0x501800000
Time: 1.099330e-01 ms
CUDA: halide_cuda_run (user_context: 0x7ffff380ac68, entry: kernel_phaseStepping_s0_y_yo___block_id_y, blocks: 13x13x1, threads: 8x8x1, shmem: 0
Got context.
Got module 0x56481386f850
Got function 0x56481387b2b0
halide_cuda_run 0 4 [0x6400000064 ...] 0
halide_cuda_run 1 4 [0x64 ...] 0
halide_cuda_run 2 4 [0x0 ...] 0
halide_cuda_run 3 4 [0x6400000000 ...] 0
halide_cuda_run 4 4 [0x4000000064 ...] 0
halide_cuda_run 5 8 [0x501800000 ...] 1
halide_cuda_run 6 8 [0x501700000 ...] 1
halide_cuda_run translated arg5 [0x501800000 ...]
halide_cuda_run translated arg6 [0x501700000 ...]
Time: 4.394600e-02 ms
Exiting Pipeline phaseStepping
halide_copy_to_host 0x564812e02a68
copy_to_host_already_locked 0x564812e02a68 dev_dirty is true
CUDA: halide_cuda_copy_to_host (user_context: 0x0, buf: 0x564812e02a68)
c.extent[0] = 100
c.extent[1] = 100
cuMemcpyDtoH 0x501700000 -> 0x56481325b200, 40000 bytes
Time: 2.062520e-01 ms
Writing image of height 100 and width 100
halide_device_free: 0x564812e02a68 buf dev 21498953728 interface 0x7f76cbb340f0
CUDA: halide_cuda_device_free (user_context: 0x0, buf: 0x564812e02a68)
cuMemFree 0x501700000
Time: 7.846700e-02 ms
halide_device_free: 0x564812dc6418 buf dev 21500002304 interface 0x7f76cbb340f0
CUDA: halide_cuda_device_free (user_context: 0x0, buf: 0x564812dc6418)
cuMemFree 0x501800000
Time: 8.416100e-02 ms