You cannot do this with the current implementation of managed memory because managed memory requires exclusive access to managed data by the device, when kernels are running. Host access to managed data during the time when kernels are running will lead to undefined behavior, typically seg fault.
This should be possible using zero-copy techniques, however, including the volatile
recommendation from @Cicada.
Here's a worked example:
$ cat t736.cu
#include <stdio.h>
#include <unistd.h>
__global__ void mykernel(volatile int *idata, volatile int *odata){
*odata = *idata;
while (*idata == 1);
*odata = *idata+5;
}
int main(){
int *idata, *odata;
cudaHostAlloc(&idata, sizeof(int), cudaHostAllocMapped);
cudaHostAlloc(&odata, sizeof(int), cudaHostAllocMapped);
*odata = 0;
*idata = 1; // set barrier
mykernel<<<1,1>>>(idata, odata);
sleep(1);
printf("odata = %d\n", *odata); // expect this to be 1
*idata = 0; // release barrier
sleep(1);
printf("odata = %d\n", *odata); // expect this to be 5
cudaDeviceSynchronize(); // if kernel is hung, we will hang
return 0;
}
$ nvcc -o t736 t736.cu
$ cuda-memcheck ./t736
========= CUDA-MEMCHECK
odata = 1
odata = 5
========= ERROR SUMMARY: 0 errors
$
The above assumes a linux 64 bit environment.