Trying to use this feature, I wrote a small example on Windows x64 to implement this. In this example, kernel "directly" accesses disk space. Actually, as @RobertCrovella mentioned previously, the operating system is doing the job, with probably some CPU work; but no supplemental coding.
__global__ void kernel(int4* ptr)
{
int4 val ; val.x = threadIdx.x ; val.y = blockDim.x ; val.z = blockIdx.x ; val.w = gridDim.x ;
ptr[threadIdx.x + blockDim.x * blockIdx.x] = val ;
ptr[160*1024*1024 + threadIdx.x + blockDim.x * blockIdx.x] = val ;
}
#include "Windows.h"
int main()
{
// 4GB - larger than installed GPU memory
size_t size = 256 * 1024 * 1024 * sizeof(int4) ;
HANDLE hFile = ::CreateFile ("GPU.dump", (GENERIC_READ | GENERIC_WRITE), 0, 0, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL) ;
HANDLE hFileMapping = ::CreateFileMapping (hFile, 0, PAGE_READWRITE, (size >> 32), (int)size, 0) ;
void* ptr = ::MapViewOfFile (hFileMapping, FILE_MAP_ALL_ACCESS, 0, 0, size) ;
::cudaSetDeviceFlags (cudaDeviceMapHost) ;
cudaError_t er = ::cudaHostRegister (ptr, size, cudaHostRegisterMapped) ;
if (cudaSuccess != er)
{
printf ("could not register\n") ;
return 1 ;
}
void* d_ptr ;
er = ::cudaHostGetDevicePointer (&d_ptr, ptr, 0) ;
if (cudaSuccess != er)
{
printf ("could not get device pointer\n") ;
return 1 ;
}
kernel<<<256,256>>> ((int4*)d_ptr) ;
if (cudaSuccess != ::cudaDeviceSynchronize())
{
printf ("error in kernel\n") ;
return 1 ;
}
if (cudaSuccess != ::cudaHostUnregister (ptr))
{
printf ("could not unregister\n") ;
return 1 ;
}
::UnmapViewOfFile (ptr) ;
::CloseHandle (hFileMapping) ;
::CloseHandle (hFile) ;
::cudaDeviceReset() ;
printf ("DONE\n");
return 0 ;
}