#include <cuda_runtime.h>
#include <string>
#include <chrono>
#include <random>
#include <iostream>
using namespace std;
class MyTimer {
std::chrono::time_point<std::chrono::system_clock> start;
public:
void startCounter() {
start = std::chrono::system_clock::now();
}
int64_t getCounterNs() {
return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::system_clock::now() - start).count();
}
int64_t getCounterMs() {
return std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start).count();
}
double getCounterMsPrecise() {
return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::system_clock::now() - start).count()
/ 1000000.0;
}
};
__global__ void HelloWorld();
void GenData(int N, float* a);
__global__ void Multiply(int N, float* __restrict__ output, const float* __restrict__ x, const float* __restrict__ y);
//----------
volatile double dummy = 0;
void Test(bool summing)
{
MyTimer timer;
int N = 10000000;
float *d_x, *d_y, *d_res;
cudaMallocManaged(&d_x, N * sizeof(float));
cudaMallocManaged(&d_y, N * sizeof(float));
cudaMallocManaged(&d_res, N * sizeof(float));
cudaMemAdvise(d_res, N * sizeof(float), cudaMemAdviseSetAccessedBy, cudaCpuDeviceId); // set direct access hint
GenData(N, d_x);
GenData(N, d_y);
GenData(N, d_res);
cudaMemPrefetchAsync(d_x, N * sizeof(float), 0, 0);
cudaMemPrefetchAsync(d_y, N * sizeof(float), 0, 0);
cudaMemPrefetchAsync(d_res, N * sizeof(float), 0, 0);
cudaDeviceSynchronize();
//-------------------
int nloop = 100;
double cost = 0;
for (int t = 1; t <= nloop; t++) {
timer.startCounter();
Multiply<<<256,256>>>(N, d_res, d_x, d_y);
cudaDeviceSynchronize();
dummy = timer.getCounterMsPrecise();
cost += dummy;
// This only read data, and doesn't write.
// Why does it still invalidate d_res memory pages on the GPU ?
// Is there any way to read data from d_res without making the next kernel call slower?
float sum = rand() % 1000;
if (summing) {
// either line below will make the next kernel slower
cudaMemPrefetchAsync(d_res, N * sizeof(float), cudaCpuDeviceId, 0);
//for (int i = 0; i < N; i++) sum += d_res[i];
}
cudaDeviceSynchronize();
dummy = sum;
}
cout << "Summing = " << summing << " cost = " << cost / nloop << "\n";
cudaFree(d_x);
cudaFree(d_y);
cudaFree(d_res);
}
int main()
{
srand(time(NULL));
HelloWorld<<<1,1>>>();
Test(false);
Test(true);
Test(false);
Test(true);
return 0;
}
//-----------------------------
//-----------------------------
//-----------------------------
__global__
void HelloWorld()
{
printf("Hello world\n");
}
void GenData(int N, float* a)
{
for (int i = 0; i < N; i ++) a[i] = float(rand() % 1000) / (rand() % 1000 + 1);
}
__global__
void Multiply(int N, float* __restrict__ output, const float* __restrict__ x, const float* __restrict__ y)
{
int start = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = start; i < N; i += stride) {
output[i] = x[i] * y[i];
}
}
Compile command: nvcc -o main main.cu -O3 -std=c++17
, 2080ti
Output:
Hello world
Summing = 0 cost = 0.237617
Summing = 1 cost = 12.1865
Summing = 0 cost = 0.235626
Summing = 1 cost = 11.8909
I have a bunch of Unified Memory (UM) GPU array (like Matlab's gpuArray
). 99.9% of the computation/write operations will be done on the GPU device memory. Sometimes the result is read on CPU.
I notice that just reading a UM array on CPU side will cause the next kernel call on that array to become much slower. I assume this is due to some kind of page fault. But usually only memory writes cause page faults.
Why does this happen? How can I change the code so that reading from UM doesn't make the next kernel slower? (edit: without using cudaMemPrefetch
, if possible)
Edit: also, directManagedMemAccessFromHost
gives 0 on 2080ti and Nvidia A30. So which system support that feature?