What would be the source code like to add two `N x N x N x N` tensors?

Question

My question is actually about x, y, z dimensions.

If the following CUDA program adds two NxNxN (N = 2^20) matrices:

#include <stdio.h>
#define N (1 << 20)
#define BLOCK_SIZE 16

__global__ void add(float* a, float* b, float* c, int n) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    int j = blockIdx.y * blockDim.y + threadIdx.y;
    int k = blockIdx.z * blockDim.z + threadIdx.z;
    if (i < n && j < n && k < n) {
        int index = (i * n + j) * n + k;
        c[index] = a[index] + b[index];
    }
}

int main() {
    float *a, *b, *c; // device pointers
    float *d_a, *d_b, *d_c; // host pointers
    int size = N * N * N * sizeof(float);
    
    // Allocate memory on the device
    cudaMalloc(&d_a, size);
    cudaMalloc(&d_b, size);
    cudaMalloc(&d_c, size);
    
    // Allocate memory on the host and initialize matrices
    a = (float*)malloc(size);
    b = (float*)malloc(size);
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            for (int k = 0; k < N; k++) {
                int index = (i * N + j) * N + k;
                a[index] = i + j + k;
                b[index] = N - i - j - k;
            }
        }
    }
    
    // Copy matrices from host to device
    cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
    
    // Define grid and block sizes
    dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE, BLOCK_SIZE);
    dim3 dimGrid((N+dimBlock.x-1)/dimBlock.x, (N+dimBlock.y-1)/dimBlock.y, (N+dimBlock.z-1)/dimBlock.z);
    
    // Launch kernel with 3D grid and block sizes
    add<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N);
    
    // Copy result from device to host
    cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
    
    // Free memory on host and device
    free(a); free(b); free(c);
    cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
    
    return 0;
}

What would be the source code like to add two N x N x N x N tensors?

If we have four dimensions in our problem, how can we write the add() function, as we don't have more than three dimensions in CUDA blocks?

N=2^20 implies a NxNxNxN `float` tensor of 2^82 bytes (for each of a,b,c). Probably not very practical. For a more practical problem size (say, N=256), you could just concatenate two of the dimensions in the first (`x`) dimension, and increase the size of your grid launch. Otherwise, you will need to use a loop, possibly [the grid-stride loop already pointed out to you](https://stackoverflow.com/questions/75828305/). — Robert Crovella, Mar 24 '23 at 19:58
@RobertCrovella, My question is actually about x, y, z dimensions. If we have four dimensions in our problem, how can we write the `add()` function as we don't have more than three dimensions? — user366312, Mar 24 '23 at 20:11
Just like the 3rd dimension can be thought of as a set of layers of the first 2 dimensions, the 4th dimension can be thought of as a set of layers of the first 3 dimensions. If we have 2 dimensions, and we index beyond the end of the first layer, we enter the 2nd layer (i.e. we move in z, even though we are not explicitly indexing in z.) Likewise if we have a 3D volume, and we index beyond the end of that 3D volume, we have moved in the 4th dimension. So, you can concatenate 2 dimensions into one. Of course there are many ways to solve the problem. There is not just one answer to your question — Robert Crovella, Mar 24 '23 at 20:19
The division of thread and block ids into up to three dimensions is only convenience. There is nothing inherently local in Cuda in the y and z dimensions. So we can calculate three dimensions back to one flat dimension and forward again into 17 dimensions. Your specific problem (adding tensors) unlike multiplication also acts on each element separately. So only the number of tensor elements matter, not their dimensionality. — Sebastian, Mar 28 '23 at 13:02

score 1 · Accepted Answer · answered Mar 24 '23 at 21:24

What would be the source code like to add two N x N x N x N tensors? If we have four dimensions in our problem, how can we write the add() function as we don't have more than three dimensions?

As is usual in computer programming, there are many ways to do it.

Here is a demonstration of 4 methods

using a 1D grid (x), striding
using a 2D grid (x,y), with the grid "striding" in z, and w
using a 3D grid (x,y,z) with the grid "striding" in w
using a 3D grid (x,y,z) but "encoding" the w dimension in the x grid variable, therefore not requiring a loop in kernel code

example:

$ cat t2231.cu
#include <iostream>

using it = int;
const it NPOW = 6;         // 2^6 = 64
const it N = (1<<NPOW);    // the 4th power of this must fit in type it!
using dt = int;

//1D
__global__ void k1(dt *a, dt *b, dt *c, it n){

  for (it i = blockIdx.x*blockDim.x+threadIdx.x; i < n*n*n*n; i += gridDim.x*blockDim.x) // 1D grid-stride loop
    c[i] = a[i] + b[i];
}

//2D, "striding" in z, w, must launch threads to cover x,y
__global__ void k2(dt *a, dt *b, dt *c, it n){

  it x = blockIdx.x*blockDim.x+threadIdx.x;
  it y = blockIdx.y*blockDim.y+threadIdx.y;
  if (x < n && y < n)
    for (it w = 0; w < n; w++)
      for (it z = 0; z < n; z++){
        it idx = (((((w*n)+z)*n)+y)*n)+x;
        c[idx] = a[idx]+b[idx];}
}

//3D, "striding" in w, must launch threads to cover x,y,z
__global__ void k3(dt *a, dt *b, dt *c, it n){

  it x = blockIdx.x*blockDim.x+threadIdx.x;
  it y = blockIdx.y*blockDim.y+threadIdx.y;
  it z = blockIdx.z*blockDim.z+threadIdx.z;
  if (x < n && y < n && z < n)
    for (it w = 0; w < n; w++){
      it idx = (((((w*n)+z)*n)+y)*n)+x;
      c[idx] = a[idx]+b[idx];}
}

//4D, extracting w from x, must launch threads to cover w*x,y,z
__global__ void k4(dt *a, dt *b, dt *c, it n){

  it px = blockIdx.x*blockDim.x+threadIdx.x;
  it w = px>>NPOW;
  it x = px-(w<<NPOW);
  it y = blockIdx.y*blockDim.y+threadIdx.y;
  it z = blockIdx.z*blockDim.z+threadIdx.z;
  if (x < n && y < n && z < n && w < n){
    it idx = (((((w*n)+z)*n)+y)*n)+x;
    c[idx] = a[idx]+b[idx];}
}


int main(){
  it sz = N*N*N*N;
  dt *a = new dt[sz];
  dt *b = new dt[sz];
  dt *c = new dt[sz];
  dt *d_a, *d_b, *d_c;
  for (it i = 0; i < sz; i++) {a[i] = i+1; b[i] = i+1;}
  cudaMalloc(&d_a, sizeof(d_a[0])*sz);
  cudaMalloc(&d_b, sizeof(d_a[0])*sz);
  cudaMalloc(&d_c, sizeof(d_a[0])*sz);
  cudaMemcpy(d_a, a, sizeof(d_a[0])*sz, cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, b, sizeof(d_b[0])*sz, cudaMemcpyHostToDevice);
  // choose "arbitrary" dimensions for 1D case
  k1<<<80, 1024>>>(d_a, d_b, d_c, N);
  cudaMemcpy(c, d_c, sizeof(d_c[0])*sz, cudaMemcpyDeviceToHost);
  for (it i = 0; i < sz; i++) if (c[i] != a[i]+b[i]) std::cout << "Error1" << std::endl;
  cudaMemset(d_c, 0, sizeof(d_c[0])*sz);
  // choose x,y to at least cover N,N for 2D case
  dim3 block2(8,8);
  dim3 grid2((N+block2.x-1)/block2.x, (N+block2.y-1)/block2.y);
  k2<<<grid2, block2>>>(d_a, d_b, d_c, N);
  cudaMemcpy(c, d_c, sizeof(d_c[0])*sz, cudaMemcpyDeviceToHost);
  for (it i = 0; i < sz; i++) if (c[i] != a[i]+b[i]) std::cout << "Error2" << std::endl;
  cudaMemset(d_c, 0, sizeof(d_c[0])*sz);
  // choose x,y,z to at least cover N,N,N for 3D case
  dim3 block3(8,8,8);
  dim3 grid3((N+block3.x-1)/block3.x, (N+block3.y-1)/block3.y, (N+block3.z-1)/block3.z);
  k3<<<grid3, block3>>>(d_a, d_b, d_c, N);
  cudaMemcpy(c, d_c, sizeof(d_c[0])*sz, cudaMemcpyDeviceToHost);
  for (it i = 0; i < sz; i++) if (c[i] != a[i]+b[i]) std::cout << "Error3" << std::endl;
  cudaMemset(d_c, 0, sizeof(d_c[0])*sz);
  // choose x,y,z to at least cover N*N,N,N for 4D case
  dim3 grid4((N*N+block3.x-1)/block3.x, (N+block3.y-1)/block3.y, (N+block3.z-1)/block3.z);
  k4<<<grid4, block3>>>(d_a, d_b, d_c, N);
  cudaMemcpy(c, d_c, sizeof(d_c[0])*sz, cudaMemcpyDeviceToHost);
  for (it i = 0; i < sz; i++) if (c[i] != a[i]+b[i]) std::cout << "Error4" << std::endl;
}

$ nvcc -o t2231 t2231.cu
$ compute-sanitizer ./t2231
========= COMPUTE-SANITIZER
========= ERROR SUMMARY: 0 errors
$

Could you kindly describe what you did in `k4`? I do not understand the calculation. — user366312, Mar 26 '23 at 20:44
This is the dimension folding that I described in the comments above under your question. We are using one grid dimension to represent two data dimensions, and we do that by having that grid dimension sized to handle the product of the two data dimensions. — Robert Crovella, Mar 27 '23 at 00:50

What would be the source code like to add two `N x N x N x N` tensors?

1 Answers1

Linked