Recipe to copy 1D strided data with cudaMemcpy2D

Question

If one has two continuous ranges of device memory it is possible to copy memory from from one to the other using cudaMemcpy.

   double* source = ...
   double* dest = ...
   cudaMemcpy(dest, source, N, cudaMemcpyDeviceToDevice);

Now suppose that I want to copy source into dest, but every 2 or 3 elements respectively. That is dest[0] = source[0], dest[3] = source[2], dest[6] = source[4], .... Of course a single plain cudaMemcpy cannot do this.

Intuitively, cudaMemcpy2D should be able to do the job, because "strided elements can be see as a column in a larger array". But cudaMemcpy2D it has many input parameters that are obscure to interpret in this context, such as pitch.

For example, I manager to use cudaMemcpy2D to reproduce the case where both strides are 1.

    cudaMemcpy2D(dest, 1, source, 1, 1, n*sizeof(T), cudaMemcpyDeviceToHost);

But I cannot figure out the general case, with dest_stride and source_stride difference from 1.

Is there a way to copy strided data to stride data with cudaMemcpy2D? In which order do I have to put the known information about the layout?, namely, in terms of the two strides and sizeof(T).

    cudaMemcpy2D(dest, ??, source, ???, ????, ????, cudaMemcpyDeviceToHost);

score 3 · Answer 1 · answered Jun 14 '19 at 12:33

Yes, this can be done. It is easier to illustrate in code than words so:

#include <iostream>

int main()
{
    const size_t swidth = 2;
    const size_t sheight = 4;
    size_t spitch = swidth * sizeof(int);
    int source[swidth * sheight] = { 0, 1, 2, 3, 4, 5, 6, 7 }; 


    const size_t dwidth = 3;
    const size_t dheight = 4;
    size_t dpitch = dwidth * sizeof(int);
    int dest[dwidth * dheight] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 };

    const size_t cwidth = 1 * sizeof(int);
    const size_t cheight = 3;

    int* source_d; cudaMalloc(&source_d, spitch * sheight);
    cudaMemcpy(source_d, &source[0], spitch * sheight, cudaMemcpyHostToDevice);
    cudaMemcpy2D(&dest[0], dpitch, source_d, spitch, cwidth, cheight, cudaMemcpyDeviceToHost);

    for(int i=0; i < 12; i++) std::cout << i << " " << dest[i] << std::endl;

    return 0;
}

which does this:

$ nvcc -std=c++11 -arch=sm_52 -o strided_copy strided_copy.cu 
$ cuda-memcheck ./strided_copy
========= CUDA-MEMCHECK
0 0
1 -1
2 -1
3 2
4 -1
5 -1
6 4
7 -1
8 -1
9 -1
10 -1
11 -1
========= ERROR SUMMARY: 0 errors

In essence, you are copying a width of 4 bytes (an int) with a stride of 8 bytes (two ints) into a destination with a stride of 12 bytes (three ints). I only copied three rwos so that it obvious how the row argument works. Adjust the size of the copy element and strides, etc. to taste.

score 1 · Accepted Answer · answered Jun 14 '19 at 13:50

A generic function for such a strided copy could look roughly like this:

void cudaMemcpyStrided(
        void *dst, int dstStride, 
        void *src, int srcStride, 
        int numElements, int elementSize, int kind) {
    int srcPitchInBytes = srcStride * elementSize;
    int dstPitchInBytes = dstStride * elementSize;
    int width = 1 * elementSize;
    int height = numElements;
    cudaMemcpy2D(
        dst, dstPitchInBytes, 
        src, srcPitchInBytes, 
        width, height,
        kind);
}

And for your example, it could be called as

cudaMemcpyStrided(dest, 3, source, 2, 3, sizeof(double), cudaMemcpyDeviceToDevice);

"Roughly", because I just translated it on the fly from the (Java/JCuda based) code that I tested it with:

import static jcuda.runtime.JCuda.cudaMemcpy2D;

import java.util.Arrays;
import java.util.Locale;

import jcuda.Pointer;
import jcuda.Sizeof;
import jcuda.runtime.cudaMemcpyKind;

public class JCudaStridedMemcopy {
    public static void main(String[] args) {

        int dstLength = 9;
        int srcLength = 6;
        int dstStride = 3;
        int srcStride = 2;
        int numElements = 3;
        runExample(dstLength, dstStride, srcLength, srcStride, numElements);

        dstLength = 9;
        srcLength = 12;
        dstStride = 3;
        srcStride = 4;
        numElements = 3;
        runExample(dstLength, dstStride, srcLength, srcStride, numElements);

        dstLength = 18;
        srcLength = 12;
        dstStride = 3;
        srcStride = 2;
        numElements = 6;
        runExample(dstLength, dstStride, srcLength, srcStride, numElements);

    }

    private static void runExample(int dstLength, int dstStride, int srcLength, int srcStride, int numElements) {
        double dst[] = new double[dstLength];
        double src[] = new double[srcLength];
        for (int i = 0; i < src.length; i++) {
            src[i] = i;
        }

        cudaMemcpyStrided(dst, dstStride, src, srcStride, numElements);

        System.out.println("Copy " + numElements + " elements");
        System.out.println("  to   array with length " + dstLength + ", with a stride of " + dstStride);
        System.out.println("  from array with length " + srcLength + ", with a stride of " + srcStride);

        System.out.println("");

        System.out.println("Destination:");
        System.out.println(toString2D(dst, dstStride));
        System.out.println("Flat: " + Arrays.toString(dst));

        System.out.println("");

        System.out.println("Source:");
        System.out.println(toString2D(src, srcStride));
        System.out.println("Flat: " + Arrays.toString(src));

        System.out.println("");
        System.out.println("Done");
        System.out.println("");

    }

    private static void cudaMemcpyStrided(double dst[], int dstStride, double src[], int srcStride, int numElements) {
        long srcPitchInBytes = srcStride * Sizeof.DOUBLE;
        long dstPitchInBytes = dstStride * Sizeof.DOUBLE;
        long width = 1 * Sizeof.DOUBLE;
        long height = numElements;
        cudaMemcpy2D(
                Pointer.to(dst), dstPitchInBytes, 
                Pointer.to(src), srcPitchInBytes, 
                width, height,
                cudaMemcpyKind.cudaMemcpyHostToHost);
    }

    public static String toString2D(double[] a, long columns) {
        String format = "%4.1f ";
        ;
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < a.length; i++) {
            if (i > 0 && i % columns == 0) {
                sb.append("\n");
            }
            sb.append(String.format(Locale.ENGLISH, format, a[i]));
        }
        return sb.toString();
    }
}

To give an idea of what the function does, based on the examples/test cases, here is the output:

Copy 3 elements
  to   array with length 9, with a stride of 3
  from array with length 6, with a stride of 2

Destination:
 0.0  0.0  0.0 
 2.0  0.0  0.0 
 4.0  0.0  0.0 
Flat: [0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 4.0, 0.0, 0.0]

Source:
 0.0  1.0 
 2.0  3.0 
 4.0  5.0 
Flat: [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]

Done

Copy 3 elements
  to   array with length 9, with a stride of 3
  from array with length 12, with a stride of 4

Destination:
 0.0  0.0  0.0 
 4.0  0.0  0.0 
 8.0  0.0  0.0 
Flat: [0.0, 0.0, 0.0, 4.0, 0.0, 0.0, 8.0, 0.0, 0.0]

Source:
 0.0  1.0  2.0  3.0 
 4.0  5.0  6.0  7.0 
 8.0  9.0 10.0 11.0 
Flat: [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0]

Done

Copy 6 elements
  to   array with length 18, with a stride of 3
  from array with length 12, with a stride of 2

Destination:
 0.0  0.0  0.0 
 2.0  0.0  0.0 
 4.0  0.0  0.0 
 6.0  0.0  0.0 
 8.0  0.0  0.0 
10.0  0.0  0.0 
Flat: [0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 4.0, 0.0, 0.0, 6.0, 0.0, 0.0, 8.0, 0.0, 0.0, 10.0, 0.0, 0.0]

Source:
 0.0  1.0 
 2.0  3.0 
 4.0  5.0 
 6.0  7.0 
 8.0  9.0 
10.0 11.0 
Flat: [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0]

Done

I guess the downvote is due to the fact that there' some Java code in there? Consider it as an add-on. The relevant function is in C, for that matter. — Marco13, Jun 15 '19 at 10:51
It is a pitty that `cudaMemcpy2D` doesn't accept 0 (zero) as the `srcPitchInBytes` argument (returns error), that would allow for a more sane `memset` function. — alfC, Aug 15 '19 at 19:18

Recipe to copy 1D strided data with cudaMemcpy2D

2 Answers2

Linked