Looking for fast binary convolution on GPU

Question

I have a large and small 2d numpy boolean array. I want to know all positions where this smaller array fits on the larger array. If for a specific location no element of the smaller and (sliced) larger array are True at the same time, the result should be True. See it as trying to place an object onto an image without having any overlap with any other items on the image.

For convenience I chose that the result array is indicating the left-top coordinate of where to place the small array and the result array has the same shape as the large array.

I want to optimize this for speed, for this I tried a lot. The most simple approach was to use pytorch:

import torch
import torch.nn.functional as F

def get_placement_options(large_array, small_array):
    shape = large_array.shape

    # Convert numpy arrays to PyTorch tensors
    large_array = torch.from_numpy(large_array).to('cuda:0').float()
    small_array = torch.from_numpy(small_array).to('cuda:0').float()

    # Convolve symbol over the large_grid
    possible_locations = (F.conv2d(large_array[None, None], small_array[None, None])[0, 0] < .5).cpu().numpy()
    
    result = np.zeros(shape, dtype='bool')
    result[:possible_locations.shape[0], :possible_locations.shape[1]] = possible_locations
    return result

But I wanted it faster and I was thinking of bitbacking the booleans into an int64. For this new approach I used cupy and wrote my own kernel. This approach does cost more memory but for my use case this is oke, since typically the large array is around (1000x1000) and the smaller array is like (100x100)

I also used a technique called "bitpacking" to efficiently compare patches of 8x8 (int64) using the bitwise AND operator. So the smaller array is bitpacked into int64, for the larger array I can also do this, but have to do it for every 8x8 shift, hence the larger memory usage. Then on the GPU I can very efficiently use the AND operator between two int64 number and if it is not zero, immediately stop for that location.

import cupy as cp
import numpy as np
import time

class Placer:
    def __init__(self):
        kernel_code = """
        extern "C" {
        __global__ void compute(long long int* large_array, long long int* small_array, int* result, int rn, int rm, int n, int m, int k, int l) {
            int i = blockIdx.x * blockDim.x + threadIdx.x;  // x position in large array we are calculating
            int j = blockIdx.y * blockDim.y + threadIdx.y;  // y position in large array we are calculating
            
            if (i <= rn && j <= rm) {
            
                int r_i = i % 8;  // Which x shift we are looking at
                int r_j = j % 8;  // Which y shift we are looking at
                int sub_array_index = r_i * 8 * n * m + r_j * n * m;

                for (int p = 0; p < k; ++p) {
                    for (int q = 0; q < l; ++q) {
                        if ((small_array[p * l + q] & large_array[sub_array_index + ((i / 8)+p) * m + (j / 8)+q]) != 0) {
                            result[i * rm + j] = 0;
                            return;
                        }
                    }
                }
            }

        }
        }
        """

        # Compile the kernel code
        self.compiled_kernel = cp.RawKernel(kernel_code, 'compute')

    def __call__(self, large_array: np.ndarray, small_array: np.ndarray):
        
        # Result placement coordinates will be left top
        result_np = np.zeros_like(large_array)

        # Make sure small array divisible by 8, add same extra padding to large array
        padding = ((0, (8 - small_array.shape[0] % 8) % 8), (0, (8 - small_array.shape[1] % 8) % 8))
        small_array = np.pad(small_array, padding, mode='constant', constant_values=False)
        K, L = small_array.shape
        large_array = np.pad(large_array, padding, mode='constant', constant_values=True) 

        # Make sure large array divisible by 8
        padding = ((0, (8 - large_array.shape[0] % 8) % 8), (0, (8 - large_array.shape[1] % 8) % 8))
        large_array = np.pad(large_array, padding, mode='constant', constant_values=True) 
        N, M = large_array.shape
        
        # Creating all 64 shifts and packing them into int64 (on the gpu)
        large_array_cp = cp.array(large_array)
        large_array_cp = cp.pad(self.sliding_window_view_cp(large_array_cp, (8, 8)), ((0, 7), (0, 7), (0, 0), (0, 0)), 'constant', constant_values=True).transpose((2, 3, 0, 1)).copy()
        large_array_cp = cp.packbits(large_array_cp.transpose((0, 1, 3, 2))).reshape(8, 8, large_array.shape[1], large_array.shape[0] // 8)
        large_array_cp = large_array_cp.transpose((0, 1, 3, 2)).copy().view('int64')

        # Convert the small array into int64 as well
        small_array = cp.array(np.packbits(small_array.copy(), axis=0).view(np.int64))
        
        # Call the kernel function
        block = (32, 32, 1)
        grid = ((N-K+1 + block[0] - 1) // block[0], (M-L+1 + block[1] - 1) // block[1])
        result = cp.ones((N-K+1, M-L+1), dtype=cp.int32)
        self.compiled_kernel(grid=grid, block=block, args=(large_array_cp, small_array, result, N-K+1, M-L+1, N // 8, M // 8, K // 8, L // 8))
        
        # Ensure the GPU has finished processing
        cp.cuda.stream.get_current_stream().synchronize()

        result = result.astype(cp.bool_).get()
        result_np[:result.shape[0], :result.shape[1]] = result

        return result_np

    @staticmethod
    def sliding_window_view_cp(arr, window_shape):
        output_shape = arr.shape[:-len(window_shape)] + tuple(i - j + 1 for i, j in zip(arr.shape[-len(window_shape):], window_shape))
        strides = arr.strides + arr.strides[-len(window_shape):]
        return as_strided(arr, shape=output_shape + window_shape, strides=strides)

Although I think in theory this approach should be faster, it is approximately as fast as the first approach. Probably misses some CUDA optimization.

To test I used

large_array = np.random.choice(a=[False, True], size=(1000, 1000))
small_array = np.zeros((100, 100), dtype=bool)
small_array[-4:, -4:] = True
large_array[-260:, -260:] = False

which give the following valid locations:

The first method took .06 seconds, the second method took .05 seconds.

I can't really believe that nobody has done this before. I think there is a speedup possible but I can't find the library to do this. Anyone has any tips, ideas how to make this faster?

Maybe it's not big enough to benefit? Sending data from CPU->GPU has some latency, and the matrices you're working with are pretty small. Even your large matrix is only 125 KB. — Nick ODell, Jun 09 '23 at 18:47
I agree with @Nick ODell. And without saturating the GPU, I don't think you will see much speed improvement. The benefits are mostly seen when you take advantage of the GPU's ability to perform SIMD operations, which happens when you have a large amount of data that the GPU can operate on simultaneously and will outweigh the overhead of moving the data to the GPU. — jared, Jun 09 '23 at 19:11
Well, by timing I see that the kernel calculations does take the majority of the time. Already optimized on CPU -> GPU by doing the 8x8 rolls on the gpu which also are now timewise negligible. I tried increasing the block size to (32, 32, 1), but not really having an effect. I guess indeed with larger matrices the cupy approach will benefit more, but have to test that out. Maybe I can do something with this large_array in the kernel, since the blocksize >= than 8x8, each thread only has to have access to a single of the 8x8 large arrays. — F.Wessels, Jun 09 '23 at 21:32
On my machine the kernel takes a small computational time. The main issue is the functions called previously. This is not easy to track which one since cupy computations are done lazily... Time with the kernel: 32 ms. Time without the kernel: 23 ms. Tested on a Nvidia 1660S GPU. — Jérôme Richard, Jun 09 '23 at 22:23
Weird, I probably do something wrong then. For me this part which I time takes up 98%: ``` start = time.time() self.compiled_kernel(grid=grid, block=block, args=(large_array_cp, small_array, result, N-K+1, M-L+1, N // 8, M // 8, K // 8, L // 8)) # Ensure the GPU has finished processing cp.cuda.stream.get_current_stream().synchronize() print(time.time() - start) ``` I played with the kernel and found out that accessing the large array is what is costing a lot -> large_array[sub_array_index + p * m + q]; Was able to speed things up by first checking if small_array[p * l + q] != 0 — F.Wessels, Jun 09 '23 at 22:41
Having a `return` in a middle a 2 for loop is really bad for performance if there is any *warp divergence*. You should really avoid this on GPUs. More generally, GPUs do not like conditionals, especially ones that are not all true/false on all threads of a block (divergence). In fact, conditionals are also slow on CPUs so trying to avoid them is generally good for performance. Memory accesses are generally a bit slow. Especially global memory ones on GPUs. GPUs are good for heavy computations (which is not the case here despite the bit manipulations). — Jérôme Richard, Jun 10 '23 at 00:00
I expect this to be faster on CPU actually. You can try to write a SIMD-friendly loop to get an equivalent of the GPU version. CPUs have very-fast and relatively large L1 caches that are interesting in this case. Few threads (eg. 4) can certainly do all the computation in their L1 cache since the input is only 120KB wide. For example, Skylake-like CPU can read 2 256-bit SIMD register for the L1 cache per cycle while often operating at >3 GHz. This is a lot. Certainly enough to get a much faster code than your current one. Maybe even faster than data transfer, GPU kernel start and allocations. — Jérôme Richard, Jun 10 '23 at 00:09
I am totally not a gpu/cpu optimization expert, my question is a more general one that I can't believe nobody optimized binary convolution before. Since I couldn't find anything quick, I tried on my own and came across cupy. But is seems my question isn't appreciated here, I will try a bit simpler approach with cupy including your tips Jerome. — F.Wessels, Jun 10 '23 at 08:21

F.Wessels · Accepted Answer · 2023-06-19T08:00:48.807

I spent some more time in using the cupy library to try making optimized code for cuda / the gpu. Managed to get a +/- 4 times reduction in speed than using the convolutional approach using the pytorch library.

Thoughts for anyone out there who wants to continue making it faster:

Let us say we have the two matrices (large and a small one) with MxN and KxL dimensions. Then in total you have to compare elements basically MxNxKxL times, which you can see as the outer product of the two matrices. The large problem in terms of speed is that each binary element is represented as an int16 or larger, partly because GPU's are not made for binary stuff. So you can try to optimize blocks, shared memory etc, but you still have to do lots of int16 operations.

You can use bitpacking, the approach I tried, but then the conversion from your binary array to int16/32/64 is going to be the bottleneck, partly because of the shifts you have to calculate to be able to efficiently compare on the GPU.

Here is how far I came and was allowed to pursue this. Note, the cp.roll can be made more efficient, I managed with cp.shift.. to get another factor 1.5 reduction, but then the code did not look elegant anymore. On my pc 1/4 (3ms) of the time (12ms for 1000x1000x100x100) is spent in the kernel and 2/4 (6ms) doing the 64 rolls and bitpacking.

class Placer:
    def __init__(self):
        kernel_code = """
        extern "C" {
        __global__ void compute(long long int* large_array, long long int* small_array, int* result, int rn, int rm, int n, int m, int k, int l) {
            int i = blockIdx.x * blockDim.x + threadIdx.x;  // x position in large array we are calculating
            int j = blockIdx.y * blockDim.y + threadIdx.y;  // y position in large array we are calculating
            long long int a = 0;
            long long int b = 0;
            int sub_array_index = 0;
            
            if (i < rn && j < rm) {
                sub_array_index = (i % 8) * 8 * n * m + (j % 8) * n * m + (i / 8) * m + (j / 8);

                for (int p = 0; p < k; ++p) {
                    for (int q = 0; q < l; ++q) {
                        a = small_array[p * l + q];
                        if (a != 0) {
                            b = large_array[sub_array_index + p * m + q];
                            if (b != 0) {
                                if ((a & b) != 0) {
                                    result[i * rm + j] = 0;
                                    break;
                                }
                            }
                        }
                    }
                }
            }
        }
        }
        """

        # Compile the kernel code
        self.compiled_kernel = cp.RawKernel(kernel_code, 'compute')

    def __call__(self, large_array: np.ndarray, small_array: np.ndarray):
        
        # Result placement coordinates will be left top
        result_np = np.zeros_like(large_array)

        # Make sure small array divisible by 8, add same extra padding to large array
        padding = ((0, (8 - small_array.shape[0] % 8) % 8), (0, (8 - small_array.shape[1] % 8) % 8))
        small_array = np.pad(small_array, padding, mode='constant', constant_values=False)
        K, L = small_array.shape
        large_array = np.pad(large_array, padding, mode='constant', constant_values=True) 

        # Make sure large array divisible by 8
        padding = ((0, (8 - large_array.shape[0] % 8) % 8), (0, (8 - large_array.shape[1] % 8) % 8))
        large_array = np.pad(large_array, padding, mode='constant', constant_values=True) 
        N, M = large_array.shape
        
        # Creating all 64 shifts and packing them into int64 (on the gpu)
        large_array_cp = cp.array(large_array)
        large_array_cp = cp.array([[cp.packbits(cp.roll(large_array_cp, (-dx, -dy), (0, 1))) for dy in range(8)] for dx in range(8)])
        large_array_cp = large_array_cp.reshape(8, 8, large_array.shape[0], large_array.shape[1] // 8).transpose((0, 1, 3, 2)).copy().view('int64').transpose((0, 1, 3, 2)).copy()

        # Convert the small array into int64 as well
        small_array = cp.array(np.packbits(small_array.copy(), axis=0).view(np.int64))
        
        # Call the kernel function
        block = (16, 16, 1)
        grid = ((N-K+1 + block[0] - 1) // block[0], (M-L+1 + block[1] - 1) // block[1])
        result = cp.ones((N-K+1, M-L+1), dtype=cp.int32)
        self.compiled_kernel(grid=grid, block=block, args=(large_array_cp, small_array, result, N-K+1, M-L+1, N // 8, M // 8, K // 8, L // 8))
        
        # Copy the result to CPU, it will wait until the GPU is finished
        result = result.astype(cp.bool_).get()
        result_np[:result.shape[0], :result.shape[1]] = result

        return result_np

Looking for fast binary convolution on GPU

1 Answers1