63

I'm trying to understand how numpy can be so fast, based on my shocking comparison with optimized C/C++ code which is still far from reproducing numpy's speed.

Consider the following example: Given a 2D array with shape=(N, N) and dtype=float32, which represents a list of N vectors of N dimensions, I am computing the pairwise differences between every pair of vectors. Using numpy broadcasting, this simply writes as:

def pairwise_sub_numpy( X ):
    return X - X[:, None, :]

Using timeit I can measure the performance for N=512: it takes 88 ms per call on my laptop.

Now, in C/C++ a naive implementation writes as:

#define X(i, j)     _X[(i)*N + (j)]
#define res(i, j, k)  _res[((i)*N + (j))*N + (k)]

float* pairwise_sub_naive( const float* _X, int N ) 
{
    float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));

    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            for (int k = 0; k < N; k++)
                res(i,j,k) = X(i,k) - X(j,k);
          }
    }
    return _res;
}

Compiling using gcc 7.3.0 with -O3 flag, I get 195 ms per call for pairwise_sub_naive(X), which is not too bad given the simplicity of the code, but about 2 times slower than numpy.

Now I start getting serious and add some small optimizations, by indexing the row vectors directly:

float* pairwise_sub_better( const float* _X, int N ) 
{
    float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));
    
    for (int i = 0; i < N; i++) {
        const float* xi = & X(i,0);

        for (int j = 0; j < N; j++) {
            const float* xj = & X(j,0);
            
            float* r = &res(i,j,0);
            for (int k = 0; k < N; k++)
                 r[k] = xi[k] - xj[k];
        }
    }
    return _res;
}

The speed stays the same at 195 ms, which means that the compiler was able to figure that much. Let's now use SIMD vector instructions:

float* pairwise_sub_simd( const float* _X, int N ) 
{
    float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));

    // create caches for row vectors which are memory-aligned
    float* xi = (float*)aligned_alloc(32, N * sizeof(float));
    float* xj = (float*)aligned_alloc(32, N * sizeof(float));
    
    for (int i = 0; i < N; i++) {
        memcpy(xi, & X(i,0), N*sizeof(float));
        
        for (int j = 0; j < N; j++) {
            memcpy(xj, & X(j,0), N*sizeof(float));
            
            float* r = &res(i,j,0);
            for (int k = 0; k < N; k += 256/sizeof(float)) {
                const __m256 A = _mm256_load_ps(xi+k);
                const __m256 B = _mm256_load_ps(xj+k);
                _mm256_store_ps(r+k, _mm256_sub_ps( A, B ));
            }
        }
    }
    free(xi); 
    free(xj);
    return _res;
}

This only yields a small boost (178 ms instead of 194 ms per function call).

Then I was wondering if a "block-wise" approach, like what is used to optimize dot-products, could be beneficials:

float* pairwise_sub_blocks( const float* _X, int N ) 
{
    float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));

    #define B 8
    float cache1[B*B], cache2[B*B];

    for (int bi = 0; bi < N; bi+=B)
      for (int bj = 0; bj < N; bj+=B)
        for (int bk = 0; bk < N; bk+=B) {
        
            // load first 8x8 block in the cache
            for (int i = 0; i < B; i++)
              for (int k = 0; k < B; k++)
                cache1[B*i + k] = X(bi+i, bk+k);

            // load second 8x8 block in the cache
            for (int j = 0; j < B; j++)
              for (int k = 0; k < B; k++)
                cache2[B*j + k] = X(bj+j, bk+k);

            // compute local operations on the caches
            for (int i = 0; i < B; i++)
             for (int j = 0; j < B; j++)
              for (int k = 0; k < B; k++)
                 res(bi+i,bj+j,bk+k) = cache1[B*i + k] - cache2[B*j + k];
         }
    return _res;
}

And surprisingly, this is the slowest method so far (258 ms per function call).

To summarize, despite some efforts with some optimized C++ code, I can't come anywhere close the 88 ms / call that numpy achieves effortlessly. Any idea why?

Note: By the way, I am disabling numpy multi-threading and anyway, this kind of operation is not multi-threaded.

Edit: Exact code to benchmark the numpy code:

import numpy as np

def pairwise_sub_numpy( X ):
    return X - X[:, None, :]

N = 512
X = np.random.rand(N,N).astype(np.float32)

import timeit
times = timeit.repeat('pairwise_sub_numpy( X )', globals=globals(), number=1, repeat=5)
print(f">> best of 5 = {1000*min(times):.3f} ms")

Full benchmark for C code:

#include <stdio.h>
#include <string.h>
#include <xmmintrin.h>   // compile with -mavx -msse4.1
#include <pmmintrin.h>
#include <immintrin.h>
#include <time.h>


#define X(i, j)     _x[(i)*N + (j)]
#define res(i, j, k)  _res[((i)*N + (j))*N + (k)]

float* pairwise_sub_naive( const float* _x, int N ) 
{
    float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));

    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            for (int k = 0; k < N; k++)
                res(i,j,k) = X(i,k) - X(j,k);
          }
    }
    return _res;
}

float* pairwise_sub_better( const float* _x, int N ) 
{
    float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));
    
    for (int i = 0; i < N; i++) {
        const float* xi = & X(i,0);

        for (int j = 0; j < N; j++) {
            const float* xj = & X(j,0);
            
            float* r = &res(i,j,0);
            for (int k = 0; k < N; k++)
                 r[k] = xi[k] - xj[k];
        }
    }
    return _res;
}

float* pairwise_sub_simd( const float* _x, int N ) 
{
    float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));

    // create caches for row vectors which are memory-aligned
    float* xi = (float*)aligned_alloc(32, N * sizeof(float));
    float* xj = (float*)aligned_alloc(32, N * sizeof(float));
    
    for (int i = 0; i < N; i++) {
        memcpy(xi, & X(i,0), N*sizeof(float));
        
        for (int j = 0; j < N; j++) {
            memcpy(xj, & X(j,0), N*sizeof(float));
            
            float* r = &res(i,j,0);
            for (int k = 0; k < N; k += 256/sizeof(float)) {
                const __m256 A = _mm256_load_ps(xi+k);
                const __m256 B = _mm256_load_ps(xj+k);
                _mm256_store_ps(r+k, _mm256_sub_ps( A, B ));
            }
        }
    }
    free(xi); 
    free(xj);
    return _res;
}


float* pairwise_sub_blocks( const float* _x, int N ) 
{
    float* _res = (float*) aligned_alloc( 32, N*N*N*sizeof(float));

    #define B 8
    float cache1[B*B], cache2[B*B];

    for (int bi = 0; bi < N; bi+=B)
      for (int bj = 0; bj < N; bj+=B)
        for (int bk = 0; bk < N; bk+=B) {
        
            // load first 8x8 block in the cache
            for (int i = 0; i < B; i++)
              for (int k = 0; k < B; k++)
                cache1[B*i + k] = X(bi+i, bk+k);

            // load second 8x8 block in the cache
            for (int j = 0; j < B; j++)
              for (int k = 0; k < B; k++)
                cache2[B*j + k] = X(bj+j, bk+k);

            // compute local operations on the caches
            for (int i = 0; i < B; i++)
             for (int j = 0; j < B; j++)
              for (int k = 0; k < B; k++)
                 res(bi+i,bj+j,bk+k) = cache1[B*i + k] - cache2[B*j + k];
         }
    return _res;
}

int main() 
{
    const int N = 512;
    float* _x = (float*) malloc( N * N * sizeof(float) );
    for( int i = 0; i < N; i++)
      for( int j = 0; j < N; j++)
        X(i,j) = ((i+j*j+17*i+101) % N) / float(N);

    double best = 9e9;
    for( int i = 0; i < 5; i++)
    {
        struct timespec start, stop;
        clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
        
        //float* res = pairwise_sub_naive( _x, N );
        //float* res = pairwise_sub_better( _x, N );
        //float* res = pairwise_sub_simd( _x, N );
        float* res = pairwise_sub_blocks( _x, N );

        clock_gettime(CLOCK_THREAD_CPUTIME_ID, &stop);

        double t = (stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) / 1e3;    // in microseconds
        if (t < best) best = t;
        free( res );
    }
    printf("Best of 5 = %f ms\n", best / 1000);

    free( _x );
    return 0;
}

Compiled using gcc 7.3.0 gcc -Wall -O3 -mavx -msse4.1 -o test_simd test_simd.c

Summary of timings on my machine:

Implementation Time
numpy 88 ms
C++ naive 194 ms
C++ better 195 ms
C++ SIMD 178 ms
C++ blocked 258 ms
C++ blocked (gcc 8.3.1) 217 ms
rustyx
  • 80,671
  • 25
  • 200
  • 267
jere0111
  • 751
  • 4
  • 9
  • 9
    your c++ code allocates lots of memory. `pairwise_sub_numpy` doesnt do any of that. Measure allocations and actual computations seperately – 463035818_is_not_an_ai Jan 25 '21 at 16:26
  • 1
    Yes, `numpy` also allocates a N x N x N array that is returned. Also, I measured the allocation time and it is rather negligible (around 0.01 ms per call). – jere0111 Jan 25 '21 at 16:30
  • 9
    questions about performance require many details. How did you measure? What is the complete numpy code (how do you call it? what exactly do you measure?) How do you compile the C++ code? Which compiler? What flags? – 463035818_is_not_an_ai Jan 25 '21 at 16:32
  • doesn't `pairwise_sub_numpy` returns some kind of generator? In such case in fact it does do anything like `pairwise_sub_naive`. – Marek R Jan 25 '21 at 16:37
  • no it returns a numpy array, not a generator. – jere0111 Jan 25 '21 at 16:38
  • `-O3` doesn't generate the fastest code. Its goal is to create a smaller executable. Not that I expect a huge difference between `-O2` and `-O3`. – sweenish Jan 25 '21 at 16:39
  • 2
    BTW, a small comment: you should not name anything starting with underscore followed directly by an upper-case letter, as this is reserved for the implementation. See [here](https://timsong-cpp.github.io/cppwp/n4659/lex.name#3.1). – florestan Jan 25 '21 at 16:45
  • 3
    I don't know if this causes the slowdown, but don't you underallocate the result? Your result is `N*N*N` bytes rather than `N*N*N*sizeof(float)` bytes – Paul Hankin Jan 25 '21 at 16:47
  • 7
    @sweenish `-O3` adds more optimizations, often resulting in a _larger_ binary (though you're correct it's not necessarily a faster one and sometimes slower!); the (rarely-used) `-Os` is for smaller binaries https://gcc.gnu.org/onlinedocs/gcc/Optimize-Options.html https://stackoverflow.com/questions/19689014/gcc-difference-between-o3-and-os – ti7 Jan 25 '21 at 16:48
  • Thanks for the reference. I remembered incorrectly. – sweenish Jan 25 '21 at 16:53
  • 2
    Post complete code of your C++ benchmark. – Maxim Egorushkin Jan 25 '21 at 16:59
  • @PaulHankin yes, you are right, but my original code was correct. I made the error while copying/editing on stackoverflow. I will post the full c++ benchmark code. I fixed it here. – jere0111 Jan 25 '21 at 17:07
  • 1
    This sounds like it could be GCC failing to optimize the multiplication. What happens if you rewrite your code to avoid multiplication in the index computation? – user2357112 Jan 25 '21 at 17:38
  • 13
    Are you sure you didn't mess up something with your measurements? Your numpy code takes 83ms here and your c++ code 92ms for the naive and 60ms for the blocks... – florestan Jan 25 '21 at 17:42
  • 2
    Interestingly, it runs even faster if I remove the -mavx and leave only -msse4.1 – florestan Jan 25 '21 at 17:48
  • @florestan indeed the benchmark is now faster than numpy with the blocked version!! But i rewrote a bit the benchmark in the process of posting on stackoverflow, and I am sure my old timings were correct. I don't really understand where the slowdown happened and why it's not there anymore.... – jere0111 Jan 25 '21 at 18:03
  • @florestan So in fact there was a bug in my #define (missing parenthesis). With the bug fixed, I cannot improve over 2x slower than numpy. – jere0111 Jan 25 '21 at 19:57
  • 1
    GCC 7.3 doesn't autovectorize. Newer GCCs do. Your SIMD code is terrible because it does unnecessary copies; use unaligned loads if you're not sure the input is aligned. – Sopel Jan 25 '21 at 20:16
  • 1
    numpy is written in C anyway so it might be possible to investigate its source code – M.M Jan 25 '21 at 20:26
  • @Sopel i know, i tried getting rid of it and it doesn't matter. I've also compiled with gcc 8.3.1 and the perf stays the same with SIMD. – jere0111 Jan 25 '21 at 20:58
  • @M.M I looked but couldn't localize it. Numpy source code is really complex. – jere0111 Jan 25 '21 at 20:58
  • 5
    `aligned_alloc( 32, N*N*N*sizeof(float));` is risky: `N*N*N` might overflow the range of `int`. You should write `sizeof(float)*N*N*N` which forces the full computation to use the potentially larger type `size_t`. – chqrlie Jan 25 '21 at 20:58
  • 1
    For reference, the NumPy code for the inner loop is [here](https://github.com/numpy/numpy/blob/v1.19.5/numpy/core/src/umath/loops.c.src#L625-L645) and [here](https://github.com/numpy/numpy/blob/v1.19.5/numpy/core/src/umath/fast_loop_macros.h#L187), and once you expand all the code generation stuff, it boils down to a fairly naive loop with no explicit SIMD usage. (I believe there's some chunking going on in outer loops, but I can't point you to the relevant code.) – user2357112 Jan 25 '21 at 21:24
  • 3
    Wait, no, there is SIMD! https://github.com/numpy/numpy/blob/v1.19.5/numpy/core/src/umath/simd.inc.src#L703 I've been overlooking that file for a while now. I don't know if it triggers here, though. – user2357112 Jan 25 '21 at 21:31
  • 2
    Make sure to avoid measuring memory allocations. Maybe your Python environment reuses already allocated memory I get for example 230 ms for your Python example and 65ms for `np.subtract(X, X[:, None, :],res)` where res is already allocated. I also tried a naive Numba solution which, which takes about 43 ms, without memory allocation and 211 with memory allocation. The same should be possible in C. – max9111 Jan 26 '21 at 13:10

3 Answers3

32

As pointed out by some of the comments numpy uses SIMD in its implementation and it does not allocate memory at the point of computation. If I eliminate the memory allocation from your implementation, pre-allocating all the buffers ahead of the computation then I get a better time compared to numpy even with the scaler version(that is the one without any optimizations).

Also in terms of SIMD and why your implementation does not perform much better than the scaler is because your memory access patterns are not ideal for SIMD usage - you do memcopy and you load into SIMD registers from locations that are far apart from each other - e.g. you fill vectors from line 0 and line 511, which might not play well with the cache or with the SIMD prefetcher.

There is also a mistake in how you load the SIMD registers(if I understood correctly what you're trying to compute): a 256 bit SIMD register can load 8 single-precision floating-point numbers 8 * 32 = 256, but in your loop you jump k by "256/sizeof(float)" which is 256/4 = 64; _x and _res are float pointers and the SIMD intrinsics expect also float pointers as arguments so instead of reading all elements from those lines every 8 floats you read them every 64 floats.

The computation can be optimized further by changing the access patterns but also by observing that you repeat some computations: e.g. when iterating with line0 as a base you compute line0 - line1 but at some future time, when iterating with line1 as a base, you need to compute line1 - line0 which is basically -(line0 - line1), that is for each line after line0 a lot of results could be reused from previous computations. A lot of times SIMD usage or parallelization requires one to change how data is accessed or reasoned about in order to provide meaningful improvements.

Here is what I have done as a first step based on your initial implementation and it is faster than the numpy(don't mind the OpenMP stuff as it's not how its supposed to be done, I just wanted to see how it behaves trying the naive way).

C++
Time scaler version: 55 ms
Time SIMD version: 53 ms
**Time SIMD 2 version: 33 ms**
Time SIMD 3 version: 168 ms
Time OpenMP version: 59 ms

Python numpy
>> best of 5 = 88.794 ms


#include <cstdlib>
#include <xmmintrin.h>   // compile with -mavx -msse4.1
#include <pmmintrin.h>
#include <immintrin.h>

#include <numeric>
#include <algorithm>
#include <chrono>
#include <iostream>
#include <cstring>

using namespace std;

float* pairwise_sub_naive (const float* input, float* output, int n) 
{
    for (int i = 0; i < n; i++) {
        for (int j = 0; j < n; j++) {
            for (int k = 0; k < n; k++)
                output[(i * n + j) * n + k] = input[i * n + k] - input[j * n + k];
          }
    }
    return output;
}

float* pairwise_sub_simd (const float* input, float* output, int n) 
{    
    for (int i = 0; i < n; i++) 
    {
        const int idxi = i * n;
        for (int j = 0; j < n; j++)
        {
            const int idxj = j * n;
            const int outidx = idxi + j;
            for (int k = 0; k < n; k += 8) 
            {
                __m256 A = _mm256_load_ps(input + idxi + k);
                __m256 B = _mm256_load_ps(input + idxj + k);
                _mm256_store_ps(output + outidx * n + k, _mm256_sub_ps( A, B ));
            }
        }
    }
    
    return output;
}

float* pairwise_sub_simd_2 (const float* input, float* output, int n) 
{
    float* line_buffer = (float*) aligned_alloc(32, n * sizeof(float));

    for (int i = 0; i < n; i++) 
    {
        const int idxi = i * n;
        for (int j = 0; j < n; j++)
        {
            const int idxj = j * n;
            const int outidx = idxi + j;
            for (int k = 0; k < n; k += 8) 
            {
                __m256 A = _mm256_load_ps(input + idxi + k);
                __m256 B = _mm256_load_ps(input + idxj + k);
                _mm256_store_ps(line_buffer + k, _mm256_sub_ps( A, B ));
            }
            memcpy(output + outidx * n, line_buffer, n);
        }
    }
    
    return output;
}

float* pairwise_sub_simd_3 (const float* input, float* output, int n) 
{    
    for (int i = 0; i < n; i++) 
    {
        const int idxi = i * n;
        for (int k = 0; k < n; k += 8) 
        {
            __m256 A = _mm256_load_ps(input + idxi + k);
            for (int j = 0; j < n; j++)
            {
                const int idxj = j * n;
                const int outidx = (idxi + j) * n;
                __m256 B = _mm256_load_ps(input + idxj + k);
                _mm256_store_ps(output + outidx + k, _mm256_sub_ps( A, B     ));
             }
        }
    }

    return output;
}

float* pairwise_sub_openmp (const float* input, float* output, int n)
{
    int i, j;
    #pragma omp parallel for private(j)
    for (i = 0; i < n; i++) 
    {
        for (j = 0; j < n; j++)
        {
            const int idxi = i * n; 
            const int idxj = j * n;
            const int outidx = idxi + j;
            for (int k = 0; k < n; k += 8) 
            {
                __m256 A = _mm256_load_ps(input + idxi + k);
                __m256 B = _mm256_load_ps(input + idxj + k);
                _mm256_store_ps(output + outidx * n + k, _mm256_sub_ps( A, B ));
            }
        }
    }
    /*for (i = 0; i < n; i++) 
    {
        for (j = 0; j < n; j++) 
        {
            for (int k = 0; k < n; k++)
            {
                output[(i * n + j) * n + k] = input[i * n + k] - input[j * n + k];
            }
        }
    }*/
    
    return output;
}

int main ()
{
    constexpr size_t n = 512;
    constexpr size_t input_size = n * n;
    constexpr size_t output_size = n * n * n;

    float* input = (float*) aligned_alloc(32, input_size * sizeof(float));
    float* output = (float*) aligned_alloc(32, output_size * sizeof(float));

    float* input_simd = (float*) aligned_alloc(32, input_size * sizeof(float));
    float* output_simd = (float*) aligned_alloc(32, output_size * sizeof(float));

    float* input_par = (float*) aligned_alloc(32, input_size * sizeof(float));
    float* output_par = (float*) aligned_alloc(32, output_size * sizeof(float));

    iota(input, input + input_size, float(0.0));
    fill(output, output + output_size, float(0.0));

    iota(input_simd, input_simd + input_size, float(0.0));
    fill(output_simd, output_simd + output_size, float(0.0));
    
    iota(input_par, input_par + input_size, float(0.0));
    fill(output_par, output_par + output_size, float(0.0));

    std::chrono::milliseconds best_scaler{100000};
    for (int i = 0; i < 5; ++i)
    {
        auto start = chrono::high_resolution_clock::now();
        pairwise_sub_naive(input, output, n);
        auto stop = chrono::high_resolution_clock::now();

        auto duration = chrono::duration_cast<chrono::milliseconds>(stop - start);
        if (duration < best_scaler)
        {
            best_scaler = duration;
        }
    }
    cout << "Time scaler version: " << best_scaler.count() << " ms\n";

    std::chrono::milliseconds best_simd{100000};
for (int i = 0; i < 5; ++i)
{
    auto start = chrono::high_resolution_clock::now();
    pairwise_sub_simd(input_simd, output_simd, n);
    auto stop = chrono::high_resolution_clock::now();

    auto duration = chrono::duration_cast<chrono::milliseconds>(stop - start);
     if (duration < best_simd)
    {
        best_simd = duration;
    }
}
cout << "Time SIMD version: " << best_simd.count() << " ms\n";

std::chrono::milliseconds best_simd_2{100000};
for (int i = 0; i < 5; ++i)
{
    auto start = chrono::high_resolution_clock::now();
    pairwise_sub_simd_2(input_simd, output_simd, n);
    auto stop = chrono::high_resolution_clock::now();

    auto duration = chrono::duration_cast<chrono::milliseconds>(stop - start);
     if (duration < best_simd_2)
    {
        best_simd_2 = duration;
    }
}
cout << "Time SIMD 2 version: " << best_simd_2.count() << " ms\n";

std::chrono::milliseconds best_simd_3{100000};
for (int i = 0; i < 5; ++i)
{
    auto start = chrono::high_resolution_clock::now();
    pairwise_sub_simd_3(input_simd, output_simd, n);
    auto stop = chrono::high_resolution_clock::now();

    auto duration = chrono::duration_cast<chrono::milliseconds>(stop - start);
     if (duration < best_simd_3)
    {
        best_simd_3 = duration;
    }
}
cout << "Time SIMD 3 version: " << best_simd_3.count() << " ms\n";

    std::chrono::milliseconds best_par{100000};
    for (int i = 0; i < 5; ++i)
    {
        auto start = chrono::high_resolution_clock::now();
        pairwise_sub_openmp(input_par, output_par, n);
        auto stop = chrono::high_resolution_clock::now();

        auto duration = chrono::duration_cast<chrono::milliseconds>(stop - start);
         if (duration < best_par)
        {
            best_par = duration;
        }
    }
    cout << "Time OpenMP version: " << best_par.count() << " ms\n";

    cout << "Verification\n";
    if (equal(output, output + output_size, output_simd))
    {
        cout << "PASSED\n";
    }
    else
    {
        cout << "FAILED\n";
    }

    return 0;
}

Edit: Small correction as there was a wrong call related to the second version of SIMD implementation.

As you can see now, the second implementation is the fastest as it behaves the best from the point of view of the locality of reference of the cache. Examples 2 and 3 of SIMD implementations are there to illustrate for you how changing memory access patterns to influence the performance of your SIMD optimizations. To summarize(knowing that I'm far from being complete in my advice) be mindful of your memory access patterns and of the loads and stores to\from the SIMD unit; the SIMD is a different hardware unit inside the processor's core so there is a penalty in shuffling data back and forth, hence when you load a register from memory try to do as many operations as possible with that data and do not be too eager to store it back(of course, in your example that might be all you need to do with the data). Be mindful also that there is a limited number of SIMD registers available and if you load too many then they will "spill", that is they will be stored back to temporary locations in main memory behind the scenes killing all your gains. SIMD optimization, it's a true balance act!

There is some effort to put a cross-platform intrinsics wrapper into the standard(I developed myself a closed source one in my glorious past) and even it's far from being complete, it's worth taking a look at(read the accompanying papers if you're truly interested to learn how SIMD works). https://github.com/VcDevel/std-simd

Khawaja Asim
  • 1,327
  • 18
  • 38
celavek
  • 5,575
  • 6
  • 41
  • 69
  • 1
    Thank you so much for your answer. Indeed, removing the allocation from my naive code, I get the same perf as you do (40 ms per function call on my laptop). But I'm still puzzled: if i execute an empty function that only allocates the array and returns it, it's super fast (0.003 ms per call). How can that be? – jere0111 Jan 27 '21 at 10:06
  • 1
    It's all in the optimizations the compiler is able to make. When you have more code around your allocations the compiler might not be able to assume much about the context as when you have just an allocation. If you want to go deeper into the rabbit hole, then take your code from both situations, put in into the [Compiler Explorer|https://godbolt.org/] and look at the assembly code both with and without optimizations, to see what the compiler does behind the scenes - you might be surprised by the result. – celavek Jan 27 '21 at 10:11
  • 1
    Ok. Also, you said that numpy "does not allocate memory at the point of computation". What does that mean? Does that mean that numpy is pre-allocating memory? But it cannot know in advance the resulting size before the operation i performed, isn't it? So it still should allocate the memory at the time the operation is performed, which should have appeared in the final measured time... (?) – jere0111 Jan 27 '21 at 10:23
  • "N = 512 X = np.random.rand(N,N).astype(np.float32)" That is where numpy(behind the scenes) allocates memory for you which is not at the point of computation, that is where you actually do your pairwise subtraction in the "pairwise_sub_numpy" function. – celavek Jan 27 '21 at 14:46
10

This is a complement to the answer posted by @celakev . I think I finally got to understand what exactly was the issue. The issue was not about allocating the memory in the main function that does the computation.

What was actually taking time is to access new (fresh) memory. I believe that the malloc call returns pages of memory which are virtual, i.e. that does not corresponds to actual physical memory -- until it is explicitly accessed. What actually takes time is the process of allocating physical memory on the fly (which I think is OS-level) when it is accessed in the function code.

Here is a proof. Consider the two following trivial functions:

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

float* just_alloc( size_t N ) 
{
    return (float*) aligned_alloc( 32, sizeof(float)*N );
}

void just_fill( float* _arr, size_t N ) 
{
    for (size_t i = 0; i < N; i++)
        _arr[i] = 1;
}

#define Time( code_to_benchmark, cleanup_code ) \
    do { \
        double best = 9e9; \
        for( int i = 0; i < 5; i++) { \
            struct timespec start, stop; \
            clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start); \
            code_to_benchmark; \
            clock_gettime(CLOCK_THREAD_CPUTIME_ID, &stop); \
            double t = (stop.tv_sec - start.tv_sec) * 1e3 + (stop.tv_nsec - start.tv_nsec) / 1e6; \
            printf("Time[%d] = %f ms\n", i, t); \
            if (t < best) best = t; \
            cleanup_code; \
        } \
        printf("Best of 5 for '" #code_to_benchmark "' = %f ms\n\n", best); \
    } while(0)

int main() 
{
    const size_t N = 512;

    Time( float* arr = just_alloc(N*N*N), free(arr) );
    
    float* arr = just_alloc(N*N*N);
    Time( just_fill(arr, N*N*N), ; );
    free(arr);

    return 0;
}

I get the following timings, which I now detail for each of the calls:

Time[0] = 0.000931 ms
Time[1] = 0.000540 ms
Time[2] = 0.000523 ms
Time[3] = 0.000524 ms
Time[4] = 0.000521 ms
Best of 5 for 'float* arr = just_alloc(N*N*N)' = 0.000521 ms

Time[0] = 189.822237 ms
Time[1] = 45.041083 ms
Time[2] = 46.331428 ms
Time[3] = 44.729433 ms
Time[4] = 42.241279 ms
Best of 5 for 'just_fill(arr, N*N*N)' = 42.241279 ms

As you can see, allocating memory is blazingly fast, but the first time that the memory is accessed, it is 5 times slower than the other times. So, basically the reason that my code was slow was because i was each time reallocating fresh memory that had no physical address yet. (Correct me if I'm wrong but I think that's the gist of it!)

jere0111
  • 751
  • 4
  • 9
  • 3
    "So, basically the reason that my code was slow was because i was each time reallocating fresh memory that had no physical address yet. (Correct me if I'm wrong but I think that's the gist of it!)" So it was about memory allocation in the end :) – celavek Jan 27 '21 at 14:22
  • The process you just described is part of the memory allocation - malloc & friends do call an operating system primitive(OS specific) in order to actually allocate the memory as it's the OS which manages that. That has also a price, as you are "traversing" from user space to kernel space. Some people use custom allocators to "avoid" part o that – celavek Jan 27 '21 at 14:51
1

A bit late to the party, but I wanted to add a pairwise method with Eigen, which is supposed to give C++ a high-level algebra manipulation capability and use SIMD under the hood. Just like numpy.

Here is the implementation

#include <iostream>
#include <vector>
#include <chrono>
#include <algorithm>
        
#include <Eigen/Dense>

auto pairwise_eigen(const Eigen::MatrixXf &input, std::vector<Eigen::MatrixXf> &output) {
        for (int k = 0; k < input.cols(); ++k)
                output[k] = input
                          // subtract matrix with repeated k-th column
                          - input.col(k) * Eigen::RowVectorXf::Ones(input.cols());

}

int main() {
        constexpr size_t n = 512;
        
        // allocate input and output 
        Eigen::MatrixXf input = Eigen::MatrixXf::Random(n, n);
        std::vector<Eigen::MatrixXf> output(n);

        std::chrono::milliseconds best_eigen{100000};
        for (int i = 0; i < 5; ++i) {
                auto start = std::chrono::high_resolution_clock::now();
                pairwise_eigen(input, output);
                auto end = std::chrono::high_resolution_clock::now();
         
                auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end-start);      
                if (duration < best_eigen)
                        best_eigen = duration;
        }

        std::cout << "Time Eigen version: " << best_eigen.count() << " ms\n";

        return 0;
}

The full benchmark tests suggested by @celavek on my system are

Time scaler version: 57 ms
Time SIMD version: 58 ms
Time SIMD 2 version: 40 ms
Time SIMD 3 version: 58 ms
Time OpenMP version: 58 ms

Time Eigen version: 76 ms

Numpy >> best of 5 = 118.489 ms

Whit Eigen there is still a noticeable improvement with respect to Numpy, but not so impressive compared to the "raw" implementations (there is certainly some overhead). An extra optimization is to allocate the output vector with copies of the input and then subtract directly from each vector entry, simply replacing the following lines

// inside the pairwise method
        for (int k = 0; k < input.cols(); ++k)
                output[k] -= input.col(k) * Eigen::RowVectorXf::Ones(input.cols());


// at allocation time
        std::vector<Eigen::MatrixXf> output(n, input);

This pushes the best of 5 down to 60 ms.

tboschi
  • 124
  • 11