Does "more threads" mean more speed?

Question

I've got a Jacobi implementation on CUDA, but the problem is:

I assign threads at this way:

#define imin(a,b) (a < b ? a : b)
int  dimBlocks, dimThreads;
dimThreads = 256;
dimBlocks =  imin(32, (dimThreads + dim - 1)/dimThreads);

But if I use 32 threads it's fastest than using 256 threads or moreover...

I've got these results:

Sequential times:
9900 5.882000 
9900 6.071000

Parallel times:
9900 1.341000 //using 32
9900 1.626000 //using 256

Where 9900 is matrix WIDTH... And we can see the following:

5.882 / 1.34 = 4.39
6.07 / 1.62 = 3.74

So 32 threads is more efficient than 256?

Sorry, I don't know if I should upload the code(since they are a bit long), if you request it I will do it.

EDIT:

//Based on doubletony algorithm
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "Jacobi.cuh"

#include "thrust\host_vector.h"
#include "thrust\device_vector.h"
#include "thrust\extrema.h"

#include <cstdio>
#include <cstdlib>
#include <cmath>
#include <ctime>

#define imin(a,b) (a < b ? a : b)

// name OF FUNCTION: __copy_vector
// PURPOSE:
//    The function will copy a vector.
//
// PARAMETERS:
// name         type      value/reference   description
// ---------------------------------------------------------------------
// source       double*   value             vector to be copied
// dest         double*   reference         vector copied
// dim          int       value             vector dimension
// RETURN VALUE:
// name         type  description
// ---------------------------------------------------------------------
//
__global__ void __copy_vector(double *source, double *dest, const int  dim)
{
    int  tIdx = blockDim.x * blockIdx.x + threadIdx.x;
    while(tIdx < dim){
        dest[tIdx] = source[tIdx];
        tIdx += gridDim.x * blockDim.x;
    }
}

// name OF FUNCTION: __Jacobi_sum
// PURPOSE:
//    The function will execute matrix vector multiplication
//
// PARAMETERS:
// name         type      value/reference   description
// ---------------------------------------------------------------------
// A            double*   value             A
// B            double*   value             B
// C            double*   reference         A*B
// dim          int       value             vector dimension
// RETURN VALUE:
// name         type  description
// ---------------------------------------------------------------------
//
__global__ void __Jacobi_sum(const double *A, 
                             const double *B, 
                             double *resul, 
                             const int  dim)
{
    int  tIdx = blockIdx.x * blockDim.x + threadIdx.x;
    while(tIdx < dim){
        resul[tIdx] = 0;
        for(int i = 0; i < dim; i++)
            if(tIdx != i)
                resul[tIdx] += A[tIdx * dim + i] * B[i];
        tIdx += gridDim.x * blockDim.x;
    }
    __syncthreads;
}
// name OF FUNCTION: __substract
// PURPOSE:
//    The function will execute A-B=resul
//
// PARAMETERS:
// name         type      value/reference   description
// ---------------------------------------------------------------------
// A            double*   value             A
// B            double*   value             B
// C            double*   reference         A-B
// dim          int       value             vector dimension
// RETURN VALUE:
// name         type  description
// ---------------------------------------------------------------------
//
__global__ void __substract(const double *A, 
                            const double *B, 
                            double *C, 
                            const int  dim)
{
    int  tIdx = blockIdx.x * blockDim.x + threadIdx.x;
    while(tIdx < dim){
        C[tIdx] = A[tIdx] - B[tIdx];
        tIdx += gridDim.x * blockDim.x;
    }
}
// name OF FUNCTION: __divide
// PURPOSE:
//    The function will execute the jacobi division, that is, 
//    (B-sum)/A[i,i]
//
// PARAMETERS:
// name         type      value/reference   description
// ---------------------------------------------------------------------
// A            double*   value             A
// B            double*   reference         (B-sum)/A[i,i]
// dim          int       value             vector dimension
// RETURN VALUE:
// name         type  description
// ---------------------------------------------------------------------
//
__global__ void __divide(const double *A, double *B, const int dim)
{
    int  tIdx = blockIdx.x * blockDim.x + threadIdx.x;
    while(tIdx < dim){
        //if(A[tIdx * dim + tIdx] != 0)
            B[tIdx] /= A[tIdx * dim + tIdx];
        tIdx += blockDim.x * gridDim.x;
    }
}
// name OF FUNCTION: __absolute
// PURPOSE:
//    The function will calculate the absolute value for each
//    number in an array
//
// PARAMETERS:
// name         type      value/reference   description
// ---------------------------------------------------------------------
// A            double*   reference         |A[i]|
// dim          int       value             vector dimension
// RETURN VALUE:
// name         type  description
// ---------------------------------------------------------------------
//
__global__ void __absolute(double *A, const int dim)
{
    int  tIdx = blockIdx.x * blockDim.x + threadIdx.x;
    while(tIdx < dim){
        if(A[tIdx] < 0)
            A[tIdx] = -A[tIdx];
        tIdx += blockDim.x * gridDim.x;
    }
}
// name OF FUNCTION: Jacobi_Cuda
// PURPOSE:
//    The function will calculate a X solution for a linear system
//    using Jacobi's Method.
//
// PARAMETERS:
// name         type      value/reference   description
// ---------------------------------------------------------------------
// Matrix_A     double*   value             Matrix A(coefficients)
// Vector_B     double*   value             Vector B
// Vector_X     double*   reference         Solution
// dim          int       value             Matrix Dimension
// e            double    value             Error allowed
// maxIter      int       value             Maximum iterations allowed
// RETURN VALUE:
// name         type  description
// ---------------------------------------------------------------------
//

void Jacobi_Cuda(const double *Matrix_A, 
                 const double *Vector_B,
                 double *Vector_X, 
                 const  int  dim, 
                 const double e,
                 const int  maxIter,
                 double *t)
{

    /** Host variables **/
    int iter = 0; // iter counter
    double err = 1; // error between X^k and X^k-1
    double *tmp; // temporary for thrust norm
    double *norm; // Vector norm

    tmp = (double *) malloc(sizeof(double) * dim);
    norm = (double *) malloc(sizeof(double));

    int  dimBlocks, dimThreads;
    dimThreads = 64;
    dimBlocks =  imin(32, (dim + dimThreads - 1)/dimThreads);
    /** ************** **/

    /** Device variables **/
    double *d_Matrix_A, *d_Vector_B, *d_Vector_X, *d_Vector_Y, *d_Vector_Resul;

    cudaMalloc((void**)&d_Matrix_A, sizeof(double) * dim * dim);
    cudaMalloc((void**)&d_Vector_B, sizeof(double) * dim);
    cudaMalloc((void**)&d_Vector_X, sizeof(double) * dim);
    cudaMalloc((void**)&d_Vector_Y, sizeof(double) * dim);
    cudaMalloc((void**)&d_Vector_Resul, sizeof(double) * dim);

    /** **************** **/

    /** Initialize **/
    cudaMemcpy(d_Matrix_A, Matrix_A, sizeof(double) * dim * dim, 
                cudaMemcpyHostToDevice);
    cudaMemcpy(d_Vector_B, Vector_B, sizeof(double) * dim, cudaMemcpyHostToDevice);
    cudaMemcpy(d_Vector_X, Vector_X, sizeof(double) * dim, cudaMemcpyHostToDevice);
    /** ********** **/


    clock_t start,finish;
    double totaltime;
    start = clock(); 

    /** Jacobi **/
    while(err > e && iter < maxIter){

        __copy_vector<<<dimBlocks, dimThreads>>>(d_Vector_X, d_Vector_Y, dim);

        __Jacobi_sum<<<dimBlocks, dimThreads>>>(d_Matrix_A, d_Vector_Y, 
                                                d_Vector_Resul, dim); 
        __substract<<<dimBlocks, dimThreads>>>(d_Vector_B, d_Vector_Resul, 
                                                d_Vector_X, dim);

        __divide<<<dimBlocks, dimThreads>>>(d_Matrix_A, d_Vector_X, dim);

        __substract<<<dimBlocks, dimThreads>>>(d_Vector_Y, d_Vector_X,
                                                d_Vector_Resul, dim);
        __absolute<<<dimBlocks, dimThreads>>>(d_Vector_Resul, dim);

        cudaMemcpy(tmp, d_Vector_Resul, sizeof(double) * dim, cudaMemcpyDeviceToHost);

        double *t = thrust::max_element(tmp, tmp + dim); //vector norm

        err = *t;

        iter++;
    }

    finish = clock();

    totaltime=(double)(finish-start)/CLOCKS_PER_SEC;   

    *t = totaltime;

    cudaMemcpy(Vector_X, d_Vector_X, sizeof(double) * dim, 
                cudaMemcpyDeviceToHost);
    if(iter == maxIter)
        puts("Jacobi has reached maxIter!");
    /** ****** **/

    /** Free memory **/
    cudaFree(d_Matrix_A);
    cudaFree(d_Vector_B);
    cudaFree(d_Vector_X);
    cudaFree(d_Vector_Y);
    cudaFree(d_Vector_Resul);
    free(tmp);
    free(norm);
    /** *********** **/
}

The only way to correctly answer this question is by looking at the code itself. But, from the looks of it, it may be that you code has a lot of synchronization points and synchronizing 32 threads is quicker than synchronizing 256 — Toote, Aug 17 '12 at 21:50
Ok... Should I upload it to my post editing it or maybe uploading code at pastebin or something else? Because it's a bit long... — FacundoGFlores, Aug 17 '12 at 22:08
You have to read more about context switching, which under certain circumstances makes good algorithms sooooo slooooow. — pinepain, Aug 17 '12 at 22:17
@pinepain - the question is about CUDA threads not CPU threads. There is no GPU context switching in the question. While there is minimal overhead to create a warp (32 threads) there is no overhead involved in switching between warps. — Greg Smith, Aug 18 '12 at 02:45
@GregSmith yeah, there is no _traditional_ context switching, but in two words GPU threading mechanism is described [in this answer](http://stackoverflow.com/a/6606033/1461984). — pinepain, Aug 18 '12 at 07:38
@pinepain thank you for the comment. So, let me see. You're telling me when I use a certain amount of threads I set a certaing amount of warps, and if this amount is enough, then we get best perfomance? — FacundoGFlores, Aug 18 '12 at 13:09

David Titarenco · Accepted Answer · 2012-08-17T21:56:22.700

4

It depends on your algorithm. Some algorithms are by definition non-parallelizable (calculating the Fibonacci series, for example). But here's a parallelizable Jacobi algorithm courtesy of Brown. Note that solving systems of equations CAN be solved either in serial or in parallel, it's just a matter of writing the code.

In short, it's impossible to know whether or not more threads = more speed unless you show us (or at least explain) the algorithm. As far as thread synchronization goes, CUDA is very (very) good at normalizing synchronization costs so (if your algorithm is proper), more threads should almost always yield more speed.

edited Aug 17 '12 at 21:56

answered Aug 17 '12 at 21:51

David Titarenco

32,662
13
66
111

Even if synchronization is very efficient, if you create more threads than you have CPU cores when number crunching efficiency cannot be increased, because the scheduler has to timeslice the threads, so the other threads have to wait for CPU time. – ckruse Aug 17 '12 at 21:56
@ckruse - CUDA uses the video card for processing. The GTX 680, for example, has 1536 CUDA cores. It does not appear that he's breaching that limit :P You are otherwise correct. – David Titarenco Aug 17 '12 at 21:57
Does CUDA offer a scheduler?? – Kuba hasn't forgotten Monica Aug 17 '12 at 22:01
@KubaOber - yep. Not only that, but over-utilization is extremely rare. Under-utilization seems to be a much bigger issue. Source: http://www.cs.virginia.edu/kim/docs/pmea09.pdf – David Titarenco Aug 17 '12 at 22:03
3

@DavidTitarenco: The Fibonacci series is parallelizable via prefix sum – Jared Hoberock Aug 18 '12 at 05:15
@JaredHoberock - was not aware of that, +1! – David Titarenco Aug 18 '12 at 07:34
@DavidTitarenco can you explain how the algorithm is allied to thread configuration? Someone said me that 64 or 32 threads are more efficient than 256 because it depends of matrix width, do you think the same? – FacundoGFlores Aug 18 '12 at 13:02
1

@KubaOber the paper you referenced is for a GTX 280 which is a 1.x device and did not support concurrent kernels or many of the enhancements in both the CUDA work distributor or SM warp scheduler. Utilization can be measured both at the GPU level (SM load balancing) and the SM level (active warps/cycle). Current NV GPUs can only support a single context at a time which means utilization is dependent on the concurrency in the program and the parallelism of each kernel. The base occupancy level for NV GPUs has increased with each architecture as the ratio of maxwarps:schedulers has decreased. – Greg Smith Aug 19 '12 at 21:52

score 0 · Answer 2 · answered Aug 17 '12 at 21:50

0

Fewer threads might be more efficient if the workload is small enough that the overheads of managing many threads cause the performance degradation.

...but without seeing your code it's hard to say. Personally I'm more inclined to believe it's just a bug in your code.

answered Aug 17 '12 at 21:50

Dai

141,631
28
261
374

What is the "overhead of managing many threads" in a GPU? Remember that this runs on a GPU using CUDA, it's not using operating system threads... – Kuba hasn't forgotten Monica Aug 17 '12 at 22:00
Yes, but CUDA still needs to manage those GPU threads (even if there are hundreds of them running in parallel). – Dai Aug 17 '12 at 22:26
2

The cost to launch a block is very small and can be done in parallel. The cost to launch a warp (32 threads) is several cycles. The warp scheduler can switch between warps every cycle with no cost. – Greg Smith Aug 18 '12 at 02:49

Does "more threads" mean more speed?

2 Answers2