Poor CUDALink performance, when calling a kernel multiple times

Question

When working on a project of mine I noticed that CUDALink shows very poor performance when calling CUDA kernels multiple times in a loop-construct.

To demonstrate this I put together this little example which calls a simple array addition kernel multiple times:

<< CUDALink`

src = "__global__ void addKernel(float *c, const float *a, const \
float *b)
{
  int i = threadIdx.x + blockIdx.x * blockDim.x;
  c[i] = a[i] + b[i];
}";
fun = CUDAFunctionLoad[src, "addKernel", {{"Float", _, "Output"}, {"Float", _, "Input"}, 
{"Float", _, "Input"}}, 256, TargetPrecision -> "Single"];


a = ConstantArray[3., 256 * 256];
b = ConstantArray[2., 256 * 256];
c = ConstantArray[1., 256 * 256];

gpuA = CUDAMemoryLoad[a, TargetPrecision -> "Single"];
gpuB = CUDAMemoryLoad[b, TargetPrecision -> "Single"];
gpuC = CUDAMemoryLoad[c, TargetPrecision -> "Single"];

CUDAMemoryCopyToDevice[gpuA];
CUDAMemoryCopyToDevice[gpuB];
CUDAMemoryCopyToDevice[gpuC];

runtimes = {};
For[max = 1, max <= 1000, max++,
 For[i = 1, i <= max, i++,
  fun[gpuC, gpuA, gpuB];
 ] // AbsoluteTiming // First // AppendTo[runtimes, #] &;
]

Then I called the same kernel-code using a C compiler (Visual Studio 2012) with the same array length and the same number of Iterations (this is essentially the NVidia sample code that comes up, when you start a CUDA Project):

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <time.h>
#include <stdio.h>
#include <fstream>
#include <iomanip>

cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);

__global__ void addKernel(float *c, const float *a, const float *b)
{
    int i = threadIdx.x + blockIdx.x * blockDim.x;
    c[i] = a[i] + b[i];
}

int main()
{
    const int arraySize = 256*256;
    float a[arraySize];
    float b[arraySize];
    float c[arraySize];

    for(int i = 0; i < arraySize; i++)
    {
        a[i]=1.0f;
        b[i]=2.0f;
        c[i]=0.0f;
    }

    float *dev_a = 0;
    float *dev_b = 0;
    float *dev_c = 0;
    cudaError_t cudaStatus;
    std::fstream runtimes;
    runtimes.open("runtimes.txt", std::fstream::out);

    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    // Allocate GPU buffers for three vectors (two input, one output)    .
    cudaStatus = cudaMalloc((void**)&dev_c, arraySize * sizeof(float));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_a, arraySize * sizeof(float));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_b, arraySize * sizeof(float));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    // Copy input vectors from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(dev_a, a, arraySize * sizeof(float), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    cudaStatus = cudaMemcpy(dev_b, b, arraySize * sizeof(float), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    clock_t timer;

    //Get runtimes
    //--------------------------------------------
    for(int max = 1; max <= 1000; max++)
    {
        cudaDeviceSynchronize();
        timer = clock();

        for(int i = 1; i <= max; i++)
        {
            // Add vectors in parallel.
            // Launch a kernel on the GPU with one thread for each element.
            addKernel<<<256, 256>>>(dev_c, dev_a, dev_b);

                // Check for any errors launching the kernel
            cudaStatus = cudaGetLastError();
            if (cudaStatus != cudaSuccess) {
                fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
                goto Error;
            }
        }

        // cudaDeviceSynchronize waits for the kernel to finish, and returns
        // any errors encountered during the launch.
        cudaStatus = cudaDeviceSynchronize();
        if (cudaStatus != cudaSuccess) {
            fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
            goto Error;
        }

        timer = clock()-timer;

        runtimes << std::setprecision(9) << ((double)timer / (double)CLOCKS_PER_SEC) << std::endl;
    }
    //---------------------------------------------

    // Copy output vector from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(c, dev_c, arraySize * sizeof(float), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    Error:
    cudaFree(dev_c);
    cudaFree(dev_a);
    cudaFree(dev_b);

    runtimes.close();

    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceReset failed!");
        return 1;
    }

    return 0;
}

The resulting runtimes (on my GT650M): Runtime Comparison

Can anybody explain this behavior and how to adjust my Mathematica Code to prevent this effect?

I think the MMA overhead of calling a CUDAFunction is slightly under 1ms for you. This can be negligible if your computation takes way more than that, but for a fast addition example such as this one of 64K threads, the overhead ends up representing most of the time. — Rojo, Jan 25 '14 at 19:02
Perhaps with some spelunking some of the checks and automation can be skipped and you get a good speedup — Rojo, Jan 25 '14 at 19:05
Just running the CUDA kernel 1000 times takes about 0.06s on my PC (Do[CUDALink`Private`cLaunchKernel["Double"][], {1000}]) compared to 0.9s for Do[fun[gpuC, gpuA, gpuB], {1000}]. The overhead is validity checks on the function arguments and options, calculating the grid size, and multiple calls to the CUDALink DLL to set the cubin binary, the kernel function, function arguments and the block and grid sizes, then launch the kernel and finally deal with the output. — Simon Woods, Jan 25 '14 at 22:38
@Simon Woods: Is there a way to circumvent those checks and directly call the kernel? — definitely.uncertain, Jan 26 '14 at 18:29
There's no easy way! AFAIK there is no publicly available documentation for the CUDALink DLL, you are expected to use the CUDAFunction interface and not worry about the details. If you want to dig into the Mathematica code behind CUDAFunction you can use the "Spelunking" package here. However, it will be a lot of guesswork and experimentation to get anything working. It might end up being easier to write something in C and call that, or (if possible) change your algorithm to do a few large calculations rather than many small ones. — Simon Woods, Jan 26 '14 at 19:58

Poor CUDALink performance, when calling a kernel multiple times

0 Answers0