When working on a project of mine I noticed that CUDALink shows very poor performance when calling CUDA kernels multiple times in a loop-construct.
To demonstrate this I put together this little example which calls a simple array addition kernel multiple times:
<< CUDALink`
src = "__global__ void addKernel(float *c, const float *a, const \
float *b)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
c[i] = a[i] + b[i];
}";
fun = CUDAFunctionLoad[src, "addKernel", {{"Float", _, "Output"}, {"Float", _, "Input"},
{"Float", _, "Input"}}, 256, TargetPrecision -> "Single"];
a = ConstantArray[3., 256 * 256];
b = ConstantArray[2., 256 * 256];
c = ConstantArray[1., 256 * 256];
gpuA = CUDAMemoryLoad[a, TargetPrecision -> "Single"];
gpuB = CUDAMemoryLoad[b, TargetPrecision -> "Single"];
gpuC = CUDAMemoryLoad[c, TargetPrecision -> "Single"];
CUDAMemoryCopyToDevice[gpuA];
CUDAMemoryCopyToDevice[gpuB];
CUDAMemoryCopyToDevice[gpuC];
runtimes = {};
For[max = 1, max <= 1000, max++,
For[i = 1, i <= max, i++,
fun[gpuC, gpuA, gpuB];
] // AbsoluteTiming // First // AppendTo[runtimes, #] &;
]
Then I called the same kernel-code using a C compiler (Visual Studio 2012) with the same array length and the same number of Iterations (this is essentially the NVidia sample code that comes up, when you start a CUDA Project):
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <time.h>
#include <stdio.h>
#include <fstream>
#include <iomanip>
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);
__global__ void addKernel(float *c, const float *a, const float *b)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
c[i] = a[i] + b[i];
}
int main()
{
const int arraySize = 256*256;
float a[arraySize];
float b[arraySize];
float c[arraySize];
for(int i = 0; i < arraySize; i++)
{
a[i]=1.0f;
b[i]=2.0f;
c[i]=0.0f;
}
float *dev_a = 0;
float *dev_b = 0;
float *dev_c = 0;
cudaError_t cudaStatus;
std::fstream runtimes;
runtimes.open("runtimes.txt", std::fstream::out);
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_c, arraySize * sizeof(float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_a, arraySize * sizeof(float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_b, arraySize * sizeof(float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, arraySize * sizeof(float), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_b, b, arraySize * sizeof(float), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
clock_t timer;
//Get runtimes
//--------------------------------------------
for(int max = 1; max <= 1000; max++)
{
cudaDeviceSynchronize();
timer = clock();
for(int i = 1; i <= max; i++)
{
// Add vectors in parallel.
// Launch a kernel on the GPU with one thread for each element.
addKernel<<<256, 256>>>(dev_c, dev_a, dev_b);
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
timer = clock()-timer;
runtimes << std::setprecision(9) << ((double)timer / (double)CLOCKS_PER_SEC) << std::endl;
}
//---------------------------------------------
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, arraySize * sizeof(float), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
runtimes.close();
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
return 0;
}
The resulting runtimes (on my GT650M):

Can anybody explain this behavior and how to adjust my Mathematica Code to prevent this effect?
Do[CUDALink`Private`cLaunchKernel["Double"][], {1000}]) compared to 0.9s forDo[fun[gpuC, gpuA, gpuB], {1000}]. The overhead is validity checks on the function arguments and options, calculating the grid size, and multiple calls to the CUDALink DLL to set the cubin binary, the kernel function, function arguments and the block and grid sizes, then launch the kernel and finally deal with the output. – Simon Woods Jan 25 '14 at 22:38CUDAFunctioninterface and not worry about the details. If you want to dig into the Mathematica code behindCUDAFunctionyou can use the "Spelunking" package here. However, it will be a lot of guesswork and experimentation to get anything working. It might end up being easier to write something in C and call that, or (if possible) change your algorithm to do a few large calculations rather than many small ones. – Simon Woods Jan 26 '14 at 19:58