Performance Difference Between Custom CUDNN Wrapper and PyTorch (original) (raw)
I’ve implemented a wrapper function for calling the CUDNN convolution API as shown in my code snippet below. After testing its execution and profiling with nsys, I noticed a significant performance gap compared to PyTorch’s implementation.
When profiling my implementation, I found that the core convolution computation takes more than 20 milliseconds:
cudnn_infer_ampere_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
Begins: 3.61082s
Ends: 3.6346s (+23.778 ms)
grid: <<<1024, 4, 2>>>
block: <<<256, 1, 1>>>
Launch Type: Regular
Static Shared Memory: 49,152 bytes
Dynamic Shared Memory: 0 bytes
Registers Per Thread: 126
Local Memory Per Thread: 0 bytes
Local Memory Total: 127,401,984 bytes
Shared Memory executed: 102,400 bytes
Shared Memory Bank Size: 4 B
Theoretical occupancy: 25 %
Launched from thread: 3428407
Latency: ←109.155 ms
Correlation ID: 2594
Stream: Stream 22
However, when I wrote the same convolution operation in PyTorch and profiled it, it only takes about 100 microseconds:
cudnn_ampere_scudnn_128x32_relu_small_nn_v1
Begins: 13.3173s
Ends: 13.3174s (+93.952 μs)
grid: <<<3136, 2, 1>>>
block: <<<64, 1, 1>>>
Launch Type: Regular
Static Shared Memory: 5,376 bytes
Dynamic Shared Memory: 0 bytes
Registers Per Thread: 128
Local Memory Per Thread: 0 bytes
Local Memory Total: 205,258,752 bytes
Shared Memory executed: 102,400 bytes
Shared Memory Bank Size: 4 B
Theoretical occupancy: 25 %
Launched from thread: 3417000
Latency: ←124.023 ms
Correlation ID: 2300
Stream: Default stream 7
This performance difference is more than two orders of magnitude!
I suspect that PyTorch’s backend or the Python backend has some compilation optimizations. However, looking at the kernel execution, it seems that PyTorch is still ultimately calling pre-written kernels from CUDNN rather than generating them through compilation techniques.
Question:
Is it possible for my wrapper function approach to achieve execution efficiency similar to PyTorch? If so, what additional work do I need to do?
Here’s my wrapper function implementation:
extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuCudnnConv2dForward(
int n, int c, int h, int w_in, // Input dimensions
int k, int r, int s, // Kernel dimensions
int pad_h, int pad_w, // Padding
int stride_h, int stride_w, // Stride
int dilation_h, int dilation_w, // Dilation
void* x_data, void* w_data, void* bias_data, // Input, weight, and bias pointers
void* y_data, // Output pointer
CUstream stream // CUDA stream
) {
// Ensure global context
mgpuEnsureContext();
// Get cuDNN handle for this stream
cudnnHandle_t handle = mgpuCudnnGetHandle(stream);
// Create descriptors
cudnnTensorDescriptor_t xDesc, yDesc, biasDesc;
cudnnFilterDescriptor_t wDesc;
cudnnConvolutionDescriptor_t convDesc;
CUDNN_REPORT_IF_ERROR(cudnnCreateTensorDescriptor(&xDesc));
CUDNN_REPORT_IF_ERROR(cudnnCreateFilterDescriptor(&wDesc));
CUDNN_REPORT_IF_ERROR(cudnnCreateTensorDescriptor(&yDesc));
CUDNN_REPORT_IF_ERROR(cudnnCreateTensorDescriptor(&biasDesc));
CUDNN_REPORT_IF_ERROR(cudnnCreateConvolutionDescriptor(&convDesc));
// Set input descriptor
CUDNN_REPORT_IF_ERROR(cudnnSetTensor4dDescriptor(
xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, c, h, w_in));
// Set weight descriptor
CUDNN_REPORT_IF_ERROR(cudnnSetFilter4dDescriptor(
wDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, k, c, r, s));
// Set convolution descriptor
CUDNN_REPORT_IF_ERROR(cudnnSetConvolution2dDescriptor(
convDesc, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT));
// Enable Tensor Cores
CUDNN_REPORT_IF_ERROR(cudnnSetConvolutionMathType(convDesc, CUDNN_TENSOR_OP_MATH));
// Get output dimensions
int out_n, out_c, out_h, out_w;
CUDNN_REPORT_IF_ERROR(cudnnGetConvolution2dForwardOutputDim(
convDesc, xDesc, wDesc, &out_n, &out_c, &out_h, &out_w));
fprintf(stderr, "Output dimensions: n=%d, c=%d, h=%d, w=%d\n",
out_n, out_c, out_h, out_w);
CUDNN_REPORT_IF_ERROR(cudnnSetTensor4dDescriptor(
yDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, out_n, out_c, out_h, out_w));
// Set bias descriptor (1xCx1x1)
CUDNN_REPORT_IF_ERROR(cudnnSetTensor4dDescriptor(
biasDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, k, 1, 1));
int requestedAlgoCount = 10;
int returnedAlgoCount;
cudnnConvolutionFwdAlgoPerf_t perfResults[10];
CUDNN_REPORT_IF_ERROR(cudnnGetConvolutionForwardAlgorithm_v7(
handle, xDesc, wDesc, convDesc, yDesc,
requestedAlgoCount, &returnedAlgoCount, perfResults));
cudnnConvolutionFwdAlgo_t algo = perfResults[0].algo;
// Get workspace size
size_t workspaceSize = 0;
CUDNN_REPORT_IF_ERROR(cudnnGetConvolutionForwardWorkspaceSize(
handle, xDesc, wDesc, convDesc, yDesc, algo, &workspaceSize));
// Allocate workspace
void* workspace = nullptr;
if (workspaceSize > 0) {
CUdeviceptr wsPtr = 0;
CUDA_REPORT_IF_ERROR(cuMemAlloc(&wsPtr, workspaceSize));
workspace = reinterpret_cast<void*>(wsPtr);
}
// Execute convolution
const float alpha = 1.0f;
const float beta = 0.0f;
cudnnStatus_t status = cudnnConvolutionForward(
handle, &alpha, xDesc, x_data, wDesc, w_data, convDesc, algo,
workspace, workspaceSize, &beta, yDesc, y_data);
// Report errors (if any)
CUDNN_REPORT_IF_ERROR(status);
// Add bias (if provided)
if (bias_data != nullptr) {
const float alpha_bias = 1.0f;
const float beta_bias = 1.0f;
CUDNN_REPORT_IF_ERROR(cudnnAddTensor(
handle, &alpha_bias, biasDesc, bias_data, &beta_bias, yDesc, y_data));
}
// Free workspace
if (workspace != nullptr) {
CUDA_REPORT_IF_ERROR(cuMemFree(reinterpret_cast<CUdeviceptr>(workspace)));
}
// Clean up descriptors
CUDNN_REPORT_IF_ERROR(cudnnDestroyTensorDescriptor(xDesc));
CUDNN_REPORT_IF_ERROR(cudnnDestroyFilterDescriptor(wDesc));
CUDNN_REPORT_IF_ERROR(cudnnDestroyTensorDescriptor(yDesc));
CUDNN_REPORT_IF_ERROR(cudnnDestroyTensorDescriptor(biasDesc));
CUDNN_REPORT_IF_ERROR(cudnnDestroyConvolutionDescriptor(convDesc));
}
Any suggestions or insights would be greatly appreciated!