Performance Difference Between Custom CUDNN Wrapper and PyTorch (original) (raw)

I’ve implemented a wrapper function for calling the CUDNN convolution API as shown in my code snippet below. After testing its execution and profiling with nsys, I noticed a significant performance gap compared to PyTorch’s implementation.

When profiling my implementation, I found that the core convolution computation takes more than 20 milliseconds:

cudnn_infer_ampere_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
Begins: 3.61082s
Ends: 3.6346s (+23.778 ms)
grid: <<<1024, 4, 2>>>
block: <<<256, 1, 1>>>
Launch Type: Regular
Static Shared Memory: 49,152 bytes
Dynamic Shared Memory: 0 bytes
Registers Per Thread: 126
Local Memory Per Thread: 0 bytes
Local Memory Total: 127,401,984 bytes
Shared Memory executed: 102,400 bytes
Shared Memory Bank Size: 4 B
Theoretical occupancy: 25 %
Launched from thread: 3428407
Latency: ←109.155 ms
Correlation ID: 2594
Stream: Stream 22

However, when I wrote the same convolution operation in PyTorch and profiled it, it only takes about 100 microseconds:

cudnn_ampere_scudnn_128x32_relu_small_nn_v1
Begins: 13.3173s
Ends: 13.3174s (+93.952 μs)
grid: <<<3136, 2, 1>>>
block: <<<64, 1, 1>>>
Launch Type: Regular
Static Shared Memory: 5,376 bytes
Dynamic Shared Memory: 0 bytes
Registers Per Thread: 128
Local Memory Per Thread: 0 bytes
Local Memory Total: 205,258,752 bytes
Shared Memory executed: 102,400 bytes
Shared Memory Bank Size: 4 B
Theoretical occupancy: 25 %
Launched from thread: 3417000
Latency: ←124.023 ms
Correlation ID: 2300
Stream: Default stream 7

This performance difference is more than two orders of magnitude!

I suspect that PyTorch’s backend or the Python backend has some compilation optimizations. However, looking at the kernel execution, it seems that PyTorch is still ultimately calling pre-written kernels from CUDNN rather than generating them through compilation techniques.

Question:
Is it possible for my wrapper function approach to achieve execution efficiency similar to PyTorch? If so, what additional work do I need to do?

Here’s my wrapper function implementation:

extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuCudnnConv2dForward(
    int n, int c, int h, int w_in,              // Input dimensions
    int k, int r, int s,                         // Kernel dimensions
    int pad_h, int pad_w,                        // Padding
    int stride_h, int stride_w,                  // Stride
    int dilation_h, int dilation_w,              // Dilation
    void* x_data, void* w_data, void* bias_data, // Input, weight, and bias pointers
    void* y_data,                               // Output pointer
    CUstream stream                             // CUDA stream
) {
    // Ensure global context
    mgpuEnsureContext();

    // Get cuDNN handle for this stream
    cudnnHandle_t handle = mgpuCudnnGetHandle(stream);
    
    // Create descriptors
    cudnnTensorDescriptor_t xDesc, yDesc, biasDesc;
    cudnnFilterDescriptor_t wDesc;
    cudnnConvolutionDescriptor_t convDesc;
    
    CUDNN_REPORT_IF_ERROR(cudnnCreateTensorDescriptor(&xDesc));
    CUDNN_REPORT_IF_ERROR(cudnnCreateFilterDescriptor(&wDesc));
    CUDNN_REPORT_IF_ERROR(cudnnCreateTensorDescriptor(&yDesc));
    CUDNN_REPORT_IF_ERROR(cudnnCreateTensorDescriptor(&biasDesc));
    CUDNN_REPORT_IF_ERROR(cudnnCreateConvolutionDescriptor(&convDesc));
    
    // Set input descriptor
    CUDNN_REPORT_IF_ERROR(cudnnSetTensor4dDescriptor(
        xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, c, h, w_in));
    
    // Set weight descriptor
    CUDNN_REPORT_IF_ERROR(cudnnSetFilter4dDescriptor(
        wDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, k, c, r, s));
    
    // Set convolution descriptor
    CUDNN_REPORT_IF_ERROR(cudnnSetConvolution2dDescriptor(
        convDesc, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
        CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT));
    
    // Enable Tensor Cores
    CUDNN_REPORT_IF_ERROR(cudnnSetConvolutionMathType(convDesc, CUDNN_TENSOR_OP_MATH));

    // Get output dimensions
    int out_n, out_c, out_h, out_w;
    CUDNN_REPORT_IF_ERROR(cudnnGetConvolution2dForwardOutputDim(
        convDesc, xDesc, wDesc, &out_n, &out_c, &out_h, &out_w));

    fprintf(stderr, "Output dimensions: n=%d, c=%d, h=%d, w=%d\n", 
          out_n, out_c, out_h, out_w);
    
    CUDNN_REPORT_IF_ERROR(cudnnSetTensor4dDescriptor(
        yDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, out_n, out_c, out_h, out_w));
    
    // Set bias descriptor (1xCx1x1)
    CUDNN_REPORT_IF_ERROR(cudnnSetTensor4dDescriptor(
        biasDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, k, 1, 1));
    
   int requestedAlgoCount = 10;
   int returnedAlgoCount;
   cudnnConvolutionFwdAlgoPerf_t perfResults[10];
   CUDNN_REPORT_IF_ERROR(cudnnGetConvolutionForwardAlgorithm_v7(
       handle, xDesc, wDesc, convDesc, yDesc,
       requestedAlgoCount, &returnedAlgoCount, perfResults));
  
   cudnnConvolutionFwdAlgo_t algo = perfResults[0].algo;

    // Get workspace size
    size_t workspaceSize = 0;
    CUDNN_REPORT_IF_ERROR(cudnnGetConvolutionForwardWorkspaceSize(
        handle, xDesc, wDesc, convDesc, yDesc, algo, &workspaceSize));
    
    // Allocate workspace
    void* workspace = nullptr;
    if (workspaceSize > 0) {
      CUdeviceptr wsPtr = 0;
      CUDA_REPORT_IF_ERROR(cuMemAlloc(&wsPtr, workspaceSize));
      workspace = reinterpret_cast<void*>(wsPtr);
    }

    // Execute convolution
    const float alpha = 1.0f;
    const float beta = 0.0f;
    cudnnStatus_t status = cudnnConvolutionForward(
    handle, &alpha, xDesc, x_data, wDesc, w_data, convDesc, algo,
    workspace, workspaceSize, &beta, yDesc, y_data);

    // Report errors (if any)
    CUDNN_REPORT_IF_ERROR(status);

    // Add bias (if provided)
    if (bias_data != nullptr) {
      const float alpha_bias = 1.0f;
      const float beta_bias = 1.0f;
      CUDNN_REPORT_IF_ERROR(cudnnAddTensor(
          handle, &alpha_bias, biasDesc, bias_data, &beta_bias, yDesc, y_data));
    }
    
    // Free workspace
    if (workspace != nullptr) {
      CUDA_REPORT_IF_ERROR(cuMemFree(reinterpret_cast<CUdeviceptr>(workspace)));
    }
    
    // Clean up descriptors
    CUDNN_REPORT_IF_ERROR(cudnnDestroyTensorDescriptor(xDesc));
    CUDNN_REPORT_IF_ERROR(cudnnDestroyFilterDescriptor(wDesc));
    CUDNN_REPORT_IF_ERROR(cudnnDestroyTensorDescriptor(yDesc));
    CUDNN_REPORT_IF_ERROR(cudnnDestroyTensorDescriptor(biasDesc));
    CUDNN_REPORT_IF_ERROR(cudnnDestroyConvolutionDescriptor(convDesc));
}

Any suggestions or insights would be greatly appreciated!