How to lower the combination of async gpu ops in `gpu` Dialect (original) (raw)

Sure, I have tested many examples. Here, I’ll paste a relatively simple one (since the IR is quite large, I only attached the IR lowered to the GPU dialect level). As shown below, it computes two matmuls and returns the sum of the two matmul results.

  func.func @test_parallel(%arg0: memref<256x1024xf32>, %arg1: memref<1024x256xf32>) -> memref<256x256xf32> attributes {llvm.emit_c_interface} {
    %c8 = arith.constant 8 : index
    %c32 = arith.constant 32 : index
    %cst = arith.constant 0.000000e+00 : f32
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c1024 = arith.constant 1024 : index
    %memref = gpu.alloc  host_shared () : memref<256x256xf32>
    %memref_0 = gpu.alloc  host_shared () : memref<256x256xf32>
    %memref_1 = gpu.alloc  host_shared () : memref<256x256xf32>
    %0 = gpu.wait async
    %2 = gpu.wait async
    %1 = gpu.launch_func async [%0] @test_parallel_kernel::@test_parallel_kernel blocks in (%c8, %c8, %c1) threads in (%c32, %c32, %c1)  args(%c32 : index, %c0 : index, %c32 : index, %c1 : index, %c0 : index, %arg0 : memref<256x1024xf32>, %arg1 : memref<1024x256xf32>, %c1024 : index, %cst : f32, %memref_1 : memref<256x256xf32>)
    %3 = gpu.launch_func async [%2] @test_parallel_kernel_0::@test_parallel_kernel blocks in (%c8, %c8, %c1) threads in (%c32, %c32, %c1)  args(%c32 : index, %c0 : index, %c32 : index, %c1 : index, %c0 : index, %arg0 : memref<256x1024xf32>, %arg1 : memref<1024x256xf32>, %c1024 : index, %cst : f32, %memref_0 : memref<256x256xf32>)
    %4 = gpu.wait async [%1, %3]
    %5 = gpu.launch_func async [%4] @test_parallel_kernel_1::@test_parallel_kernel blocks in (%c8, %c8, %c1) threads in (%c32, %c32, %c1)  args(%c32 : index, %c0 : index, %c32 : index, %c1 : index, %c0 : index, %memref_1 : memref<256x256xf32>, %memref_0 : memref<256x256xf32>, %memref : memref<256x256xf32>)
    gpu.wait [%5]
    return %memref : memref<256x256xf32>
  }
  gpu.module @test_parallel_kernel {
    gpu.func @test_parallel_kernel(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: memref<256x1024xf32>, %arg6: memref<1024x256xf32>, %arg7: index, %arg8: f32, %arg9: memref<256x256xf32>) kernel {
      %block_id_x = gpu.block_id  x
      %block_id_y = gpu.block_id  y
      %thread_id_x = gpu.thread_id  x
      %thread_id_y = gpu.thread_id  y
      %0 = affine.apply #map()[%arg0, %arg1, %block_id_x]
      %1 = affine.apply #map()[%arg2, %arg1, %block_id_y]
      %2 = affine.apply #map()[%arg3, %arg4, %thread_id_x]
      %3 = affine.apply #map()[%arg3, %arg4, %thread_id_y]
      %4 = arith.addi %2, %0 : index
      %5 = arith.addi %3, %1 : index
      %6 = scf.for %arg10 = %arg1 to %arg7 step %arg3 iter_args(%arg11 = %arg8) -> (f32) {
        %7 = memref.load %arg5[%4, %arg10] : memref<256x1024xf32>
        %8 = memref.load %arg6[%arg10, %5] : memref<1024x256xf32>
        %9 = arith.mulf %7, %8 : f32
        %10 = arith.addf %arg11, %9 : f32
        scf.yield %10 : f32
      }
      memref.store %6, %arg9[%4, %5] : memref<256x256xf32>
      gpu.return
    }
  }
  gpu.module @test_parallel_kernel_0 {
    gpu.func @test_parallel_kernel(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: memref<256x1024xf32>, %arg6: memref<1024x256xf32>, %arg7: index, %arg8: f32, %arg9: memref<256x256xf32>) kernel {
      %block_id_x = gpu.block_id  x
      %block_id_y = gpu.block_id  y
      %thread_id_x = gpu.thread_id  x
      %thread_id_y = gpu.thread_id  y
      %0 = affine.apply #map()[%arg0, %arg1, %block_id_x]
      %1 = affine.apply #map()[%arg2, %arg1, %block_id_y]
      %2 = affine.apply #map()[%arg3, %arg4, %thread_id_x]
      %3 = affine.apply #map()[%arg3, %arg4, %thread_id_y]
      %4 = arith.addi %2, %0 : index
      %5 = arith.addi %3, %1 : index
      %6 = scf.for %arg10 = %arg1 to %arg7 step %arg3 iter_args(%arg11 = %arg8) -> (f32) {
        %7 = memref.load %arg5[%4, %arg10] : memref<256x1024xf32>
        %8 = memref.load %arg6[%arg10, %5] : memref<1024x256xf32>
        %9 = arith.mulf %7, %8 : f32
        %10 = arith.addf %arg11, %9 : f32
        scf.yield %10 : f32
      }
      memref.store %6, %arg9[%4, %5] : memref<256x256xf32>
      gpu.return
    }
  }
  gpu.module @test_parallel_kernel_1 {
    gpu.func @test_parallel_kernel(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: memref<256x256xf32>, %arg6: memref<256x256xf32>, %arg7: memref<256x256xf32>) kernel {
      %block_id_x = gpu.block_id  x
      %block_id_y = gpu.block_id  y
      %thread_id_x = gpu.thread_id  x
      %thread_id_y = gpu.thread_id  y
      %0 = affine.apply #map()[%arg0, %arg1, %block_id_x]
      %1 = affine.apply #map()[%arg2, %arg1, %block_id_y]
      %2 = affine.apply #map()[%arg3, %arg4, %thread_id_x]
      %3 = affine.apply #map()[%arg3, %arg4, %thread_id_y]
      %4 = arith.addi %2, %0 : index
      %5 = arith.addi %3, %1 : index
      %6 = memref.load %arg5[%4, %5] : memref<256x256xf32>
      %7 = memref.load %arg6[%4, %5] : memref<256x256xf32>
      %8 = arith.addf %6, %7 : f32
      memref.store %8, %arg7[%4, %5] : memref<256x256xf32>
      gpu.return
    }
  }

Later, I will use -gpu-lower-to-nvvm-pipeline="host-bare-ptr-calling-convention=1 kernel-bare-ptr-calling-convention=1 cubin-chip=sm_80 cubin-format=fatbin" to lower it to LLVM, and then execute it using mlir-cpu-runner.
Because I am using mlir-cpu-runner for execution, I actually added a main function to call it during the actual execution.

Also, regarding what you mentioned about “not serializing kernel execution”, does this need to be controlled manually? My understanding is that nsys is just an observation tool and shouldn’t affect the execution of kernels.

How to lower the combination of async gpu ops in gpu Dialect (original) (raw)

How to lower the combination of async gpu ops in `gpu` Dialect (original) (raw)