How to Implement Asynchronous Concurrent Execution Between gpu.launch Operations? (original) (raw)

I initially created a simple model using ONNX (with two convolution computations followed by adding their results) and converted it to affine representation.

#map = affine_map<(d0, d1) -> (d0 * 64 + d1)>
#map1 = affine_map<(d0) -> (-d0 + 1)>
#map2 = affine_map<(d0, d1) -> (-d1 + 1)>
#map3 = affine_map<(d0) -> (-d0 + 1, 0)>
#map4 = affine_map<(d0) -> (-d0 + 29, 3)>
#map5 = affine_map<(d0)[s0] -> (d0 + s0)>
#map6 = affine_map<(d0, d1)[s0, s1] -> (d1 - s1)>
#map7 = affine_map<(d0, d1, d2)[s0, s1, s2] -> (d2 - s2)>
module {
  func.func @test_parallel(%arg0: memref<1024x1x28x28xf32>) -> memref<1024x64x28x28xf32> attributes {llvm.emit_c_interface} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = "krnl.global"() {name = "constant_0", shape = [64, 1, 3, 3], value = dense<1.000000e+00> : tensor<64x1x3x3xf32>} : () -> memref<64x1x3x3xf32>
    %1 = "krnl.global"() {name = "constant_1", shape = [64], value = dense<1.000000e-01> : tensor<64xf32>} : () -> memref<64xf32>
    %2 = "krnl.global"() {name = "constant_2", shape = [64, 1, 3, 3], value = dense<5.000000e-01> : tensor<64x1x3x3xf32>} : () -> memref<64x1x3x3xf32>
    %3 = "krnl.global"() {name = "constant_3", shape = [64], value = dense<2.000000e-01> : tensor<64xf32>} : () -> memref<64xf32>
    %alloc = memref.alloc() {alignment = 16 : i64} : memref<1024x64x28x28xf32>
    affine.for %arg1 = 0 to 1024 {
      affine.for %arg2 = 0 to 1 {
        affine.for %arg3 = 0 to 64 {
          %4 = affine.apply #map(%arg2, %arg3)
          affine.for %arg4 = 0 to 28 {
            affine.for %arg5 = 0 to 28 {
              %5 = affine.apply #map1(%arg4)
              %6 = affine.apply #map2(%arg4, %arg5)
              %7 = affine.for %arg6 = 0 to 1 iter_args(%arg7 = %cst) -> (f32) {
                %10 = affine.for %arg8 = max #map3(%arg4) to min #map4(%arg4) iter_args(%arg9 = %arg7) -> (f32) {
                  %11 = affine.for %arg10 = max #map3(%arg5) to min #map4(%arg5) iter_args(%arg11 = %arg9) -> (f32) {
                    %12 = affine.apply #map5(%arg6)[%arg2]
                    %13 = affine.apply #map6(%arg6, %arg8)[%arg2, %5]
                    %14 = affine.apply #map7(%arg6, %arg8, %arg10)[%arg2, %5, %6]
                    %15 = affine.load %arg0[%arg1, %12, %13, %14] : memref<1024x1x28x28xf32>
                    %16 = affine.load %0[%4, %arg6, %arg8, %arg10] : memref<64x1x3x3xf32>
                    %17 = arith.mulf %15, %16 : f32
                    %18 = arith.addf %arg11, %17 : f32
                    affine.yield %18 : f32
                  }
                  affine.yield %11 : f32
                }
                affine.yield %10 : f32
              }
              %8 = affine.load %1[%4] : memref<64xf32>
              %9 = arith.addf %7, %8 : f32
              affine.store %9, %alloc[%arg1, %4, %arg4, %arg5] : memref<1024x64x28x28xf32>
            }
          }
        }
      }
    }
    %alloc_0 = memref.alloc() {alignment = 16 : i64} : memref<1024x64x28x28xf32>
    affine.for %arg1 = 0 to 1024 {
      affine.for %arg2 = 0 to 1 {
        affine.for %arg3 = 0 to 64 {
          %4 = affine.apply #map(%arg2, %arg3)
          affine.for %arg4 = 0 to 28 {
            affine.for %arg5 = 0 to 28 {
              %5 = affine.apply #map1(%arg4)
              %6 = affine.apply #map2(%arg4, %arg5)
              %7 = affine.for %arg6 = 0 to 1 iter_args(%arg7 = %cst) -> (f32) {
                %10 = affine.for %arg8 = max #map3(%arg4) to min #map4(%arg4) iter_args(%arg9 = %arg7) -> (f32) {
                  %11 = affine.for %arg10 = max #map3(%arg5) to min #map4(%arg5) iter_args(%arg11 = %arg9) -> (f32) {
                    %12 = affine.apply #map5(%arg6)[%arg2]
                    %13 = affine.apply #map6(%arg6, %arg8)[%arg2, %5]
                    %14 = affine.apply #map7(%arg6, %arg8, %arg10)[%arg2, %5, %6]
                    %15 = affine.load %arg0[%arg1, %12, %13, %14] : memref<1024x1x28x28xf32>
                    %16 = affine.load %2[%4, %arg6, %arg8, %arg10] : memref<64x1x3x3xf32>
                    %17 = arith.mulf %15, %16 : f32
                    %18 = arith.addf %arg11, %17 : f32
                    affine.yield %18 : f32
                  }
                  affine.yield %11 : f32
                }
                affine.yield %10 : f32
              }
              %8 = affine.load %3[%4] : memref<64xf32>
              %9 = arith.addf %7, %8 : f32
              affine.store %9, %alloc_0[%arg1, %4, %arg4, %arg5] : memref<1024x64x28x28xf32>
            }
          }
        }
      }
    }
    %alloc_1 = memref.alloc() {alignment = 16 : i64} : memref<1024x64x28x28xf32>
    affine.for %arg1 = 0 to 1024 {
      affine.for %arg2 = 0 to 64 {
        affine.for %arg3 = 0 to 28 {
          affine.for %arg4 = 0 to 28 {
            %4 = affine.load %alloc[%arg1, %arg2, %arg3, %arg4] : memref<1024x64x28x28xf32>
            %5 = affine.load %alloc_0[%arg1, %arg2, %arg3, %arg4] : memref<1024x64x28x28xf32>
            %6 = arith.addf %4, %5 : f32
            affine.store %6, %alloc_1[%arg1, %arg2, %arg3, %arg4] : memref<1024x64x28x28xf32>
          }
        }
      }
    }
    return %alloc_1 : memref<1024x64x28x28xf32>
  }
}

Then I transformed it into GPU dialect using the following command:

mlir-opt --affine-parallelize --lower-affine --canonicalize --scf-parallel-loop-fusion --gpu-map-parallel-loops --convert-parallel-loops-to-gpu -gpu-kernel-outlining -allow-unregistered-dialect --canonicalize

After obtaining the GPU dialect representation (gpu.module section omitted for brevity):

  func.func @test_parallel(%arg0: memref<1024x1x28x28xf32>) -> memref<1024x64x28x28xf32> attributes {llvm.emit_c_interface} {
    %c64 = arith.constant 64 : index
    %c1024 = arith.constant 1024 : index
    %c3 = arith.constant 3 : index
    %c29 = arith.constant 29 : index
    %c28 = arith.constant 28 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %cst = arith.constant 0.000000e+00 : f32
    %0 = "krnl.global"() {name = "constant_0", shape = [64, 1, 3, 3], value = dense<1.000000e+00> : tensor<64x1x3x3xf32>} : () -> memref<64x1x3x3xf32>
    %1 = "krnl.global"() {name = "constant_1", shape = [64], value = dense<1.000000e-01> : tensor<64xf32>} : () -> memref<64xf32>
    %2 = "krnl.global"() {name = "constant_2", shape = [64, 1, 3, 3], value = dense<5.000000e-01> : tensor<64x1x3x3xf32>} : () -> memref<64x1x3x3xf32>
    %3 = "krnl.global"() {name = "constant_3", shape = [64], value = dense<2.000000e-01> : tensor<64xf32>} : () -> memref<64xf32>
    %alloc = memref.alloc() {alignment = 16 : i64} : memref<1024x64x28x28xf32>
    gpu.launch_func  @test_parallel_kernel::@test_parallel_kernel blocks in (%c1024, %c64, %c28) threads in (%c1, %c1, %c1)  args(%c1 : index, %c0 : index, %c29 : index, %c3 : index, %arg0 : memref<1024x1x28x28xf32>, %0 : memref<64x1x3x3xf32>, %cst : f32, %1 : memref<64xf32>, %alloc : memref<1024x64x28x28xf32>, %c28 : index)
    %alloc_0 = memref.alloc() {alignment = 16 : i64} : memref<1024x64x28x28xf32>
    gpu.launch_func  @test_parallel_kernel_0::@test_parallel_kernel blocks in (%c1024, %c64, %c28) threads in (%c1, %c1, %c1)  args(%c1 : index, %c0 : index, %c29 : index, %c3 : index, %arg0 : memref<1024x1x28x28xf32>, %2 : memref<64x1x3x3xf32>, %cst : f32, %3 : memref<64xf32>, %alloc_0 : memref<1024x64x28x28xf32>, %c28 : index)
    %alloc_1 = memref.alloc() {alignment = 16 : i64} : memref<1024x64x28x28xf32>
    gpu.launch_func  @test_parallel_kernel_1::@test_parallel_kernel blocks in (%c1024, %c64, %c28) threads in (%c1, %c1, %c1)  args(%c1 : index, %c0 : index, %alloc : memref<1024x64x28x28xf32>, %alloc_0 : memref<1024x64x28x28xf32>, %alloc_1 : memref<1024x64x28x28xf32>, %c28 : index)
    return %alloc_1 : memref<1024x64x28x28xf32>
  }

I then used --gpu-async-region to convert GPU operations into asynchronous form.

  func.func @test_parallel(%arg0: memref<1024x1x28x28xf32>) -> memref<1024x64x28x28xf32> attributes {llvm.emit_c_interface} {
    %c64 = arith.constant 64 : index
    %c1024 = arith.constant 1024 : index
    %c3 = arith.constant 3 : index
    %c29 = arith.constant 29 : index
    %c28 = arith.constant 28 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %cst = arith.constant 0.000000e+00 : f32
    %0 = "krnl.global"() {name = "constant_0", shape = [64, 1, 3, 3], value = dense<1.000000e+00> : tensor<64x1x3x3xf32>} : () -> memref<64x1x3x3xf32>
    %1 = "krnl.global"() {name = "constant_1", shape = [64], value = dense<1.000000e-01> : tensor<64xf32>} : () -> memref<64xf32>
    %2 = "krnl.global"() {name = "constant_2", shape = [64, 1, 3, 3], value = dense<5.000000e-01> : tensor<64x1x3x3xf32>} : () -> memref<64x1x3x3xf32>
    %3 = "krnl.global"() {name = "constant_3", shape = [64], value = dense<2.000000e-01> : tensor<64xf32>} : () -> memref<64xf32>
    %alloc = memref.alloc() {alignment = 16 : i64} : memref<1024x64x28x28xf32>
    %4 = gpu.wait async
    %5 = gpu.launch_func async [%4] @test_parallel_kernel::@test_parallel_kernel blocks in (%c1024, %c64, %c28) threads in (%c1, %c1, %c1)  args(%c1 : index, %c0 : index, %c29 : index, %c3 : index, %arg0 : memref<1024x1x28x28xf32>, %0 : memref<64x1x3x3xf32>, %cst : f32, %1 : memref<64xf32>, %alloc : memref<1024x64x28x28xf32>, %c28 : index)
    gpu.wait [%5]
    %alloc_0 = memref.alloc() {alignment = 16 : i64} : memref<1024x64x28x28xf32>
    %6 = gpu.wait async
    %7 = gpu.launch_func async [%6] @test_parallel_kernel_0::@test_parallel_kernel blocks in (%c1024, %c64, %c28) threads in (%c1, %c1, %c1)  args(%c1 : index, %c0 : index, %c29 : index, %c3 : index, %arg0 : memref<1024x1x28x28xf32>, %2 : memref<64x1x3x3xf32>, %cst : f32, %3 : memref<64xf32>, %alloc_0 : memref<1024x64x28x28xf32>, %c28 : index)
    gpu.wait [%7]
    %alloc_1 = memref.alloc() {alignment = 16 : i64} : memref<1024x64x28x28xf32>
    %8 = gpu.wait async
    %9 = gpu.launch_func async [%8] @test_parallel_kernel_1::@test_parallel_kernel blocks in (%c1024, %c64, %c28) threads in (%c1, %c1, %c1)  args(%c1 : index, %c0 : index, %alloc : memref<1024x64x28x28xf32>, %alloc_0 : memref<1024x64x28x28xf32>, %alloc_1 : memref<1024x64x28x28xf32>, %c28 : index)
    gpu.wait [%9]
    return %alloc_1 : memref<1024x64x28x28xf32>
  }

To enable concurrent execution of the first two gpu.launch operations, I manually modified the gpu.wait dependencies . The modified async version looks like:

%4 = gpu.wait async
%5 = gpu.launch_func async [%4] @conv1::@conv1 blocks in (%c1024, %c64, %c28) threads in (%c1, %c1, %c1)  args(%c1 : index, %c0 : index, %c29 : index, %c3 : index, %arg0 : memref<1024x1x28x28xf32>, %memref_2 : memref<64x1x3x3xf32>, %cst : f32, %memref_3 : memref<64xf32>, %memref_1 : memref<1024x64x28x28xf32>, %c28 : index)
%6 = gpu.wait async
%7 = gpu.launch_func async [%6] @conv2::@conv2 blocks in (%c1024, %c64, %c28) threads in (%c1, %c1, %c1)  args(%c1 : index, %c0 : index, %c29 : index, %c3 : index, %arg1 : memref<1024x1x28x28xf32>, %memref_4 : memref<64x1x3x3xf32>, %cst : f32, %memref_5 : memref<64xf32>, %memref_0 : memref<1024x64x28x28xf32>, %c28 : index)
%joined_token = gpu.wait async [%5, %7]
%9 = gpu.launch_func async [%joined_token] @add::@add blocks in (%c1024, %c64, %c28) threads in (%c1, %c1, %c1)  args(%c1 : index, %c0 : index, %memref_1 : memref<1024x64x28x28xf32>, %memref_0 : memref<1024x64x28x28xf32>, %memref : memref<1024x64x28x28xf32>, %c28 : index)
gpu.wait [%9]

(Of course, I also modified the memory allocation to ensure proper execution on the GPU. Specifically, I replaced memref.alloc with gpu.alloc host_shared() and copied values of global types into the memory allocated by gpu.alloc host_shared() .)

I’m not sure if I described this clearly – thank you for your help!