How to Implement Asynchronous Concurrent Execution Between gpu.launch Operations? (original) (raw)
I initially created a simple model using ONNX (with two convolution computations followed by adding their results) and converted it to affine representation.
#map = affine_map<(d0, d1) -> (d0 * 64 + d1)>
#map1 = affine_map<(d0) -> (-d0 + 1)>
#map2 = affine_map<(d0, d1) -> (-d1 + 1)>
#map3 = affine_map<(d0) -> (-d0 + 1, 0)>
#map4 = affine_map<(d0) -> (-d0 + 29, 3)>
#map5 = affine_map<(d0)[s0] -> (d0 + s0)>
#map6 = affine_map<(d0, d1)[s0, s1] -> (d1 - s1)>
#map7 = affine_map<(d0, d1, d2)[s0, s1, s2] -> (d2 - s2)>
module {
func.func @test_parallel(%arg0: memref<1024x1x28x28xf32>) -> memref<1024x64x28x28xf32> attributes {llvm.emit_c_interface} {
%cst = arith.constant 0.000000e+00 : f32
%0 = "krnl.global"() {name = "constant_0", shape = [64, 1, 3, 3], value = dense<1.000000e+00> : tensor<64x1x3x3xf32>} : () -> memref<64x1x3x3xf32>
%1 = "krnl.global"() {name = "constant_1", shape = [64], value = dense<1.000000e-01> : tensor<64xf32>} : () -> memref<64xf32>
%2 = "krnl.global"() {name = "constant_2", shape = [64, 1, 3, 3], value = dense<5.000000e-01> : tensor<64x1x3x3xf32>} : () -> memref<64x1x3x3xf32>
%3 = "krnl.global"() {name = "constant_3", shape = [64], value = dense<2.000000e-01> : tensor<64xf32>} : () -> memref<64xf32>
%alloc = memref.alloc() {alignment = 16 : i64} : memref<1024x64x28x28xf32>
affine.for %arg1 = 0 to 1024 {
affine.for %arg2 = 0 to 1 {
affine.for %arg3 = 0 to 64 {
%4 = affine.apply #map(%arg2, %arg3)
affine.for %arg4 = 0 to 28 {
affine.for %arg5 = 0 to 28 {
%5 = affine.apply #map1(%arg4)
%6 = affine.apply #map2(%arg4, %arg5)
%7 = affine.for %arg6 = 0 to 1 iter_args(%arg7 = %cst) -> (f32) {
%10 = affine.for %arg8 = max #map3(%arg4) to min #map4(%arg4) iter_args(%arg9 = %arg7) -> (f32) {
%11 = affine.for %arg10 = max #map3(%arg5) to min #map4(%arg5) iter_args(%arg11 = %arg9) -> (f32) {
%12 = affine.apply #map5(%arg6)[%arg2]
%13 = affine.apply #map6(%arg6, %arg8)[%arg2, %5]
%14 = affine.apply #map7(%arg6, %arg8, %arg10)[%arg2, %5, %6]
%15 = affine.load %arg0[%arg1, %12, %13, %14] : memref<1024x1x28x28xf32>
%16 = affine.load %0[%4, %arg6, %arg8, %arg10] : memref<64x1x3x3xf32>
%17 = arith.mulf %15, %16 : f32
%18 = arith.addf %arg11, %17 : f32
affine.yield %18 : f32
}
affine.yield %11 : f32
}
affine.yield %10 : f32
}
%8 = affine.load %1[%4] : memref<64xf32>
%9 = arith.addf %7, %8 : f32
affine.store %9, %alloc[%arg1, %4, %arg4, %arg5] : memref<1024x64x28x28xf32>
}
}
}
}
}
%alloc_0 = memref.alloc() {alignment = 16 : i64} : memref<1024x64x28x28xf32>
affine.for %arg1 = 0 to 1024 {
affine.for %arg2 = 0 to 1 {
affine.for %arg3 = 0 to 64 {
%4 = affine.apply #map(%arg2, %arg3)
affine.for %arg4 = 0 to 28 {
affine.for %arg5 = 0 to 28 {
%5 = affine.apply #map1(%arg4)
%6 = affine.apply #map2(%arg4, %arg5)
%7 = affine.for %arg6 = 0 to 1 iter_args(%arg7 = %cst) -> (f32) {
%10 = affine.for %arg8 = max #map3(%arg4) to min #map4(%arg4) iter_args(%arg9 = %arg7) -> (f32) {
%11 = affine.for %arg10 = max #map3(%arg5) to min #map4(%arg5) iter_args(%arg11 = %arg9) -> (f32) {
%12 = affine.apply #map5(%arg6)[%arg2]
%13 = affine.apply #map6(%arg6, %arg8)[%arg2, %5]
%14 = affine.apply #map7(%arg6, %arg8, %arg10)[%arg2, %5, %6]
%15 = affine.load %arg0[%arg1, %12, %13, %14] : memref<1024x1x28x28xf32>
%16 = affine.load %2[%4, %arg6, %arg8, %arg10] : memref<64x1x3x3xf32>
%17 = arith.mulf %15, %16 : f32
%18 = arith.addf %arg11, %17 : f32
affine.yield %18 : f32
}
affine.yield %11 : f32
}
affine.yield %10 : f32
}
%8 = affine.load %3[%4] : memref<64xf32>
%9 = arith.addf %7, %8 : f32
affine.store %9, %alloc_0[%arg1, %4, %arg4, %arg5] : memref<1024x64x28x28xf32>
}
}
}
}
}
%alloc_1 = memref.alloc() {alignment = 16 : i64} : memref<1024x64x28x28xf32>
affine.for %arg1 = 0 to 1024 {
affine.for %arg2 = 0 to 64 {
affine.for %arg3 = 0 to 28 {
affine.for %arg4 = 0 to 28 {
%4 = affine.load %alloc[%arg1, %arg2, %arg3, %arg4] : memref<1024x64x28x28xf32>
%5 = affine.load %alloc_0[%arg1, %arg2, %arg3, %arg4] : memref<1024x64x28x28xf32>
%6 = arith.addf %4, %5 : f32
affine.store %6, %alloc_1[%arg1, %arg2, %arg3, %arg4] : memref<1024x64x28x28xf32>
}
}
}
}
return %alloc_1 : memref<1024x64x28x28xf32>
}
}
Then I transformed it into GPU dialect using the following command:
mlir-opt --affine-parallelize --lower-affine --canonicalize --scf-parallel-loop-fusion --gpu-map-parallel-loops --convert-parallel-loops-to-gpu -gpu-kernel-outlining -allow-unregistered-dialect --canonicalize
After obtaining the GPU dialect representation (gpu.module section omitted for brevity):
func.func @test_parallel(%arg0: memref<1024x1x28x28xf32>) -> memref<1024x64x28x28xf32> attributes {llvm.emit_c_interface} {
%c64 = arith.constant 64 : index
%c1024 = arith.constant 1024 : index
%c3 = arith.constant 3 : index
%c29 = arith.constant 29 : index
%c28 = arith.constant 28 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = "krnl.global"() {name = "constant_0", shape = [64, 1, 3, 3], value = dense<1.000000e+00> : tensor<64x1x3x3xf32>} : () -> memref<64x1x3x3xf32>
%1 = "krnl.global"() {name = "constant_1", shape = [64], value = dense<1.000000e-01> : tensor<64xf32>} : () -> memref<64xf32>
%2 = "krnl.global"() {name = "constant_2", shape = [64, 1, 3, 3], value = dense<5.000000e-01> : tensor<64x1x3x3xf32>} : () -> memref<64x1x3x3xf32>
%3 = "krnl.global"() {name = "constant_3", shape = [64], value = dense<2.000000e-01> : tensor<64xf32>} : () -> memref<64xf32>
%alloc = memref.alloc() {alignment = 16 : i64} : memref<1024x64x28x28xf32>
gpu.launch_func @test_parallel_kernel::@test_parallel_kernel blocks in (%c1024, %c64, %c28) threads in (%c1, %c1, %c1) args(%c1 : index, %c0 : index, %c29 : index, %c3 : index, %arg0 : memref<1024x1x28x28xf32>, %0 : memref<64x1x3x3xf32>, %cst : f32, %1 : memref<64xf32>, %alloc : memref<1024x64x28x28xf32>, %c28 : index)
%alloc_0 = memref.alloc() {alignment = 16 : i64} : memref<1024x64x28x28xf32>
gpu.launch_func @test_parallel_kernel_0::@test_parallel_kernel blocks in (%c1024, %c64, %c28) threads in (%c1, %c1, %c1) args(%c1 : index, %c0 : index, %c29 : index, %c3 : index, %arg0 : memref<1024x1x28x28xf32>, %2 : memref<64x1x3x3xf32>, %cst : f32, %3 : memref<64xf32>, %alloc_0 : memref<1024x64x28x28xf32>, %c28 : index)
%alloc_1 = memref.alloc() {alignment = 16 : i64} : memref<1024x64x28x28xf32>
gpu.launch_func @test_parallel_kernel_1::@test_parallel_kernel blocks in (%c1024, %c64, %c28) threads in (%c1, %c1, %c1) args(%c1 : index, %c0 : index, %alloc : memref<1024x64x28x28xf32>, %alloc_0 : memref<1024x64x28x28xf32>, %alloc_1 : memref<1024x64x28x28xf32>, %c28 : index)
return %alloc_1 : memref<1024x64x28x28xf32>
}
I then used --gpu-async-region
to convert GPU operations into asynchronous form.
func.func @test_parallel(%arg0: memref<1024x1x28x28xf32>) -> memref<1024x64x28x28xf32> attributes {llvm.emit_c_interface} {
%c64 = arith.constant 64 : index
%c1024 = arith.constant 1024 : index
%c3 = arith.constant 3 : index
%c29 = arith.constant 29 : index
%c28 = arith.constant 28 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = "krnl.global"() {name = "constant_0", shape = [64, 1, 3, 3], value = dense<1.000000e+00> : tensor<64x1x3x3xf32>} : () -> memref<64x1x3x3xf32>
%1 = "krnl.global"() {name = "constant_1", shape = [64], value = dense<1.000000e-01> : tensor<64xf32>} : () -> memref<64xf32>
%2 = "krnl.global"() {name = "constant_2", shape = [64, 1, 3, 3], value = dense<5.000000e-01> : tensor<64x1x3x3xf32>} : () -> memref<64x1x3x3xf32>
%3 = "krnl.global"() {name = "constant_3", shape = [64], value = dense<2.000000e-01> : tensor<64xf32>} : () -> memref<64xf32>
%alloc = memref.alloc() {alignment = 16 : i64} : memref<1024x64x28x28xf32>
%4 = gpu.wait async
%5 = gpu.launch_func async [%4] @test_parallel_kernel::@test_parallel_kernel blocks in (%c1024, %c64, %c28) threads in (%c1, %c1, %c1) args(%c1 : index, %c0 : index, %c29 : index, %c3 : index, %arg0 : memref<1024x1x28x28xf32>, %0 : memref<64x1x3x3xf32>, %cst : f32, %1 : memref<64xf32>, %alloc : memref<1024x64x28x28xf32>, %c28 : index)
gpu.wait [%5]
%alloc_0 = memref.alloc() {alignment = 16 : i64} : memref<1024x64x28x28xf32>
%6 = gpu.wait async
%7 = gpu.launch_func async [%6] @test_parallel_kernel_0::@test_parallel_kernel blocks in (%c1024, %c64, %c28) threads in (%c1, %c1, %c1) args(%c1 : index, %c0 : index, %c29 : index, %c3 : index, %arg0 : memref<1024x1x28x28xf32>, %2 : memref<64x1x3x3xf32>, %cst : f32, %3 : memref<64xf32>, %alloc_0 : memref<1024x64x28x28xf32>, %c28 : index)
gpu.wait [%7]
%alloc_1 = memref.alloc() {alignment = 16 : i64} : memref<1024x64x28x28xf32>
%8 = gpu.wait async
%9 = gpu.launch_func async [%8] @test_parallel_kernel_1::@test_parallel_kernel blocks in (%c1024, %c64, %c28) threads in (%c1, %c1, %c1) args(%c1 : index, %c0 : index, %alloc : memref<1024x64x28x28xf32>, %alloc_0 : memref<1024x64x28x28xf32>, %alloc_1 : memref<1024x64x28x28xf32>, %c28 : index)
gpu.wait [%9]
return %alloc_1 : memref<1024x64x28x28xf32>
}
To enable concurrent execution of the first two gpu.launch
operations, I manually modified the gpu.wait
dependencies . The modified async version looks like:
%4 = gpu.wait async
%5 = gpu.launch_func async [%4] @conv1::@conv1 blocks in (%c1024, %c64, %c28) threads in (%c1, %c1, %c1) args(%c1 : index, %c0 : index, %c29 : index, %c3 : index, %arg0 : memref<1024x1x28x28xf32>, %memref_2 : memref<64x1x3x3xf32>, %cst : f32, %memref_3 : memref<64xf32>, %memref_1 : memref<1024x64x28x28xf32>, %c28 : index)
%6 = gpu.wait async
%7 = gpu.launch_func async [%6] @conv2::@conv2 blocks in (%c1024, %c64, %c28) threads in (%c1, %c1, %c1) args(%c1 : index, %c0 : index, %c29 : index, %c3 : index, %arg1 : memref<1024x1x28x28xf32>, %memref_4 : memref<64x1x3x3xf32>, %cst : f32, %memref_5 : memref<64xf32>, %memref_0 : memref<1024x64x28x28xf32>, %c28 : index)
%joined_token = gpu.wait async [%5, %7]
%9 = gpu.launch_func async [%joined_token] @add::@add blocks in (%c1024, %c64, %c28) threads in (%c1, %c1, %c1) args(%c1 : index, %c0 : index, %memref_1 : memref<1024x64x28x28xf32>, %memref_0 : memref<1024x64x28x28xf32>, %memref : memref<1024x64x28x28xf32>, %c28 : index)
gpu.wait [%9]
(Of course, I also modified the memory allocation to ensure proper execution on the GPU. Specifically, I replaced memref.alloc
with gpu.alloc host_shared()
and copied values of global types into the memory allocated by gpu.alloc host_shared()
.)
I’m not sure if I described this clearly – thank you for your help!