How to better implement operation-level parallelism? (original) (raw)
I did a simple test of the approach you mentioned. I wrote a matrix multiplication code that executes twice and then adds the results of the two executions. The specific IR is shown below:
func.func @test_parallel(%arg0: memref<4x5xf32>, %arg1: memref<5x3xf32>) -> memref<4x3xf32> attributes {llvm.emit_c_interface} {
%cst = arith.constant 0.000000e+00 : f32
%alloc = memref.alloc() {alignment = 16 : i64} : memref<4x3xf32>
%alloc_0 = memref.alloc() {alignment = 16 : i64} : memref<4x3xf32>
affine.parallel (%arg2) = (0) to (4) {
affine.parallel (%arg3) = (0) to (3) {
%0 = affine.for %arg4 = 0 to 5 iter_args(%arg5 = %cst) -> (f32) {
%1 = affine.load %arg0[%arg2, %arg4] : memref<4x5xf32>
%2 = affine.load %arg1[%arg4, %arg3] : memref<5x3xf32>
%3 = arith.mulf %1, %2 : f32
%4 = arith.addf %arg5, %3 : f32
affine.yield %4 : f32
}
affine.store %0, %alloc[%arg2, %arg3] : memref<4x3xf32>
}
}
%alloc_1 = memref.alloc() {alignment = 16 : i64} : memref<4x3xf32>
affine.parallel (%arg2) = (0) to (4) {
affine.parallel (%arg3) = (0) to (3) {
%0 = affine.for %arg4 = 0 to 5 iter_args(%arg5 = %cst) -> (f32) {
%1 = affine.load %arg0[%arg2, %arg4] : memref<4x5xf32>
%2 = affine.load %arg1[%arg4, %arg3] : memref<5x3xf32>
%3 = arith.mulf %1, %2 : f32
%4 = arith.addf %arg5, %3 : f32
affine.yield %4 : f32
}
affine.store %0, %alloc_1[%arg2, %arg3] : memref<4x3xf32>
}
}
affine.parallel (%arg2) = (0) to (4) {
affine.parallel (%arg3) = (0) to (3) {
%0 = affine.load %alloc[%arg2, %arg3] : memref<4x3xf32>
%1 = affine.load %alloc_1[%arg2, %arg3] : memref<4x3xf32>
%2 = arith.addf %0, %1 : f32
affine.store %2, %alloc_0[%arg2, %arg3] : memref<4x3xf32>
}
}
return %alloc_0 : memref<4x3xf32>
}
Then I converted it to the GPU dialect representation:
func.func @test_parallel(%arg0: memref<4x5xf32>, %arg1: memref<5x3xf32>) -> memref<4x3xf32> attributes {llvm.emit_c_interface} {
%c3 = arith.constant 3 : index
%c4 = arith.constant 4 : index
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c5 = arith.constant 5 : index
%memref = gpu.alloc host_shared () : memref<4x3xf32>
%memref_0 = gpu.alloc host_shared () : memref<4x3xf32>
%memref_1 = gpu.alloc host_shared () : memref<4x3xf32>
gpu.launch_func @test::@test blocks in (%c4, %c3, %c1) threads in (%c1, %c1, %c1) args(%c1 : index, %c0 : index, %arg0 : memref<4x5xf32>, %arg1 : memref<5x3xf32>, %c5 : index, %cst : f32, %memref_1 : memref<4x3xf32>)
gpu.launch_func @parallel::@parallel blocks in (%c4, %c3, %c1) threads in (%c1, %c1, %c1) args(%c1 : index, %c0 : index, %arg0 : memref<4x5xf32>, %arg1 : memref<5x3xf32>, %c5 : index, %cst : f32, %memref_0 : memref<4x3xf32>)
gpu.launch_func @test_parallel_kernel_1::@test_parallel_kernel blocks in (%c4, %c3, %c1) threads in (%c1, %c1, %c1) args(%c1 : index, %c0 : index, %memref_1 : memref<4x3xf32>, %memref_0 : memref<4x3xf32>, %memref : memref<4x3xf32>)
return %memref : memref<4x3xf32>
}
I tried to execute it using mlir-cpu-runner and profiled its execution with nsys. Then I used nsys ui to view the actual kernel execution and found that it was executing serially.
Do I need to use --gpu-async-region for additional processing?
But there’s a strange thing here. When I use --gpu-async-region to process the above IR, it generates the following form, where it processes the three gpu.launch operations into sequential execution.
func.func @test_parallel(%arg0: memref<4x5xf32>, %arg1: memref<5x3xf32>) -> memref<4x3xf32> attributes {llvm.emit_c_interface} {
%c3 = arith.constant 3 : index
%c4 = arith.constant 4 : index
%c5 = arith.constant 5 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%alloc = memref.alloc() {alignment = 16 : i64} : memref<4x3xf32>
%alloc_0 = memref.alloc() {alignment = 16 : i64} : memref<4x3xf32>
%0 = gpu.wait async
%1 = gpu.launch_func async [%0] @test::@test blocks in (%c4, %c3, %c1) threads in (%c1, %c1, %c1) args(%c1 : index, %c0 : index, %arg0 : memref<4x5xf32>, %arg1 : memref<5x3xf32>, %c5 : index, %cst : f32, %alloc : memref<4x3xf32>)
%alloc_1 = memref.alloc() {alignment = 16 : i64} : memref<4x3xf32>
%2 = gpu.wait async
%3 = gpu.launch_func async [%2] @parallel::@parallel blocks in (%c4, %c3, %c1) threads in (%c1, %c1, %c1) args(%c1 : index, %c0 : index, %arg0 : memref<4x5xf32>, %arg1 : memref<5x3xf32>, %c5 : index, %cst : f32, %alloc_1 : memref<4x3xf32>)
gpu.wait [%1, %3]
%5 = gpu.wait async
%4 = gpu.launch_func async [%5] @test_parallel_kernel_1::@test_parallel_kernel blocks in (%c4, %c3, %c1) threads in (%c1, %c1, %c1) args(%c1 : index, %c0 : index, %alloc : memref<4x3xf32>, %alloc_1 : memref<4x3xf32>, %alloc_0 : memref<4x3xf32>)
gpu.wait [%4]
return %alloc_0 : memref<4x3xf32>
}