How to better implement operation-level parallelism? (original) (raw)

I did a simple test of the approach you mentioned. I wrote a matrix multiplication code that executes twice and then adds the results of the two executions. The specific IR is shown below:

  func.func @test_parallel(%arg0: memref<4x5xf32>, %arg1: memref<5x3xf32>) -> memref<4x3xf32> attributes {llvm.emit_c_interface} {
    %cst = arith.constant 0.000000e+00 : f32
    %alloc = memref.alloc() {alignment = 16 : i64} : memref<4x3xf32>
    %alloc_0 = memref.alloc() {alignment = 16 : i64} : memref<4x3xf32>
    affine.parallel (%arg2) = (0) to (4) {
      affine.parallel (%arg3) = (0) to (3) {
        %0 = affine.for %arg4 = 0 to 5 iter_args(%arg5 = %cst) -> (f32) {
          %1 = affine.load %arg0[%arg2, %arg4] : memref<4x5xf32>
          %2 = affine.load %arg1[%arg4, %arg3] : memref<5x3xf32>
          %3 = arith.mulf %1, %2 : f32
          %4 = arith.addf %arg5, %3 : f32
          affine.yield %4 : f32
        }
        affine.store %0, %alloc[%arg2, %arg3] : memref<4x3xf32>
      }
    }
    %alloc_1 = memref.alloc() {alignment = 16 : i64} : memref<4x3xf32>
    affine.parallel (%arg2) = (0) to (4) {
      affine.parallel (%arg3) = (0) to (3) {
        %0 = affine.for %arg4 = 0 to 5 iter_args(%arg5 = %cst) -> (f32) {
          %1 = affine.load %arg0[%arg2, %arg4] : memref<4x5xf32>
          %2 = affine.load %arg1[%arg4, %arg3] : memref<5x3xf32>
          %3 = arith.mulf %1, %2 : f32
          %4 = arith.addf %arg5, %3 : f32
          affine.yield %4 : f32
        }
        affine.store %0, %alloc_1[%arg2, %arg3] : memref<4x3xf32>
      }
    }
    affine.parallel (%arg2) = (0) to (4) {
      affine.parallel (%arg3) = (0) to (3) {
        %0 = affine.load %alloc[%arg2, %arg3] : memref<4x3xf32>
        %1 = affine.load %alloc_1[%arg2, %arg3] : memref<4x3xf32>
        %2 = arith.addf %0, %1 : f32
        affine.store %2, %alloc_0[%arg2, %arg3] : memref<4x3xf32>
      }
    }
    return %alloc_0 : memref<4x3xf32>
  }

Then I converted it to the GPU dialect representation:

  func.func @test_parallel(%arg0: memref<4x5xf32>, %arg1: memref<5x3xf32>) -> memref<4x3xf32> attributes {llvm.emit_c_interface} {
    %c3 = arith.constant 3 : index
    %c4 = arith.constant 4 : index
    %cst = arith.constant 0.000000e+00 : f32
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c5 = arith.constant 5 : index
    %memref = gpu.alloc  host_shared () : memref<4x3xf32>
    %memref_0 = gpu.alloc  host_shared () : memref<4x3xf32>
    %memref_1 = gpu.alloc  host_shared () : memref<4x3xf32>
    gpu.launch_func  @test::@test blocks in (%c4, %c3, %c1) threads in (%c1, %c1, %c1)  args(%c1 : index, %c0 : index, %arg0 : memref<4x5xf32>, %arg1 : memref<5x3xf32>, %c5 : index, %cst : f32, %memref_1 : memref<4x3xf32>)
    gpu.launch_func  @parallel::@parallel blocks in (%c4, %c3, %c1) threads in (%c1, %c1, %c1)  args(%c1 : index, %c0 : index, %arg0 : memref<4x5xf32>, %arg1 : memref<5x3xf32>, %c5 : index, %cst : f32, %memref_0 : memref<4x3xf32>)
    gpu.launch_func  @test_parallel_kernel_1::@test_parallel_kernel blocks in (%c4, %c3, %c1) threads in (%c1, %c1, %c1)  args(%c1 : index, %c0 : index, %memref_1 : memref<4x3xf32>, %memref_0 : memref<4x3xf32>, %memref : memref<4x3xf32>)
    return %memref : memref<4x3xf32>
  }

I tried to execute it using mlir-cpu-runner and profiled its execution with nsys. Then I used nsys ui to view the actual kernel execution and found that it was executing serially.

Do I need to use --gpu-async-region for additional processing?

But there’s a strange thing here. When I use --gpu-async-region to process the above IR, it generates the following form, where it processes the three gpu.launch operations into sequential execution.

  func.func @test_parallel(%arg0: memref<4x5xf32>, %arg1: memref<5x3xf32>) -> memref<4x3xf32> attributes {llvm.emit_c_interface} {
    %c3 = arith.constant 3 : index
    %c4 = arith.constant 4 : index
    %c5 = arith.constant 5 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %cst = arith.constant 0.000000e+00 : f32
    %alloc = memref.alloc() {alignment = 16 : i64} : memref<4x3xf32>
    %alloc_0 = memref.alloc() {alignment = 16 : i64} : memref<4x3xf32>
    %0 = gpu.wait async
    %1 = gpu.launch_func async [%0] @test::@test blocks in (%c4, %c3, %c1) threads in (%c1, %c1, %c1)  args(%c1 : index, %c0 : index, %arg0 : memref<4x5xf32>, %arg1 : memref<5x3xf32>, %c5 : index, %cst : f32, %alloc : memref<4x3xf32>)
    %alloc_1 = memref.alloc() {alignment = 16 : i64} : memref<4x3xf32>
    %2 = gpu.wait async
    %3 = gpu.launch_func async [%2] @parallel::@parallel blocks in (%c4, %c3, %c1) threads in (%c1, %c1, %c1)  args(%c1 : index, %c0 : index, %arg0 : memref<4x5xf32>, %arg1 : memref<5x3xf32>, %c5 : index, %cst : f32, %alloc_1 : memref<4x3xf32>)
    gpu.wait [%1, %3]
    %5 = gpu.wait async
    %4 = gpu.launch_func async [%5] @test_parallel_kernel_1::@test_parallel_kernel blocks in (%c4, %c3, %c1) threads in (%c1, %c1, %c1)  args(%c1 : index, %c0 : index, %alloc : memref<4x3xf32>, %alloc_1 : memref<4x3xf32>, %alloc_0 : memref<4x3xf32>)
    gpu.wait [%4]
    return %alloc_0 : memref<4x3xf32>
  }