"An exception was thrown: Native API failed. Native API returns: 20 (UR_RESULT_ERROR_DEVICE_LOST)." (original) (raw)
Hi everyone,
I’m running a kernel generated with MLIR on an Intel GPU, but I’m encountering this error:
“An exception was thrown: Native API failed. Native API returns: 20 (UR_RESULT_ERROR_DEVICE_LOST).”
I placed a print statement at the very beginning of the kernel, but it never prints anything during execution , which makes me suspect the error occurs even before execution starts.
Has anyone faced a similar issue or have any insights on how to debug and resolve this? Any help would be greatly appreciated!
IR on the host side
define { ptr, ptr, i64, [1 x i64], [1 x i64] } @Prediction_Function(ptr %0, ptr %1, i64 %2, i64 %3, i64 %4, i64 %5, i64 %6, ptr %7, ptr %8, i64 %9, i64 %10, i64 %11, ptr %12, ptr %13, i64 %14, i64 %15, i64 %16, ptr %17, ptr %18, i64 %19, i64 %20, i64 %21, ptr %22, ptr %23, i64 %24, i64 %25, i64 %26, ptr %27, ptr %28, i64 %29, i64 %30, i64 %31) {
%33 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } undef, ptr %7, 0
%34 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %33, ptr %8, 1
%35 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %34, i64 %9, 2
%36 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %35, i64 %10, 3, 0
%37 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %36, i64 %11, 4, 0
br label %38
38: ; preds = %41, %32
%39 = phi i64 [ %43, %41 ], [ 0, %32 ]
%40 = icmp slt i64 %39, 32
br i1 %40, label %41, label %44
41: ; preds = %38
%42 = getelementptr float, ptr %8, i64 %39
store float 0.000000e+00, ptr %42, align 4
%43 = add i64 %39, 1
br label %38
44: ; preds = %38
%45 = call ptr @mgpuStreamCreate()
%46 = call ptr @mgpuMemAlloc(i64 ptrtoint (ptr getelementptr (float, ptr null, i32 160) to i64), ptr %45, i8 0)
call void @mgpuMemcpy(ptr %46, ptr %1, i64 ptrtoint (ptr getelementptr (float, ptr null, i32 160) to i64), ptr %45)
%47 = call ptr @mgpuMemAlloc(i64 ptrtoint (ptr getelementptr (float, ptr null, i32 32) to i64), ptr %45, i8 0)
call void @mgpuMemcpy(ptr %47, ptr %8, i64 ptrtoint (ptr getelementptr (float, ptr null, i32 32) to i64), ptr %45)
call void @mgpuStreamSynchronize(ptr %45)
call void @mgpuStreamDestroy(ptr %45)
%48 = alloca %0, align 8
%49 = alloca ptr, i64 8, align 8
%50 = getelementptr inbounds %0, ptr %48, i32 0, i32 0
store ptr %46, ptr %50, align 8
%51 = getelementptr ptr, ptr %49, i32 0
store ptr %50, ptr %51, align 8
%52 = getelementptr inbounds %0, ptr %48, i32 0, i32 1
store ptr %18, ptr %52, align 8
%53 = getelementptr ptr, ptr %49, i32 1
store ptr %52, ptr %53, align 8
%54 = getelementptr inbounds %0, ptr %48, i32 0, i32 2
store i64 0, ptr %54, align 4
%55 = getelementptr ptr, ptr %49, i32 2
store ptr %54, ptr %55, align 8
%56 = getelementptr inbounds %0, ptr %48, i32 0, i32 3
store ptr %23, ptr %56, align 8
%57 = getelementptr ptr, ptr %49, i32 3
store ptr %56, ptr %57, align 8
%58 = getelementptr inbounds %0, ptr %48, i32 0, i32 4
store ptr %13, ptr %58, align 8
%59 = getelementptr ptr, ptr %49, i32 4
store ptr %58, ptr %59, align 8
%60 = getelementptr inbounds %0, ptr %48, i32 0, i32 5
store i1 false, ptr %60, align 1
%61 = getelementptr ptr, ptr %49, i32 5
store ptr %60, ptr %61, align 8
%62 = getelementptr inbounds %0, ptr %48, i32 0, i32 6
store float 0.000000e+00, ptr %62, align 4
%63 = getelementptr ptr, ptr %49, i32 6
store ptr %62, ptr %63, align 8
%64 = getelementptr inbounds %0, ptr %48, i32 0, i32 7
store ptr %47, ptr %64, align 8
%65 = getelementptr ptr, ptr %49, i32 7
store ptr %64, ptr %65, align 8
%66 = call ptr @mgpuModuleLoad(ptr @Prediction_Function_kernel_bin_cst, i64 6532)
%67 = call ptr @mgpuModuleGetFunction(ptr %66, ptr @Prediction_Function_kernel_Prediction_Function_kernel_kernel_name)
%68 = call ptr @mgpuStreamCreate()
call void @mgpuLaunchKernel(ptr %67, i64 8, i64 1, i64 1, i64 4, i64 1, i64 1, i32 0, ptr %68, ptr %49, ptr null, i64 8)
call void @mgpuStreamSynchronize(ptr %68)
call void @mgpuStreamDestroy(ptr %68)
call void @mgpuModuleUnload(ptr %66)
%69 = call ptr @mgpuStreamCreate()
call void @mgpuMemcpy(ptr %8, ptr %47, i64 ptrtoint (ptr getelementptr (float, ptr null, i32 32) to i64), ptr %69)
call void @mgpuStreamSynchronize(ptr %69)
call void @mgpuStreamDestroy(ptr %69)
ret { ptr, ptr, i64, [1 x i64], [1 x i64] } %37
}
Part of driver traces :
UR —> Queue->executeCommandList(CommandList, false, true)
UR <— Queue->executeCommandList(CommandList, false, true)(UR_RESULT_SUCCESS)
(.hQueue = 0x6514dc904810, .hKernel = 0x6514ddba6470, .workDim = 3, .pGlobalWorkOffset = 0x6514db22ac70 (0), .pGlobalWorkSize = 0x6514db22ac40 (32), .pLocalWorkSize = 0x6514db22ac58 (4), .numEventsInWaitList = 0, .phEventWaitList = {}, .phEvent = 0x7ffcc4842908 (0x6514ddba0eb0)) → UR_RESULT_SUCCESS;
—> urEventReleaseUR —> urEventReleaseInternal(Event)
UR <— urEventReleaseInternal(Event)(UR_RESULT_SUCCESS)
(.hEvent = 0x6514ddba0eb0) → UR_RESULT_SUCCESS;
Hello mgpuStreamSynchronize!
—> urQueueFinishUR —> Queue->executeAllOpenCommandLists()
UR <— Queue->executeAllOpenCommandLists()(UR_RESULT_SUCCESS)
(.hQueue = 0x6514dc904810) → UR_RESULT_ERROR_DEVICE_LOST;
An exception was thrown: Native API failed. Native API returns: 20 (UR_RESULT_ERROR_DEVICE_LOST)
As per the trace its basically hanging after mgpuStreamSynchronize call