parallel_for's throw_on_error results in terminate · Issue #1448 · NVIDIA/thrust (original) (raw)

We're using Thrust with Torch 1.7.1 and MinkowskiEngine 0.5.4 and experience a deterministic issue which makes the library unusable for long-running processes.

When we run parallel_for on large arrays there is several memory allocation steps and if we encounter an OOM error it simply results in a terminate instead of an error which could be handled. This is usually a result of an error thrown in a noexcept function.

Environment:

Ubuntu 18.04
CUDA 10.2
Driver Version: 460.73.01

Code to reproduce:

#include #include #include #include #include #include #include

#include <cuda_runtime.h> #include <thrust/copy.h> #include <thrust/device_vector.h> #include <thrust/execution_policy.h> #include <thrust/iterator/constant_iterator.h> #include <thrust/iterator/counting_iterator.h> #include <thrust/sort.h>

int main() { std::random_device rd; std::mt19937 engine; engine.seed(rd()); std::uniform_real_distribution u(0, 90.);

// Parameter for Tesla V100 16GB VRAM (Use 250M for 8GB VRAM): int N = 550000000;

std::vector v(N); std::generate(v.begin(), v.end(), & { return u(engine); }); thrust::host_vector hv(v.begin(), v.end()); thrust::device_vector dv = hv;

thrust::device_vector res(dv.begin(), dv.end());

thrust::device_vector index(N); thrust::sequence(thrust::device, index.begin(), index.end(), 0, 1);

while (1) { try { std::cout << "step" << std::endl; thrust::sort_by_key(thrust::device, // dv.begin(), // key begin dv.end(), // key end thrust::make_zip_iterator( // value begin thrust::make_tuple( // dv.begin(), // index.begin() // ))); } catch (std::bad_alloc) { std::cout << "bad_alloc" << std::endl; } catch (...) { std::cout << "other error" << std::endl; } // thrust exception cannot be caught }

cudaDeviceSynchronize(); return 0; }

Traceback:

#0  0x00007f07d8108fb7 in raise () from /lib/x86_64-linux-gnu/libc.so.6
#1  0x00007f07d810a921 in abort () from /lib/x86_64-linux-gnu/libc.so.6
#2  0x00007f07d8afd957 in ?? () from /usr/lib/x86_64-linux-gnu/libstdc++.so.6
#3  0x00007f07d8b03ae6 in std::rethrow_exception(std::__exception_ptr::exception_ptr) () from /usr/lib/x86_64-linux-gnu/libstdc++.so.6
#4  0x00007f07d8b02b49 in __cxa_throw_bad_array_new_length () from /usr/lib/x86_64-linux-gnu/libstdc++.so.6
#5  0x00007f07d8b034b8 in __gxx_personality_v0 () from /usr/lib/x86_64-linux-gnu/libstdc++.so.6
#6  0x00007f07d84cb573 in _Unwind_GetTextRelBase () from /lib/x86_64-linux-gnu/libgcc_s.so.1
#7  0x00007f07d84cbad1 in _Unwind_RaiseException () from /lib/x86_64-linux-gnu/libgcc_s.so.1
#8  0x00007f07d8b03d47 in __cxa_throw () from /usr/lib/x86_64-linux-gnu/libstdc++.so.6
#9  0x000056485a33fb4d in thrust::cuda_cub::throw_on_error(cudaError, char const*) ()
#10 0x000056485a35c134 in void thrust::cuda_cub::parallel_for<thrust::cuda_cub::par_t, thrust::cuda_cub::for_each_f<thrust::pointer<thrust::tuple<float, int, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>, thrust::cuda_cub::par_t, thrust::use_default, thrust::use_default>, thrust::detail::wrapped_function<thrust::detail::allocator_traits_detail::gozer, void> >, long>(thrust::cuda_cub::execution_policy<thrust::cuda_cub::par_t>&, thrust::cuda_cub::for_each_f<thrust::pointer<thrust::tuple<float, int, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>, thrust::cuda_cub::par_t, thrust::use_default, thrust::use_default>, thrust::detail::wrapped_function<thrust::detail::allocator_traits_detail::gozer, void> >, long) ()

I found that the issue is not present on CUDA 11.1, so we started to migrate the codebase over, but it might be useful for someone else who encounters this issue.