PyTorch patch for building on JetPack >= 4.4 (original) (raw)

diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp

index 1751128f1a..03e74f5ac2 100644

--- a/aten/src/ATen/cuda/CUDAContext.cpp

+++ b/aten/src/ATen/cuda/CUDAContext.cpp

@@ -24,6 +24,8 @@ void initCUDAContextVectors() {

void initDeviceProperty(DeviceIndex device_index) {

cudaDeviceProp device_prop;

AT_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, device_index));

device_properties[device_index] = device_prop;

}

diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h

index 45056ab996..81a0246ceb 100644

--- a/aten/src/ATen/cuda/detail/KernelUtils.h

+++ b/aten/src/ATen/cuda/detail/KernelUtils.h

@@ -22,7 +22,10 @@ namespace at { namespace cuda { namespace detail {

// Use 1024 threads per block, which requires cuda sm_2x or above

-constexpr int CUDA_NUM_THREADS = 1024;

+//constexpr int CUDA_NUM_THREADS = 1024;

+// patch for "too many resources requested for launch"

+constexpr int CUDA_NUM_THREADS = 512;

// CUDA: number of blocks for threads.

inline int GET_BLOCKS(const int64_t N) {

diff --git a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp

index 4e9c799986..12c1453073 100644

--- a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp

+++ b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp

@@ -24,7 +24,13 @@ using namespace vec256;

// copysign faster for the half-precision types

template

T copysign(T a, T b) {

+#if (!defined(__aarch64__)) || defined(__clang__) || (__GNUC__ > 8)

return std::copysign(a, b);

+#else

+#endif

}

// Implement copysign for half precision floats using bit ops

@@ -149,6 +155,18 @@ void div_trunc_kernel(TensorIterator& iter) {

}

}

+// this is a function because MSVC does not like us to use #if inside AT_DISPATC

+template <typename scalar_t>

+static inline scalar_t signed_zero(scalar_t sign) {

+#if (!defined(__aarch64__)) || defined(__clang__) || (__GNUC__ > 8)

+#else

+#endif

+}

// NOTE: [Floor Division in Python]

// Python's __floordiv__ operator is more complicated than just floor(a / b).

// It aims to maintain the property: a == (a // b) * b + remainder(a, b)

@@ -201,7 +219,7 @@ void div_floor_kernel(TensorIterator& iter) {

floordiv += scalar_t(1.0);

}

} else {

- floordiv = copysign(scalar_t(0), a / b);

}

return floordiv;

});

diff --git a/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu b/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu

index e3ac2665a4..77e866b7f3 100644

--- a/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu

+++ b/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu

@@ -1,10 +1,11 @@

#include <ATen/AccumulateType.h>

#include <ATen/Dispatch.h>

+#include <ATen/native/BinaryOps.h>

#include <ATen/native/DispatchStub.h>

-#include <ATen/native/cuda/Loops.cuh>

#include <ATen/native/TensorIterator.h>

-#include <ATen/native/BinaryOps.h>

#include <c10/cuda/CUDAGuard.h>

+#include <c10/cuda/CUDAMathCompat.h>

+#include <ATen/native/cuda/Loops.cuh>

// NOTE: CUDA on Windows requires that the enclosing function

// of a __device__ lambda not have internal linkage.

@@ -139,7 +140,9 @@ void div_floor_kernel_cuda(TensorIterator& iter) {

floordiv += scalar_t(1.0);

}

} else {

- floordiv = std::copysign(scalar_t(0), a * inv_b);

}

return floordiv;

});

@@ -160,7 +163,9 @@ void div_floor_kernel_cuda(TensorIterator& iter) {

floordiv += scalar_t(1.0);

}

} else {

- floordiv = std::copysign(scalar_t(0), a / b);

}

return floordiv;

});

diff --git a/aten/src/THCUNN/common.h b/aten/src/THCUNN/common.h

index 69b7f3a4d3..85b0b1305f 100644

--- a/aten/src/THCUNN/common.h

+++ b/aten/src/THCUNN/common.h

@@ -5,7 +5,10 @@

"Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.")

// Use 1024 threads per block, which requires cuda sm_2x or above

-const int CUDA_NUM_THREADS = 1024;

+//constexpr int CUDA_NUM_THREADS = 1024;

+// patch for "too many resources requested for launch"

+constexpr int CUDA_NUM_THREADS = 512;

// CUDA: number of blocks for threads.

inline int GET_BLOCKS(const int64_t N)

diff --git a/c10/cuda/CUDAMathCompat.h b/c10/cuda/CUDAMathCompat.h

index 1fb0c3ec29..a4c6655859 100644

--- a/c10/cuda/CUDAMathCompat.h

+++ b/c10/cuda/CUDAMathCompat.h

@@ -42,11 +42,80 @@ __MATH_FUNCTIONS_DECL__ double ceil(double x) {

return ::ceil(x);

}

+__MATH_FUNCTIONS_DECL__ float fp32_from_bits(uint32_t w) {

+#if defined(__OPENCL_VERSION__)

+#elif defined(__CUDA_ARCH__)

+#elif defined(__INTEL_COMPILER)

+#else

+#endif

+}

+__MATH_FUNCTIONS_DECL__ uint32_t fp32_to_bits(float f) {

+#if defined(__OPENCL_VERSION__)

+#elif defined(__CUDA_ARCH__)

+#elif defined(__INTEL_COMPILER)

+#else

+#endif

+}

+__MATH_FUNCTIONS_DECL__ double fp64_from_bits(uint64_t w) {

+#if defined(__CUDA_ARCH__)

+#else

+#endif

+}

+__MATH_FUNCTIONS_DECL__ uint64_t fp64_to_bits(double f) {

+#if defined(__CUDA_ARCH__)

+#else

+#endif

+}

__MATH_FUNCTIONS_DECL__ float copysign(float x, float y) {

- return ::copysignf(x, y);

+#if (!defined(__aarch64__)) || defined(__clang__) || (__GNUC__ > 8)

+#else

+#endif

}

__MATH_FUNCTIONS_DECL__ double copysign(double x, double y) {

- return ::copysign(x, y);

+#if (!defined(__aarch64__)) || defined(__clang__) || (__GNUC__ > 8)

+#else

+#endif

}

__MATH_FUNCTIONS_DECL__ float floor(float x) {