CUDA thrust max_element fails with large index

111 Views Asked by At

Using Thrust to find maximum values, I get error what(): extrema failed to synchronize: cudaErrorIllegalAddress: an illegal memory access was encountered when I put large array whose length is close to INT_MAX (2147483647).

I found a similar post in https://github.com/NVIDIA/thrust/issues/1271, and it seems most of the bugs are 'closed'.

Maybe max_element is the one remaining?

Would there be any workaround with Thurst? While I can write my own CUDA kernels to find maximum, I bet it will underperform Thrust.

Below is the test code, and when I keep first 3 digits of INT_MAX (i.e. 2140000000), it works. However, if I keep one additional digit as 2147000000, it fails with the error above.

#include <iostream>
#include <cstdlib>
#include "thrust/device_ptr.h"
#include "thrust/device_vector.h"
#include "thrust/fill.h"
#include "thrust/copy.h"
#include "thrust/extrema.h"

int main() {
  size_t N = std::numeric_limits<int>::max(); // 2147483647
  N = 2147000000;

  std::cout << "N  " << N << std::endl;

  thrust::device_vector<int> dArr(N);
  thrust::fill(dArr.begin(), dArr.end(), 1);

  dArr[N-2] = 2;

  thrust::device_vector<int>::iterator iter=thrust::max_element(dArr.begin(),dArr.end());

  size_t position = iter - dArr.begin();
  int    val = dArr[position];

  std::cout << "max value in dArr  " << val << std::endl;
  std::cout << " - max_index  " << position << std::endl;

  return 0;
}
1

There are 1 best solutions below

0
Abator Abetor On BEST ANSWER

Here are two possible work-arounds. 1. Use thrust::reduce with a custom comparator. 2. Use cub's version of max_element. Note that cub's public API currently only supports up to int_max elements. thrust::reduce should use the internal cub implementation with the correct index type depending on the size of the input range.

// nvcc --extended-lambda -arch=sm_80 -std=c++17

#include <iostream>
#include <cstdlib>
#include "thrust/device_ptr.h"
#include "thrust/device_vector.h"
#include "thrust/fill.h"
#include "thrust/copy.h"
#include "thrust/extrema.h"

#include <cub/cub.cuh>

int main() {
  size_t N = std::numeric_limits<int>::max(); // 2147483647
  N = 2147000000;

  std::cout << "N  " << N << std::endl;

  thrust::device_vector<int> dArr(N);
  thrust::fill(dArr.begin(), dArr.end(), 1);

  dArr[N-2] = 2;

  // thrust::device_vector<int>::iterator iter=thrust::max_element(dArr.begin(),dArr.end());

  // size_t position = iter - dArr.begin();
  // int    val = dArr[position];

  // std::cout << "max value in dArr  " << val << std::endl;
  // std::cout << " - max_index  " << position << std::endl;


  auto result1 = thrust::reduce(
    thrust::make_zip_iterator(
      dArr.begin(),
      thrust::make_counting_iterator<int>(0)
    ),
    thrust::make_zip_iterator(
      dArr.begin(),
      thrust::make_counting_iterator<int>(0)
    ) + N,
    thrust::make_tuple(std::numeric_limits<int>::min(), -1),
    [] __host__ __device__ (const thrust::tuple<int,int>& l, const thrust::tuple<int,int>& r){
      if(thrust::get<0>(l) > thrust::get<0>(r)) return l;
      else return r;
    }
  );

  std::cout << thrust::get<0>(result1) << " " << thrust::get<1>(result1) << "\n";

  using T = int;
  size_t temp_storage_bytes = 0;
  thrust::device_vector<cub::KeyValuePair<int, T>> dResult2(1);
  cub::DeviceReduce::ArgMax(
    nullptr,
    temp_storage_bytes,
    dArr.data().get(),
    dResult2.data().get(),
    N
  );

  thrust::device_vector<char> dTemp(temp_storage_bytes);

  cub::DeviceReduce::ArgMax(
    dTemp.data().get(),
    temp_storage_bytes,
    dArr.data().get(),
    dResult2.data().get(),
    N
  );
  cudaDeviceSynchronize();
  cub::KeyValuePair<int, T> result2 = dResult2[0];
  std::cout << result2.value << " " << result2.key << "\n";


  return 0;
}