CUDA kernel for finding the min and max index of values in a 1D array greater than particular threshold

104 Views Asked by At

I am trying to write a CUDA kernel that is used to find min and max index of values in 1D array greater than a particular threshold Below is the pseudo code in CPU for doing the same

int min_index = 0, max_index = length;
    for(int i = 0; i < length; i++)
    {
        if(arr[i] >= threshold)
        {
            min_index = i;
            break;
        }
    }

    for(int i = length - 1 ; i >= min_index; i--)
    {
        if(arr[i] >= threshold)
        {
            max_index = i;
            break;
        }
    }

One way of doing this is GPU is using atomic operations, but i would not prefer to use them. I was thinking of using reduction, but unable to figure out the logic with which i need to do reduction. For min reduction a logic like below can be followed

    for(int i = 8; i >= 1; i /= 2)
    {
        if(threadIdx.x < i)
            sh_mem[threadIdx.x] = min(sh_mem[threadIdx.x], sh_mem[threadIdx.x + i]);
        __syncthreads();
    }

But what logic can i use to do for my case, where i need to compare the values in shared memory with a threshold and compute the min index in shared memory satisfying this criteria?

Would greatly appreciate any thoughts on how to solve the above problem

EDIT: It is not possible for me to use thrust library due to some constraints

2

There are 2 best solutions below

6
paleonix On

The following is a sample of how to do this in Thrust with a single algorithm instead of two calls to thrust::find_if (one utilizing reverse_iterators). This implementation is using a single transform_reduce where the transform filters out the values above the threshold by replacing them with neutral elements and prepares the following minmax reduction working on tuples.

While it might be possible to get better performance using a custom kernel as discussed in the comments below the question, this should serve as a good reference solution (i.e. this isn't necessarily supposed to be the final/accepted answer).

#include <iostream>

#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/transform_reduce.h>
#include <thrust/zip_function.h>

int main() {
    thrust::host_vector<float> h_vals{0.0f, 100.0f, 0.0f, 100.0f, 42.0f, 0.0f, 0.0f};
    thrust::device_vector<float> vals(h_vals);
    float threshold = 42.0f;

    const auto enumerate_iter = thrust::make_zip_iterator(
        thrust::make_tuple(
            thrust::make_counting_iterator(0),
            vals.cbegin()));
    const auto filter_minmax_input = thrust::make_zip_function(
            [threshold] __host__ __device__ (int idx, float val) {
                if (val >= threshold) {
                    return thrust::make_tuple(idx, idx);
                }
                else {
                    return thrust::make_tuple(INT_MAX, INT_MIN);
                }
            });
    const auto minmax_f = 
        [] __host__ __device__ (thrust::tuple<int, int> left,
                                thrust::tuple<int, int> right) {
            const int left_min_idx = thrust::get<0>(left);
            const int left_max_idx = thrust::get<1>(left);
            const int right_min_idx = thrust::get<0>(right);
            const int right_max_idx = thrust::get<1>(right);
            return thrust::make_tuple(
                thrust::min(left_min_idx, right_min_idx),
                thrust::max(left_max_idx, right_max_idx));
        };
    const auto res = thrust::transform_reduce(
        enumerate_iter, enumerate_iter + vals.size(),
        filter_minmax_input,
        thrust::make_tuple(INT_MAX, INT_MIN),
        minmax_f);
    
    std::cout << thrust::get<0>(res) << ' ' << thrust::get<1>(res) << '\n';
    return 0;
}

Note that in case the minimum and maximum indices are far apart, the two find_ifs should actually perform better, assuming they do implement early exit and avoid reading the whole range from global memory. In the (edge-) case where these indices are very close however a single reduction might be favorable as it avoids overheads.

Because it is probably (did not benchmark) the better solution in general, I will include the (pretty simple) find_if solution here as well:

#include <iostream>

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/find.h>

int main() {
    thrust::host_vector<float> h_vals{0.0f, 100.0f, 0.0f, 100.0f, 42.0f, 0.0f, 0.0f};
    thrust::device_vector<float> vals(h_vals);
    float threshold = 42.0f;

    const auto filter = [threshold] __host__ __device__ (float val){
            return val >= threshold;
        };
    const auto min_idx = thrust::distance(
        vals.cbegin(),
        thrust::find_if(vals.cbegin(), vals.cend(), filter));
    const auto max_idx = thrust::distance(
        thrust::find_if(vals.crbegin(), vals.crend(), filter),
        vals.crend() - 1); // reverse iterator corresponding to cbegin
    
    std::cout << min_idx << ' ' << max_idx << '\n';
    return 0;
}
1
Johan On

It is inefficient to wholly avoid the use of atomics, but you can reduce their usage quite a bit, by doing reduction per warp. If you have Ampere or better, you'll have the __reduce_XXX_sync functions, which I'll do what you want. Note that in actual testing, a warpwide atomicOp on a shared mem is actually faster (12 cycles vs 26 cycles), but you did ask to not use atomic operations.

Testing a predicate can be done very efficiently using __ballot_sync. This tests 32 predicates in parallel. One more reduction and you can do 1024 tests in four instructions (see code below). With some bit twiddling we can derive the first thread to find a match and use that to derive the index of the match.

This code is obviously memory limited, you can make it faster by using async_memcpy (new in Ampere, preview feature in Volta) to prefetch memory ahead of time (out of scope for this answer though). My tests indicate that this will speed it up by about 30%.

Note that atomic operations on the GPU are very very fast, a few cycles, even for atomic operations on global memory. This is unlike atomics on CPUs that takes 1000's of cycles.

If you model the atomics as fire-and-forget operations (i.e.: do not use the return value), they return immediately.

static constexpr auto big = 0x0FFFFFFF;
static constexpr auto small = 0;

__device__ [[nodiscard]] int laneid() { return threadIdx.x % 32; }
__device__ [[nodiscard]] int warpid() { return threadIdx.x / 32; }

template <int blockcount, blocksize>
__device__ int findminmax(int* data, int length, int threshold) {
    constexpr auto warpcount = blocksize / 32;
    const auto warpid = (threadIdx.x / 32);
    __shared__ int minlocation[warpcount];
    __shared__ int anywheremask;
    
    const auto stride = (length + blocksize - 1) / blockcount;
    const auto start = stride * blockIdx.x;
    const auto end = start + stride; //allow processing past end.
    __syncthreads();
    //from start to middle
    for (auto i = start + threadIdx.x; i < end; i += blockDim.x) {
        //all threads will always be active inside this loop.
        const auto a = (i < length) ? data[i] : small; //do not read past end though
        const auto foundmask = __ballot_sync(-1u, a >= threshold);
        //contains a 1 if found inside the warp.
        minlocation[warpid] = foundmask;
        __syncthreads();
        if (warpid() == 0) { //only in warp0 to reduce demand on shared memory.
            anywheremask = __ballot_sync(-1u, minlocation[laneid()]); //note that -1u assumes a block has 1024 threads, should really use a constexpr to derive the activemask from blocksize.
        }
        __syncthreads();
        if (anywheremask) {
            const auto warpfind = __ffs(anywheremask) -1;
            assert(uint32_t(warpfind) < 32);
            //do not worry about 1024 concurrent reads from shared memory, because this only happens once.
            const auto lanefind = __ffs(minlocation[warpfind]) - 1;
            assert(uint32_t(lanefind) < 32);
            //you may use an atomicOr here to signal
            //that you've found something and test that
            //value elsewhere to exit early, so other blocks also have an early out.
            return i - threadIdx.x + warpfind * 32 + lanefind;
        }
    }
    constexpr auto not_found = -1;
    return not_found; 
}

template <int blockcount, int blocksize>
__global__ findminmax(int* data, int length, int threshold, int* gmin, int* gmax) {
    assert(blockcount == blockDim.x);
    gmin[blockIdx.x] = findmin<blockcount, blocksize>(data, length, threshold);
    //now reduce the gmin array, to find the minimal value.
    //code for gmax works the same way, but traverse the data in the opposite direction.
    //do not! run findmin and findmax at the same time, because findmax needs min position as a stop.    
}

int main() {
     //init data
     
     cudaDeviceSynchronize(); //or use collaborative groups
     findminmax<<<46,1024>>><46, 1024>(data, len, t, gmin, gmax);
     cudaDeviceSynchronize(); 
     
}

I have not tested the code above, nor have I tried to compile it, but I hope you'll get the idea.

Pre-Ampere you can easily program your own __reduce_XXX_sync like so:

__device__ [[nodiscard]] int reduce_min_sync(int activemask, int data) {
    assert(activemask == -1); //adjust code if not all threads are active 
    data = min(data, __shfl_down_sync(activemask, 1, data);
    data = min(data, __shfl_down_sync(activemask, 2, data);
    data = min(data, __shfl_down_sync(activemask, 4, data);
    data = min(data, __shfl_down_sync(activemask, 8, data);
    data = min(data, __shfl_down_sync(activemask, 16, data);
    return data;
}

However if will be much faster to do be reduction using atomics

__device__ [[nodiscard]] int reduce_min_sync(int activemask, int data) {
    const auto starttime = clock64(); //not clock32, that is slow!
    __shared__ int s_min;
    s_min = big;
    //__syncwarp()// no need for sync, everyone agrees on s_min
    atomicMin(&s_min, data);
    __syncwarp();
    const auto endtime = clock64();
    printf("time for atomic reduction = %i cycles\n", int(endtime - starttime));
    return s_min;
}