Fusing two reduction operations in cuda Thrust

Question

Fusing two reduction operations in cuda Thrust

109 Views Asked by Frimousse At 18 May 2023 at 11:41

Is there a way to do a reduce_by_key operation and a reduce (or ideally another reduce_by_key) operation in only one kernel call in Thrust? Besides gaining computational speed, let us say I want to do this because the number of output values from the first reduce_by_key operation is too large to be stored in memory.

I have been wondering if transform_output_iterator could help here but have not found a solution.

A simple demonstration, but not my real use case, could be to find the minimum of the maximums of each row in a matrix, where that matrix is flattened and stored in a device_vector.

Original Q&A

There are 1 best solutions below

**Abator Abetor** · Accepted Answer · 2023-05-18T13:00:47.413000

The following code computes the minimum of all row maximums with a fixed amount of temporary storage to store a limited number of minima. Afterwards, a min reduce is performed to find the global minimum

The idea is to directly update the minimum value via transform_output_iterator. This can be done via atomics (in case of raw pointers for temp minima) or via locks (in case of iterators for temp minima. not shown in this answer).

To avoid atomic contention, the number of temporary minima should not be too small.

For 1G segments of size 1,i.e. there will be an atomic operation for each input element, I observe the following timings on an A100 GPU.

time approach 1 (standard): 13.2674 ms.
time approach 2 (fused): 38.0479 ms. (minimaSlots = 1)
time approach 2 (fused): 23.9251 ms. (minimaSlots = 1024)
time approach 2 (fused): 10.1109 ms. (minimaSlots = 1024 * 1024)

#include <thrust/device_vector.h>
#include <thrust/reduce.h>
#include <thrust/iterator/transform_output_iterator.h>
#include <thrust/iterator/discard_iterator.h>

#include <iostream>
#include <vector>
#include <limits>

template<size_t size>
struct UpdateMinimumOp{
    int* minPtr;

    UpdateMinimumOp(int* ptr):minPtr(ptr){}
    __device__
    int operator()(int value){
     // select output slot for minimum based on thread id
        const size_t pos = size_t(threadIdx.x) + size_t(blockIdx.x) * size_t(blockDim.x);
        const size_t minPos = pos % size;

        atomicMin(minPtr + minPos, value);
        return value;
    }
};

int main(){
    cudaEvent_t a; cudaEventCreate(&a);
    cudaEvent_t b; cudaEventCreate(&b);
    float t; 

    size_t N =  1ull << 30;
    thrust::device_vector<int> keys(N);
    thrust::device_vector<int> values(N);
    thrust::sequence(keys.begin(), keys.end(), 0);
    thrust::sequence(values.begin(), values.end(), 1);

    //Approach 1 (for timing comparison). max Reduce_by_key. then min reduce
    thrust::device_vector<int> maxima(N);

    cudaEventRecord(a);

    thrust::reduce_by_key(
        keys.begin(),
        keys.end(),
        values.begin(),
        thrust::make_discard_iterator(),
        maxima.begin(),
        thrust::equal_to<int>{},
        thrust::maximum<int>{}
    );

    int minimumApproach1 = thrust::reduce(maxima.begin(), maxima.end(), std::numeric_limits<int>::max(), thrust::minimum<int>{});

    cudaEventRecord(b);
    cudaEventSynchronize(b);
    cudaEventElapsedTime(&t, a,b);
    std::cout << "time approach 1 (standard): " << t << " ms. minimum: " <<minimumApproach1 << "\n";


    //Approach 2. Fuse max Reduce_by_key with the computation of the minimaSlots smallest maxima. then min reduce the stored smallest maxima
    //constexpr size_t minimaSlots = 1; 
    //constexpr size_t minimaSlots = 1024; 
    constexpr size_t minimaSlots = 1024*1024;
   
    thrust::device_vector<int> minima_of_maxima(minimaSlots);
    thrust::fill(minima_of_maxima.begin(), minima_of_maxima.end(), std::numeric_limits<int>::max());

    auto minimaOfMaximaIterator = thrust::make_transform_output_iterator(
        thrust::make_discard_iterator(),
        UpdateMinimumOp<minimaSlots>{minima_of_maxima.data().get()}
    );

    cudaEventRecord(a);

    thrust::reduce_by_key(
        keys.begin(),
        keys.end(),
        values.begin(),
        thrust::make_discard_iterator(),
        minimaOfMaximaIterator,
        thrust::equal_to<int>{},
        thrust::maximum<int>{}
    );

    int minimumApproach2 = thrust::reduce(minima_of_maxima.begin(), minima_of_maxima.end(), std::numeric_limits<int>::max(), thrust::minimum<int>{});

    cudaEventRecord(b);
    cudaEventSynchronize(b);
    cudaEventElapsedTime(&t, a,b);
    std::cout << "time approach 2 (fused): " << t << " ms. minimum: " << minimumApproach2 << "\n";

    cudaEventDestroy(a);
    cudaEventDestroy(b);
}

Fusing two reduction operations in cuda Thrust

There are 1 best solutions below

Related Questions in C++

Related Questions in CUDA

Related Questions in REDUCE

Related Questions in THRUST

Trending Questions

Popular # Hahtags

Popular Questions