Why I don’t see speed improvement using std::execution with GCC?

178 Views Asked by At

I have such code to test speed improvement using std::execution library on Windows 10:

#include <stddef.h>
#include <stdio.h>

#include <algorithm>
#include <chrono>
#include <execution>
#include <random>
#include <ratio>
#include <vector>

using std::milli;
using std::random_device;
using std::sort;
using std::vector;
using std::chrono::duration;
using std::chrono::duration_cast;
using std::chrono::high_resolution_clock;

const size_t testSize = 1'000'000;
const int iterationCount = 5;

void print_results(                                 //
    const char* const tag,                          //
    const vector<double>& sorted,                   //
    high_resolution_clock::time_point startTime,    //
    high_resolution_clock::time_point endTime
    //
)
{
    printf("%s: Lowest: %g Highest: %g Time: %f ms\n", tag, sorted.front(), sorted.back(),
           duration_cast<duration<double, milli>>(endTime - startTime).count());
}

int main()
{
    random_device rd;

    printf("Testing with %llu doubles...\n", testSize);
    vector<double> doubles(testSize);
    for (auto& d : doubles)
    {
        d = static_cast<double>(rd());
    }

    for (size_t i = 0; i < iterationCount; ++i)
    {
        vector<double> sorted(doubles);
        const auto startTime = high_resolution_clock::now();
        sort(sorted.begin(), sorted.end());
        const auto endTime = high_resolution_clock::now();
        print_results("Serial STL", sorted, startTime, endTime);
    }

    for (size_t i = 0; i < iterationCount; ++i)
    {
        vector<double> sorted(doubles);
        const auto startTime = high_resolution_clock::now();
        std::sort(std::execution::par, sorted.begin(), sorted.end());
        const auto endTime = high_resolution_clock::now();
        print_results("Parallel STL", sorted, startTime, endTime);
    }
    return 0;
}

And I compile this code using cmake and Ninja/MVSC as generators.

Here is the CMakeLists.txt code:

cmake_minimum_required(VERSION 3.14.0)
project(EXEC VERSION 0.0.1)

set(CMAKE_C_STANDARD 17)
set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_STANDARD_REQUIRED ON)

add_executable(
    executionTests
    targets/executionTests.cpp
)

if(CMAKE_CXX_COMPILER_ID MATCHES "GNU")
    target_compile_options(
        executionTests
        PRIVATE
        -O3
    )
elseif(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
    STRING(REGEX REPLACE "/RTC(su|[1su])" "" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
    STRING(REGEX REPLACE "/RTC(su|[1su])" "" CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}")
    target_compile_options(
        executionTests
        PRIVATE
        /O2
    )
endif()

And config/build scripts:

# Set-Location build ; cmake .. -DCMAKE_BUILD_TYPE=Debug -G Ninja ; Set-Location ..
Set-Location build ; cmake .. -DCMAKE_BUILD_TYPE=Debug -G "Visual Studio 17 2022" ; Set-Location ..

cmake --build build --target executionTests -j 8 -v

Running executable builded by Ninja generator (gcc 13.1.0 compiler) gives this result:

Testing with 1000000 doubles...
Serial STL: Lowest: 9059 Highest: 4.29496e+09 Time: 75.064000 ms
Serial STL: Lowest: 9059 Highest: 4.29496e+09 Time: 78.308300 ms
Serial STL: Lowest: 9059 Highest: 4.29496e+09 Time: 77.079100 ms
Serial STL: Lowest: 9059 Highest: 4.29496e+09 Time: 77.511300 ms
Serial STL: Lowest: 9059 Highest: 4.29496e+09 Time: 76.836500 ms
Parallel STL: Lowest: 9059 Highest: 4.29496e+09 Time: 77.417900 ms
Parallel STL: Lowest: 9059 Highest: 4.29496e+09 Time: 77.452600 ms
Parallel STL: Lowest: 9059 Highest: 4.29496e+09 Time: 78.962000 ms
Parallel STL: Lowest: 9059 Highest: 4.29496e+09 Time: 80.188500 ms
Parallel STL: Lowest: 9059 Highest: 4.29496e+09 Time: 79.135000 ms

BUT! Executable builded with "Visual Studio 17 2022" gives next result:

Testing with 1000000 doubles...
Serial STL: Lowest: 5059 Highest: 4.29497e+09 Time: 256.872900 ms
Serial STL: Lowest: 5059 Highest: 4.29497e+09 Time: 264.764000 ms
Serial STL: Lowest: 5059 Highest: 4.29497e+09 Time: 262.767800 ms
Serial STL: Lowest: 5059 Highest: 4.29497e+09 Time: 264.283300 ms
Serial STL: Lowest: 5059 Highest: 4.29497e+09 Time: 259.603600 ms
Parallel STL: Lowest: 5059 Highest: 4.29497e+09 Time: 86.583400 ms
Parallel STL: Lowest: 5059 Highest: 4.29497e+09 Time: 81.407500 ms
Parallel STL: Lowest: 5059 Highest: 4.29497e+09 Time: 81.962600 ms
Parallel STL: Lowest: 5059 Highest: 4.29497e+09 Time: 88.384000 ms
Parallel STL: Lowest: 5059 Highest: 4.29497e+09 Time: 84.420800 ms

At this point I should be seeing difference in speed of sorting using std::execution::par option against basic sorting after compiling with GCC compiler, but I only see difference with MVSC compiler. Why? By the way, if I change std::execution::par to std::execution::seq - nothing changed.

Here is the verbose compile and linking via Ninja build generator:

[1/2] L:\UCRT_GCC-13-1-0_x64\mingw64\bin\c++.exe   -g -O3 -std=gnu++20 -MD -MT CMakeFiles/executionTests.dir/targets/executionTests.cpp.obj -MF CMakeFiles\executionTests.dir\targets\executionTests.cpp.obj.d -o CMakeFiles/executionTests.dir/targets/executionTests.cpp.obj -c ${WorkspaceFolder}/targets/executionTests.cpp
[2/2] cmd.exe /C "cd . && L:\UCRT_GCC-13-1-0_x64\mingw64\bin\c++.exe -g  CMakeFiles/executionTests.dir/targets/executionTests.cpp.obj -o ..\${OutputDir}\executionTests.exe -Wl,--out-implib,..\${OutputDir}\libexecutionTests.dll.a -Wl,--major-image-version,0,--minor-image-version,0  -lkernel32 -luser32 -lgdi32 -lwinspool -lshell32 -lole32 -loleaut32 -luuid -lcomdlg32 -ladvapi32 && cd ."

Here is the verbose compile and linking via "Visual Studio 17 2022" build generator:

   ClCompile:
     C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.36.32532\bin\HostX64\x64\CL.exe /c /Zi /nologo /W3 /WX- /diagnostics:column /O2 /Ob0 /D _MBCS /D WIN32 /D _WINDOWS /D "CMAKE_INTDIR=\"Debug\"" /Gm- /EHsc /MDd /GS /fp:precise /Zc:wchar_t /Zc:forScope /Zc:inli
     ne /GR /std:c++20 /Fo"executionTests.dir\Debug\\" /Fd"executionTests.dir\Debug\vc143.pdb" /external:W3 /Gd /TP /errorReport:queue ${WorkspaceFolder}\targets\executionTests.cpp
     executionTests.cpp
   Link:
     C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.36.32532\bin\HostX64\x64\link.exe /ERRORREPORT:QUEUE /OUT:"${OutputDir}\Debug\executionTests.exe" /INCREMENTAL /ILK:"executionTests
     .dir\Debug\executionTests.ilk" /NOLOGO kernel32.lib user32.lib gdi32.lib winspool.lib shell32.lib ole32.lib oleaut32.lib uuid.lib comdlg32.lib advapi32.lib /MANIFEST /MANIFESTUAC:"level='asInvoker' uiAccess='false'" /manifest:embed /DEBUG /PDB:"${OutputDir}/Debug/executionTests.pdb" /SUBSYSTEM:CONSOLE /TLBID:1 /DYNAMICBASE /NXCOMPAT /IMPLIB:"${OutputDir}/Debug/executionTests.lib" /MACHINE:X64  /machine:x64 
      executionTests.dir\Debug\executionTests.obj
     executionTests.vcxproj -> ${OutputDir}\Debug\executionTests.exe

Don’t see what I am missing.

Can it be that STL implementation with GCC 13.1.0 (maybe earlier too) has speed improvements using -O3 flag, and std::execution not needed?

Or maybe I just didn’t place needed flag to see how std::execution improve performance even better, meaning less then 75-80 ms using std::executiuon?

1

There are 1 best solutions below

1
Dmytro Kovryzhenko On

It seems that issue resolved.

Ted Lyngmo mentioned very important fact:

... when you include <execution> it checks if it can find the tbb headers. If they available, it includes them and uses tbb as a backend. If it can't find a backend to use, it will fallback to std::execution::seq...

It was a surprise for me that if I not explicitly using TBB headers in my code - I still need to include and link TBB...

So I have to fix CMakeLists.txt accordingly, to include and link TBB headers and library.

cmake_minimum_required(VERSION 3.14.0)
project(EXEC VERSION 0.0.1)

set(CMAKE_C_STANDARD 17)
set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_STANDARD_REQUIRED ON)

# ! Set the TBB library path
if(CMAKE_CXX_COMPILER_ID MATCHES "GNU")
    set(TBB_ROOT_LIB "L:/oneTBB/mingw64/mingw64/bin" CACHE PATH "Path to TBB") # ? Change this path according to your oneTBB location
    set(TBB_ROOT_INC "L:/oneTBB/mingw64/mingw64/include" CACHE PATH "Path to TBB") # ? Change this path according to your oneTBB location
elseif(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
    set(TBB_ROOT_LIB "L:/oneapi-tbb-2021.9.0-win/oneapi-tbb-2021.9.0/lib/intel64/vc14" CACHE PATH "Path to TBB") # ? Change this path according to your oneTBB location
    set(TBB_ROOT_INC "L:/oneapi-tbb-2021.9.0-win/oneapi-tbb-2021.9.0/include" CACHE PATH "Path to TBB") # ? Change this path according to your oneTBB location
endif()

add_executable(
    executionTests
    targets/executionTests.cpp
)

# ! When you include `<execution>` GCC checks if it can find the tbb headers.
target_include_directories(
    executionTests
    PUBLIC ${TBB_ROOT_INC}
)

# ! Also GCC need to link TBB library to use it's features, so specify path
target_link_directories(
    executionTests
    PRIVATE "${TBB_ROOT_LIB}"
)

# ! Libraries for MSVC and GCC are different
if(CMAKE_CXX_COMPILER_ID MATCHES "GNU")
    # * I found only one version of TBB dll for mingw64,
    # * not sure if's Debug or Release version,
    # * but results very close to MSVC Release
    set(TBB_LIB "-llibtbb12")
elseif(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
    if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
        set(TBB_LIB "tbb_debug.lib")
    else()
        set(TBB_LIB "tbb.lib")
    endif()
endif()

# ! Finally link library
target_link_libraries(
    executionTests
    PRIVATE ${TBB_LIB}
)

if(CMAKE_CXX_COMPILER_ID MATCHES "GNU")
    target_compile_options(
        executionTests
        PRIVATE
        -O3
    )
elseif(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
    # ? I never mentioned why I have these lines:
    # ? /O2 (improve performance by reducing execution time and optimizing code size)
    # ? and /RTC1 (perform runtime checks) flags can't be combined
    # ? so this is a workaround to remove /RTC1 from resulted command line
    STRING(REGEX REPLACE "/RTC(su|[1su])" "" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
    STRING(REGEX REPLACE "/RTC(su|[1su])" "" CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}")
    target_compile_options(
        executionTests
        PRIVATE
        /O2
    )
endif()

Need to mention that MSVC needs to see configuration type in the build script, and not only in the configuration script, so these also need to be added:

cmake --build build --config Debug --target executionTests -j 8 -v
# cmake --build build --config Release --target executionTests -j 8 -v

Useful code changes were done:

C++ headers instead of C headers (stddef.h, stdio.h>), and other usefull changes:

#include <cstddef>
#include <cstdio>

#include <ranges> // ! add this for std::ranges::generate

// * I came up with this template function to easy use std::mt19937
// * and generate doubles in a functor way  
template <typename _Type, _Type _left, _Type _right>
_Type generateRandomNumber()
{
    /*
    ! Your program looks portable as-is so it's just a matter of making
    ! the implementation use a backend for the execution
    ! policies. I would however not use `random_device` as a source since
    ! that will produce different results every time. Use
    ! a deterministic source so that you get the same content of the containers every time.
    ! `std::mt19937 rd;` would be better for that.
    */
    std::random_device rd;
    std::mt19937 gen(rd());
    std::uniform_real_distribution<_Type> dist(_left, _right);    //* get random number from 1 to 1000
    return dist(gen);
}

int main()
{  
    // ! %llu is the wrong printf conversion specifier for size_t. Use %zu.
    printf("Testing with %zu doubles...\n", testSize);
    vector<double> doubles(testSize);
    // * my idea how we can generate random doubles
    std::ranges::generate(doubles, generateRandomNumber<double, -100.0, 100.0>);

    // time how long it takes to sort them:
    for (size_t i = 0; i < iterationCount; ++i)
    {
        vector<double> sorted(doubles);
        /*
        ! Never use high_resolution_clock./
        ! The specification for that clock was messed up and
        ! even the author of that clock says to not use it.
        ! Use std::chrono::steady_clock instead.
        */
        const auto startTime = steady_clock::now();
        sort(sorted.begin(), sorted.end());
        const auto endTime = steady_clock::now();
        print_results("Serial STL", sorted, startTime, endTime);
    }

    for (size_t i = 0; i < iterationCount; ++i)
    {
        vector<double> sorted(doubles);
        const auto startTime = steady_clock::now();
        // same sort call as above, but with par_unseq:
        std::sort(std::execution::par, sorted.begin(), sorted.end());
        const auto endTime = steady_clock::now();
        // in our output, note that these are the parallel results:
        print_results("Parallel STL", sorted, startTime, endTime);
    }
}

Now results for GCC and MSVC are drasticly different.

DEBUG:

GCC:

Testing with 1000000 doubles...
Serial STL: Lowest: -99.9999 Highest: 100 Time: 86.243000 ms
Serial STL: Lowest: -99.9999 Highest: 100 Time: 83.652300 ms
Serial STL: Lowest: -99.9999 Highest: 100 Time: 85.125400 ms
Serial STL: Lowest: -99.9999 Highest: 100 Time: 86.877800 ms
Serial STL: Lowest: -99.9999 Highest: 100 Time: 85.984300 ms
Parallel STL: Lowest: -99.9999 Highest: 100 Time: 35.638500 ms
Parallel STL: Lowest: -99.9999 Highest: 100 Time: 30.529100 ms
Parallel STL: Lowest: -99.9999 Highest: 100 Time: 35.590700 ms
Parallel STL: Lowest: -99.9999 Highest: 100 Time: 29.676400 ms
Parallel STL: Lowest: -99.9999 Highest: 100 Time: 33.012500 ms

MSVC:

Testing with 1000000 doubles...
Serial STL: Lowest: -99.9999 Highest: 99.9999 Time: 277.651300 ms
Serial STL: Lowest: -99.9999 Highest: 99.9999 Time: 281.134900 ms
Serial STL: Lowest: -99.9999 Highest: 99.9999 Time: 278.242000 ms
Serial STL: Lowest: -99.9999 Highest: 99.9999 Time: 280.372500 ms
Serial STL: Lowest: -99.9999 Highest: 99.9999 Time: 275.779000 ms
Parallel STL: Lowest: -99.9999 Highest: 99.9999 Time: 98.904400 ms
Parallel STL: Lowest: -99.9999 Highest: 99.9999 Time: 94.853300 ms
Parallel STL: Lowest: -99.9999 Highest: 99.9999 Time: 100.861400 ms
Parallel STL: Lowest: -99.9999 Highest: 99.9999 Time: 92.364000 ms
Parallel STL: Lowest: -99.9999 Highest: 99.9999 Time: 102.100400 ms

RELEASE:

GCC:

Testing with 1000000 doubles...
Serial STL: Lowest: -99.9998 Highest: 100 Time: 77.569600 ms
Serial STL: Lowest: -99.9998 Highest: 100 Time: 83.123500 ms
Serial STL: Lowest: -99.9998 Highest: 100 Time: 81.983300 ms
Serial STL: Lowest: -99.9998 Highest: 100 Time: 82.967000 ms
Serial STL: Lowest: -99.9998 Highest: 100 Time: 82.845600 ms
Parallel STL: Lowest: -99.9998 Highest: 100 Time: 34.475000 ms
Parallel STL: Lowest: -99.9998 Highest: 100 Time: 34.092200 ms
Parallel STL: Lowest: -99.9998 Highest: 100 Time: 30.292100 ms
Parallel STL: Lowest: -99.9998 Highest: 100 Time: 33.041200 ms
Parallel STL: Lowest: -99.9998 Highest: 100 Time: 30.095900 ms

MSVC:

Testing with 1000000 doubles...
Serial STL: Lowest: -99.9998 Highest: 100 Time: 95.452600 ms
Serial STL: Lowest: -99.9998 Highest: 100 Time: 98.047800 ms
Serial STL: Lowest: -99.9998 Highest: 100 Time: 97.359000 ms
Serial STL: Lowest: -99.9998 Highest: 100 Time: 96.975000 ms
Serial STL: Lowest: -99.9998 Highest: 100 Time: 98.612100 ms
Parallel STL: Lowest: -99.9998 Highest: 100 Time: 35.154200 ms
Parallel STL: Lowest: -99.9998 Highest: 100 Time: 35.384300 ms
Parallel STL: Lowest: -99.9998 Highest: 100 Time: 35.499900 ms
Parallel STL: Lowest: -99.9998 Highest: 100 Time: 34.143500 ms
Parallel STL: Lowest: -99.9998 Highest: 100 Time: 33.570500 ms

Something wrong with GCC again) Probably I need to link another TBB library for Debug version.

But overall problem now solved. Many thanks to Ted Lyngmo.