Why can indexing a std::vector in a loop be faster than a raw pointer array on MSVC?

135 Views Asked by At

I was benchmarking Longest Common Subsequence algorithm implementation in C++ using a vector and a normal array and noticed a huge difference in performance of the versions. The new'd array was about 40% slower than std::vector in the compute part which I measure separately from the entire function itself. The for loops are identical themselves, yet they have a huge performance gap.

Here is the sample code:

#include <cstdio>
#include <cstdlib>
#include <algorithm>
#include <vector>
#include <windows.h>

#ifdef max
#undef max
#endif

#define ARR2D(i, j, cols) ((i) * (cols) + (j))
typedef unsigned long long u64;
u64 timing = 0ull;

int LCS_Arr(const std::vector<int> &a, const std::vector<int> &b)
{
    const int m = a.size() + 1;
    const int n = b.size() + 1;
    int *dp = new int[m * n];

    memset(dp, 0, m * n * sizeof(*dp));
    u64 tc;
    QueryPerformanceCounter((LARGE_INTEGER *)&tc);
    for (int i = 1; i < m; i++)
        for (int j = 1; j < n; j++)
        {
            if (a[i - 1] == b[j - 1])
                dp[ARR2D(i, j, n)] = dp[ARR2D(i - 1, j - 1, n)] + 1;
            else
                dp[ARR2D(i, j, n)] = std::max(dp[ARR2D(i - 1, j, n)], dp[ARR2D(i, j - 1, n)]);
        }
    const int result = dp[n * (m - 1) + n - 1];
    QueryPerformanceCounter((LARGE_INTEGER *)&timing);
    timing -= tc;

    delete[] dp;
    return result;
}

int LCS_VecLin(const std::vector<int> &a, const std::vector<int> &b)
{
    const int m = a.size() + 1;
    const int n = b.size() + 1;
    std::vector<int> dp = std::vector<int>(m * n, 0);

    u64 tc;
    QueryPerformanceCounter((LARGE_INTEGER *)&tc);
    for (int i = 1; i < m; i++)
        for (int j = 1; j < n; j++)
        {
            if (a[i - 1] == b[j - 1])
                dp[ARR2D(i, j, n)] = dp[ARR2D(i - 1, j - 1, n)] + 1;
            else
                dp[ARR2D(i, j, n)] = std::max(dp[ARR2D(i - 1, j, n)], dp[ARR2D(i, j - 1, n)]);
        }
    const int result = dp[n * (m - 1) + n - 1];
    QueryPerformanceCounter((LARGE_INTEGER *)&timing);
    timing -= tc;

    return result;
}

void main()
{
    srand(576);
    constexpr int Length = 30000;
    std::vector<int> a(Length), b(Length);

    std::generate(a.begin(), a.end(), [] { return rand() % 256; });
    std::generate(b.begin(), b.end(), [] { return rand() % 256; });

    u64 t0, t1, f;
    u64 d0, d1, f0, f1;
    int res0, res1;


    QueryPerformanceFrequency((LARGE_INTEGER *)&f);
    QueryPerformanceCounter((LARGE_INTEGER *)&t0);
    res1 = LCS_VecLin(a, b);
    QueryPerformanceCounter((LARGE_INTEGER *)&t1);
    d1 = t1 - t0;
    f1 = timing;

    QueryPerformanceCounter((LARGE_INTEGER *)&t0);
    res0 = LCS_Arr(a, b);
    QueryPerformanceCounter((LARGE_INTEGER *)&t1);
    d0 = t1 - t0;
    f0 = timing;

    double fm = 1.0 / f;
    printf("Array:  %d - Time taken Arr: %.3f s\n", res0, fm * d0);
    printf("VecLin: %d - Time taken Vec: %.3f s\n", res1, fm * d1);
    printf("ArrCompute: %.3f s\n", f0 * fm);
    printf("VecCompute: %.3f s\n", f1 * fm);
}

This is the output I usually get:

Array:  3493 - Time taken Arr: 1.373 s
VecLin: 3493 - Time taken Vec: 0.877 s
ArrCompute: 1.085 s
VecCompute: 0.612 s

Does anyone have any reasonable explanation on why this happens? GCC and Clang exhibit no such issues and yield about the same level of performance on both versions. Extracting the computation loop into a separate function leads to better performance and matches the std::vector when used with either container.

I tried both implementations, I expected them to yield about the same performance when compiled with optimizations on, but the std::vector implementation turned out much faster.

0

There are 0 best solutions below