Why BLAS cblas_sgemm in C is slower than np.dot?

79 Views Asked by At

I made a simple benchmark between Python NumPy and C OpenBLAS to multiply two 500x500 matrices. It seems that np.dot performs almost 9 times faster than cblas_sgemm. Is there anything I'm doing wrong?

Results:

  • Python: 0.001830 seconds
  • C: 0.016374 seconds
import numpy as np
import time

np.random.seed(42)
matrix_a = np.random.rand(500, 500)
matrix_b = np.random.rand(500, 500)

start_time = time.time()
result = np.dot(matrix_a, matrix_b)
end_time = time.time()

elapsed_time = end_time - start_time
print(f"{elapsed_time:.6f} seconds")
#include <stdio.h>
#include <stdlib.h>
#include <cblas.h>
#include <time.h>

#define N 500

int main(void) {
    srand(42);

    float *matrix_a = (float *)malloc(N * N * sizeof(float));
    float *matrix_b = (float *)malloc(N * N * sizeof(float));
    float *result = (float *)malloc(N * N * sizeof(float));

    for (int i = 0; i < N * N; ++i) {
        matrix_a[i] = (float)rand() / RAND_MAX;
        matrix_b[i] = (float)rand() / RAND_MAX;
    }

    clock_t start_time = clock();

    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
                N, N, N,
                1.0, matrix_a, N, matrix_b, N,
                0.0, result, N);

    clock_t end_time = clock();

    float elapsed_time = (float)(end_time - start_time) / CLOCKS_PER_SEC;
    printf("%f seconds\n", elapsed_time);

    free(matrix_a);
    free(matrix_b);
    free(result);

    return 0;
}

Compiling with

cc -lopenblas main.c

Also gcc -O3 -march=native: 0.016420 seconds

I installed openblas with brew install openblas on MacOS 10.15.7. Also the output of np.show_config():

openblas64__info:
    libraries = ['openblas64_', 'openblas64_']
    library_dirs = ['/usr/local/lib']
    language = c
    define_macros = [('HAVE_CBLAS', None), ('BLAS_SYMBOL_SUFFIX', '64_'), ('HAVE_BLAS_ILP64', None)]
    runtime_library_dirs = ['/usr/local/lib']
blas_ilp64_opt_info:
    libraries = ['openblas64_', 'openblas64_']
    library_dirs = ['/usr/local/lib']
    language = c
    define_macros = [('HAVE_CBLAS', None), ('BLAS_SYMBOL_SUFFIX', '64_'), ('HAVE_BLAS_ILP64', None)]
    runtime_library_dirs = ['/usr/local/lib']
openblas64__lapack_info:
    libraries = ['openblas64_', 'openblas64_']
    library_dirs = ['/usr/local/lib']
    language = c
    define_macros = [('HAVE_CBLAS', None), ('BLAS_SYMBOL_SUFFIX', '64_'), ('HAVE_BLAS_ILP64', None), ('HAVE_LAPACKE', None)]
    runtime_library_dirs = ['/usr/local/lib']
lapack_ilp64_opt_info:
    libraries = ['openblas64_', 'openblas64_']
    library_dirs = ['/usr/local/lib']
    language = c
    define_macros = [('HAVE_CBLAS', None), ('BLAS_SYMBOL_SUFFIX', '64_'), ('HAVE_BLAS_ILP64', None), ('HAVE_LAPACKE', None)]
    runtime_library_dirs = ['/usr/local/lib']
Supported SIMD extensions in this NumPy install:
    baseline = SSE,SSE2,SSE3
    found = SSSE3,SSE41,POPCNT,SSE42,AVX,F16C,FMA3,AVX2
    not found = AVX512F,AVX512CD,AVX512_KNL,AVX512_SKX,AVX512_CLX,AVX512_CNL,AVX512_ICL
0

There are 0 best solutions below