I made a simple benchmark between Python NumPy and C OpenBLAS to multiply two 500x500 matrices. It seems that np.dot performs almost 9 times faster than cblas_sgemm. Is there anything I'm doing wrong?
Results:
- Python: 0.001830 seconds
- C: 0.016374 seconds
import numpy as np
import time
np.random.seed(42)
matrix_a = np.random.rand(500, 500)
matrix_b = np.random.rand(500, 500)
start_time = time.time()
result = np.dot(matrix_a, matrix_b)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"{elapsed_time:.6f} seconds")
#include <stdio.h>
#include <stdlib.h>
#include <cblas.h>
#include <time.h>
#define N 500
int main(void) {
srand(42);
float *matrix_a = (float *)malloc(N * N * sizeof(float));
float *matrix_b = (float *)malloc(N * N * sizeof(float));
float *result = (float *)malloc(N * N * sizeof(float));
for (int i = 0; i < N * N; ++i) {
matrix_a[i] = (float)rand() / RAND_MAX;
matrix_b[i] = (float)rand() / RAND_MAX;
}
clock_t start_time = clock();
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
N, N, N,
1.0, matrix_a, N, matrix_b, N,
0.0, result, N);
clock_t end_time = clock();
float elapsed_time = (float)(end_time - start_time) / CLOCKS_PER_SEC;
printf("%f seconds\n", elapsed_time);
free(matrix_a);
free(matrix_b);
free(result);
return 0;
}
Compiling with
cc -lopenblas main.c
Also gcc -O3 -march=native: 0.016420 seconds
I installed openblas with brew install openblas on MacOS 10.15.7. Also the output of np.show_config():
openblas64__info:
libraries = ['openblas64_', 'openblas64_']
library_dirs = ['/usr/local/lib']
language = c
define_macros = [('HAVE_CBLAS', None), ('BLAS_SYMBOL_SUFFIX', '64_'), ('HAVE_BLAS_ILP64', None)]
runtime_library_dirs = ['/usr/local/lib']
blas_ilp64_opt_info:
libraries = ['openblas64_', 'openblas64_']
library_dirs = ['/usr/local/lib']
language = c
define_macros = [('HAVE_CBLAS', None), ('BLAS_SYMBOL_SUFFIX', '64_'), ('HAVE_BLAS_ILP64', None)]
runtime_library_dirs = ['/usr/local/lib']
openblas64__lapack_info:
libraries = ['openblas64_', 'openblas64_']
library_dirs = ['/usr/local/lib']
language = c
define_macros = [('HAVE_CBLAS', None), ('BLAS_SYMBOL_SUFFIX', '64_'), ('HAVE_BLAS_ILP64', None), ('HAVE_LAPACKE', None)]
runtime_library_dirs = ['/usr/local/lib']
lapack_ilp64_opt_info:
libraries = ['openblas64_', 'openblas64_']
library_dirs = ['/usr/local/lib']
language = c
define_macros = [('HAVE_CBLAS', None), ('BLAS_SYMBOL_SUFFIX', '64_'), ('HAVE_BLAS_ILP64', None), ('HAVE_LAPACKE', None)]
runtime_library_dirs = ['/usr/local/lib']
Supported SIMD extensions in this NumPy install:
baseline = SSE,SSE2,SSE3
found = SSSE3,SSE41,POPCNT,SSE42,AVX,F16C,FMA3,AVX2
not found = AVX512F,AVX512CD,AVX512_KNL,AVX512_SKX,AVX512_CLX,AVX512_CNL,AVX512_ICL