I am currently working on a project where I need to parallelize the Gaussian elimination algorithm using MPI in C++. I have already implemented the algorithm, but I'm facing performance issues when calling the parallel version of the gaussianMPI function. Surprisingly, the sequential version of gaussian works faster than the parallel one.
One of the approaches I am considering is to call the gaussianMPI function from the main thread instead of creating a separate MPI process. However, I am not sure how to do this or if it is even possible with MPI.
I want the functions themselves to be called in the main thread, but according to the documentation, it seems impossible with MPI. In my case, the function gaussianMPI is called for each thread.
Can anyone suggest how to optimize the parallel Gaussian elimination algorithm or propose alternative approaches to improve its performance? I need to confine the scope of MPI parallelism within the for loop, but idk how to do it.
Here is my code
#include <iostream>
#include <mpi.h>
#include <ctime>
#include <cstdlib>
using namespace std;
const int N = 3; // Dimension of the matrix
const int rowsPerProc = 1; // For each process, N/size rows
void gaussianMPI(double A[N][N + 1], int rank, int local_rows) {
int i, j, k;
double koef;
double aa[1][N + 1];
// Forward elimination
for (k = 0; k < N - 1; k++) {
MPI_Bcast(A, (N-1) * (N + 1), MPI_DOUBLE, 0, MPI_COMM_WORLD);
MPI_Scatter(A, local_rows * (N + 1), MPI_DOUBLE, aa, local_rows * (N + 1), MPI_DOUBLE, 0, MPI_COMM_WORLD);
for (i = 0; i < local_rows; i++) {
if (rank * local_rows + i > k) {
koef = aa[i][k] / A[k][k];
for (j = k; j < N + 1; j++) {
aa[i][j] -= koef * A[k][j];
}
}
}
MPI_Barrier(MPI_COMM_WORLD);
MPI_Gather(aa, local_rows * (N + 1), MPI_DOUBLE, A, local_rows * (N + 1), MPI_DOUBLE, 0, MPI_COMM_WORLD);
}
// Back substitution
for (k = N - 1; k >= 0; k--) {
if (rank== 0) {
A[k][N] /= A[k][k];
A[k][k] = 1.0;
}
MPI_Bcast(A, N * (N + 1), MPI_DOUBLE, 0, MPI_COMM_WORLD);
MPI_Scatter(A, local_rows * (N + 1), MPI_DOUBLE, aa, local_rows * (N + 1), MPI_DOUBLE, 0, MPI_COMM_WORLD);
for (i = 0; i < local_rows; i++) {
if (rank * local_rows + i < k) {
aa[i][N] -= A[rank * local_rows + i][k] * A[k][N];
aa[i][k] = 0.0;
}
}
MPI_Barrier(MPI_COMM_WORLD);
MPI_Gather(aa, local_rows * (N + 1), MPI_DOUBLE, A, local_rows * (N + 1), MPI_DOUBLE, 0, MPI_COMM_WORLD);
}
// Output results on rank 0 process
if (rank == 0) {
for (int i = 0; i < N; ++i) {
cout << "x[" << i << "] = " << A[i][N] << endl;
}
}
}
void gaussian(double A[N][N + 1]) {
int i, j, k;
double koef;
// Forward elimination
for (k = 0; k < N - 1; k++) {
for (i = k + 1; i < N; i++) {
koef = A[i][k] / A[k][k];
for (j = k; j < N + 1; j++) {
A[i][j] -= koef * A[k][j];
}
}
}
// Back substitution
for (k = N - 1; k >= 0; k--) {
A[k][N] /= A[k][k];
A[k][k] = 1.0;
for (i = 0; i < k; i++) {
A[i][N] -= A[i][k] * A[k][N];
A[i][k] = 0.0;
}
}
for (int i = 0; i < N; ++i) {
cout << "x[" << i << "] = " << A[i][N] << endl;
}
}
int main() {
double A[N][N + 1] = { {1, 2, -1, 2}, {2, -3, 2, 2}, {3, 1, 1, 8} };
double B[N][N + 1] = { {1, 2, -1, 2}, {2, -3, 2, 2}, {3, 1, 1, 8} };
int rank, size;
double start_time, end_time;
MPI_Init(0, 0);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
int local_rows = N / size;
if (rank == 0) start_time = clock() / (double)CLOCKS_PER_SEC;
gaussianMPI(B, rank, local_rows);
MPI_Barrier(MPI_COMM_WORLD);
if (rank == 0) {
end_time = clock() / (double)CLOCKS_PER_SEC;
cout << "Total time taken by the program is " << end_time - start_time << " seconds" << endl;
}
MPI_Finalize();
if (rank == 0) {
start_time = clock() / (double)CLOCKS_PER_SEC;
gaussian(A);
end_time = clock() / (double)CLOCKS_PER_SEC;
cout << "Total time taken by the program is " << end_time - start_time << " seconds" << endl;
}
return 0;
}
There are several reasons why your code will not give good performance.
N=3the work is basically nothing, and your communication is equivalent to millions of operations.