I am new to MPI. Trying to approximately solve a PDE. A 1000 by 1000 array. Other than the first and last row, at every iteration each element is updated to be the average to its 8 neighbors.
My code runs, but with slightly different results at the third decimal places from using different number of processors. I guess the my communication is losing precision? I partition the large array by rows since C++ stores array row by row.
Here is the code.
#include <iostream> #include <mpi.h> #include <cmath> #include <stdio.h> #include <stdlib.h> int main(int argc, const char * argv[]) { // Initialize the MPI environment MPI_Init(NULL,NULL); int p; MPI_Comm_size(MPI_COMM_WORLD, &p); int id; MPI_Comm_rank(MPI_COMM_WORLD, &id); const double pi = 3.1415926; int n; n=atoi(argv[argc-1]); //calculate the starting and ending column indices int m=floor((n-2)/p); int r=n-2-m*p; //dividing row 1 to row n-2 by rows since in c/c++ arrays are stored in row-wise //therefore n-2=(m+1)*r+m*(p-r) //the first r processors get m+1 rows, the rest get m rows //starting row, ending row in the original A, width of matrix int start_row, end_row, width; if (id<=r-1){ start_row=1+id*(m+1); end_row=start_row+m; width=m+1; } else { start_row=1+r*(m+1)+(id-r)*m; end_row=start_row+m-1; width=m; } //printf("mpi debug 1"); printf("on processor %d, starting row is %d, ending row is %d \n",id,start_row,end_row); //id of the processor before and after //id_before is not significant for id==0 //id_after is not int id_before, id_after; if (id==0) id_before=p-1; else id_before=id-1; if (id==p-1) id_after=0; else id_after=id+1; //printf("debug000"); //initialize the local matrix //**** better way to initialize? double a[width][n], b[width][n]; for (int i=0; i<width; i++) for (int j=0; j<n; j++) a[i][j]=0.5; //two 1d arrays to store the halo cells double halo_before[n], halo_after[n]; if (id==0){ for (int j=0; j<n; j++){ halo_before[j]=0.0; } } if (id==p-1){ for (int j=0; j<n; j++) { halo_after[j]=5*sin(M_PI*((double)j/n)*((double)j/n)); } } MPI_Barrier(MPI_COMM_WORLD); //std::cout << " the sin function is" << 5*sin(M_PI*((double)1/1)*((double)1/2)) << "\n" <<std::endl; //set id=0 to be the root processor and call double start_time, end_time; if (id==0) start_time=MPI_Wtime(); MPI_Status status_before, status_after; MPI_Request request_before, request_after; ///////////////////////////////////////////////////////// //to dubug, print out arrays //std::cout<< " the array on processor " << id <<" to start is \n" << std::endl; //for (int i=0; i<width; i++){ // for (int j=0; j<n; j++){ // std::cout << a[i][j] << " "; // if (j==n-1) // std::cout << "\n" <<std::endl; // } //} //to debug print out halos //std::cout << "halo_before on processor " << id << " to start with is\n" << std::endl; //for (int i=0;i<n;i++){ // std::cout << halo_before[i] << " "; // if (i==n-1) // std::cout <<"\n" <<std::endl; //} //std::cout << "halo_after on processor " << id << " to start with is\n" << std::endl; //for (int i=0;i<n;i++){ // std::cout << halo_after[i] << " "; // if (i==n-1) // std::cout <<"\n" <<std::endl; //} ////////////////////////////////////////////////////// //begin iteration for (int t=0; t<500; t++){ //unblocking send //send first row to id_before: //how should I use tag? if (id>0){ MPI_Isend(&a[0][0], n, MPI_DOUBLE, id_before, t , MPI_COMM_WORLD, &request_before); } if (id<p-1){ //send the last row to id_after MPI_Isend(&a[width-1][0], n, MPI_DOUBLE, id_after, t, MPI_COMM_WORLD, &request_after); } //printf("dubug0"); //update the entries that do not depend on halos //local row=1 to row=width-2 //add if (width>3)?? int j_b, j_a; for (int i=1; i<width-1; i++){ for (int j=0; j<n; j++){ j_b=(n+j-1)%n; j_a=(j+1)%n; b[i][j]=(a[i-1][j_b]+a[i-1][j]+a[i-1][j_a]+a[i][j_b]+a[i][j_a]+a[i+1][j_b]+a[i+1][j]+a[i+1][j_a])/8; } } //printf("dubug1"); //blocking receive //may consider unblocking receive //receive from id_before and store in halo_before //not sure about status if (id>0){ MPI_Recv(&halo_before[0], n, MPI_DOUBLE, id_before ,t , MPI_COMM_WORLD, MPI_STATUS_IGNORE); } //receive from id_after and store in halo_after if (id<p-1){ MPI_Recv(&halo_after[0], n, MPI_DOUBLE, id_after,t , MPI_COMM_WORLD, MPI_STATUS_IGNORE); } //to debug print out halos //std::cout << "halo_before on processor " << id << " at iteration " << t<< " is\n" <<std::endl; //for (int i=0;i<n;i++){ // std::cout << halo_before[i] << " "; // if (i==n-1) // std::cout <<"\n" <<std::endl; //} //std::cout << "halo_after on processor " << id << " at iteration " << t<< " is\n" <<std::endl; //for (int i=0;i<n;i++){ // std::cout << halo_after[i] << " "; // if (i==n-1) // std::cout <<"\n" <<std::endl; //} //update entries that depend on halos //bugs here, what if width==1??? if (width==1){ for (int j=0; j<n; j++){ j_a=(n+j-1)%n; j_b=(j+1)%n; b[0][j]=(halo_before[j_b]+halo_before[j]+halo_before[j_a]+a[0][j_b]+a[0][j_a]+halo_after[j_b]+halo_after[j]+halo_after[j_a])/8; } } else{ for (int j=0; j<n; j++){ j_a=(n+j-1)%n; j_b=(j+1)%n; b[0][j]=(halo_before[j_b]+halo_before[j]+halo_before[j_a]+a[0][j_b]+a[0][j_a]+a[1][j_b]+a[1][j]+a[1][j_a])/8; b[width-1][j]=(a[width-2][j_b]+a[width-2][j]+a[width-2][j_b]+a[width-1][j_b]+a[width-1][j_a]+halo_after[j_a]+halo_after[j]+halo_after[j_b])/8; } } //copy to b //but make sure the send have been completed if (id>0) MPI_Wait(&request_before,MPI_STATUS_IGNORE); if (id<p-1) MPI_Wait(&request_after,MPI_STATUS_IGNORE); for (int i=0; i<width; i++) for (int j=0; j<n; j++) a[i][j]=b[i][j]; //to dubug, print out arrays //std::cout<< " the array on processor " << id <<" at iteration " << t <<" is \n"<< std::endl; //for (int i=0; i<width; i++){ // for (int j=0; j<n; j++){ // std::cout << a[i][j] << " "; // if (j==n-1) // std::cout << "\n" <<std::endl; // } //} } //calculate the sum double sum=0.0; for (int i=0; i<width; i++) sum += a[i][i+start_row]; double total_sum; //send to root processor MPI_Reduce(&sum, &total_sum,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); if (id==0){ end_time=MPI_Wtime(); //double sum_receive[p]; //double sum_calc; //for (int i=0; i<p; i++){ // MPI_Recv(&sum_receive[i], 1, MPI_DOUBLE, i, i, MPI_COMM_WORLD, MPI_STATUS_IGNORE); // sum_calc += sum_receive[i]; //} printf("time elapse is %f \n", end_time-start_time); printf("at root processor %d, the calculated sum is %f, \n", id, total_sum+5*sin(M_PI*((double)(n-1)/n)*((double)(n-1)/n))); } MPI_Finalize(); return 0; }