Different number of processors end up with different results?

Question

I am new to MPI. Trying to approximately solve a PDE. A 1000 by 1000 array. Other than the first and last row, at every iteration each element is updated to be the average to its 8 neighbors.

My code runs, but with slightly different results at the third decimal places from using different number of processors. I guess the my communication is losing precision? I partition the large array by rows since C++ stores array row by row.

Here is the code.

#include <iostream> #include <mpi.h> #include <cmath> #include <stdio.h> #include <stdlib.h> int main(int argc, const char * argv[]) { // Initialize the MPI environment MPI_Init(NULL,NULL); int p; MPI_Comm_size(MPI_COMM_WORLD, &p); int id; MPI_Comm_rank(MPI_COMM_WORLD, &id); const double pi = 3.1415926; int n; n=atoi(argv[argc-1]); //calculate the starting and ending column indices int m=floor((n-2)/p); int r=n-2-m*p; //dividing row 1 to row n-2 by rows since in c/c++ arrays are stored in row-wise //therefore n-2=(m+1)*r+m*(p-r) //the first r processors get m+1 rows, the rest get m rows //starting row, ending row in the original A, width of matrix int start_row, end_row, width; if (id<=r-1){ start_row=1+id*(m+1); end_row=start_row+m; width=m+1; } else { start_row=1+r*(m+1)+(id-r)*m; end_row=start_row+m-1; width=m; } //printf("mpi debug 1"); printf("on processor %d, starting row is %d, ending row is %d \n",id,start_row,end_row); //id of the processor before and after //id_before is not significant for id==0 //id_after is not int id_before, id_after; if (id==0) id_before=p-1; else id_before=id-1; if (id==p-1) id_after=0; else id_after=id+1; //printf("debug000"); //initialize the local matrix //**** better way to initialize? double a[width][n], b[width][n]; for (int i=0; i<width; i++) for (int j=0; j<n; j++) a[i][j]=0.5; //two 1d arrays to store the halo cells double halo_before[n], halo_after[n]; if (id==0){ for (int j=0; j<n; j++){ halo_before[j]=0.0; } } if (id==p-1){ for (int j=0; j<n; j++) { halo_after[j]=5*sin(M_PI*((double)j/n)*((double)j/n)); } } MPI_Barrier(MPI_COMM_WORLD); //std::cout << " the sin function is" << 5*sin(M_PI*((double)1/1)*((double)1/2)) << "\n" <<std::endl; //set id=0 to be the root processor and call double start_time, end_time; if (id==0) start_time=MPI_Wtime(); MPI_Status status_before, status_after; MPI_Request request_before, request_after; ///////////////////////////////////////////////////////// //to dubug, print out arrays //std::cout<< " the array on processor " << id <<" to start is \n" << std::endl; //for (int i=0; i<width; i++){ // for (int j=0; j<n; j++){ // std::cout << a[i][j] << " "; // if (j==n-1) // std::cout << "\n" <<std::endl; // } //} //to debug print out halos //std::cout << "halo_before on processor " << id << " to start with is\n" << std::endl; //for (int i=0;i<n;i++){ // std::cout << halo_before[i] << " "; // if (i==n-1) // std::cout <<"\n" <<std::endl; //} //std::cout << "halo_after on processor " << id << " to start with is\n" << std::endl; //for (int i=0;i<n;i++){ // std::cout << halo_after[i] << " "; // if (i==n-1) // std::cout <<"\n" <<std::endl; //} ////////////////////////////////////////////////////// //begin iteration for (int t=0; t<500; t++){ //unblocking send //send first row to id_before: //how should I use tag? if (id>0){ MPI_Isend(&a[0][0], n, MPI_DOUBLE, id_before, t , MPI_COMM_WORLD, &request_before); } if (id<p-1){ //send the last row to id_after MPI_Isend(&a[width-1][0], n, MPI_DOUBLE, id_after, t, MPI_COMM_WORLD, &request_after); } //printf("dubug0"); //update the entries that do not depend on halos //local row=1 to row=width-2 //add if (width>3)?? int j_b, j_a; for (int i=1; i<width-1; i++){ for (int j=0; j<n; j++){ j_b=(n+j-1)%n; j_a=(j+1)%n; b[i][j]=(a[i-1][j_b]+a[i-1][j]+a[i-1][j_a]+a[i][j_b]+a[i][j_a]+a[i+1][j_b]+a[i+1][j]+a[i+1][j_a])/8; } } //printf("dubug1"); //blocking receive //may consider unblocking receive //receive from id_before and store in halo_before //not sure about status if (id>0){ MPI_Recv(&halo_before[0], n, MPI_DOUBLE, id_before ,t , MPI_COMM_WORLD, MPI_STATUS_IGNORE); } //receive from id_after and store in halo_after if (id<p-1){ MPI_Recv(&halo_after[0], n, MPI_DOUBLE, id_after,t , MPI_COMM_WORLD, MPI_STATUS_IGNORE); } //to debug print out halos //std::cout << "halo_before on processor " << id << " at iteration " << t<< " is\n" <<std::endl; //for (int i=0;i<n;i++){ // std::cout << halo_before[i] << " "; // if (i==n-1) // std::cout <<"\n" <<std::endl; //} //std::cout << "halo_after on processor " << id << " at iteration " << t<< " is\n" <<std::endl; //for (int i=0;i<n;i++){ // std::cout << halo_after[i] << " "; // if (i==n-1) // std::cout <<"\n" <<std::endl; //} //update entries that depend on halos //bugs here, what if width==1??? if (width==1){ for (int j=0; j<n; j++){ j_a=(n+j-1)%n; j_b=(j+1)%n; b[0][j]=(halo_before[j_b]+halo_before[j]+halo_before[j_a]+a[0][j_b]+a[0][j_a]+halo_after[j_b]+halo_after[j]+halo_after[j_a])/8; } } else{ for (int j=0; j<n; j++){ j_a=(n+j-1)%n; j_b=(j+1)%n; b[0][j]=(halo_before[j_b]+halo_before[j]+halo_before[j_a]+a[0][j_b]+a[0][j_a]+a[1][j_b]+a[1][j]+a[1][j_a])/8; b[width-1][j]=(a[width-2][j_b]+a[width-2][j]+a[width-2][j_b]+a[width-1][j_b]+a[width-1][j_a]+halo_after[j_a]+halo_after[j]+halo_after[j_b])/8; } } //copy to b //but make sure the send have been completed if (id>0) MPI_Wait(&request_before,MPI_STATUS_IGNORE); if (id<p-1) MPI_Wait(&request_after,MPI_STATUS_IGNORE); for (int i=0; i<width; i++) for (int j=0; j<n; j++) a[i][j]=b[i][j]; //to dubug, print out arrays //std::cout<< " the array on processor " << id <<" at iteration " << t <<" is \n"<< std::endl; //for (int i=0; i<width; i++){ // for (int j=0; j<n; j++){ // std::cout << a[i][j] << " "; // if (j==n-1) // std::cout << "\n" <<std::endl; // } //} } //calculate the sum double sum=0.0; for (int i=0; i<width; i++) sum += a[i][i+start_row]; double total_sum; //send to root processor MPI_Reduce(&sum, &total_sum,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); if (id==0){ end_time=MPI_Wtime(); //double sum_receive[p]; //double sum_calc; //for (int i=0; i<p; i++){ // MPI_Recv(&sum_receive[i], 1, MPI_DOUBLE, i, i, MPI_COMM_WORLD, MPI_STATUS_IGNORE); // sum_calc += sum_receive[i]; //} printf("time elapse is %f \n", end_time-start_time); printf("at root processor %d, the calculated sum is %f, \n", id, total_sum+5*sin(M_PI*((double)(n-1)/n)*((double)(n-1)/n))); } MPI_Finalize(); return 0; }

Jonathan Dursi · Accepted Answer · 2014-10-10 19:34:58Z

There's a simple typo in one of your lines of code. This:

 b[width-1][j]=(a[width-2][j_b]+a[width-2][j]+a[width-2][j_b]+a[width-1][j_b]+a[width-1][j_a]+halo_after[j_a]+halo_after[j]+halo_after[j_b])/8;

should be this (note the 4rd term; j_a, rather than a second j_b):

 b[width-1][j]=(a[width-2][j_b]+a[width-2][j]+a[width-2][j_a]+a[width-1][j_b]+a[width-1][j_a]+halo_after[j_a]+halo_after[j]+halo_after[j_b])/8;

Because this was happening at the end row of each domain, the exact amount of error that caused was going to depend on the domain boundaries - e.g., how many processors you had.

Now the reason that such an error was basically inevitable in the code you've posted is that the same calculation - b from a - happens no fewer than 3 times with some modifications. This is a ticking time bomb; eventually one will get updated and the others will either become inconsistent with the first or the update to the others will end up having some error.

There's a number of ways to reduce the amount of replication here, both clarifying the code and avoiding these sorts of errors. Best is to incorporate the halos into the a and b arrays themselves, adding an extra row before and after the data to include the data - that way you don't have to worry about if width == 1 or not and treating the end rows separately. Also, define a function which updates a row or an element of b based on a, and use that function rather than repeating the code.

Below is an example of a cleaned-up code that encapsulates bits into routines, treats n and width consistently with included halo zones, etc.

#include <mpi.h> #include <math.h> #include <stdio.h> #include <stdlib.h> int min2i(int a, int b) { int result = a; if (b < a) result = b; return result; } void decomposition(const int n, const int nprocs, const int id, int *start_row, int *width, int *id_before, int *id_after) { const int nrows = n; const int m = nrows/nprocs; const int r = nrows % nprocs; *width = m; if (id < r) (*width)++; *start_row = 1 + id*m + min2i(id,r); *id_before = (id > 0 ? id-1 : MPI_PROC_NULL); *id_after = (id < nprocs-1 ? id+1 : MPI_PROC_NULL); } void startBC(const int n, const int width, double a[][n+2], double b[][n+2], const int id_before, const int id_after, const int t, MPI_Request *req) { MPI_Isend(&a[1][1], n, MPI_DOUBLE, id_before, 2*t , MPI_COMM_WORLD, &req[0]); MPI_Isend(&a[width][1], n, MPI_DOUBLE, id_after , 2*t+1, MPI_COMM_WORLD, &req[1]); } void finishBC(const int n, const int width, double a[][n+2], double b[][n+2], const int id_before, const int id_after, const int t, MPI_Request *req) { MPI_Status stats[2]; MPI_Recv(&a[0][1], n, MPI_DOUBLE, id_before, 2*t+1, MPI_COMM_WORLD, &stats[0]); MPI_Recv(&a[width+1][1], n, MPI_DOUBLE, id_after, 2*t , MPI_COMM_WORLD, &stats[1]); for (int i=0; i<width+2; i++) { a[i][0] = a[i][n]; a[i][n+1] = a[i][1]; } MPI_Waitall(2, req, stats); } void updateRow(const int n, const int width, double a[][n+2], double b[][n+2], const int row) { for (int j=1; j<=n; j++) b[row][j]=( a[row-1][j-1] + a[row-1][j] + a[row-1][j+1] +a[ row ][j-1] + a[ row ][j+1] +a[row+1][j-1] + a[row+1][j] + a[row+1][j+1])/8; } int main(int argc, const char * argv[]) { int p, id; MPI_Init(NULL,NULL); MPI_Comm_size(MPI_COMM_WORLD, &p); MPI_Comm_rank(MPI_COMM_WORLD, &id); const double pi = 3.1415926; int n=atoi(argv[argc-1]); int width, start_row, id_before, id_after; decomposition(n, p, id, &start_row, &width, &id_before, &id_after); double a[width+2][n+2], b[width+2][n+2]; for (int i=0; i<width+2; i++) for (int j=0; j<n+2; j++) a[i][j]=0.5; if (id==p-1) for (int j=0; j<n+2; j++) a[width+1][j]=5*sin(pi*((double)(j-1)/n)*((double)(j-1)/n)); if (id==0) for (int j=0; j<n+2; j++) a[0][j]=0.; double start_time, end_time; if (id==0) start_time=MPI_Wtime(); MPI_Request reqs[2]; //begin iteration for (int t=0; t<2; t++){ startBC(n, width, a, b, id_before, id_after, t, reqs); /* interior rows */ for (int row=2; row<width; row++) updateRow(n, width, a, b, row); finishBC(n, width, a, b, id_before, id_after, t, reqs); /* boundary rows */ updateRow(n, width, a, b, 1); updateRow(n, width, a, b, width); for (int i=1; i<width+1; i++) for (int j=1; j<n+1; j++) a[i][j]=b[i][j]; } //calculate the sum double sum=0.0; for (int i=1; i<width+1; i++) sum += a[i][i+start_row-1]; double total_sum; MPI_Reduce(&sum, &total_sum,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); if (id==0){ end_time=MPI_Wtime(); printf("time elapse is %f \n", end_time-start_time); printf("at root processor %d, the calculated sum is %f, \n", id, total_sum+5*sin(pi*((double)(n-1)/n)*((double)(n-1)/n))); } MPI_Finalize(); return 0; }

Collectives™ on Stack Overflow

Different number of processors end up with different results?

1 Answer 1

Comments

Hot Network Questions

Collectives™ on Stack Overflow

1 Answer 1

Comments

Related