0

I am new to MPI. Trying to approximately solve a PDE. A 1000 by 1000 array. Other than the first and last row, at every iteration each element is updated to be the average to its 8 neighbors.

My code runs, but with slightly different results at the third decimal places from using different number of processors. I guess the my communication is losing precision? I partition the large array by rows since C++ stores array row by row.

Here is the code.

#include <iostream> #include <mpi.h> #include <cmath> #include <stdio.h> #include <stdlib.h> int main(int argc, const char * argv[]) { // Initialize the MPI environment MPI_Init(NULL,NULL); int p; MPI_Comm_size(MPI_COMM_WORLD, &p); int id; MPI_Comm_rank(MPI_COMM_WORLD, &id); const double pi = 3.1415926; int n; n=atoi(argv[argc-1]); //calculate the starting and ending column indices int m=floor((n-2)/p); int r=n-2-m*p; //dividing row 1 to row n-2 by rows since in c/c++ arrays are stored in row-wise //therefore n-2=(m+1)*r+m*(p-r) //the first r processors get m+1 rows, the rest get m rows //starting row, ending row in the original A, width of matrix int start_row, end_row, width; if (id<=r-1){ start_row=1+id*(m+1); end_row=start_row+m; width=m+1; } else { start_row=1+r*(m+1)+(id-r)*m; end_row=start_row+m-1; width=m; } //printf("mpi debug 1"); printf("on processor %d, starting row is %d, ending row is %d \n",id,start_row,end_row); //id of the processor before and after //id_before is not significant for id==0 //id_after is not int id_before, id_after; if (id==0) id_before=p-1; else id_before=id-1; if (id==p-1) id_after=0; else id_after=id+1; //printf("debug000"); //initialize the local matrix //**** better way to initialize? double a[width][n], b[width][n]; for (int i=0; i<width; i++) for (int j=0; j<n; j++) a[i][j]=0.5; //two 1d arrays to store the halo cells double halo_before[n], halo_after[n]; if (id==0){ for (int j=0; j<n; j++){ halo_before[j]=0.0; } } if (id==p-1){ for (int j=0; j<n; j++) { halo_after[j]=5*sin(M_PI*((double)j/n)*((double)j/n)); } } MPI_Barrier(MPI_COMM_WORLD); //std::cout << " the sin function is" << 5*sin(M_PI*((double)1/1)*((double)1/2)) << "\n" <<std::endl; //set id=0 to be the root processor and call double start_time, end_time; if (id==0) start_time=MPI_Wtime(); MPI_Status status_before, status_after; MPI_Request request_before, request_after; ///////////////////////////////////////////////////////// //to dubug, print out arrays //std::cout<< " the array on processor " << id <<" to start is \n" << std::endl; //for (int i=0; i<width; i++){ // for (int j=0; j<n; j++){ // std::cout << a[i][j] << " "; // if (j==n-1) // std::cout << "\n" <<std::endl; // } //} //to debug print out halos //std::cout << "halo_before on processor " << id << " to start with is\n" << std::endl; //for (int i=0;i<n;i++){ // std::cout << halo_before[i] << " "; // if (i==n-1) // std::cout <<"\n" <<std::endl; //} //std::cout << "halo_after on processor " << id << " to start with is\n" << std::endl; //for (int i=0;i<n;i++){ // std::cout << halo_after[i] << " "; // if (i==n-1) // std::cout <<"\n" <<std::endl; //} ////////////////////////////////////////////////////// //begin iteration for (int t=0; t<500; t++){ //unblocking send //send first row to id_before: //how should I use tag? if (id>0){ MPI_Isend(&a[0][0], n, MPI_DOUBLE, id_before, t , MPI_COMM_WORLD, &request_before); } if (id<p-1){ //send the last row to id_after MPI_Isend(&a[width-1][0], n, MPI_DOUBLE, id_after, t, MPI_COMM_WORLD, &request_after); } //printf("dubug0"); //update the entries that do not depend on halos //local row=1 to row=width-2 //add if (width>3)?? int j_b, j_a; for (int i=1; i<width-1; i++){ for (int j=0; j<n; j++){ j_b=(n+j-1)%n; j_a=(j+1)%n; b[i][j]=(a[i-1][j_b]+a[i-1][j]+a[i-1][j_a]+a[i][j_b]+a[i][j_a]+a[i+1][j_b]+a[i+1][j]+a[i+1][j_a])/8; } } //printf("dubug1"); //blocking receive //may consider unblocking receive //receive from id_before and store in halo_before //not sure about status if (id>0){ MPI_Recv(&halo_before[0], n, MPI_DOUBLE, id_before ,t , MPI_COMM_WORLD, MPI_STATUS_IGNORE); } //receive from id_after and store in halo_after if (id<p-1){ MPI_Recv(&halo_after[0], n, MPI_DOUBLE, id_after,t , MPI_COMM_WORLD, MPI_STATUS_IGNORE); } //to debug print out halos //std::cout << "halo_before on processor " << id << " at iteration " << t<< " is\n" <<std::endl; //for (int i=0;i<n;i++){ // std::cout << halo_before[i] << " "; // if (i==n-1) // std::cout <<"\n" <<std::endl; //} //std::cout << "halo_after on processor " << id << " at iteration " << t<< " is\n" <<std::endl; //for (int i=0;i<n;i++){ // std::cout << halo_after[i] << " "; // if (i==n-1) // std::cout <<"\n" <<std::endl; //} //update entries that depend on halos //bugs here, what if width==1??? if (width==1){ for (int j=0; j<n; j++){ j_a=(n+j-1)%n; j_b=(j+1)%n; b[0][j]=(halo_before[j_b]+halo_before[j]+halo_before[j_a]+a[0][j_b]+a[0][j_a]+halo_after[j_b]+halo_after[j]+halo_after[j_a])/8; } } else{ for (int j=0; j<n; j++){ j_a=(n+j-1)%n; j_b=(j+1)%n; b[0][j]=(halo_before[j_b]+halo_before[j]+halo_before[j_a]+a[0][j_b]+a[0][j_a]+a[1][j_b]+a[1][j]+a[1][j_a])/8; b[width-1][j]=(a[width-2][j_b]+a[width-2][j]+a[width-2][j_b]+a[width-1][j_b]+a[width-1][j_a]+halo_after[j_a]+halo_after[j]+halo_after[j_b])/8; } } //copy to b //but make sure the send have been completed if (id>0) MPI_Wait(&request_before,MPI_STATUS_IGNORE); if (id<p-1) MPI_Wait(&request_after,MPI_STATUS_IGNORE); for (int i=0; i<width; i++) for (int j=0; j<n; j++) a[i][j]=b[i][j]; //to dubug, print out arrays //std::cout<< " the array on processor " << id <<" at iteration " << t <<" is \n"<< std::endl; //for (int i=0; i<width; i++){ // for (int j=0; j<n; j++){ // std::cout << a[i][j] << " "; // if (j==n-1) // std::cout << "\n" <<std::endl; // } //} } //calculate the sum double sum=0.0; for (int i=0; i<width; i++) sum += a[i][i+start_row]; double total_sum; //send to root processor MPI_Reduce(&sum, &total_sum,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); if (id==0){ end_time=MPI_Wtime(); //double sum_receive[p]; //double sum_calc; //for (int i=0; i<p; i++){ // MPI_Recv(&sum_receive[i], 1, MPI_DOUBLE, i, i, MPI_COMM_WORLD, MPI_STATUS_IGNORE); // sum_calc += sum_receive[i]; //} printf("time elapse is %f \n", end_time-start_time); printf("at root processor %d, the calculated sum is %f, \n", id, total_sum+5*sin(M_PI*((double)(n-1)/n)*((double)(n-1)/n))); } MPI_Finalize(); return 0; } 

1 Answer 1

3

There's a simple typo in one of your lines of code. This:

 b[width-1][j]=(a[width-2][j_b]+a[width-2][j]+a[width-2][j_b]+a[width-1][j_b]+a[width-1][j_a]+halo_after[j_a]+halo_after[j]+halo_after[j_b])/8; 

should be this (note the 4rd term; j_a, rather than a second j_b):

 b[width-1][j]=(a[width-2][j_b]+a[width-2][j]+a[width-2][j_a]+a[width-1][j_b]+a[width-1][j_a]+halo_after[j_a]+halo_after[j]+halo_after[j_b])/8; 

Because this was happening at the end row of each domain, the exact amount of error that caused was going to depend on the domain boundaries - e.g., how many processors you had.

Now the reason that such an error was basically inevitable in the code you've posted is that the same calculation - b from a - happens no fewer than 3 times with some modifications. This is a ticking time bomb; eventually one will get updated and the others will either become inconsistent with the first or the update to the others will end up having some error.

There's a number of ways to reduce the amount of replication here, both clarifying the code and avoiding these sorts of errors. Best is to incorporate the halos into the a and b arrays themselves, adding an extra row before and after the data to include the data - that way you don't have to worry about if width == 1 or not and treating the end rows separately. Also, define a function which updates a row or an element of b based on a, and use that function rather than repeating the code.

Below is an example of a cleaned-up code that encapsulates bits into routines, treats n and width consistently with included halo zones, etc.

#include <mpi.h> #include <math.h> #include <stdio.h> #include <stdlib.h> int min2i(int a, int b) { int result = a; if (b < a) result = b; return result; } void decomposition(const int n, const int nprocs, const int id, int *start_row, int *width, int *id_before, int *id_after) { const int nrows = n; const int m = nrows/nprocs; const int r = nrows % nprocs; *width = m; if (id < r) (*width)++; *start_row = 1 + id*m + min2i(id,r); *id_before = (id > 0 ? id-1 : MPI_PROC_NULL); *id_after = (id < nprocs-1 ? id+1 : MPI_PROC_NULL); } void startBC(const int n, const int width, double a[][n+2], double b[][n+2], const int id_before, const int id_after, const int t, MPI_Request *req) { MPI_Isend(&a[1][1], n, MPI_DOUBLE, id_before, 2*t , MPI_COMM_WORLD, &req[0]); MPI_Isend(&a[width][1], n, MPI_DOUBLE, id_after , 2*t+1, MPI_COMM_WORLD, &req[1]); } void finishBC(const int n, const int width, double a[][n+2], double b[][n+2], const int id_before, const int id_after, const int t, MPI_Request *req) { MPI_Status stats[2]; MPI_Recv(&a[0][1], n, MPI_DOUBLE, id_before, 2*t+1, MPI_COMM_WORLD, &stats[0]); MPI_Recv(&a[width+1][1], n, MPI_DOUBLE, id_after, 2*t , MPI_COMM_WORLD, &stats[1]); for (int i=0; i<width+2; i++) { a[i][0] = a[i][n]; a[i][n+1] = a[i][1]; } MPI_Waitall(2, req, stats); } void updateRow(const int n, const int width, double a[][n+2], double b[][n+2], const int row) { for (int j=1; j<=n; j++) b[row][j]=( a[row-1][j-1] + a[row-1][j] + a[row-1][j+1] +a[ row ][j-1] + a[ row ][j+1] +a[row+1][j-1] + a[row+1][j] + a[row+1][j+1])/8; } int main(int argc, const char * argv[]) { int p, id; MPI_Init(NULL,NULL); MPI_Comm_size(MPI_COMM_WORLD, &p); MPI_Comm_rank(MPI_COMM_WORLD, &id); const double pi = 3.1415926; int n=atoi(argv[argc-1]); int width, start_row, id_before, id_after; decomposition(n, p, id, &start_row, &width, &id_before, &id_after); double a[width+2][n+2], b[width+2][n+2]; for (int i=0; i<width+2; i++) for (int j=0; j<n+2; j++) a[i][j]=0.5; if (id==p-1) for (int j=0; j<n+2; j++) a[width+1][j]=5*sin(pi*((double)(j-1)/n)*((double)(j-1)/n)); if (id==0) for (int j=0; j<n+2; j++) a[0][j]=0.; double start_time, end_time; if (id==0) start_time=MPI_Wtime(); MPI_Request reqs[2]; //begin iteration for (int t=0; t<2; t++){ startBC(n, width, a, b, id_before, id_after, t, reqs); /* interior rows */ for (int row=2; row<width; row++) updateRow(n, width, a, b, row); finishBC(n, width, a, b, id_before, id_after, t, reqs); /* boundary rows */ updateRow(n, width, a, b, 1); updateRow(n, width, a, b, width); for (int i=1; i<width+1; i++) for (int j=1; j<n+1; j++) a[i][j]=b[i][j]; } //calculate the sum double sum=0.0; for (int i=1; i<width+1; i++) sum += a[i][i+start_row-1]; double total_sum; MPI_Reduce(&sum, &total_sum,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); if (id==0){ end_time=MPI_Wtime(); printf("time elapse is %f \n", end_time-start_time); printf("at root processor %d, the calculated sum is %f, \n", id, total_sum+5*sin(pi*((double)(n-1)/n)*((double)(n-1)/n))); } MPI_Finalize(); return 0; } 
Sign up to request clarification or add additional context in comments.

Comments

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.