[mpich-commits] [software/dev/stencil] Stencil code repository branch, master, created. 5c2816986cd0be1c0bb66652e08e213da1307700

Wed Sep 10 13:25:16 CDT 2014

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "Stencil code repository".

The branch, master has been created
        at  5c2816986cd0be1c0bb66652e08e213da1307700 (commit)

- Log -----------------------------------------------------------------
http://git.mpich.org/software/dev/stencil.git/commitdiff/5c2816986cd0be1c0bb66652e08e213da1307700

commit 5c2816986cd0be1c0bb66652e08e213da1307700
Author: Pavan Balaji <balaji at anl.gov>
Date:   Sat Sep 6 21:11:06 2014 -0500

    Added header file.

diff --git a/stencil_par.h b/stencil_par.h
new file mode 100644
index 0000000..2dba3d2
--- /dev/null
+++ b/stencil_par.h
@@ -0,0 +1,24 @@
+/*
+ * stencil_par.h
+ *
+ *  Created on: Jan 4, 2012
+ *      Author: htor
+ */
+
+#ifndef STENCIL_PAR_H_
+#define STENCIL_PAR_H_
+
+#include "mpi.h"
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+
+// row-major order
+#define ind(i,j) (j)*(bx+2)+(i)
+
+void printarr_par(int iter, double* array, int size, int px, int py, int rx, int ry, int bx, int by, int offx, int offy, MPI_Comm comm);
+
+
+#endif /* STENCIL_PAR_H_ */

http://git.mpich.org/software/dev/stencil.git/commitdiff/ef82f607fb83051c58d894412293adcae65cb8f3

commit ef82f607fb83051c58d894412293adcae65cb8f3
Author: Pavan Balaji <balaji at anl.gov>
Date:   Sat Sep 6 19:14:05 2014 -0500

    White space cleanup.

diff --git a/stencil.cpp b/stencil.cpp
index b4d7f2e..b417e06 100644
--- a/stencil.cpp
+++ b/stencil.cpp
@@ -6,527 +6,515 @@
  */
 
 #include "stencil_par.h"
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
+{
 
-  int provide;
-  MPI_Init(&argc,&argv);  
-  int r,p;
-  MPI_Comm comm = MPI_COMM_WORLD;
-  MPI_Comm_rank(comm, &r);
-  MPI_Comm_size(comm, &p);
-  int n, energy, niters,nx,ny, lpx,lpy,px, py;
+    int provide;
+    MPI_Init(&argc, &argv);
+    int r, p;
+    MPI_Comm comm = MPI_COMM_WORLD;
+    MPI_Comm_rank(comm, &r);
+    MPI_Comm_size(comm, &p);
+    int n, energy, niters, nx, ny, lpx, lpy, px, py;
 
-  MPI_Comm shmcomm;
-  MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shmcomm);
-  int sr, sp; // rank and size in shmem comm
+    MPI_Comm shmcomm;
+    MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shmcomm);
+    int sr, sp;                 // rank and size in shmem comm
 
-  MPI_Comm_size(shmcomm, &sp);
-  MPI_Comm_rank(shmcomm, &sr);
-    printf("shmcomm size = %d\n",sp); 
-  int CORENUM = 12;
-  if (r==0) {
-	  // argument checking
-	  if(argc < 8) {
-		  if(!r) printf("usage: stencil_mpi <n> <energy> <niters> <nx> <ny> <lpx> <lpy>\n");
-		  MPI_Finalize();
-		  exit(1);       
-	  }
+    MPI_Comm_size(shmcomm, &sp);
+    MPI_Comm_rank(shmcomm, &sr);
+    printf("shmcomm size = %d\n", sp);
+    int CORENUM = 12;
+    if (r == 0) {
+        // argument checking
+        if (argc < 8) {
+            if (!r)
+                printf("usage: stencil_mpi <n> <energy> <niters> <nx> <ny> <lpx> <lpy>\n");
+            MPI_Finalize();
+            exit(1);
+        }
+
+        n = atoi(argv[1]);      // nxn grid
+        energy = atoi(argv[2]); // energy to be injected per iteration
+        niters = atoi(argv[3]); // number of iterations
 
-	  n = atoi(argv[1]); // nxn grid
-	  energy = atoi(argv[2]); // energy to be injected per iteration
-	  niters = atoi(argv[3]); // number of iterations
+        nx = atoi(argv[4]);     // 1st dim processes
+        ny = atoi(argv[5]);     // 2nd dim processes
+        lpx = atoi(argv[6]);    // 1st dim processes
+        lpy = atoi(argv[7]);    // 2nd dim processes
+        px = nx * lpx;
+        py = ny * lpy;
+        if (px * py != p)
+            MPI_Abort(comm, 1); // abort if px or py are wrong
+        if (n % py != 0)
+            MPI_Abort(comm, 2); // abort px needs to divide n
+        if (n % px != 0)
+            MPI_Abort(comm, 3); // abort py needs to divide n
+        if (nx * ny != p / CORENUM)
+            MPI_Abort(comm, 66);
+        if (lpx * lpy != CORENUM)
+            MPI_Abort(comm, 77);
+        // distribute arguments
+        int args[7] = { n, energy, niters, nx, ny, lpx, lpy };
+        MPI_Bcast(args, 7, MPI_INT, 0, comm);
+    }
+    else {
+        int args[7];
+        MPI_Bcast(args, 7, MPI_INT, 0, comm);
+        n = args[0];
+        energy = args[1];
+        niters = args[2];
 
-	  nx=atoi(argv[4]); // 1st dim processes
-	  ny=atoi(argv[5]); // 2nd dim processes
-	  lpx=atoi(argv[6]); // 1st dim processes
-      lpy=atoi(argv[7]); // 2nd dim processes
-      px = nx*lpx;
-	  py = ny*lpy;
-      if(px * py != p) MPI_Abort(comm, 1);// abort if px or py are wrong
-      if(n % py != 0) MPI_Abort(comm, 2); // abort px needs to divide n
-      if(n % px != 0) MPI_Abort(comm, 3); // abort py needs to divide n
-      if(nx*ny != p/CORENUM) MPI_Abort(comm,66);
-	  if(lpx*lpy != CORENUM) MPI_Abort(comm,77);
-      // distribute arguments
-      int args[7] = {n, energy, niters, nx,ny,lpx,lpy};
-      MPI_Bcast(args, 7, MPI_INT, 0, comm);
-  }
-  else {
-      int args[7];
-      MPI_Bcast(args, 7, MPI_INT, 0, comm);
-      n=args[0]; energy=args[1]; niters=args[2];
-	
-	  nx=args[3]; ny=args[4];
-	  lpx = args[5];
-	  lpy = args[6];
-	
-      px = nx*lpx;
-	  py = ny*lpy;
-  }
+        nx = args[3];
+        ny = args[4];
+        lpx = args[5];
+        lpy = args[6];
+
+        px = nx * lpx;
+        py = ny * lpy;
+    }
 
-  int nodenum = nx*ny;
-  int nodeidx, nodeidy;
-  int nodeid = r/CORENUM;
-  int CH = 2;
-	if(CH == 1)
-	{
-		nodeidx = nodeid % nx;
-		nodeidy = nodeid / nx;
-	}
-	else if(CH == 2)
-	{
-		nodeidx = nodeid / ny;
-		nodeidy = nodeid % ny;
-	}
-	else if(CH == 3)
-	{
-		nodeidx = nodeid / ny;
-                nodeidy = nodeid % ny;
-		if(nodeidx % 2 != 0)
-		{
-			nodeidy = ny-1 - nodeidy;
-		}
-	}
+    int nodenum = nx * ny;
+    int nodeidx, nodeidy;
+    int nodeid = r / CORENUM;
+    int CH = 2;
+    if (CH == 1) {
+        nodeidx = nodeid % nx;
+        nodeidy = nodeid / nx;
+    }
+    else if (CH == 2) {
+        nodeidx = nodeid / ny;
+        nodeidy = nodeid % ny;
+    }
+    else if (CH == 3) {
+        nodeidx = nodeid / ny;
+        nodeidy = nodeid % ny;
+        if (nodeidx % 2 != 0) {
+            nodeidy = ny - 1 - nodeidy;
+        }
+    }
 //nodeidx idy is the node coordinates(x,y)
 //
-    int localprocessindex = r%CORENUM;
-	int local_x, local_y;
-	local_x = localprocessindex%lpx;
-	local_y = localprocessindex/lpx;
-	// determine my coordinates (x,y) -- r=x*a+y in the 2d processor array
+    int localprocessindex = r % CORENUM;
+    int local_x, local_y;
+    local_x = localprocessindex % lpx;
+    local_y = localprocessindex / lpx;
+    // determine my coordinates (x,y) -- r=x*a+y in the 2d processor array
     // int rx = r % px;
     // int ry = r / px;
     int rx;
     int ry;
-	rx = lpx*nodeidx+local_x;
-	ry = lpy*nodeidy+local_y;
-	int noden, nodes, nodew, nodee;
-	if(nodeidy < ny-1)
-	{
-		if(CH ==1 )
-			nodes = nodeid+ nx;
-		if(CH == 2)
-			nodes = nodeid+1;
-		if(CH == 3)
-		{
-			if(nodeidx %2 ==0)
-				nodes=nodeid+1;
-			else 
-				nodes = nodeid-1;
-		}
-	}
-	else
-	{
-		nodes = -1;
-	}
-	if(nodeidy == 0)
-	{
-		noden = -1;
-	}
-	else
-	{
-		if(CH==1)
-			noden = nodeid - nx;
-		if(CH == 2)
-			noden = nodeid-1;
-		if(CH == 3)
-		{
-			if(nodeidx %2 == 0)
-				noden = nodeid-1;
-			else
-				noden = nodeid+1;
-		}
-	
-	}
-        if(nodeidx == 0)
-	{
-		nodew = -1;
-	}
-	else
-	{
-		if(CH ==1)
-			nodew = nodeid -1;
-		if(CH == 2)
-			nodew = nodeid -ny;
-		if(CH == 3)
-		{
-			
-			if(nodeidx%2 == 0)
-				nodew = nodeid - nodeidy -1-nodeidy;
-			else
-			{
-				nodew = nodeid +nodeidy+1-2*ny+nodeidy;
-			}
-		}
-	}
-	if(nodeidx == nx -1)
-	{
-		nodee = -1;
-	}
-	else
-	{
-		if(CH==1)
-		{
-			nodee = nodeid+1;
-		}
-		if(CH==2)
-		{
-			nodee = nodeid + ny;
-		}
-		if(CH==3)
-		{
-			if(nodeidx%2 == 0)
-			{
-				nodee = nodeid - nodeidy + ny + ny-1-nodeidy;
-			}
-			else
-			{
-				nodee = nodeid +nodeidy +1 + nodeidy;
-			}
-		}
-	}
-	int north1, south1, west1, east1;
-	if(local_y != 0)
-	{
-		//south
-		north1 = r - lpx;
-	}	
-	else
-	{
-		if(noden == -1)
-			north1 = -1;
-		else
-		{
-			north1 = r + (noden - nodeid)*CORENUM  - r%CORENUM +(lpy-1) * lpx + local_x;
-			//north1 = r + (noden - nodeid)*CORENUM  - r%CORENUM + local_x;
-		}	
-	}
-	if(local_y < lpy-1)
-	{
-		//north
-		south1 = r + lpx;
-	}
-	else
-	{
-		if(nodes == -1)
-			south1 = -1;
-		else
-		{
-			south1 = r + (nodes - nodeid)*CORENUM  - r%CORENUM + local_x;
-			//south1 = r + (noden - nodeid)*CORENUM  - r%CORENUM +(lpy-1) * lpx + local_x;
-		}
-	}
-	if(local_x != 0)
-	{
-		//west
-		west1 = r-1;
-	}
-	else
-	{
-		if(nodew == -1)
-			west1 = -1;
-		else
-		{
-			west1 = r + (nodew - nodeid)*CORENUM - r%CORENUM + local_y*lpx + lpx-1;
-		}
-	}
-	if(local_x == lpx-1)
-	{
-		//east
-		if(nodee == -1)
-		{
-			east1 = -1;
-		}
-		else
-		{
-			east1 = r + (nodee - nodeid) * CORENUM - r%CORENUM + local_y*lpx;
-		}
-	}
-	else
-	{
-		
-		east1 = r+1;
-	}
-	
-  // determine my four neighbors
-  for(int i=0; i<p; i++)
-  {
-    if(r==i)
-	  printf("id=%d, sr=%d, nodeid=%d, north1=%d, south1=%d, west1=%d, east1=%d\n", r,sr,nodeid,north1,south1,west1,east1);
-	MPI_Barrier(MPI_COMM_WORLD);
-  }
-  int north = north1; if(north1 < 0)   north = MPI_PROC_NULL;
-  int south = south1; if(south1 < 0)   south = MPI_PROC_NULL;
-  int west= west1;     if(west1< 0)    west = MPI_PROC_NULL;
-  int east = east1;    if(east1 < 0)   east = MPI_PROC_NULL;
-  //int temp = north; north = south; south = temp; 
-  int bx = n/px; // block size in x
-  int by = n/py; // block size in y
-  int offx = rx*bx; // offset in x
-  int offy = ry*by; // offset in y
-  double* volatile memO;
-  double* volatile memN;
-  
-  //printf("rank %d N S W E is: %d %d %d %d\n",r,north, south, west, east);
-  MPI_Win winO;
-  MPI_Win winN;
-  MPI_Win_allocate_shared((bx+2)*(by+2)*sizeof(double),1,MPI_INFO_NULL,shmcomm,(void*)&memO,&winO);
-  MPI_Win_allocate_shared((bx+2)*(by+2)*sizeof(double),1,MPI_INFO_NULL,shmcomm,(void*)&memN,&winN);
-  double* volatile westptrO;
-  double* volatile eastptrO;
-  double* volatile northptrO;
-  double* volatile southptrO;
-  double* volatile westptrN;
-  double* volatile eastptrN;
-  double* volatile northptrN;
-  double* volatile southptrN;
-  int needN = 0; int needS = 0; int needW = 0; int needE = 0;
-  if(north != -1 && north / CORENUM == r/CORENUM)
-  {
-	needN = 1;
-  }
-  if(south != -1 && south / CORENUM == r/CORENUM)
-  {
-	needS = 1;
-  }
-  if(west != -1 && west / CORENUM == r/CORENUM)
-  {
-	needW = 1;
-  }
-  if(east != -1 && east / CORENUM == r/CORENUM)
-  {
-	needE = 1;
-  }
-  //temp = needN; needN =needS; needS = temp;
-  MPI_Aint sz;
-  int dsp_unit;
-  if(needN)
-  {
-	MPI_Win_shared_query(winO,sr-lpx,&sz,&dsp_unit,(void*)&northptrO);
-	MPI_Win_shared_query(winN,sr-lpx,&sz,&dsp_unit,(void*)&northptrN);
-  }
-  if(needS)
-  {
-	MPI_Win_shared_query(winO,sr+lpx,&sz,&dsp_unit,(void*)&southptrO);
-	MPI_Win_shared_query(winN,sr+lpx,&sz,&dsp_unit,(void*)&southptrN);
-  }
-  if(needW)
-  {
-	MPI_Win_shared_query(winO,sr-1,&sz,&dsp_unit,(void*)&westptrO);
-	MPI_Win_shared_query(winN,sr-1,&sz,&dsp_unit,(void*)&westptrN);
-  }
-  if(needE)
-  {
-	MPI_Win_shared_query(winO,sr+1,&sz,&dsp_unit,(void*)&eastptrO);
-	MPI_Win_shared_query(winN,sr+1,&sz,&dsp_unit,(void*)&eastptrN);
-  }
-  
-  // printf("rank %d N S W E is: %d %d %d %d\n",r,north, south, west, east);
-  // decompose the domain
-  // printf("%i (%i,%i) - w: %i, e: %i, n: %i, s: %i\n", r, ry,rx,west,east,north,south);
-  // allocate two work arrays
-  double* volatile tmp; 
-  // initialize three heat sources
-  const int nsources=3;
-  int sources[nsources][2] = {{n/2,n/2}, {n/3,n/3}, {n*4/5,n*8/9}};
-  int locnsources=0; // number of sources in my area
-  int locsources[nsources][2]; // sources local to my rank
-  for (int i=0; i<nsources; ++i) { // determine which sources are in my patch
-    int locx = sources[i][0] - offx;
-    int locy = sources[i][1] - offy;
-    if(locx >= 0 && locx < bx && locy >= 0 && locy < by) {
-      locsources[locnsources][0] = locx+1; // offset by halo zone
-      locsources[locnsources][1] = locy+1; // offset by halo zone
-      locnsources++;
-    }
-  }
+    rx = lpx * nodeidx + local_x;
+    ry = lpy * nodeidy + local_y;
+    int noden, nodes, nodew, nodee;
+    if (nodeidy < ny - 1) {
+        if (CH == 1)
+            nodes = nodeid + nx;
+        if (CH == 2)
+            nodes = nodeid + 1;
+        if (CH == 3) {
+            if (nodeidx % 2 == 0)
+                nodes = nodeid + 1;
+            else
+                nodes = nodeid - 1;
+        }
+    }
+    else {
+        nodes = -1;
+    }
+    if (nodeidy == 0) {
+        noden = -1;
+    }
+    else {
+        if (CH == 1)
+            noden = nodeid - nx;
+        if (CH == 2)
+            noden = nodeid - 1;
+        if (CH == 3) {
+            if (nodeidx % 2 == 0)
+                noden = nodeid - 1;
+            else
+                noden = nodeid + 1;
+        }
+
+    }
+    if (nodeidx == 0) {
+        nodew = -1;
+    }
+    else {
+        if (CH == 1)
+            nodew = nodeid - 1;
+        if (CH == 2)
+            nodew = nodeid - ny;
+        if (CH == 3) {
+
+            if (nodeidx % 2 == 0)
+                nodew = nodeid - nodeidy - 1 - nodeidy;
+            else {
+                nodew = nodeid + nodeidy + 1 - 2 * ny + nodeidy;
+            }
+        }
+    }
+    if (nodeidx == nx - 1) {
+        nodee = -1;
+    }
+    else {
+        if (CH == 1) {
+            nodee = nodeid + 1;
+        }
+        if (CH == 2) {
+            nodee = nodeid + ny;
+        }
+        if (CH == 3) {
+            if (nodeidx % 2 == 0) {
+                nodee = nodeid - nodeidy + ny + ny - 1 - nodeidy;
+            }
+            else {
+                nodee = nodeid + nodeidy + 1 + nodeidy;
+            }
+        }
+    }
+    int north1, south1, west1, east1;
+    if (local_y != 0) {
+        //south
+        north1 = r - lpx;
+    }
+    else {
+        if (noden == -1)
+            north1 = -1;
+        else {
+            north1 = r + (noden - nodeid) * CORENUM - r % CORENUM + (lpy - 1) * lpx + local_x;
+            //north1 = r + (noden - nodeid)*CORENUM  - r%CORENUM + local_x;
+        }
+    }
+    if (local_y < lpy - 1) {
+        //north
+        south1 = r + lpx;
+    }
+    else {
+        if (nodes == -1)
+            south1 = -1;
+        else {
+            south1 = r + (nodes - nodeid) * CORENUM - r % CORENUM + local_x;
+            //south1 = r + (noden - nodeid)*CORENUM  - r%CORENUM +(lpy-1) * lpx + local_x;
+        }
+    }
+    if (local_x != 0) {
+        //west
+        west1 = r - 1;
+    }
+    else {
+        if (nodew == -1)
+            west1 = -1;
+        else {
+            west1 = r + (nodew - nodeid) * CORENUM - r % CORENUM + local_y * lpx + lpx - 1;
+        }
+    }
+    if (local_x == lpx - 1) {
+        //east
+        if (nodee == -1) {
+            east1 = -1;
+        }
+        else {
+            east1 = r + (nodee - nodeid) * CORENUM - r % CORENUM + local_y * lpx;
+        }
+    }
+    else {
 
-	double tp = 0.0;
-  // allocate communication buffers
-  double *sbufnorth = (double*)calloc(1,bx*sizeof(double)); // send buffers
-  double *sbufsouth = (double*)calloc(1,bx*sizeof(double));
-  double *sbufeast = (double*)calloc(1,by*sizeof(double));
-  double *sbufwest = (double*)calloc(1,by*sizeof(double));
-  double *rbufnorth = (double*)calloc(1,bx*sizeof(double)); // receive buffers
-  double *rbufsouth = (double*)calloc(1,bx*sizeof(double));
-  double *rbufeast = (double*)calloc(1,by*sizeof(double));
-  double *rbufwest = (double*)calloc(1,by*sizeof(double));
+        east1 = r + 1;
+    }
 
-  double thetime[6];
-        thetime[0] = thetime[1] = thetime[2] = thetime[3] =thetime [4] = thetime[5] =  0.0;
-  double t=-MPI_Wtime(); // take time
-  double heat; // total heat in system
-  //the following is the loop for niters steps.--zhuxm
-  //double* tempnew = (double*)malloc(bx*sizeof(double));
-  //int basesize = 2*(bx+by);
-  //int halfindex = basesize+4;
-  //MPI_Win_lock_all(0, winO);
-  //MPI_Win_lock_all(0, winN);
+    // determine my four neighbors
+    for (int i = 0; i < p; i++) {
+        if (r == i)
+            printf("id=%d, sr=%d, nodeid=%d, north1=%d, south1=%d, west1=%d, east1=%d\n", r, sr,
+                   nodeid, north1, south1, west1, east1);
+        MPI_Barrier(MPI_COMM_WORLD);
+    }
+    int north = north1;
+    if (north1 < 0)
+        north = MPI_PROC_NULL;
+    int south = south1;
+    if (south1 < 0)
+        south = MPI_PROC_NULL;
+    int west = west1;
+    if (west1 < 0)
+        west = MPI_PROC_NULL;
+    int east = east1;
+    if (east1 < 0)
+        east = MPI_PROC_NULL;
+    //int temp = north; north = south; south = temp;
+    int bx = n / px;            // block size in x
+    int by = n / py;            // block size in y
+    int offx = rx * bx;         // offset in x
+    int offy = ry * by;         // offset in y
+    double *volatile memO;
+    double *volatile memN;
 
-  MPI_Barrier(shmcomm);
-  for(int iter=0; iter<niters; ++iter) {
-    // refresh heat sources
-    //MPI_Barrier(comm);
-    //if(!r) printf("Step %d\n",iter);
-   double time1 = MPI_Wtime();
-	 for(int i=0; i<locnsources; ++i) {
-      memO[ind(locsources[i][0],locsources[i][1])] += energy; // heat source
-  }
-   double time2 = MPI_Wtime();
-    // exchange data with neighbors
-    MPI_Request reqs[8];
-   if(!needN)
-    for(int i=0; i<bx; ++i) sbufnorth[i] = memO[ind(i+1,1)]; // pack loop - last valid region
-   if(!needS)
-    for(int i=0; i<bx; ++i) sbufsouth[i] = memO[ind(i+1,by)]; // pack loop
-   if(!needE)
-    for(int i=0; i<by; ++i) sbufeast[i] = memO[ind(bx,i+1)]; // pack loop
-   if(!needW)
-    for(int i=0; i<by; ++i) sbufwest[i] = memO[ind(1,i+1)]; // pack loop
+    //printf("rank %d N S W E is: %d %d %d %d\n",r,north, south, west, east);
+    MPI_Win winO;
+    MPI_Win winN;
+    MPI_Win_allocate_shared((bx + 2) * (by + 2) * sizeof(double), 1, MPI_INFO_NULL, shmcomm,
+                            (void *) &memO, &winO);
+    MPI_Win_allocate_shared((bx + 2) * (by + 2) * sizeof(double), 1, MPI_INFO_NULL, shmcomm,
+                            (void *) &memN, &winN);
+    double *volatile westptrO;
+    double *volatile eastptrO;
+    double *volatile northptrO;
+    double *volatile southptrO;
+    double *volatile westptrN;
+    double *volatile eastptrN;
+    double *volatile northptrN;
+    double *volatile southptrN;
+    int needN = 0;
+    int needS = 0;
+    int needW = 0;
+    int needE = 0;
+    if (north != -1 && north / CORENUM == r / CORENUM) {
+        needN = 1;
+    }
+    if (south != -1 && south / CORENUM == r / CORENUM) {
+        needS = 1;
+    }
+    if (west != -1 && west / CORENUM == r / CORENUM) {
+        needW = 1;
+    }
+    if (east != -1 && east / CORENUM == r / CORENUM) {
+        needE = 1;
+    }
+    //temp = needN; needN =needS; needS = temp;
+    MPI_Aint sz;
+    int dsp_unit;
+    if (needN) {
+        MPI_Win_shared_query(winO, sr - lpx, &sz, &dsp_unit, (void *) &northptrO);
+        MPI_Win_shared_query(winN, sr - lpx, &sz, &dsp_unit, (void *) &northptrN);
+    }
+    if (needS) {
+        MPI_Win_shared_query(winO, sr + lpx, &sz, &dsp_unit, (void *) &southptrO);
+        MPI_Win_shared_query(winN, sr + lpx, &sz, &dsp_unit, (void *) &southptrN);
+    }
+    if (needW) {
+        MPI_Win_shared_query(winO, sr - 1, &sz, &dsp_unit, (void *) &westptrO);
+        MPI_Win_shared_query(winN, sr - 1, &sz, &dsp_unit, (void *) &westptrN);
+    }
+    if (needE) {
+        MPI_Win_shared_query(winO, sr + 1, &sz, &dsp_unit, (void *) &eastptrO);
+        MPI_Win_shared_query(winN, sr + 1, &sz, &dsp_unit, (void *) &eastptrN);
+    }
+
+    // printf("rank %d N S W E is: %d %d %d %d\n",r,north, south, west, east);
+    // decompose the domain
+    // printf("%i (%i,%i) - w: %i, e: %i, n: %i, s: %i\n", r, ry,rx,west,east,north,south);
+    // allocate two work arrays
+    double *volatile tmp;
+    // initialize three heat sources
+    const int nsources = 3;
+    int sources[nsources][2] = { {n / 2, n / 2}, {n / 3, n / 3}, {n * 4 / 5, n * 8 / 9} };
+    int locnsources = 0;        // number of sources in my area
+    int locsources[nsources][2];        // sources local to my rank
+    for (int i = 0; i < nsources; ++i) {        // determine which sources are in my patch
+        int locx = sources[i][0] - offx;
+        int locy = sources[i][1] - offy;
+        if (locx >= 0 && locx < bx && locy >= 0 && locy < by) {
+            locsources[locnsources][0] = locx + 1;      // offset by halo zone
+            locsources[locnsources][1] = locy + 1;      // offset by halo zone
+            locnsources++;
+        }
+    }
 
-	double time3 = MPI_Wtime();
-	
-	int index = 0;
-   if(!needN)
-   {
-        MPI_Isend(sbufnorth, bx, MPI_DOUBLE, north, 9, comm, &reqs[index++]);
-        MPI_Irecv(rbufnorth, bx, MPI_DOUBLE, north, 9, comm, &reqs[index++]);
-   }
-    if(!needE)
-    {
-        MPI_Isend(sbufeast, by, MPI_DOUBLE, east, 9, comm, &reqs[index++]);
-        MPI_Irecv(rbufeast, by, MPI_DOUBLE, east, 9, comm, &reqs[index++]);
-    }
-   if(!needW)
-    {
-        MPI_Isend(sbufwest, by, MPI_DOUBLE, west, 9, comm, &reqs[index++]);
-        MPI_Irecv(rbufwest, by, MPI_DOUBLE, west, 9, comm, &reqs[index++]);
-    }
-    if(!needS)
-    {
-        MPI_Isend(sbufsouth, bx, MPI_DOUBLE, south, 9, comm, &reqs[index++]);
-        MPI_Irecv(rbufsouth, bx, MPI_DOUBLE, south, 9, comm, &reqs[index++]);
-    } 
-	printf("id = %d, iter=%d, OK\n", r, iter);
-   int ReadN, ReadS, ReadW, ReadE;
-   ReadN = ReadS = ReadW = ReadE = 0;
-   if(!needN)
-   {
-	ReadN = 1;
-   }   
-   if(!needS)
-   {
-	ReadS = 1;
-   }   
-   if(!needW)
-   {
-	ReadW = 1;
-   }   
-   if(!needE)
-   {
-	ReadE = 1;
-   }
-   //order: West0. East1, North2, South3  
-   while(!(ReadW * ReadE * ReadN * ReadS))
-   {
-     if(!ReadW)
-     {
-		 for(int i=0; i<by; i++)
-         //   memO[(i+1)*(bx+2)]=westptrO[(i+1)*(bx+2)+bx];
-         ReadW = 1;
-     }
-     if(!ReadE)
-     {
-		 for(int i=0; i<by; i++)
-         //   memO[(i+1)*(bx+2)+bx+1]=eastptrO[(i+1)*(bx+2)+1];
-         ReadE = 1;
-     }
-     if(!ReadN)
-     {
-         //memcpy(rbufnorth,northptr+2*by+bx,bx*sizeof(double));
-         memcpy(memO+1,northptrO+(bx+2)*by+1,bx*sizeof(double));
-         //northptr[basesize+3] = -1;
-         ReadN = 1;
-     }
-     if(!ReadS)
-     {
-         //memcpy(rbufsouth,southptr+2*by, bx*sizeof(double));
-         memcpy(memO+(bx+2)*(by+1)+1,southptrO+bx+2+1, bx*sizeof(double));
-         //southptr[basesize+2] = -1;
-         ReadS = 1;
-     }
-   }
-    /*
-    MPI_Isend(sbufnorth, bx, MPI_DOUBLE, north, 9, comm, &reqs[0]);
-    MPI_Isend(sbufsouth, bx, MPI_DOUBLE, south, 9, comm, &reqs[1]);
-    MPI_Isend(sbufeast, by, MPI_DOUBLE, east, 9, comm, &reqs[2]);
-    MPI_Isend(sbufwest, by, MPI_DOUBLE, west, 9, comm, &reqs[3]);
-    MPI_Irecv(rbufnorth, bx, MPI_DOUBLE, north, 9, comm, &reqs[4]);
-    MPI_Irecv(rbufsouth, bx, MPI_DOUBLE, south, 9, comm, &reqs[5]);
-    MPI_Irecv(rbufeast, by, MPI_DOUBLE, east, 9, comm, &reqs[6]);
-    MPI_Irecv(rbufwest, by, MPI_DOUBLE, west, 9, comm, &reqs[7]);
-    */
-    MPI_Waitall(index, reqs, MPI_STATUS_IGNORE);
-	double time4 = MPI_Wtime();
-	if(!needN)
-      for(int i=0; i<bx; ++i) memO[ind(i+1,0)] = rbufnorth[i]; // unpack loop - into ghost cells
-	if(!needS)
-      for(int i=0; i<bx; ++i) memO[ind(i+1,by+1)] = rbufsouth[i]; // unpack loop
-	if(!needE)
-      for(int i=0; i<by; ++i) memO[ind(bx+1,i+1)] = rbufeast[i]; // unpack loop
-	if(!needW)
-      for(int i=0; i<by; ++i) memO[ind(0,i+1)] = rbufwest[i]; // unpack loop
-	double time5 = MPI_Wtime();
-    heat = 0.0;
-	 tp = -MPI_Wtime();	
-	//double* aold2 = aold;
-	//memcpy(aold2,aold,(bx+2)*(by+2)*sizeof(double));
-    //double* anewtemp = malloc(by*sizeof(double));
-    //for(int i=1; i<bx+1; ++i) {
-      //for(int j=1; j<by+1; ++j) {
-    for(int j=1; j<by+1; ++j) {
-      for(int i=1; i<bx+1; ++i) {
-        //double tt = anew[ind(i,j)]/2.0;
-	double tt  = memN[ind(i,j)]/2.0 + (memO[ind(i-1,j)] + memO[ind(i+1,j)] + memO[ind(i,j-1)] + memO[ind(i,j+1)])/4.0/2.0;
-	//double tt  = anew[ind(i,j)]/2.0 + ( aold[ind(i,j-1)] + aold[ind(i,j+1)])/4.0/2.0;
-        //tt +=  (aold[ind(i-1,j)] + aold[ind(i+1,j)] + aold[ind(i,j-1)] + aold[ind(i,j+1)])/4.0/2.0;
-        memN[ind(i,j)] = tt;
-	//double d  = anew[ind(i,j)]/2.0 + ((double)r+1.0+double(r+3.46)+double(iter))/4.0/2.0;
-       //anew[ind(i,j)] = anew[ind(i,j)]/2.0;
-        //tempnew[i-1] = tt;
-        heat += memN[ind(i,j)];
-      }
-    }
-    // swap arrays
-    double time6 = MPI_Wtime();
-    thetime[0] += (time2-time1);
-    thetime[1] += (time3-time2);
-    thetime[2] += (time4-time3);
-    thetime[3] += (time5-time4);
-    thetime[4] += (time6-time5);
-    thetime[5] += (time6-time1);
+    double tp = 0.0;
+    // allocate communication buffers
+    double *sbufnorth = (double *) calloc(1, bx * sizeof(double));      // send buffers
+    double *sbufsouth = (double *) calloc(1, bx * sizeof(double));
+    double *sbufeast = (double *) calloc(1, by * sizeof(double));
+    double *sbufwest = (double *) calloc(1, by * sizeof(double));
+    double *rbufnorth = (double *) calloc(1, bx * sizeof(double));      // receive buffers
+    double *rbufsouth = (double *) calloc(1, bx * sizeof(double));
+    double *rbufeast = (double *) calloc(1, by * sizeof(double));
+    double *rbufwest = (double *) calloc(1, by * sizeof(double));
+
+    double thetime[6];
+    thetime[0] = thetime[1] = thetime[2] = thetime[3] = thetime[4] = thetime[5] = 0.0;
+    double t = -MPI_Wtime();    // take time
+    double heat;                // total heat in system
+    //the following is the loop for niters steps.--zhuxm
+    //double* tempnew = (double*)malloc(bx*sizeof(double));
+    //int basesize = 2*(bx+by);
+    //int halfindex = basesize+4;
+    //MPI_Win_lock_all(0, winO);
+    //MPI_Win_lock_all(0, winN);
 
-	tp += MPI_Wtime();
-    tmp=memN; memN=memO; memO=tmp;
-    tmp=northptrN; northptrN=northptrO; northptrO=tmp;
-    tmp=southptrN; southptrN=southptrO; southptrO=tmp;
-    tmp=eastptrN; eastptrN=eastptrO; eastptrO=tmp;
-    tmp=westptrN; westptrN=northptrO; westptrO=tmp;
     MPI_Barrier(shmcomm);
-  }
-  t+=MPI_Wtime();
+    for (int iter = 0; iter < niters; ++iter) {
+        // refresh heat sources
+        //MPI_Barrier(comm);
+        //if (!r) printf("Step %d\n",iter);
+        double time1 = MPI_Wtime();
+        for (int i = 0; i < locnsources; ++i) {
+            memO[ind(locsources[i][0], locsources[i][1])] += energy;    // heat source
+        }
+        double time2 = MPI_Wtime();
+        // exchange data with neighbors
+        MPI_Request reqs[8];
+        if (!needN)
+            for (int i = 0; i < bx; ++i)
+                sbufnorth[i] = memO[ind(i + 1, 1)];     // pack loop - last valid region
+        if (!needS)
+            for (int i = 0; i < bx; ++i)
+                sbufsouth[i] = memO[ind(i + 1, by)];    // pack loop
+        if (!needE)
+            for (int i = 0; i < by; ++i)
+                sbufeast[i] = memO[ind(bx, i + 1)];     // pack loop
+        if (!needW)
+            for (int i = 0; i < by; ++i)
+                sbufwest[i] = memO[ind(1, i + 1)];      // pack loop
 
-  // get final heat in the system
-  double rheat;
-  MPI_Allreduce(&heat, &rheat, 1, MPI_DOUBLE, MPI_SUM, comm);
-  double totaltime[6];
-  MPI_Allreduce(thetime,totaltime,6,MPI_DOUBLE,MPI_SUM,comm);
-if(!r)
-        {
-         printf("[%i] last heat: %f time: %f\n", r, rheat, t);
-         printf("Times are: %lf, %lf, %lf, %lf, %lf, --> %lf\n",totaltime[0]/p,totaltime[1]/p,totaltime[2]/p,totaltime[3]/p,totaltime[4]/p,totaltime[5]/p);
+        double time3 = MPI_Wtime();
+
+        int index = 0;
+        if (!needN) {
+            MPI_Isend(sbufnorth, bx, MPI_DOUBLE, north, 9, comm, &reqs[index++]);
+            MPI_Irecv(rbufnorth, bx, MPI_DOUBLE, north, 9, comm, &reqs[index++]);
+        }
+        if (!needE) {
+            MPI_Isend(sbufeast, by, MPI_DOUBLE, east, 9, comm, &reqs[index++]);
+            MPI_Irecv(rbufeast, by, MPI_DOUBLE, east, 9, comm, &reqs[index++]);
+        }
+        if (!needW) {
+            MPI_Isend(sbufwest, by, MPI_DOUBLE, west, 9, comm, &reqs[index++]);
+            MPI_Irecv(rbufwest, by, MPI_DOUBLE, west, 9, comm, &reqs[index++]);
+        }
+        if (!needS) {
+            MPI_Isend(sbufsouth, bx, MPI_DOUBLE, south, 9, comm, &reqs[index++]);
+            MPI_Irecv(rbufsouth, bx, MPI_DOUBLE, south, 9, comm, &reqs[index++]);
+        }
+        printf("id = %d, iter=%d, OK\n", r, iter);
+        int ReadN, ReadS, ReadW, ReadE;
+        ReadN = ReadS = ReadW = ReadE = 0;
+        if (!needN) {
+            ReadN = 1;
+        }
+        if (!needS) {
+            ReadS = 1;
         }
-  MPI_Barrier(comm);	
+        if (!needW) {
+            ReadW = 1;
+        }
+        if (!needE) {
+            ReadE = 1;
+        }
+        //order: West0. East1, North2, South3
+        while (!(ReadW * ReadE * ReadN * ReadS)) {
+            if (!ReadW) {
+                for (int i = 0; i < by; i++)
+                    //   memO[(i+1)*(bx+2)]=westptrO[(i+1)*(bx+2)+bx];
+                    ReadW = 1;
+            }
+            if (!ReadE) {
+                for (int i = 0; i < by; i++)
+                    //   memO[(i+1)*(bx+2)+bx+1]=eastptrO[(i+1)*(bx+2)+1];
+                    ReadE = 1;
+            }
+            if (!ReadN) {
+                //memcpy(rbufnorth,northptr+2*by+bx,bx*sizeof(double));
+                memcpy(memO + 1, northptrO + (bx + 2) * by + 1, bx * sizeof(double));
+                //northptr[basesize+3] = -1;
+                ReadN = 1;
+            }
+            if (!ReadS) {
+                //memcpy(rbufsouth,southptr+2*by, bx*sizeof(double));
+                memcpy(memO + (bx + 2) * (by + 1) + 1, southptrO + bx + 2 + 1, bx * sizeof(double));
+                //southptr[basesize+2] = -1;
+                ReadS = 1;
+            }
+        }
+        /*
+         * MPI_Isend(sbufnorth, bx, MPI_DOUBLE, north, 9, comm, &reqs[0]);
+         * MPI_Isend(sbufsouth, bx, MPI_DOUBLE, south, 9, comm, &reqs[1]);
+         * MPI_Isend(sbufeast, by, MPI_DOUBLE, east, 9, comm, &reqs[2]);
+         * MPI_Isend(sbufwest, by, MPI_DOUBLE, west, 9, comm, &reqs[3]);
+         * MPI_Irecv(rbufnorth, bx, MPI_DOUBLE, north, 9, comm, &reqs[4]);
+         * MPI_Irecv(rbufsouth, bx, MPI_DOUBLE, south, 9, comm, &reqs[5]);
+         * MPI_Irecv(rbufeast, by, MPI_DOUBLE, east, 9, comm, &reqs[6]);
+         * MPI_Irecv(rbufwest, by, MPI_DOUBLE, west, 9, comm, &reqs[7]);
+         */
+        MPI_Waitall(index, reqs, MPI_STATUS_IGNORE);
+        double time4 = MPI_Wtime();
+        if (!needN)
+            for (int i = 0; i < bx; ++i)
+                memO[ind(i + 1, 0)] = rbufnorth[i];     // unpack loop - into ghost cells
+        if (!needS)
+            for (int i = 0; i < bx; ++i)
+                memO[ind(i + 1, by + 1)] = rbufsouth[i];        // unpack loop
+        if (!needE)
+            for (int i = 0; i < by; ++i)
+                memO[ind(bx + 1, i + 1)] = rbufeast[i]; // unpack loop
+        if (!needW)
+            for (int i = 0; i < by; ++i)
+                memO[ind(0, i + 1)] = rbufwest[i];      // unpack loop
+        double time5 = MPI_Wtime();
+        heat = 0.0;
+        tp = -MPI_Wtime();
+        //double* aold2 = aold;
+        //memcpy(aold2,aold,(bx+2)*(by+2)*sizeof(double));
+        //double* anewtemp = malloc(by*sizeof(double));
+        //for(int i=1; i<bx+1; ++i) {
+        //for(int j=1; j<by+1; ++j) {
+        for (int j = 1; j < by + 1; ++j) {
+            for (int i = 1; i < bx + 1; ++i) {
+                //double tt = anew[ind(i,j)]/2.0;
+                double tt =
+                    memN[ind(i, j)] / 2.0 + (memO[ind(i - 1, j)] + memO[ind(i + 1, j)] +
+                                             memO[ind(i, j - 1)] + memO[ind(i, j + 1)]) / 4.0 / 2.0;
+                //double tt  = anew[ind(i,j)]/2.0 + (aold[ind(i,j-1)] + aold[ind(i,j+1)])/4.0/2.0;
+                //tt +=  (aold[ind(i-1,j)] + aold[ind(i+1,j)] + aold[ind(i,j-1)] + aold[ind(i,j+1)])/4.0/2.0;
+                memN[ind(i, j)] = tt;
+                //double d  = anew[ind(i,j)]/2.0 + ((double)r+1.0+double(r+3.46)+double(iter))/4.0/2.0;
+                //anew[ind(i,j)] = anew[ind(i,j)]/2.0;
+                //tempnew[i-1] = tt;
+                heat += memN[ind(i, j)];
+            }
+        }
+        // swap arrays
+        double time6 = MPI_Wtime();
+        thetime[0] += (time2 - time1);
+        thetime[1] += (time3 - time2);
+        thetime[2] += (time4 - time3);
+        thetime[3] += (time5 - time4);
+        thetime[4] += (time6 - time5);
+        thetime[5] += (time6 - time1);
+
+        tp += MPI_Wtime();
+        tmp = memN;
+        memN = memO;
+        memO = tmp;
+        tmp = northptrN;
+        northptrN = northptrO;
+        northptrO = tmp;
+        tmp = southptrN;
+        southptrN = southptrO;
+        southptrO = tmp;
+        tmp = eastptrN;
+        eastptrN = eastptrO;
+        eastptrO = tmp;
+        tmp = westptrN;
+        westptrN = northptrO;
+        westptrO = tmp;
+        MPI_Barrier(shmcomm);
+    }
+    t += MPI_Wtime();
+
+    // get final heat in the system
+    double rheat;
+    MPI_Allreduce(&heat, &rheat, 1, MPI_DOUBLE, MPI_SUM, comm);
+    double totaltime[6];
+    MPI_Allreduce(thetime, totaltime, 6, MPI_DOUBLE, MPI_SUM, comm);
+    if (!r) {
+        printf("[%i] last heat: %f time: %f\n", r, rheat, t);
+        printf("Times are: %lf, %lf, %lf, %lf, %lf, --> %lf\n", totaltime[0] / p, totaltime[1] / p,
+               totaltime[2] / p, totaltime[3] / p, totaltime[4] / p, totaltime[5] / p);
+    }
+    MPI_Barrier(comm);
 //         printf("Rank %d times are: %lf, %lf, %lf, %lf, %lf, --> %lf\n",r, thetime[0],thetime[1],thetime[2],thetime[3],thetime[4],thetime[5]);
-  MPI_Finalize();
+    MPI_Finalize();
 }

http://git.mpich.org/software/dev/stencil.git/commitdiff/8ef7f7c268ec9f3866f90ad4c2f51c51495b7a15

commit 8ef7f7c268ec9f3866f90ad4c2f51c51495b7a15
Author: Pavan Balaji <balaji at anl.gov>
Date:   Sat Sep 6 19:13:39 2014 -0500

    Initial draft of the stencil code from Shigang.

diff --git a/stencil.cpp b/stencil.cpp
new file mode 100644
index 0000000..b4d7f2e
--- /dev/null
+++ b/stencil.cpp
@@ -0,0 +1,532 @@
+/*
+ * Copyright (c) 2012 Torsten Hoefler. All rights reserved.
+ *
+ * Author(s): Torsten Hoefler <htor at illinois.edu>
+ *
+ */
+
+#include "stencil_par.h"
+int main(int argc, char **argv) {
+
+  int provide;
+  MPI_Init(&argc,&argv);  
+  int r,p;
+  MPI_Comm comm = MPI_COMM_WORLD;
+  MPI_Comm_rank(comm, &r);
+  MPI_Comm_size(comm, &p);
+  int n, energy, niters,nx,ny, lpx,lpy,px, py;
+
+  MPI_Comm shmcomm;
+  MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shmcomm);
+  int sr, sp; // rank and size in shmem comm
+
+  MPI_Comm_size(shmcomm, &sp);
+  MPI_Comm_rank(shmcomm, &sr);
+    printf("shmcomm size = %d\n",sp); 
+  int CORENUM = 12;
+  if (r==0) {
+	  // argument checking
+	  if(argc < 8) {
+		  if(!r) printf("usage: stencil_mpi <n> <energy> <niters> <nx> <ny> <lpx> <lpy>\n");
+		  MPI_Finalize();
+		  exit(1);       
+	  }
+
+	  n = atoi(argv[1]); // nxn grid
+	  energy = atoi(argv[2]); // energy to be injected per iteration
+	  niters = atoi(argv[3]); // number of iterations
+
+	  nx=atoi(argv[4]); // 1st dim processes
+	  ny=atoi(argv[5]); // 2nd dim processes
+	  lpx=atoi(argv[6]); // 1st dim processes
+      lpy=atoi(argv[7]); // 2nd dim processes
+      px = nx*lpx;
+	  py = ny*lpy;
+      if(px * py != p) MPI_Abort(comm, 1);// abort if px or py are wrong
+      if(n % py != 0) MPI_Abort(comm, 2); // abort px needs to divide n
+      if(n % px != 0) MPI_Abort(comm, 3); // abort py needs to divide n
+      if(nx*ny != p/CORENUM) MPI_Abort(comm,66);
+	  if(lpx*lpy != CORENUM) MPI_Abort(comm,77);
+      // distribute arguments
+      int args[7] = {n, energy, niters, nx,ny,lpx,lpy};
+      MPI_Bcast(args, 7, MPI_INT, 0, comm);
+  }
+  else {
+      int args[7];
+      MPI_Bcast(args, 7, MPI_INT, 0, comm);
+      n=args[0]; energy=args[1]; niters=args[2];
+	
+	  nx=args[3]; ny=args[4];
+	  lpx = args[5];
+	  lpy = args[6];
+	
+      px = nx*lpx;
+	  py = ny*lpy;
+  }
+
+  int nodenum = nx*ny;
+  int nodeidx, nodeidy;
+  int nodeid = r/CORENUM;
+  int CH = 2;
+	if(CH == 1)
+	{
+		nodeidx = nodeid % nx;
+		nodeidy = nodeid / nx;
+	}
+	else if(CH == 2)
+	{
+		nodeidx = nodeid / ny;
+		nodeidy = nodeid % ny;
+	}
+	else if(CH == 3)
+	{
+		nodeidx = nodeid / ny;
+                nodeidy = nodeid % ny;
+		if(nodeidx % 2 != 0)
+		{
+			nodeidy = ny-1 - nodeidy;
+		}
+	}
+//nodeidx idy is the node coordinates(x,y)
+//
+    int localprocessindex = r%CORENUM;
+	int local_x, local_y;
+	local_x = localprocessindex%lpx;
+	local_y = localprocessindex/lpx;
+	// determine my coordinates (x,y) -- r=x*a+y in the 2d processor array
+    // int rx = r % px;
+    // int ry = r / px;
+    int rx;
+    int ry;
+	rx = lpx*nodeidx+local_x;
+	ry = lpy*nodeidy+local_y;
+	int noden, nodes, nodew, nodee;
+	if(nodeidy < ny-1)
+	{
+		if(CH ==1 )
+			nodes = nodeid+ nx;
+		if(CH == 2)
+			nodes = nodeid+1;
+		if(CH == 3)
+		{
+			if(nodeidx %2 ==0)
+				nodes=nodeid+1;
+			else 
+				nodes = nodeid-1;
+		}
+	}
+	else
+	{
+		nodes = -1;
+	}
+	if(nodeidy == 0)
+	{
+		noden = -1;
+	}
+	else
+	{
+		if(CH==1)
+			noden = nodeid - nx;
+		if(CH == 2)
+			noden = nodeid-1;
+		if(CH == 3)
+		{
+			if(nodeidx %2 == 0)
+				noden = nodeid-1;
+			else
+				noden = nodeid+1;
+		}
+	
+	}
+        if(nodeidx == 0)
+	{
+		nodew = -1;
+	}
+	else
+	{
+		if(CH ==1)
+			nodew = nodeid -1;
+		if(CH == 2)
+			nodew = nodeid -ny;
+		if(CH == 3)
+		{
+			
+			if(nodeidx%2 == 0)
+				nodew = nodeid - nodeidy -1-nodeidy;
+			else
+			{
+				nodew = nodeid +nodeidy+1-2*ny+nodeidy;
+			}
+		}
+	}
+	if(nodeidx == nx -1)
+	{
+		nodee = -1;
+	}
+	else
+	{
+		if(CH==1)
+		{
+			nodee = nodeid+1;
+		}
+		if(CH==2)
+		{
+			nodee = nodeid + ny;
+		}
+		if(CH==3)
+		{
+			if(nodeidx%2 == 0)
+			{
+				nodee = nodeid - nodeidy + ny + ny-1-nodeidy;
+			}
+			else
+			{
+				nodee = nodeid +nodeidy +1 + nodeidy;
+			}
+		}
+	}
+	int north1, south1, west1, east1;
+	if(local_y != 0)
+	{
+		//south
+		north1 = r - lpx;
+	}	
+	else
+	{
+		if(noden == -1)
+			north1 = -1;
+		else
+		{
+			north1 = r + (noden - nodeid)*CORENUM  - r%CORENUM +(lpy-1) * lpx + local_x;
+			//north1 = r + (noden - nodeid)*CORENUM  - r%CORENUM + local_x;
+		}	
+	}
+	if(local_y < lpy-1)
+	{
+		//north
+		south1 = r + lpx;
+	}
+	else
+	{
+		if(nodes == -1)
+			south1 = -1;
+		else
+		{
+			south1 = r + (nodes - nodeid)*CORENUM  - r%CORENUM + local_x;
+			//south1 = r + (noden - nodeid)*CORENUM  - r%CORENUM +(lpy-1) * lpx + local_x;
+		}
+	}
+	if(local_x != 0)
+	{
+		//west
+		west1 = r-1;
+	}
+	else
+	{
+		if(nodew == -1)
+			west1 = -1;
+		else
+		{
+			west1 = r + (nodew - nodeid)*CORENUM - r%CORENUM + local_y*lpx + lpx-1;
+		}
+	}
+	if(local_x == lpx-1)
+	{
+		//east
+		if(nodee == -1)
+		{
+			east1 = -1;
+		}
+		else
+		{
+			east1 = r + (nodee - nodeid) * CORENUM - r%CORENUM + local_y*lpx;
+		}
+	}
+	else
+	{
+		
+		east1 = r+1;
+	}
+	
+  // determine my four neighbors
+  for(int i=0; i<p; i++)
+  {
+    if(r==i)
+	  printf("id=%d, sr=%d, nodeid=%d, north1=%d, south1=%d, west1=%d, east1=%d\n", r,sr,nodeid,north1,south1,west1,east1);
+	MPI_Barrier(MPI_COMM_WORLD);
+  }
+  int north = north1; if(north1 < 0)   north = MPI_PROC_NULL;
+  int south = south1; if(south1 < 0)   south = MPI_PROC_NULL;
+  int west= west1;     if(west1< 0)    west = MPI_PROC_NULL;
+  int east = east1;    if(east1 < 0)   east = MPI_PROC_NULL;
+  //int temp = north; north = south; south = temp; 
+  int bx = n/px; // block size in x
+  int by = n/py; // block size in y
+  int offx = rx*bx; // offset in x
+  int offy = ry*by; // offset in y
+  double* volatile memO;
+  double* volatile memN;
+  
+  //printf("rank %d N S W E is: %d %d %d %d\n",r,north, south, west, east);
+  MPI_Win winO;
+  MPI_Win winN;
+  MPI_Win_allocate_shared((bx+2)*(by+2)*sizeof(double),1,MPI_INFO_NULL,shmcomm,(void*)&memO,&winO);
+  MPI_Win_allocate_shared((bx+2)*(by+2)*sizeof(double),1,MPI_INFO_NULL,shmcomm,(void*)&memN,&winN);
+  double* volatile westptrO;
+  double* volatile eastptrO;
+  double* volatile northptrO;
+  double* volatile southptrO;
+  double* volatile westptrN;
+  double* volatile eastptrN;
+  double* volatile northptrN;
+  double* volatile southptrN;
+  int needN = 0; int needS = 0; int needW = 0; int needE = 0;
+  if(north != -1 && north / CORENUM == r/CORENUM)
+  {
+	needN = 1;
+  }
+  if(south != -1 && south / CORENUM == r/CORENUM)
+  {
+	needS = 1;
+  }
+  if(west != -1 && west / CORENUM == r/CORENUM)
+  {
+	needW = 1;
+  }
+  if(east != -1 && east / CORENUM == r/CORENUM)
+  {
+	needE = 1;
+  }
+  //temp = needN; needN =needS; needS = temp;
+  MPI_Aint sz;
+  int dsp_unit;
+  if(needN)
+  {
+	MPI_Win_shared_query(winO,sr-lpx,&sz,&dsp_unit,(void*)&northptrO);
+	MPI_Win_shared_query(winN,sr-lpx,&sz,&dsp_unit,(void*)&northptrN);
+  }
+  if(needS)
+  {
+	MPI_Win_shared_query(winO,sr+lpx,&sz,&dsp_unit,(void*)&southptrO);
+	MPI_Win_shared_query(winN,sr+lpx,&sz,&dsp_unit,(void*)&southptrN);
+  }
+  if(needW)
+  {
+	MPI_Win_shared_query(winO,sr-1,&sz,&dsp_unit,(void*)&westptrO);
+	MPI_Win_shared_query(winN,sr-1,&sz,&dsp_unit,(void*)&westptrN);
+  }
+  if(needE)
+  {
+	MPI_Win_shared_query(winO,sr+1,&sz,&dsp_unit,(void*)&eastptrO);
+	MPI_Win_shared_query(winN,sr+1,&sz,&dsp_unit,(void*)&eastptrN);
+  }
+  
+  // printf("rank %d N S W E is: %d %d %d %d\n",r,north, south, west, east);
+  // decompose the domain
+  // printf("%i (%i,%i) - w: %i, e: %i, n: %i, s: %i\n", r, ry,rx,west,east,north,south);
+  // allocate two work arrays
+  double* volatile tmp; 
+  // initialize three heat sources
+  const int nsources=3;
+  int sources[nsources][2] = {{n/2,n/2}, {n/3,n/3}, {n*4/5,n*8/9}};
+  int locnsources=0; // number of sources in my area
+  int locsources[nsources][2]; // sources local to my rank
+  for (int i=0; i<nsources; ++i) { // determine which sources are in my patch
+    int locx = sources[i][0] - offx;
+    int locy = sources[i][1] - offy;
+    if(locx >= 0 && locx < bx && locy >= 0 && locy < by) {
+      locsources[locnsources][0] = locx+1; // offset by halo zone
+      locsources[locnsources][1] = locy+1; // offset by halo zone
+      locnsources++;
+    }
+  }
+
+	double tp = 0.0;
+  // allocate communication buffers
+  double *sbufnorth = (double*)calloc(1,bx*sizeof(double)); // send buffers
+  double *sbufsouth = (double*)calloc(1,bx*sizeof(double));
+  double *sbufeast = (double*)calloc(1,by*sizeof(double));
+  double *sbufwest = (double*)calloc(1,by*sizeof(double));
+  double *rbufnorth = (double*)calloc(1,bx*sizeof(double)); // receive buffers
+  double *rbufsouth = (double*)calloc(1,bx*sizeof(double));
+  double *rbufeast = (double*)calloc(1,by*sizeof(double));
+  double *rbufwest = (double*)calloc(1,by*sizeof(double));
+
+  double thetime[6];
+        thetime[0] = thetime[1] = thetime[2] = thetime[3] =thetime [4] = thetime[5] =  0.0;
+  double t=-MPI_Wtime(); // take time
+  double heat; // total heat in system
+  //the following is the loop for niters steps.--zhuxm
+  //double* tempnew = (double*)malloc(bx*sizeof(double));
+  //int basesize = 2*(bx+by);
+  //int halfindex = basesize+4;
+  //MPI_Win_lock_all(0, winO);
+  //MPI_Win_lock_all(0, winN);
+
+  MPI_Barrier(shmcomm);
+  for(int iter=0; iter<niters; ++iter) {
+    // refresh heat sources
+    //MPI_Barrier(comm);
+    //if(!r) printf("Step %d\n",iter);
+   double time1 = MPI_Wtime();
+	 for(int i=0; i<locnsources; ++i) {
+      memO[ind(locsources[i][0],locsources[i][1])] += energy; // heat source
+  }
+   double time2 = MPI_Wtime();
+    // exchange data with neighbors
+    MPI_Request reqs[8];
+   if(!needN)
+    for(int i=0; i<bx; ++i) sbufnorth[i] = memO[ind(i+1,1)]; // pack loop - last valid region
+   if(!needS)
+    for(int i=0; i<bx; ++i) sbufsouth[i] = memO[ind(i+1,by)]; // pack loop
+   if(!needE)
+    for(int i=0; i<by; ++i) sbufeast[i] = memO[ind(bx,i+1)]; // pack loop
+   if(!needW)
+    for(int i=0; i<by; ++i) sbufwest[i] = memO[ind(1,i+1)]; // pack loop
+
+	double time3 = MPI_Wtime();
+	
+	int index = 0;
+   if(!needN)
+   {
+        MPI_Isend(sbufnorth, bx, MPI_DOUBLE, north, 9, comm, &reqs[index++]);
+        MPI_Irecv(rbufnorth, bx, MPI_DOUBLE, north, 9, comm, &reqs[index++]);
+   }
+    if(!needE)
+    {
+        MPI_Isend(sbufeast, by, MPI_DOUBLE, east, 9, comm, &reqs[index++]);
+        MPI_Irecv(rbufeast, by, MPI_DOUBLE, east, 9, comm, &reqs[index++]);
+    }
+   if(!needW)
+    {
+        MPI_Isend(sbufwest, by, MPI_DOUBLE, west, 9, comm, &reqs[index++]);
+        MPI_Irecv(rbufwest, by, MPI_DOUBLE, west, 9, comm, &reqs[index++]);
+    }
+    if(!needS)
+    {
+        MPI_Isend(sbufsouth, bx, MPI_DOUBLE, south, 9, comm, &reqs[index++]);
+        MPI_Irecv(rbufsouth, bx, MPI_DOUBLE, south, 9, comm, &reqs[index++]);
+    } 
+	printf("id = %d, iter=%d, OK\n", r, iter);
+   int ReadN, ReadS, ReadW, ReadE;
+   ReadN = ReadS = ReadW = ReadE = 0;
+   if(!needN)
+   {
+	ReadN = 1;
+   }   
+   if(!needS)
+   {
+	ReadS = 1;
+   }   
+   if(!needW)
+   {
+	ReadW = 1;
+   }   
+   if(!needE)
+   {
+	ReadE = 1;
+   }
+   //order: West0. East1, North2, South3  
+   while(!(ReadW * ReadE * ReadN * ReadS))
+   {
+     if(!ReadW)
+     {
+		 for(int i=0; i<by; i++)
+         //   memO[(i+1)*(bx+2)]=westptrO[(i+1)*(bx+2)+bx];
+         ReadW = 1;
+     }
+     if(!ReadE)
+     {
+		 for(int i=0; i<by; i++)
+         //   memO[(i+1)*(bx+2)+bx+1]=eastptrO[(i+1)*(bx+2)+1];
+         ReadE = 1;
+     }
+     if(!ReadN)
+     {
+         //memcpy(rbufnorth,northptr+2*by+bx,bx*sizeof(double));
+         memcpy(memO+1,northptrO+(bx+2)*by+1,bx*sizeof(double));
+         //northptr[basesize+3] = -1;
+         ReadN = 1;
+     }
+     if(!ReadS)
+     {
+         //memcpy(rbufsouth,southptr+2*by, bx*sizeof(double));
+         memcpy(memO+(bx+2)*(by+1)+1,southptrO+bx+2+1, bx*sizeof(double));
+         //southptr[basesize+2] = -1;
+         ReadS = 1;
+     }
+   }
+    /*
+    MPI_Isend(sbufnorth, bx, MPI_DOUBLE, north, 9, comm, &reqs[0]);
+    MPI_Isend(sbufsouth, bx, MPI_DOUBLE, south, 9, comm, &reqs[1]);
+    MPI_Isend(sbufeast, by, MPI_DOUBLE, east, 9, comm, &reqs[2]);
+    MPI_Isend(sbufwest, by, MPI_DOUBLE, west, 9, comm, &reqs[3]);
+    MPI_Irecv(rbufnorth, bx, MPI_DOUBLE, north, 9, comm, &reqs[4]);
+    MPI_Irecv(rbufsouth, bx, MPI_DOUBLE, south, 9, comm, &reqs[5]);
+    MPI_Irecv(rbufeast, by, MPI_DOUBLE, east, 9, comm, &reqs[6]);
+    MPI_Irecv(rbufwest, by, MPI_DOUBLE, west, 9, comm, &reqs[7]);
+    */
+    MPI_Waitall(index, reqs, MPI_STATUS_IGNORE);
+	double time4 = MPI_Wtime();
+	if(!needN)
+      for(int i=0; i<bx; ++i) memO[ind(i+1,0)] = rbufnorth[i]; // unpack loop - into ghost cells
+	if(!needS)
+      for(int i=0; i<bx; ++i) memO[ind(i+1,by+1)] = rbufsouth[i]; // unpack loop
+	if(!needE)
+      for(int i=0; i<by; ++i) memO[ind(bx+1,i+1)] = rbufeast[i]; // unpack loop
+	if(!needW)
+      for(int i=0; i<by; ++i) memO[ind(0,i+1)] = rbufwest[i]; // unpack loop
+	double time5 = MPI_Wtime();
+    heat = 0.0;
+	 tp = -MPI_Wtime();	
+	//double* aold2 = aold;
+	//memcpy(aold2,aold,(bx+2)*(by+2)*sizeof(double));
+    //double* anewtemp = malloc(by*sizeof(double));
+    //for(int i=1; i<bx+1; ++i) {
+      //for(int j=1; j<by+1; ++j) {
+    for(int j=1; j<by+1; ++j) {
+      for(int i=1; i<bx+1; ++i) {
+        //double tt = anew[ind(i,j)]/2.0;
+	double tt  = memN[ind(i,j)]/2.0 + (memO[ind(i-1,j)] + memO[ind(i+1,j)] + memO[ind(i,j-1)] + memO[ind(i,j+1)])/4.0/2.0;
+	//double tt  = anew[ind(i,j)]/2.0 + ( aold[ind(i,j-1)] + aold[ind(i,j+1)])/4.0/2.0;
+        //tt +=  (aold[ind(i-1,j)] + aold[ind(i+1,j)] + aold[ind(i,j-1)] + aold[ind(i,j+1)])/4.0/2.0;
+        memN[ind(i,j)] = tt;
+	//double d  = anew[ind(i,j)]/2.0 + ((double)r+1.0+double(r+3.46)+double(iter))/4.0/2.0;
+       //anew[ind(i,j)] = anew[ind(i,j)]/2.0;
+        //tempnew[i-1] = tt;
+        heat += memN[ind(i,j)];
+      }
+    }
+    // swap arrays
+    double time6 = MPI_Wtime();
+    thetime[0] += (time2-time1);
+    thetime[1] += (time3-time2);
+    thetime[2] += (time4-time3);
+    thetime[3] += (time5-time4);
+    thetime[4] += (time6-time5);
+    thetime[5] += (time6-time1);
+
+	tp += MPI_Wtime();
+    tmp=memN; memN=memO; memO=tmp;
+    tmp=northptrN; northptrN=northptrO; northptrO=tmp;
+    tmp=southptrN; southptrN=southptrO; southptrO=tmp;
+    tmp=eastptrN; eastptrN=eastptrO; eastptrO=tmp;
+    tmp=westptrN; westptrN=northptrO; westptrO=tmp;
+    MPI_Barrier(shmcomm);
+  }
+  t+=MPI_Wtime();
+
+  // get final heat in the system
+  double rheat;
+  MPI_Allreduce(&heat, &rheat, 1, MPI_DOUBLE, MPI_SUM, comm);
+  double totaltime[6];
+  MPI_Allreduce(thetime,totaltime,6,MPI_DOUBLE,MPI_SUM,comm);
+if(!r)
+        {
+         printf("[%i] last heat: %f time: %f\n", r, rheat, t);
+         printf("Times are: %lf, %lf, %lf, %lf, %lf, --> %lf\n",totaltime[0]/p,totaltime[1]/p,totaltime[2]/p,totaltime[3]/p,totaltime[4]/p,totaltime[5]/p);
+        }
+  MPI_Barrier(comm);	
+//         printf("Rank %d times are: %lf, %lf, %lf, %lf, %lf, --> %lf\n",r, thetime[0],thetime[1],thetime[2],thetime[3],thetime[4],thetime[5]);
+  MPI_Finalize();
+}

-----------------------------------------------------------------------


hooks/post-receive
-- 
Stencil code repository