[mpich-discuss] bug in MPI_File_write_all?

Rob Latham robl at mcs.anl.gov
Mon May 19 11:13:40 CDT 2014



On 05/19/2014 04:49 AM, CANELA-XANDRI Oriol wrote:
> Yes, a attach below a test program. The error can be reproduced running it with 9 MPI threads.
>

Thank you for the test case.

I probably fixed this last week, which is why Rajeev no longer sees this 
on his laptop.  It is likely a bug in the way ROMIO flattens darray 
datatypes.

Open-mpi from last week dies with a "signal 7" with this test case.

==rob

> #include <mpi.h>
>
> #include <iostream>
> #include <sstream>
> #include <string>
> #include <vector>
>
> /**
>   * get the number of local rows and columns
>   */
> int getNumroc(int globalSize, int myProc, int nProcs, int blockSize)
> {
>    int myDist = myProc % nProcs;
>    int nBlocks = globalSize / blockSize;
>    int numroc = nBlocks / nProcs;
>    numroc *= blockSize;
>    int extraBlocks = nBlocks % nProcs;
>    if(myDist < extraBlocks)
>    {
>      numroc += blockSize;
>    }
>    else if(myDist == extraBlocks)
>    {
>      numroc += globalSize % blockSize;
>    }
>    return numroc;
> }
>
> int main(int argc, char **argv)
> {
>    //MPI vars
>    bool mpiRoot;
>    int mpiRank;
>    int mpiNumTasks;
>    char hostName[MPI_MAX_PROCESSOR_NAME];
>    int lenHostName;
>    int myProcRow;
>    int myProcCol;
>
>    // Initiate MPI
>    int tmp;
>    tmp = MPI_Init(&argc, &argv);
>    if (tmp != MPI_SUCCESS) {
>      std::cerr << "Error: MPI can not be started. Terminating." << std::endl;
>      MPI_Abort(MPI_COMM_WORLD, 1);
>    }
>    MPI_Comm_rank(MPI_COMM_WORLD, &mpiRank);
>    MPI_Comm_size(MPI_COMM_WORLD, &mpiNumTasks);
>    MPI_Get_processor_name(hostName, &lenHostName);
>
>    if(mpiNumTasks != 9)
>    {
>      std::cerr << "Error: This is a test program designed for running with 9 threads." << std::endl;
>      MPI_Abort(MPI_COMM_WORLD, 1);
>    }
>
>    //Id of the process in a 2d grid
>    myProcRow = mpiRank / 3;
>    myProcCol = mpiRank % 3;
>
>    double *m;                                ///<A pointer to the distributed matrix
>    double *mRead;                                ///<A pointer to the distributed matrix
>
>    int nGlobRows = 5;                        ///<Number of rows of the global matrix
>    int nGlobCols = 4;                        ///<Number of columns of the global matrix
>    int nRows;                                ///<Number of rows of the local matrix
>    int nCols;                                ///<Number of columns of the local matrix
>    int nBlockRows = 1;                       ///<Number of rows of the distributed matrix blocks
>    int nBlockCols = 3;                       ///<Number of columns of the distributed matrix blocks
>
>    nRows = getNumroc(nGlobRows, myProcRow, 3, nBlockRows);
>    nCols = getNumroc(nGlobCols, myProcCol, 3, nBlockCols);
>
>    m = new double[nRows*nCols];
>    mRead = new double[nRows*nCols];
>    for(int i = 0; i < nRows; i++)
>    {
>      for(int j = 0; j < nCols; j++)
>      {
>        m[i*nCols + j] = 1;
>      }
>    }
>
>    for(int repeat = 0; repeat < 10; repeat++)
>    {
>      int dims[] = {nGlobRows, nGlobCols};
>      int dargs[] = {nBlockRows, nBlockCols};
>      int distribs[] = {MPI_DISTRIBUTE_CYCLIC, MPI_DISTRIBUTE_CYCLIC};
>      int dim[] = {3, 3};
>      char nat[] = "native";
>      int rc;
>      MPI_Datatype dcarray;
>      MPI_File cFile;
>      MPI_Status status;
>
>      MPI_Type_create_darray(mpiNumTasks, mpiRank, 2, dims, distribs, dargs, dim, MPI_ORDER_FORTRAN, MPI_DOUBLE, &dcarray);
>      MPI_Type_commit(&dcarray);
>
>      std::stringstream ss;
>      ss << "test_" << repeat << ".bin";
>      std::string fname = ss.str(); //"test.bin";
>      std::vector<char> fn(fname.begin(), fname.end());
>      fn.push_back('\0');
>      MPI_File_delete (&fn[0], MPI_INFO_NULL);
>
>      //Write file
>      rc = MPI_File_open(MPI_COMM_WORLD, &fn[0], MPI_MODE_EXCL | MPI_MODE_CREATE | MPI_MODE_WRONLY, MPI_INFO_NULL, &cFile);
>      if(rc){
>        std::cerr << "Error: Failed to open file." << std::endl;
>        MPI_Abort(MPI_COMM_WORLD, 1);
>      }
>      else
>      {
>        MPI_File_set_view(cFile, 0, MPI_DOUBLE, dcarray, nat, MPI_INFO_NULL);
>        MPI_File_write_all(cFile, m, nRows*nCols, MPI_DOUBLE, &status);
>      }
>      MPI_Barrier(MPI_COMM_WORLD);
>      MPI_File_close(&cFile);
>
>
>      //Initialize matrix before reading to 0;
>      for(int i = 0; i < nRows; i++)
>      {
>        for(int j = 0; j < nCols; j++)
>        {
>          mRead[i*nCols + j] = 0;
>        }
>      }
>      //Read file
>      rc = MPI_File_open(MPI_COMM_WORLD, &fn[0], MPI_MODE_RDONLY, MPI_INFO_NULL, &cFile);
>      if(rc){
>        std::cerr << "Error: Failed to open file." << std::endl;
>        MPI_Abort(MPI_COMM_WORLD, 1);
>      }
>      else
>      {
>        MPI_File_set_view(cFile, 0, MPI_DOUBLE, dcarray, nat, MPI_INFO_NULL);
>        MPI_File_read_all(cFile, mRead, nRows*nCols, MPI_DOUBLE, &status);
>      }
>      MPI_Barrier(MPI_COMM_WORLD);
>      MPI_File_close(&cFile);
>      MPI_Type_free(&dcarray);
>
>      //Check data
>      for(int i = 0; i < nRows; i++)
>      {
>        for(int j = 0; j < nCols; j++)
>        {
>          if(mRead[i*nCols + j] != 1)
>          {
>            std::cerr << "Error in data. " << repeat << " iteration." << std::endl;
>            MPI_Abort(MPI_COMM_WORLD, 1);
>          }
>        }
>      }
>    }
>
>    delete [] m;
>    delete [] mRead;
>
>    return 0;
> }
>
> Oriol
>
>

-- 
Rob Latham
Mathematics and Computer Science Division
Argonne National Lab, IL USA



More information about the discuss mailing list