[mpich-discuss] bug in MPI_File_write_all?
Rob Latham
robl at mcs.anl.gov
Mon May 19 11:13:40 CDT 2014
On 05/19/2014 04:49 AM, CANELA-XANDRI Oriol wrote:
> Yes, a attach below a test program. The error can be reproduced running it with 9 MPI threads.
>
Thank you for the test case.
I probably fixed this last week, which is why Rajeev no longer sees this
on his laptop. It is likely a bug in the way ROMIO flattens darray
datatypes.
Open-mpi from last week dies with a "signal 7" with this test case.
==rob
> #include <mpi.h>
>
> #include <iostream>
> #include <sstream>
> #include <string>
> #include <vector>
>
> /**
> * get the number of local rows and columns
> */
> int getNumroc(int globalSize, int myProc, int nProcs, int blockSize)
> {
> int myDist = myProc % nProcs;
> int nBlocks = globalSize / blockSize;
> int numroc = nBlocks / nProcs;
> numroc *= blockSize;
> int extraBlocks = nBlocks % nProcs;
> if(myDist < extraBlocks)
> {
> numroc += blockSize;
> }
> else if(myDist == extraBlocks)
> {
> numroc += globalSize % blockSize;
> }
> return numroc;
> }
>
> int main(int argc, char **argv)
> {
> //MPI vars
> bool mpiRoot;
> int mpiRank;
> int mpiNumTasks;
> char hostName[MPI_MAX_PROCESSOR_NAME];
> int lenHostName;
> int myProcRow;
> int myProcCol;
>
> // Initiate MPI
> int tmp;
> tmp = MPI_Init(&argc, &argv);
> if (tmp != MPI_SUCCESS) {
> std::cerr << "Error: MPI can not be started. Terminating." << std::endl;
> MPI_Abort(MPI_COMM_WORLD, 1);
> }
> MPI_Comm_rank(MPI_COMM_WORLD, &mpiRank);
> MPI_Comm_size(MPI_COMM_WORLD, &mpiNumTasks);
> MPI_Get_processor_name(hostName, &lenHostName);
>
> if(mpiNumTasks != 9)
> {
> std::cerr << "Error: This is a test program designed for running with 9 threads." << std::endl;
> MPI_Abort(MPI_COMM_WORLD, 1);
> }
>
> //Id of the process in a 2d grid
> myProcRow = mpiRank / 3;
> myProcCol = mpiRank % 3;
>
> double *m; ///<A pointer to the distributed matrix
> double *mRead; ///<A pointer to the distributed matrix
>
> int nGlobRows = 5; ///<Number of rows of the global matrix
> int nGlobCols = 4; ///<Number of columns of the global matrix
> int nRows; ///<Number of rows of the local matrix
> int nCols; ///<Number of columns of the local matrix
> int nBlockRows = 1; ///<Number of rows of the distributed matrix blocks
> int nBlockCols = 3; ///<Number of columns of the distributed matrix blocks
>
> nRows = getNumroc(nGlobRows, myProcRow, 3, nBlockRows);
> nCols = getNumroc(nGlobCols, myProcCol, 3, nBlockCols);
>
> m = new double[nRows*nCols];
> mRead = new double[nRows*nCols];
> for(int i = 0; i < nRows; i++)
> {
> for(int j = 0; j < nCols; j++)
> {
> m[i*nCols + j] = 1;
> }
> }
>
> for(int repeat = 0; repeat < 10; repeat++)
> {
> int dims[] = {nGlobRows, nGlobCols};
> int dargs[] = {nBlockRows, nBlockCols};
> int distribs[] = {MPI_DISTRIBUTE_CYCLIC, MPI_DISTRIBUTE_CYCLIC};
> int dim[] = {3, 3};
> char nat[] = "native";
> int rc;
> MPI_Datatype dcarray;
> MPI_File cFile;
> MPI_Status status;
>
> MPI_Type_create_darray(mpiNumTasks, mpiRank, 2, dims, distribs, dargs, dim, MPI_ORDER_FORTRAN, MPI_DOUBLE, &dcarray);
> MPI_Type_commit(&dcarray);
>
> std::stringstream ss;
> ss << "test_" << repeat << ".bin";
> std::string fname = ss.str(); //"test.bin";
> std::vector<char> fn(fname.begin(), fname.end());
> fn.push_back('\0');
> MPI_File_delete (&fn[0], MPI_INFO_NULL);
>
> //Write file
> rc = MPI_File_open(MPI_COMM_WORLD, &fn[0], MPI_MODE_EXCL | MPI_MODE_CREATE | MPI_MODE_WRONLY, MPI_INFO_NULL, &cFile);
> if(rc){
> std::cerr << "Error: Failed to open file." << std::endl;
> MPI_Abort(MPI_COMM_WORLD, 1);
> }
> else
> {
> MPI_File_set_view(cFile, 0, MPI_DOUBLE, dcarray, nat, MPI_INFO_NULL);
> MPI_File_write_all(cFile, m, nRows*nCols, MPI_DOUBLE, &status);
> }
> MPI_Barrier(MPI_COMM_WORLD);
> MPI_File_close(&cFile);
>
>
> //Initialize matrix before reading to 0;
> for(int i = 0; i < nRows; i++)
> {
> for(int j = 0; j < nCols; j++)
> {
> mRead[i*nCols + j] = 0;
> }
> }
> //Read file
> rc = MPI_File_open(MPI_COMM_WORLD, &fn[0], MPI_MODE_RDONLY, MPI_INFO_NULL, &cFile);
> if(rc){
> std::cerr << "Error: Failed to open file." << std::endl;
> MPI_Abort(MPI_COMM_WORLD, 1);
> }
> else
> {
> MPI_File_set_view(cFile, 0, MPI_DOUBLE, dcarray, nat, MPI_INFO_NULL);
> MPI_File_read_all(cFile, mRead, nRows*nCols, MPI_DOUBLE, &status);
> }
> MPI_Barrier(MPI_COMM_WORLD);
> MPI_File_close(&cFile);
> MPI_Type_free(&dcarray);
>
> //Check data
> for(int i = 0; i < nRows; i++)
> {
> for(int j = 0; j < nCols; j++)
> {
> if(mRead[i*nCols + j] != 1)
> {
> std::cerr << "Error in data. " << repeat << " iteration." << std::endl;
> MPI_Abort(MPI_COMM_WORLD, 1);
> }
> }
> }
> }
>
> delete [] m;
> delete [] mRead;
>
> return 0;
> }
>
> Oriol
>
>
--
Rob Latham
Mathematics and Computer Science Division
Argonne National Lab, IL USA
More information about the discuss
mailing list