// shmem_check.cpp // // This is a minimal benchmark to test the raw bandwidth of MPI communication // between two processes on the same node, using only host (CPU) memory. // It completely removes CUDA to isolate the performance of the MPI library's // on-node communication mechanism (e.g., shared memory vs. TCP loopback). // // Compile/run: // /opt/mpich/4.2.1-cpu/bin/mpicxx -std=c++17 -I/opt/mpich/4.2.1-cpu/include shmem_check.cpp -o shmem_check // /opt/mpich/4.2.1-cpu/bin/mpiexec -np 2 -genv FI_PROVIDER=shm -genv FI_LOG_LEVEL=debug ./shmem_check 2>&1 | tee debug_output.txt #include #include #include #include int main(int argc, char* argv[]) { MPI_Init(&argc, &argv); int rank, size; MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); if (size != 2) { if (rank == 0) { std::cerr << "Error: This program must be run with exactly 2 MPI processes." << std::endl; } MPI_Finalize(); return 1; } const int num_samples = 100; const long long packet_size = 1LL << 28; // 256 MB // Allocate standard host memory. 'new' is sufficient. char* buffer = new char[packet_size]; if (rank == 0) { std::cout << "--- Starting Host-to-Host MPI Bandwidth Test ---" << std::endl; std::cout << "Packet Size: " << (packet_size / (1024*1024)) << " MB" << std::endl; } std::vector timings; for (int i = 0; i < num_samples; ++i) { MPI_Barrier(MPI_COMM_WORLD); double start_time = MPI_Wtime(); if (rank == 0) { MPI_Send(buffer, packet_size, MPI_CHAR, 1, 0, MPI_COMM_WORLD); MPI_Recv(buffer, 1, MPI_CHAR, 1, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE); // Wait for confirmation } else { // rank == 1 MPI_Recv(buffer, packet_size, MPI_CHAR, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); MPI_Send(buffer, 1, MPI_CHAR, 0, 1, MPI_COMM_WORLD); // Send confirmation } double end_time = MPI_Wtime(); if (i >= 10) { // Discard warmup runs timings.push_back(end_time - start_time); } } if (rank == 0) { double total_time = std::accumulate(timings.begin(), timings.end(), 0.0); double avg_time = total_time / timings.size(); double bandwidth = (static_cast(packet_size) / (1024.0 * 1024.0 * 1024.0)) / avg_time; std::cout << "------------------------------------------------" << std::endl; std::cout << "Average Host-to-Host Bandwidth: " << bandwidth << " GB/s" << std::endl; std::cout << "------------------------------------------------" << std::endl; } // Clean up host memory delete[] buffer; MPI_Finalize(); return 0; }