[mpich-discuss] Run mpich on localhost

Martin Ivanov marto1980 at gmail.com
Sun May 10 02:16:18 CDT 2020


Hello Tony,
Thank you very much for your reply. I am posting you the output of 'ps x',
when 'mpirun -n 5 mpich-3.0.4/examples/hellow' freezes:

"
marto at dragonfly% ps x
   PID TT  STAT        TIME COMMAND
  1025 ??  I6s      0:00.14 /usr/local/bin/dbus-daemon --syslog-only --fork
--print-pid 5 --print-address 7 --session
  1028 ??  I4s      0:00.08 kdeinit5: Running... (kdeinit5)
  1029 ??  I2       0:00.26 /usr/local/lib/libexec/kf5/klauncher --fd=8
  1031 ??  I7       0:00.65 kded5
  1040 ??  I3       0:00.21 /usr/local/bin/kaccess
  1047 ??  I4       0:00.02 /usr/local/libexec/dconf-service
  1058 ??  I1       0:00.26 /usr/local/bin/ksmserver
  1061 ??  I6       0:00.38 /usr/local/bin/kglobalaccel5
  1067 ??  I7       0:00.10
/usr/local/lib/libexec/kf5/kscreen_backend_launcher
  1069 ??  I0       0:18.15 /usr/local/bin/kwin_x11 -session
100000000158893236700000019470008_1589011398_40052
  1071 ??  I3       0:07.36 /usr/local/bin/plasmashell
  1073 ??  I6       0:00.06 /usr/local/bin/xembedsniproxy
  1075 ??  I3       0:00.17
/usr/local/lib/libexec/polkit-kde-authentication-agent-1
  1080 ??  I6       0:00.05 /usr/local/bin/kwrited
  1084 ??  I5       0:00.07 /usr/local/bin/gmenudbusmenuproxy
  1093 ??  I2       0:00.24 /usr/local/lib/libexec/DiscoverNotifier
-session 1014ce0c7d3000158899951500000013340005_1589011398_16101
  1121 ??  I2       0:00.45 /usr/local/bin/korgac -session
100000000158893236700000019470009_1589011398_16368
  1128 ??  I2       0:00.00 /usr/local/libexec/at-spi-bus-launcher
  1130 ??  I3       0:00.24 /usr/local/lib/libexec/kactivitymanagerd
  1131 ??  S5       0:00.11 /usr/local/bin/dbus-daemon
--config-file=/usr/local/share/defaults/at-spi2/accessibility.conf --nofork
--print-address 3
  1134 ??  I2       0:00.07 /usr/local/libexec/at-spi2-registryd
--use-gnome-session
  1136 ??  I1       0:07.00 /usr/local/lib/thunderbird/thunderbird
--sm-client-id 100000000158893248800000019470014
  1140 ??  S5       0:01.73 /usr/local/bin/gkrellm --sm-client-id
100000000158893266700000019470017
  1142 ??  I6       0:00.54 /usr/local/bin/dolphin -session
100000000158893337700000019470020_1589011398_16282
  1144 ??  I7       0:00.35 /usr/local/bin/kmix -session
1014ce0c7d3000158899952000000013340007_1589011398_17041
  1146 ??  I0       0:01.01 /usr/local/bin/konsole -session
1014ce0c7d3000158899955500000013340009_1589011398_16527
  1152 ??  I1       0:00.24 /usr/local/lib/libexec/org_kde_powerdevil
  1214 ??  I4       0:00.00 kdeinit5: file.so file
local:/var/run/user/1001/klauncherPKUbmP.1.slave-socket
local:/var/run/user/1001/kio_desktopMgyNoK.1.slave-socket (kdeinit5)
  1216 ??  Z        0:00.00 (sh)
  1217 ??  Z        0:00.00 (sh)
  1218 ??  Z        0:00.00 (sh)
  1219 ??  Z        0:00.01 (sh)
  1220 ??  Z        0:00.00 (sh)
  1221 ??  Z        0:00.01 (sh)
  1222 ??  Z        0:00.00 (sh)
  1223 ??  Z        0:00.00 (sh)
  1224 ??  Z        0:00.01 (sh)
  1225 ??  Z        0:00.00 (sh)
  1226 ??  Z        0:00.00 (sh)
  1227 ??  Z        0:00.00 (sh)
  1228 ??  Z        0:00.00 (sh)
  1229 ??  Z        0:00.00 (sh)
  1230 ??  Z        0:00.00 (sh)
  1231 ??  Z        0:00.00 (sh)
  1232 ??  Z        0:00.01 (sh)
  1233 ??  Z        0:00.00 (sh)
  1234 ??  Z        0:00.01 (sh)
  1235 ??  Z        0:00.00 (sh)
  1277 ??  S4       0:00.00 kdeinit5: file.so file
local:/var/run/user/1001/klauncherPKUbmP.1.slave-socket
local:/var/run/user/1001/kded5wwlydC.1.slave-socket (kdeinit5)
  1278 ??  I7       0:05.48 chrome:  (chrome)
  1281 ??  S1       0:01.06 chrome: --type=utility
--field-trial-handle=9865741493051962933,5833454360386249874,131072
--lang=en-US --service-sandbox-type=network
--disable-webrtc-apm-in-audio-service --shared
  1282 ??  I0       0:02.61 chrome: --type=gpu-process
--field-trial-handle=9865741493051962933,5833454360386249874,131072
--gpu-preferences=MAAAAAAAAAAgAAAAAAAAAAAAAAAAAAAAAABgAAAAAAAQAAAAAAAAAAAAAAAAAAAACAAA
  1302 ??  I0       0:00.67 chrome: --type=renderer
--disable-webrtc-apm-in-audio-service
--field-trial-handle=9865741493051962933,5833454360386249874,131072
--disable-gpu-compositing --lang=en-US --enable-aut
  1315 ??  I6       0:14.93 chrome: --type=renderer
--disable-webrtc-apm-in-audio-service
--field-trial-handle=9865741493051962933,5833454360386249874,131072
--disable-gpu-compositing --lang=en-US --enable-aut
  1320 ??  I1       0:00.26 chrome: --type=renderer
--disable-webrtc-apm-in-audio-service
--field-trial-handle=9865741493051962933,5833454360386249874,131072
--disable-gpu-compositing --lang=en-US --enable-aut
  1321 ??  I4       0:00.20 chrome: --type=renderer
--disable-webrtc-apm-in-audio-service
--field-trial-handle=9865741493051962933,5833454360386249874,131072
--disable-gpu-compositing --lang=en-US --enable-aut
  1322 ??  I5       0:00.25 chrome: --type=utility
--field-trial-handle=9865741493051962933,5833454360386249874,131072
--lang=en-US --service-sandbox-type=audio
--disable-webrtc-apm-in-audio-service --shared-f
  1422 ??  I2s      0:00.00
/home/marto/WRF/Build_WRF/LIBRARIES/mpich/bin/hydra_pmi_proxy
--control-port dragonfly:2416 --rmk user --launcher ssh --demux poll --pgid
0 --retries 10 --usize -2 --proxy-id 0
  1423 ??  R4s      1:11.89 mpich-3.0.4/examples/hellow
  1424 ??  I2s      0:00.00 mpich-3.0.4/examples/hellow
  1425 ??  R3s      1:11.04 mpich-3.0.4/examples/hellow
  1426 ??  R2s      1:11.39 mpich-3.0.4/examples/hellow
  1427 ??  R6s      1:11.96 mpich-3.0.4/examples/hellow
   972 v0  I0       0:00.00 -tcsh (tcsh)
   977 v0  I0+      0:00.00 /bin/sh /usr/local/bin/startx
  1001 v0  I0+      0:00.00 xinit /home/marto/.xinitrc -- /usr/local/bin/X
:0 -auth /home/marto/.serverauth.977
  1004 v0  I0       0:00.00 ck-launch-session startplasma-x11
  1013 v0  I1       0:00.03 startplasma-x11
  1024 v0  I6       0:00.00 dbus-launch --autolaunch
7a25735143fe35ec86d2d9be5eac81cc --binary-syntax --close-stderr
  1043 v0  I2       0:00.13 /usr/local/bin/plasma_session
  1196  1  I2s      0:00.07 /bin/tcsh
  1421  1  I2+      0:00.00 mpirun -n 5 mpich-3.0.4/examples/hellow
(mpiexec.hydra)
  1199  2  I5s+     0:00.04 /bin/tcsh
  1194  3  S6s      0:00.04 /bin/tcsh
  1429  3  R6+      0:00.00 ps x
"

After that freeze, I killed mpirun with Ctrl + C:
"
marto at dragonfly% mpirun -n 5 mpich-3.0.4/examples/hellow
^C[mpiexec at dragonfly] Sending Ctrl-C to processes as requested
[mpiexec at dragonfly] Press Ctrl-C again to force abort
[proxy:0:0 at dragonfly] HYDT_dmxu_poll_wait_for_event
(./tools/demux/demux_poll.c:71): assert (!(pollfds[i].revents & ~POLLIN &
~POLLOUT & ~POLLHUP & ~POLLERR)) failed
[proxy:0:0 at dragonfly] main (./pm/pmiserv/pmip.c:206): demux engine error
waiting for event
[mpiexec at dragonfly] control_cb (./pm/pmiserv/pmiserv_cb.c:202): assert
(!closed) failed
[mpiexec at dragonfly] HYDT_dmxu_poll_wait_for_event
(./tools/demux/demux_poll.c:77): callback returned error status
[mpiexec at dragonfly] HYD_pmci_wait_for_completion
(./pm/pmiserv/pmiserv_pmci.c:197): error waiting for event
[mpiexec at dragonfly] main (./ui/mpich/mpiexec.c:331): process manager error
waiting for completion
"

The next relaunch of mpirun with 5 cores was successful. For completeness,
I am attaching the output of 'ps x' after mpirun with 2 cores freezes,
which it actually with 2 cores always does.

I hope this was helpful. I am looking forward to your reply.

Best regards,
Martin


On Sat, May 9, 2020 at 2:15 PM Tony Curtis <anthony.curtis at stonybrook.edu>
wrote:

>
>
> On May 9, 2020, at 2:24 AM, Martin Ivanov <marto1980 at gmail.com> wrote:
>
> Hello Tony,
> Thank you very much for your reply. I followed your advice and gave the
> hostname 'dragonfly' to my machine. Then, in /etc/hosts I provided gave the
> alias 'dragonfly' to localhost as you suggested:
>
> "
> marto at dragonfly% cat /etc/hosts
> ::1                     localhost dragonfly
> 127.0.0.1               localhost dragonfly
> "
>
> I compiled both the icpi and hellow examples. Now mpiexec seems to work,
> although not as reliably as I might wish. Running any of the examples with
> one core like this:
>
> mpiexec -n 1 mpich-3.0.4/examples/hellow
>
> is always successful. I could never get the command to finish with 2, 3,
> or 4 cores: it simply hangs. E.g. with 2 cores after running the above
> command I get:
>
> "
> marto at dragonfly% ps x | grep hellow
> 166493 ??  I6s      0:00.00 mpich-3.0.4/examples/hellow
> 166494 ??  I6s      0:00.00 mpich-3.0.4/examples/hellow
> "
>
>
> Ok, good, that seemed like it was the obvious problem.  Can you show all
> the processes involved in the launch, not just the application (i.e. also
> the mpirun and anything that is spawned)?  This might provide a further
> clue about what is happening underneath.
>
> Tony
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.mpich.org/pipermail/discuss/attachments/20200510/5a55f6dd/attachment.html>
-------------- next part --------------
marto at dragonfly% ps x
   PID TT  STAT        TIME COMMAND
  1025 ??  I5s      0:00.14 /usr/local/bin/dbus-daemon --syslog-only --fork --print-pid 5 --print-address 7 --session
  1028 ??  I4s      0:00.08 kdeinit5: Running... (kdeinit5)
  1029 ??  I2       0:00.29 /usr/local/lib/libexec/kf5/klauncher --fd=8
  1031 ??  I7       0:00.71 kded5
  1040 ??  I3       0:00.25 /usr/local/bin/kaccess
  1047 ??  I4       0:00.02 /usr/local/libexec/dconf-service
  1058 ??  I1       0:00.30 /usr/local/bin/ksmserver
  1061 ??  I6       0:00.41 /usr/local/bin/kglobalaccel5
  1067 ??  I7       0:00.15 /usr/local/lib/libexec/kf5/kscreen_backend_launcher
  1069 ??  I3       0:36.35 /usr/local/bin/kwin_x11 -session 100000000158893236700000019470008_1589011398_40052
  1071 ??  I0       0:10.42 /usr/local/bin/plasmashell
  1073 ??  I6       0:00.10 /usr/local/bin/xembedsniproxy
  1075 ??  I3       0:00.19 /usr/local/lib/libexec/polkit-kde-authentication-agent-1
  1080 ??  I6       0:00.08 /usr/local/bin/kwrited
  1084 ??  I5       0:00.12 /usr/local/bin/gmenudbusmenuproxy
  1093 ??  I0       0:00.28 /usr/local/lib/libexec/DiscoverNotifier -session 1014ce0c7d3000158899951500000013340005_1589011398_16101
  1121 ??  I2       0:00.46 /usr/local/bin/korgac -session 100000000158893236700000019470009_1589011398_16368
  1128 ??  I2       0:00.00 /usr/local/libexec/at-spi-bus-launcher
  1130 ??  I3       0:00.27 /usr/local/lib/libexec/kactivitymanagerd
  1131 ??  S1       0:00.13 /usr/local/bin/dbus-daemon --config-file=/usr/local/share/defaults/at-spi2/accessibility.conf --nofork --print-address 3
  1134 ??  I2       0:00.12 /usr/local/libexec/at-spi2-registryd --use-gnome-session
  1136 ??  I6       0:07.17 /usr/local/lib/thunderbird/thunderbird --sm-client-id 100000000158893248800000019470014
  1140 ??  S4       0:04.79 /usr/local/bin/gkrellm --sm-client-id 100000000158893266700000019470017
  1142 ??  I6       0:01.00 /usr/local/bin/dolphin -session 100000000158893337700000019470020_1589011398_16282
  1144 ??  I7       0:00.36 /usr/local/bin/kmix -session 1014ce0c7d3000158899952000000013340007_1589011398_17041
  1146 ??  I6       0:02.00 /usr/local/bin/konsole -session 1014ce0c7d3000158899955500000013340009_1589011398_16527
  1152 ??  I1       0:00.29 /usr/local/lib/libexec/org_kde_powerdevil
  1216 ??  Z        0:00.00 (sh)
  1217 ??  Z        0:00.00 (sh)
  1218 ??  Z        0:00.00 (sh)
  1219 ??  Z        0:00.01 (sh)
  1220 ??  Z        0:00.00 (sh)
  1221 ??  Z        0:00.01 (sh)
  1222 ??  Z        0:00.00 (sh)
  1223 ??  Z        0:00.00 (sh)
  1224 ??  Z        0:00.01 (sh)
  1225 ??  Z        0:00.00 (sh)
  1226 ??  Z        0:00.00 (sh)
  1227 ??  Z        0:00.00 (sh)
  1228 ??  Z        0:00.00 (sh)
  1229 ??  Z        0:00.00 (sh)
  1230 ??  Z        0:00.00 (sh)
  1231 ??  Z        0:00.00 (sh)
  1232 ??  Z        0:00.01 (sh)
  1233 ??  Z        0:00.00 (sh)
  1234 ??  Z        0:00.01 (sh)
  1235 ??  Z        0:00.00 (sh)
  1277 ??  S4       0:00.00 kdeinit5: file.so file local:/var/run/user/1001/klauncherPKUbmP.1.slave-socket local:/var/run/user/1001/kded5wwlydC.1.slave-socket (kdeinit5)
  1278 ??  I1       0:07.66 chrome:  (chrome)
  1281 ??  S5       0:01.90 chrome: --type=utility --field-trial-handle=9865741493051962933,5833454360386249874,131072 --lang=en-US --service-sandbox-type=network --disable-webrtc-apm-in-audio-service --shared
  1282 ??  I0       0:04.88 chrome: --type=gpu-process --field-trial-handle=9865741493051962933,5833454360386249874,131072 --gpu-preferences=MAAAAAAAAAAgAAAAAAAAAAAAAAAAAAAAAABgAAAAAAAQAAAAAAAAAAAAAAAAAAAACAAA
  1302 ??  I1       0:00.67 chrome: --type=renderer --disable-webrtc-apm-in-audio-service --field-trial-handle=9865741493051962933,5833454360386249874,131072 --disable-gpu-compositing --lang=en-US --enable-aut
  1315 ??  I4       0:45.85 chrome: --type=renderer --disable-webrtc-apm-in-audio-service --field-trial-handle=9865741493051962933,5833454360386249874,131072 --disable-gpu-compositing --lang=en-US --enable-aut
  1320 ??  I5       0:00.27 chrome: --type=renderer --disable-webrtc-apm-in-audio-service --field-trial-handle=9865741493051962933,5833454360386249874,131072 --disable-gpu-compositing --lang=en-US --enable-aut
  1321 ??  I7       0:00.20 chrome: --type=renderer --disable-webrtc-apm-in-audio-service --field-trial-handle=9865741493051962933,5833454360386249874,131072 --disable-gpu-compositing --lang=en-US --enable-aut
  1322 ??  I4       0:00.25 chrome: --type=utility --field-trial-handle=9865741493051962933,5833454360386249874,131072 --lang=en-US --service-sandbox-type=audio --disable-webrtc-apm-in-audio-service --shared-f
  1430 ??  I0       0:00.00 kdeinit5: file.so file local:/var/run/user/1001/klauncherPKUbmP.1.slave-socket local:/var/run/user/1001/dolphinxqYMha.11.slave-socket (kdeinit5)
  1440 ??  I1       0:01.33 /usr/local/bin/gvim -f /home/marto/WRF/reply.txt (vim)
  1451 ??  S5s      0:00.01 /home/marto/WRF/Build_WRF/LIBRARIES/mpich/bin/hydra_pmi_proxy --control-port dragonfly:4616 --rmk user --launcher ssh --demux poll --pgid 0 --retries 10 --usize -2 --proxy-id 0
  1452 ??  R5s      0:08.50 mpich-3.0.4/examples/hellow
  1453 ??  S5s      0:00.00 mpich-3.0.4/examples/hellow
   972 v0  I0       0:00.00 -tcsh (tcsh)
   977 v0  I0+      0:00.00 /bin/sh /usr/local/bin/startx
  1001 v0  I0+      0:00.00 xinit /home/marto/.xinitrc -- /usr/local/bin/X :0 -auth /home/marto/.serverauth.977
  1004 v0  I0       0:00.00 ck-launch-session startplasma-x11
  1013 v0  I1       0:00.03 startplasma-x11
  1024 v0  I6       0:00.00 dbus-launch --autolaunch 7a25735143fe35ec86d2d9be5eac81cc --binary-syntax --close-stderr
  1043 v0  I2       0:00.13 /usr/local/bin/plasma_session
  1196  1  S1s      0:00.09 /bin/tcsh
  1450  1  S1+      0:00.00 mpirun -n 2 mpich-3.0.4/examples/hellow (mpiexec.hydra)
  1199  2  S3s      0:00.05 /bin/tcsh
  1454  2  R3+      0:00.00 ps x
  1194  3  I6s+     0:00.04 /bin/tcsh 


More information about the discuss mailing list