[mpich-discuss] Run mpich on localhost
Martin Ivanov
marto1980 at gmail.com
Sun May 10 02:16:18 CDT 2020
Hello Tony,
Thank you very much for your reply. I am posting you the output of 'ps x',
when 'mpirun -n 5 mpich-3.0.4/examples/hellow' freezes:
"
marto at dragonfly% ps x
PID TT STAT TIME COMMAND
1025 ?? I6s 0:00.14 /usr/local/bin/dbus-daemon --syslog-only --fork
--print-pid 5 --print-address 7 --session
1028 ?? I4s 0:00.08 kdeinit5: Running... (kdeinit5)
1029 ?? I2 0:00.26 /usr/local/lib/libexec/kf5/klauncher --fd=8
1031 ?? I7 0:00.65 kded5
1040 ?? I3 0:00.21 /usr/local/bin/kaccess
1047 ?? I4 0:00.02 /usr/local/libexec/dconf-service
1058 ?? I1 0:00.26 /usr/local/bin/ksmserver
1061 ?? I6 0:00.38 /usr/local/bin/kglobalaccel5
1067 ?? I7 0:00.10
/usr/local/lib/libexec/kf5/kscreen_backend_launcher
1069 ?? I0 0:18.15 /usr/local/bin/kwin_x11 -session
100000000158893236700000019470008_1589011398_40052
1071 ?? I3 0:07.36 /usr/local/bin/plasmashell
1073 ?? I6 0:00.06 /usr/local/bin/xembedsniproxy
1075 ?? I3 0:00.17
/usr/local/lib/libexec/polkit-kde-authentication-agent-1
1080 ?? I6 0:00.05 /usr/local/bin/kwrited
1084 ?? I5 0:00.07 /usr/local/bin/gmenudbusmenuproxy
1093 ?? I2 0:00.24 /usr/local/lib/libexec/DiscoverNotifier
-session 1014ce0c7d3000158899951500000013340005_1589011398_16101
1121 ?? I2 0:00.45 /usr/local/bin/korgac -session
100000000158893236700000019470009_1589011398_16368
1128 ?? I2 0:00.00 /usr/local/libexec/at-spi-bus-launcher
1130 ?? I3 0:00.24 /usr/local/lib/libexec/kactivitymanagerd
1131 ?? S5 0:00.11 /usr/local/bin/dbus-daemon
--config-file=/usr/local/share/defaults/at-spi2/accessibility.conf --nofork
--print-address 3
1134 ?? I2 0:00.07 /usr/local/libexec/at-spi2-registryd
--use-gnome-session
1136 ?? I1 0:07.00 /usr/local/lib/thunderbird/thunderbird
--sm-client-id 100000000158893248800000019470014
1140 ?? S5 0:01.73 /usr/local/bin/gkrellm --sm-client-id
100000000158893266700000019470017
1142 ?? I6 0:00.54 /usr/local/bin/dolphin -session
100000000158893337700000019470020_1589011398_16282
1144 ?? I7 0:00.35 /usr/local/bin/kmix -session
1014ce0c7d3000158899952000000013340007_1589011398_17041
1146 ?? I0 0:01.01 /usr/local/bin/konsole -session
1014ce0c7d3000158899955500000013340009_1589011398_16527
1152 ?? I1 0:00.24 /usr/local/lib/libexec/org_kde_powerdevil
1214 ?? I4 0:00.00 kdeinit5: file.so file
local:/var/run/user/1001/klauncherPKUbmP.1.slave-socket
local:/var/run/user/1001/kio_desktopMgyNoK.1.slave-socket (kdeinit5)
1216 ?? Z 0:00.00 (sh)
1217 ?? Z 0:00.00 (sh)
1218 ?? Z 0:00.00 (sh)
1219 ?? Z 0:00.01 (sh)
1220 ?? Z 0:00.00 (sh)
1221 ?? Z 0:00.01 (sh)
1222 ?? Z 0:00.00 (sh)
1223 ?? Z 0:00.00 (sh)
1224 ?? Z 0:00.01 (sh)
1225 ?? Z 0:00.00 (sh)
1226 ?? Z 0:00.00 (sh)
1227 ?? Z 0:00.00 (sh)
1228 ?? Z 0:00.00 (sh)
1229 ?? Z 0:00.00 (sh)
1230 ?? Z 0:00.00 (sh)
1231 ?? Z 0:00.00 (sh)
1232 ?? Z 0:00.01 (sh)
1233 ?? Z 0:00.00 (sh)
1234 ?? Z 0:00.01 (sh)
1235 ?? Z 0:00.00 (sh)
1277 ?? S4 0:00.00 kdeinit5: file.so file
local:/var/run/user/1001/klauncherPKUbmP.1.slave-socket
local:/var/run/user/1001/kded5wwlydC.1.slave-socket (kdeinit5)
1278 ?? I7 0:05.48 chrome: (chrome)
1281 ?? S1 0:01.06 chrome: --type=utility
--field-trial-handle=9865741493051962933,5833454360386249874,131072
--lang=en-US --service-sandbox-type=network
--disable-webrtc-apm-in-audio-service --shared
1282 ?? I0 0:02.61 chrome: --type=gpu-process
--field-trial-handle=9865741493051962933,5833454360386249874,131072
--gpu-preferences=MAAAAAAAAAAgAAAAAAAAAAAAAAAAAAAAAABgAAAAAAAQAAAAAAAAAAAAAAAAAAAACAAA
1302 ?? I0 0:00.67 chrome: --type=renderer
--disable-webrtc-apm-in-audio-service
--field-trial-handle=9865741493051962933,5833454360386249874,131072
--disable-gpu-compositing --lang=en-US --enable-aut
1315 ?? I6 0:14.93 chrome: --type=renderer
--disable-webrtc-apm-in-audio-service
--field-trial-handle=9865741493051962933,5833454360386249874,131072
--disable-gpu-compositing --lang=en-US --enable-aut
1320 ?? I1 0:00.26 chrome: --type=renderer
--disable-webrtc-apm-in-audio-service
--field-trial-handle=9865741493051962933,5833454360386249874,131072
--disable-gpu-compositing --lang=en-US --enable-aut
1321 ?? I4 0:00.20 chrome: --type=renderer
--disable-webrtc-apm-in-audio-service
--field-trial-handle=9865741493051962933,5833454360386249874,131072
--disable-gpu-compositing --lang=en-US --enable-aut
1322 ?? I5 0:00.25 chrome: --type=utility
--field-trial-handle=9865741493051962933,5833454360386249874,131072
--lang=en-US --service-sandbox-type=audio
--disable-webrtc-apm-in-audio-service --shared-f
1422 ?? I2s 0:00.00
/home/marto/WRF/Build_WRF/LIBRARIES/mpich/bin/hydra_pmi_proxy
--control-port dragonfly:2416 --rmk user --launcher ssh --demux poll --pgid
0 --retries 10 --usize -2 --proxy-id 0
1423 ?? R4s 1:11.89 mpich-3.0.4/examples/hellow
1424 ?? I2s 0:00.00 mpich-3.0.4/examples/hellow
1425 ?? R3s 1:11.04 mpich-3.0.4/examples/hellow
1426 ?? R2s 1:11.39 mpich-3.0.4/examples/hellow
1427 ?? R6s 1:11.96 mpich-3.0.4/examples/hellow
972 v0 I0 0:00.00 -tcsh (tcsh)
977 v0 I0+ 0:00.00 /bin/sh /usr/local/bin/startx
1001 v0 I0+ 0:00.00 xinit /home/marto/.xinitrc -- /usr/local/bin/X
:0 -auth /home/marto/.serverauth.977
1004 v0 I0 0:00.00 ck-launch-session startplasma-x11
1013 v0 I1 0:00.03 startplasma-x11
1024 v0 I6 0:00.00 dbus-launch --autolaunch
7a25735143fe35ec86d2d9be5eac81cc --binary-syntax --close-stderr
1043 v0 I2 0:00.13 /usr/local/bin/plasma_session
1196 1 I2s 0:00.07 /bin/tcsh
1421 1 I2+ 0:00.00 mpirun -n 5 mpich-3.0.4/examples/hellow
(mpiexec.hydra)
1199 2 I5s+ 0:00.04 /bin/tcsh
1194 3 S6s 0:00.04 /bin/tcsh
1429 3 R6+ 0:00.00 ps x
"
After that freeze, I killed mpirun with Ctrl + C:
"
marto at dragonfly% mpirun -n 5 mpich-3.0.4/examples/hellow
^C[mpiexec at dragonfly] Sending Ctrl-C to processes as requested
[mpiexec at dragonfly] Press Ctrl-C again to force abort
[proxy:0:0 at dragonfly] HYDT_dmxu_poll_wait_for_event
(./tools/demux/demux_poll.c:71): assert (!(pollfds[i].revents & ~POLLIN &
~POLLOUT & ~POLLHUP & ~POLLERR)) failed
[proxy:0:0 at dragonfly] main (./pm/pmiserv/pmip.c:206): demux engine error
waiting for event
[mpiexec at dragonfly] control_cb (./pm/pmiserv/pmiserv_cb.c:202): assert
(!closed) failed
[mpiexec at dragonfly] HYDT_dmxu_poll_wait_for_event
(./tools/demux/demux_poll.c:77): callback returned error status
[mpiexec at dragonfly] HYD_pmci_wait_for_completion
(./pm/pmiserv/pmiserv_pmci.c:197): error waiting for event
[mpiexec at dragonfly] main (./ui/mpich/mpiexec.c:331): process manager error
waiting for completion
"
The next relaunch of mpirun with 5 cores was successful. For completeness,
I am attaching the output of 'ps x' after mpirun with 2 cores freezes,
which it actually with 2 cores always does.
I hope this was helpful. I am looking forward to your reply.
Best regards,
Martin
On Sat, May 9, 2020 at 2:15 PM Tony Curtis <anthony.curtis at stonybrook.edu>
wrote:
>
>
> On May 9, 2020, at 2:24 AM, Martin Ivanov <marto1980 at gmail.com> wrote:
>
> Hello Tony,
> Thank you very much for your reply. I followed your advice and gave the
> hostname 'dragonfly' to my machine. Then, in /etc/hosts I provided gave the
> alias 'dragonfly' to localhost as you suggested:
>
> "
> marto at dragonfly% cat /etc/hosts
> ::1 localhost dragonfly
> 127.0.0.1 localhost dragonfly
> "
>
> I compiled both the icpi and hellow examples. Now mpiexec seems to work,
> although not as reliably as I might wish. Running any of the examples with
> one core like this:
>
> mpiexec -n 1 mpich-3.0.4/examples/hellow
>
> is always successful. I could never get the command to finish with 2, 3,
> or 4 cores: it simply hangs. E.g. with 2 cores after running the above
> command I get:
>
> "
> marto at dragonfly% ps x | grep hellow
> 166493 ?? I6s 0:00.00 mpich-3.0.4/examples/hellow
> 166494 ?? I6s 0:00.00 mpich-3.0.4/examples/hellow
> "
>
>
> Ok, good, that seemed like it was the obvious problem. Can you show all
> the processes involved in the launch, not just the application (i.e. also
> the mpirun and anything that is spawned)? This might provide a further
> clue about what is happening underneath.
>
> Tony
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.mpich.org/pipermail/discuss/attachments/20200510/5a55f6dd/attachment.html>
-------------- next part --------------
marto at dragonfly% ps x
PID TT STAT TIME COMMAND
1025 ?? I5s 0:00.14 /usr/local/bin/dbus-daemon --syslog-only --fork --print-pid 5 --print-address 7 --session
1028 ?? I4s 0:00.08 kdeinit5: Running... (kdeinit5)
1029 ?? I2 0:00.29 /usr/local/lib/libexec/kf5/klauncher --fd=8
1031 ?? I7 0:00.71 kded5
1040 ?? I3 0:00.25 /usr/local/bin/kaccess
1047 ?? I4 0:00.02 /usr/local/libexec/dconf-service
1058 ?? I1 0:00.30 /usr/local/bin/ksmserver
1061 ?? I6 0:00.41 /usr/local/bin/kglobalaccel5
1067 ?? I7 0:00.15 /usr/local/lib/libexec/kf5/kscreen_backend_launcher
1069 ?? I3 0:36.35 /usr/local/bin/kwin_x11 -session 100000000158893236700000019470008_1589011398_40052
1071 ?? I0 0:10.42 /usr/local/bin/plasmashell
1073 ?? I6 0:00.10 /usr/local/bin/xembedsniproxy
1075 ?? I3 0:00.19 /usr/local/lib/libexec/polkit-kde-authentication-agent-1
1080 ?? I6 0:00.08 /usr/local/bin/kwrited
1084 ?? I5 0:00.12 /usr/local/bin/gmenudbusmenuproxy
1093 ?? I0 0:00.28 /usr/local/lib/libexec/DiscoverNotifier -session 1014ce0c7d3000158899951500000013340005_1589011398_16101
1121 ?? I2 0:00.46 /usr/local/bin/korgac -session 100000000158893236700000019470009_1589011398_16368
1128 ?? I2 0:00.00 /usr/local/libexec/at-spi-bus-launcher
1130 ?? I3 0:00.27 /usr/local/lib/libexec/kactivitymanagerd
1131 ?? S1 0:00.13 /usr/local/bin/dbus-daemon --config-file=/usr/local/share/defaults/at-spi2/accessibility.conf --nofork --print-address 3
1134 ?? I2 0:00.12 /usr/local/libexec/at-spi2-registryd --use-gnome-session
1136 ?? I6 0:07.17 /usr/local/lib/thunderbird/thunderbird --sm-client-id 100000000158893248800000019470014
1140 ?? S4 0:04.79 /usr/local/bin/gkrellm --sm-client-id 100000000158893266700000019470017
1142 ?? I6 0:01.00 /usr/local/bin/dolphin -session 100000000158893337700000019470020_1589011398_16282
1144 ?? I7 0:00.36 /usr/local/bin/kmix -session 1014ce0c7d3000158899952000000013340007_1589011398_17041
1146 ?? I6 0:02.00 /usr/local/bin/konsole -session 1014ce0c7d3000158899955500000013340009_1589011398_16527
1152 ?? I1 0:00.29 /usr/local/lib/libexec/org_kde_powerdevil
1216 ?? Z 0:00.00 (sh)
1217 ?? Z 0:00.00 (sh)
1218 ?? Z 0:00.00 (sh)
1219 ?? Z 0:00.01 (sh)
1220 ?? Z 0:00.00 (sh)
1221 ?? Z 0:00.01 (sh)
1222 ?? Z 0:00.00 (sh)
1223 ?? Z 0:00.00 (sh)
1224 ?? Z 0:00.01 (sh)
1225 ?? Z 0:00.00 (sh)
1226 ?? Z 0:00.00 (sh)
1227 ?? Z 0:00.00 (sh)
1228 ?? Z 0:00.00 (sh)
1229 ?? Z 0:00.00 (sh)
1230 ?? Z 0:00.00 (sh)
1231 ?? Z 0:00.00 (sh)
1232 ?? Z 0:00.01 (sh)
1233 ?? Z 0:00.00 (sh)
1234 ?? Z 0:00.01 (sh)
1235 ?? Z 0:00.00 (sh)
1277 ?? S4 0:00.00 kdeinit5: file.so file local:/var/run/user/1001/klauncherPKUbmP.1.slave-socket local:/var/run/user/1001/kded5wwlydC.1.slave-socket (kdeinit5)
1278 ?? I1 0:07.66 chrome: (chrome)
1281 ?? S5 0:01.90 chrome: --type=utility --field-trial-handle=9865741493051962933,5833454360386249874,131072 --lang=en-US --service-sandbox-type=network --disable-webrtc-apm-in-audio-service --shared
1282 ?? I0 0:04.88 chrome: --type=gpu-process --field-trial-handle=9865741493051962933,5833454360386249874,131072 --gpu-preferences=MAAAAAAAAAAgAAAAAAAAAAAAAAAAAAAAAABgAAAAAAAQAAAAAAAAAAAAAAAAAAAACAAA
1302 ?? I1 0:00.67 chrome: --type=renderer --disable-webrtc-apm-in-audio-service --field-trial-handle=9865741493051962933,5833454360386249874,131072 --disable-gpu-compositing --lang=en-US --enable-aut
1315 ?? I4 0:45.85 chrome: --type=renderer --disable-webrtc-apm-in-audio-service --field-trial-handle=9865741493051962933,5833454360386249874,131072 --disable-gpu-compositing --lang=en-US --enable-aut
1320 ?? I5 0:00.27 chrome: --type=renderer --disable-webrtc-apm-in-audio-service --field-trial-handle=9865741493051962933,5833454360386249874,131072 --disable-gpu-compositing --lang=en-US --enable-aut
1321 ?? I7 0:00.20 chrome: --type=renderer --disable-webrtc-apm-in-audio-service --field-trial-handle=9865741493051962933,5833454360386249874,131072 --disable-gpu-compositing --lang=en-US --enable-aut
1322 ?? I4 0:00.25 chrome: --type=utility --field-trial-handle=9865741493051962933,5833454360386249874,131072 --lang=en-US --service-sandbox-type=audio --disable-webrtc-apm-in-audio-service --shared-f
1430 ?? I0 0:00.00 kdeinit5: file.so file local:/var/run/user/1001/klauncherPKUbmP.1.slave-socket local:/var/run/user/1001/dolphinxqYMha.11.slave-socket (kdeinit5)
1440 ?? I1 0:01.33 /usr/local/bin/gvim -f /home/marto/WRF/reply.txt (vim)
1451 ?? S5s 0:00.01 /home/marto/WRF/Build_WRF/LIBRARIES/mpich/bin/hydra_pmi_proxy --control-port dragonfly:4616 --rmk user --launcher ssh --demux poll --pgid 0 --retries 10 --usize -2 --proxy-id 0
1452 ?? R5s 0:08.50 mpich-3.0.4/examples/hellow
1453 ?? S5s 0:00.00 mpich-3.0.4/examples/hellow
972 v0 I0 0:00.00 -tcsh (tcsh)
977 v0 I0+ 0:00.00 /bin/sh /usr/local/bin/startx
1001 v0 I0+ 0:00.00 xinit /home/marto/.xinitrc -- /usr/local/bin/X :0 -auth /home/marto/.serverauth.977
1004 v0 I0 0:00.00 ck-launch-session startplasma-x11
1013 v0 I1 0:00.03 startplasma-x11
1024 v0 I6 0:00.00 dbus-launch --autolaunch 7a25735143fe35ec86d2d9be5eac81cc --binary-syntax --close-stderr
1043 v0 I2 0:00.13 /usr/local/bin/plasma_session
1196 1 S1s 0:00.09 /bin/tcsh
1450 1 S1+ 0:00.00 mpirun -n 2 mpich-3.0.4/examples/hellow (mpiexec.hydra)
1199 2 S3s 0:00.05 /bin/tcsh
1454 2 R3+ 0:00.00 ps x
1194 3 I6s+ 0:00.04 /bin/tcsh
More information about the discuss
mailing list