[Wien] Problems with kpoint-parallelization / SUSE 9.2

Ulrich Keßler ukessler at uni-bonn.de
Wed Aug 3 14:55:44 CEST 2005


Dear Wien users,

I have a problem that seems similar to the one discussed by Stefaan, 
Alex and Peter at the beginning of June and the one reported by Andrea 
on 27.04.05.

I have two (rather old) Athlons (dobby and vienna), SUSE 9.2, Wien 5.5 
compiled with ifort 8.1 and mkl 7.2.1, my standard shell is bash.
When running in single mode both machines work fine.

To speed things up I tried the kpoint-parallelization.
ssh without password is enabled by private/public key method (rsa and 
dsa) and works for "normal" ssh-ing without problems as far as I can judge.
The working directory (physically on dobby) is correctly mounted, 
reading and writing is possible from both machines under the same path.

When using parallel mode (started on dobby) the following steps work:
a) lapw0 (run only on dobby)
b) reading and processing of the .machines file in lapw1para_lapw 
(.machines is: 2: vienna, 3: dobby, granularity:1)
c) splitting of the klist, yields case.klist_1 and case.klist_2
d) the ssh command itself starts lapw1 on both computers (within the kloop)

Then the two lapw1 processes crash with segmentation faults, time output 
reads about 1 s in lapw1.
The lapw1para_lapw script  is still executed and stops after the wait 
statement with the messages:
all processes done (or so)
**LAPW1 crashed

I don´t think it´s a tcsh problem as in Stefaans case because I have 
installed tcsh V. 6.14.00 as Alex suggested. The testscript of Alex 
shows no problem. The wait statement is executed without problem (or the 
problem cannot arise because of former crashing).

Moreover, trying to start the lapw1 jobs from the bash by entering
ssh vienna cd /home/wien/lapw/GGA;lapw1 lapw1_1.def
leads to the same result (crash of lapw1).

So I conclude that there is a ssh problem. Andrea (he had the crash in 
lapw2) solved it by doing the ssh from a non-SUSE machine. I didn´t try 
because I have none.
I think (and hope) it is some small stupid fault connected with the 
ssh-settings that prevents lapw1 from reading/writing to some files/memory.
Has someone who works in parallel with SUSE any idea/suggestions about this?

Thank you
Uli

P.S.: The STDOUT (quite long because flag x set in first line of 
lapw1para_lapw) and the dayfile are attached below.


Dr. Ulrich Keßler
Institut für Anorganische Chemie
Universität Bonn
Gerhard-Domagk-Str. 1
D-53121 Bonn
Germany
Tel.: (+49) (0)228/73-5334
Fax: (+49) (0)228/73-5660


Dayfile:


Calculating GGA in /home/wien/lapw/GGA
on dobby with PID 5399

    start     (Mi Aug  3 13:51:24 CEST 2005) with lapw0 (20/20 to go)

    cycle 1     (Mi Aug  3 13:51:24 CEST 2005)     (20/20 to go)

 >   lapw0 -p    (13:51:24) starting parallel lapw0 at Mi Aug  3 
13:51:24 CEST 2005
--------
running lapw0 in single mode
19.007u 0.335s 0:20.68 93.4%    0+0k 0+0io 0pf+0w
 >   lapw1  -p     (13:51:44) name /opt/WIEN2k/lapw1para
bin /opt/WIEN2k
starting parallel lapw1 at Mi Aug  3 13:51:44 CEST 2005
->  starting parallel LAPW1 jobs at Mi Aug  3 13:51:45 CEST 2005
running LAPW1 in parallel mode (using .machines)
Number of parallel jobs:  2
2 number_of_parallel_jobs

lapw1_1.def erzeugt
init:vienna
init:dobby
1 : vienna :  2 : 1 : 1
.lock_vienna1

lapw1_2.def erzeugt
init:vienna
init:dobby
1 : vienna :  2 : 1 : 1
2 : dobby :  3 : 1 : 2
.lock_dobby2
try again
**  LAPW1 crashed!
0.226u 0.235s 0:04.42 10.1%    0+0k 0+0io 0pf+0w

 >   stop error


STDOUT:

 LAPW0 END
set tmp = .tmp_lapw1para.5508
set tmp2 = .tmp_testpara_new.5508_2
onintr exit
set name = /opt/WIEN2k/lapw1para
echo name /opt/WIEN2k/lapw1para
set bin = /opt/WIEN2k
echo bin /opt/WIEN2k
if ! ( -d /opt/WIEN2k ) set bin = .
unalias rm
alias testinput if (! -e !:1 || -z !:1) goto !:2
alias testerror if (! -z !:1.error) goto error
set updn
set dnup = dn
set sc
set cmplx
set t = time
set remote = ssh
set log = :parallel
set granular = granularity:
set res = residue:
set ef = extrafine:
set kpl
unset resok
unset extrafine
set granularity = 3
set useremote = 1
set delay = 1
set sleepy = 1
set debug = 0
if ( -e /opt/WIEN2k/parallel_options ) then
source /opt/WIEN2k/parallel_options
setenv USE_REMOTE 1
setenv WIEN_GRANULARITY 3
endif
if ( 1 ) then
set useremote = 1
endif
if ( 1 ) then
set granularity = 3
endif
if ( 0 ) then
if ( 0 ) then
set mpirun=mpirun -np _NP_ _EXEC_
endif
if ( 1 < 1 ) then
setenv PWD `pwd|sed "s/tmp_mnt\///"`
pwd
sed s/tmp_mnt\///
if ( -e .processes ) rm .processes
touch .lock_
foreach i ( .lock_* )
rm .lock_
end
while ( 1 )
switch ( lapw1.def )
set def = lapw1
shift
breaksw
end
while ( 0 )
set exe = /opt/WIEN2k/lapw1
set exe = lapw1
echo **  Error in Parallel LAPW1
if ( ! -e .machines || -z .machines ) goto single
echo starting parallel lapw1 at `date`
date
echo starting parallel lapw1 at `date`
date
echo RUNNING
if ( -e lapw1.error ) rm *lapw1*.error
rm lapw1_1.error lapw1_2.error lapw1.error
if ( -e uplapw1.error ) rm *lapw1*.error
if ( -e dnlapw1.error ) rm *lapw1*.error
if ( -e lapw1_1.error ) rm *lapw1_*.error
if ( -e uplapw1_1.error ) rm *lapw1_*.error
if ( -e dnlapw1_1.error ) rm *lapw1_*.error
if ( -e .time1_1 ) rm .time1_*
rm .time1_1 .time1_2
set case = `pwd`
pwd
set case = GGA
if ( GGA ==  ) then
set caseklist=`grep klist $def.def | cut -f 2 -d,| cut -f 2 -d\' `
grep klist lapw1.def
cut -f 2 -d,
cut -f 2 -d'
set test = `grep "K-VECTORS" $case.in1${cmplx}|cut -c21`
grep K-VECTORS GGA.in1
cut -c21
if ( 4 != 4 ) then
echo ->  starting parallel LAPW1 jobs at `date`
date
if ( 0 > 0 ) echo `date` -> Setting up case GGA for parallel execution
if ( 0 > 0 ) echo `date` -> of LAPW1
if ( 0 > 0 ) echo `date` ->
if ( == up || == dn ) then
if ( 0 > 1 ) echo `date` -> non sp
cp .machines .tmp_lapw1para.5508
echo running LAPW1 in parallel mode (using .machines)
endif
grep : .tmp_lapw1para.5508
grep -v #
grep -v lapw0
grep -v granular
grep -v residue
grep -v extrafine
awk -F: {print "init:" $2}
set weigh = `cut -f1 -d: $tmp2 | xargs`
xargs
cut -f1 -d: .tmp_testpara_new.5508_2
set machine = `cut -f2 -d: $tmp2 | xargs`
xargs
cut -f2 -d: .tmp_testpara_new.5508_2
set lockfile = `cut -f2 -d: $tmp2 | awk '{print $1 NR}'|xargs`
awk {print $1 NR}
xargs
cut -f2 -d: .tmp_testpara_new.5508_2
set wweigh = ( 2 3 )
set mist = `wc $tmp2 `
wc .tmp_testpara_new.5508_2
set proc = 2
unset mist
set mist = `grep $granular $tmp | grep -v '#'| cut -f2 -d:`
grep -v #
cut -f2 -d:
grep granularity: .tmp_lapw1para.5508
if ( 1 !=  ) then
if ( 0 > 0 ) echo Granularity set to 1
set granularity = 1
endif
set mist = `grep $ef $tmp |grep -v '#'| cut -f2 -d:`
grep extrafine: .tmp_lapw1para.5508
grep -v #
cut -f2 -d:
if ( == 1 ) then
if ( 0 ) then
if ( 0 > 0 ) echo Extrafine unset
unset extrafine
endif
set mist = `grep $res $tmp |grep -v '#'| cut -f2 -d:`
grep -v #
cut -f2 -d:
grep residue: .tmp_lapw1para.5508
if ( !=  ) then
unset residue
endif
unset mist
set i = 0
set sumw = `awk -F: '{sumw += $1};END {print sumw}' $tmp`
awk -F: {sumw += $1};END {print sumw} .tmp_lapw1para.5508
sed /END/q
set mist = `wc $caseklist.tmp`
wc GGA.klist.tmp
set klist = 6
@ klist --
set i = 1
set sumn = 0
while ( 1 < = 2 )
@ weigh[1] *= 5
@ weigh[1] /= 5
@ weigh[1] /= 1
if ( 2 == 0 ) then
@ sumn += 2
@ i ++
end
while ( 2 < = 2 )
@ weigh[2] *= 5
@ weigh[2] /= 5
@ weigh[2] /= 1
if ( 3 == 0 ) then
@ sumn += 3
@ i ++
end
while ( 3 < = 2 )
if ( 0 ) then
if ( 0 > 0 ) then
if ( 0 ) then
if ( 0 > 0 ) echo `date` -> Splitting GGA.klist.tmp into junks
head -1 GGA.klist.tmp
cut -c36-
cat
@ multi = 2 * 1
set TMP=.machinetmp
if ( -f .machinetmp ) rm .machinetmp
if ( 0 ) then
grep ^[0-9][0-9]*:.* .machines
sed -e s/^[0-9]*:// -e s/^ *// -e s/ *$//
set weight_per_job=`grep '^[0-9][0-9]*:.*' .machines | sed -e 
's/^\([0-9]*\):.*/\1/' | bc`
sed -e s/^\([0-9]*\):.*/\1/
grep ^[0-9][0-9]*:.* .machines
bc
set number_of_p_jobs=`wc -l $TMP | sed -e 's/^ *//' | cut -d ' ' -f 1`
wc -l .machinetmp
sed -e s/^ *//
cut -d   -f 1
echo Number of parallel jobs:  2
sed -e s/\-/\./g .machinetmp
if ( 0 > 0 ) more .machinetmp222
set TMP=.machinetmp222
if ( 0 > 0 ) echo .machinetmp222
set number_per_job=`sed -e 's/[     ]*$//' -e 's/^[     ]*//' -e 
's/[a-zA-Z0-9\.]*:\([0-9]*\)/\1/g' -e 's/[ ^]*[a-zA-Z][a-zA-Z0-9\.]*/ 
1/g' $TMP | sed -e 's/^ //' -e 's/  */+/g' |bc`
sed -e s/[     ]*$// -e s/^[     ]*// -e 
s/[a-zA-Z0-9\.]*:\([0-9]*\)/\1/g -e s/[ ^]*[a-zA-Z][a-zA-Z0-9\.]*/ 1/g 
.machinetmp222
sed -e s/^ // -e s/  */+/g
bc
set TMP=.machinetmp
set i = 1
echo 2 number_of_parallel_jobs
while ( 1 < = 2 )
sed -e 1p -e d .machinetmp
sed -e /\(^[a-zA-Z0-9]*$\)/p -e /\(^[a-zA-Z0-9]*:[0-9]*$\)/p -e s/  */\
/gp -e d
@ i ++
end
while ( 2 < = 2 )
sed -e 2p -e d .machinetmp
sed -e /\(^[a-zA-Z0-9]*$\)/p -e /\(^[a-zA-Z0-9]*:[0-9]*$\)/p -e s/  */\
/gp -e d
@ i ++
end
while ( 3 < = 2 )
rm .machinetmp
set i=1
while ( 1 < = 2 )
set x=`cut -d: -f2 -s .machine$i`
cut -d: -f2 -s .machine1
if (  !=  ) then
@ i ++
end
while ( 2 < = 2 )
set x=`cut -d: -f2 -s .machine$i`
cut -d: -f2 -s .machine2
if (  !=  ) then
@ i ++
end
while ( 3 < = 2 )
set loop = 0
set kbegin = 1
set endloop = 0
kloop:
set p = 1
if ( 0 && 0 ) set p = 2
while ( 1 < = 2 )
if ! ( -e .lock_vienna1 ) then
if ( 1 > 5 ) goto endkloop
@ loop ++
if ( 0 > 0 ) echo prepare 1 on vienna
if ( 0 > 0 ) echo `date` -> Creating klist 1
set kold = 1
if ( 1 > 2 && 0 ) then
@ head = 1 + 2 - 1
set tail = 2
@ kbegin = 1 + 2
endif
if ( 2 > = 5 ) then
set kpl = ( 2 )
if ( 0 > 1 ) echo 1 : 2k (vienna, 2)
head -2 GGA.klist.tmp
tail -2
echo END
cut -c-35 GGA.klist_1
sed 1r head.diff
sed -f script .tmp_lapw1para.5508
if ( 0 > 1 ) echo `date` -> 
if ( 0 > 1 ) echo `date` -> creating lapw1_1.def: 
cp lapw1.def .tmp_lapw1para.5508
cat
sed -f .script .tmp_lapw1para.5508
if ( 1 != 1 ) then
echo
rm .mist .tmp_lapw1para.5508
echo lapw1_1.def erzeugt
echo -n vienna(2)
echo 1 : vienna :  2 : 1 : 1
cat .processes
echo .lock_vienna1
echo
if ( 1 == 1 ) then
if ( 1 == 1 ) then
ssh vienna cd /home/wien/lapw/GGA;time lapw1 lapw1_1.def;rm -f .lock_vienna1
else
else
hostname
jobs -l
endif
@ p ++
if ( 0 ) then
sleep 1
end
while ( 2 < = 2 )
if ! ( -e .lock_dobby2 ) then
if ( 3 > 5 ) goto endkloop
@ loop ++
if ( 0 > 0 ) echo prepare 2 on dobby
if ( 0 > 0 ) echo `date` -> Creating klist 2
set kold = 3
if ( 2 > 2 && 0 ) then
@ head = 3 + 3 - 1
set tail = 3
@ kbegin = 3 + 3
endif
if ( 5 > = 5 ) then
set head = 5
@ tail = 5 - 3 - 1
endif
set kpl = ( 2 3 )
if ( 0 > 1 ) echo 2 : 3k (dobby, 3)
head -5 GGA.klist.tmp
tail -3
echo END
cut -c-35 GGA.klist_2
sed -f script .tmp_lapw1para.5508
if ( 0 > 1 ) echo `date` -> 
if ( 0 > 1 ) echo `date` -> creating lapw1_2.def: 
cp lapw1.def .tmp_lapw1para.5508
cat
sed -f .script .tmp_lapw1para.5508
if ( 2 != 1 ) then
cat
ed lapw1_2.def
endif
echo
rm .mist .tmp_lapw1para.5508
echo lapw1_2.def erzeugt
echo -n dobby(3)
echo 2 : dobby :  3 : 1 : 2
cat .processes
echo .lock_dobby2
echo
if ( 1 == 1 ) then
if ( 1 == 1 ) then
else
else
ssh dobby cd /home/wien/lapw/GGA;time lapw1 lapw1_2.def;rm -f .lock_dobby2
hostname
jobs -l
endif
@ p ++
if ( 0 ) then
sleep 1
bash: line 1: 21107 Segmentation fault      lapw1 lapw1_1.def

real    0m1.482s
user    0m1.109s
sys    0m0.213s
end
while ( 3 < = 2 )
sleep 1
bash: line 1:  5636 Segmentation fault      lapw1 lapw1_2.def

real    0m1.017s
user    0m0.615s
sys    0m0.114s
echo try again
goto kloop
set p = 1
if ( 0 && 0 ) set p = 2
while ( 1 < = 2 )
if ! ( -e .lock_vienna1 ) then
if ( 6 > 5 ) goto endkloop
goto endkloop
if ( 0 > 0 ) echo waiting for all processes to complete
wait
if ( 0 > 0 ) echo `date` -> all processes done.
sleep 1
set counter = 0
while ( 0 < 2 )
@ counter ++
if ( 0 > 1 ) echo testerror lapw1_1
if ( ! -z lapw1_1.error ) goto error
goto error
echo **  LAPW1 crashed!
echo **  LAPW1 STOPPED at `date`
date
echo **  check ERROR FILES!
echo -----------------------------------------------------------------
echo **  Error in Parallel LAPW1
echo **  LAPW1 STOPPED at `date`
date
echo **  check ERROR FILES!
cat lapw1_1.error lapw1_2.error
echo ERROR
rm .tmp_lapw1para.5508 .tmp_testpara_new.5508_2
hostname
rm .lapw1para.5508.dobby
exit 1



More information about the Wien mailing list