[Wien] Compilation in parallel

Peter Blaha pblaha at theochem.tuwien.ac.at
Thu Nov 27 14:16:42 CET 2008


Sorry, the previously attached file had an error.

Peter Blaha schrieb:
> 
>> The second question. The names of nodes in our cluster are node-1, 
>> node-2 ....The script lapw1para
>> produces names node 1, node 2... , instead of node-1, node-2... . Is 
>> it possible to change the script? 
> 
> Try the attached lapw1para_lapw script. It should work for names 
> including "-"
> 
> Please let me know if it works (I don't have machines with this names...)
> 
> 
> ------------------------------------------------------------------------
> 
> #!/bin/csh -f
> #
> # run lapw1 in parallel mode
> #
> # (C)1997 by Joachim Luitz
> #
> # $Author: jluitz $
> # $State: Exp $
> #
> set tmp = .tmp_lapw1para.$$
> set tmp2 = .tmp_testpara_new.$$_2
> onintr exit
> set name	= $0
> set bin		= $name:h	#default directory for WIEN-executables
> if !(-d $bin) set bin = .
> 
> unalias rm
> alias   testinput       'if (! -e \!:1 || -z \!:1) goto \!:2'
> alias   testerror       'if (! -z \!:1.error) goto error'
> alias   sortoutput      'if (-f .stdout\!:1) bashtime2csh.pl_lapw .stdout\!:1 > .temp\!:1; grep \% .temp\!:1 >> .time\!:1; grep -v \% .temp\!:1 | perl -e "print stderr <STDIN>"'
> 
> ############################################################################
> # First we set up some variables
> #
> 
> set updn			# spinpolarization switch
> set dnup	= 'dn'		# spinpolarization switch
> set sc				# semicore-switch
> set cmplx
> set t 		= time
> set remote = ssh
> set log         = :parallel
> set granular    = granularity:
> set res         = residue:
> set ef          = extrafine:
> set kpl
> unset resok
> unset extrafine
> set granularity = 3            # higher values split k-list into more
>                                # junks, however, each vector will produce
>                                # its own clmval file!!!
>                                # Granularity 3 yields approx. 3 files per
>                                # processor. To override default granularity
> 			       # of 3 insert a line in .machines with the
>                                # following format:
>                                # granularity:new_value
> 
> ############################################################################
> #In this section use 0 to turn of an option, 1 to turn it on, 
> #respectively choose a value
> 
> set useremote   = 1             # using remote shell to launch processes
> set delay       = 1             # delay launching of processes by n seconds
> set sleepy      = 1             # additional sleep before checking
> set debug       = 0             # verbosity of debugging output
> 
> ############################################################################
> 
> ############################################################################
> # and now we look if we should override the defaults
> if (-e $bin/parallel_options) then
> 	source $bin/parallel_options
> endif
> if ( $?USE_REMOTE ) then
>         set useremote = $USE_REMOTE
> endif
> if ( $?WIEN_GRANULARITY ) then
>         set granularity = $WIEN_GRANULARITY
> endif
> if ( $?WIEN_EXTRAFINE ) then
>         set extrafine
> endif
> 
> if ( $?WIEN_MPIRUN ) then
>   set mpirun = "$WIEN_MPIRUN"
> else
>   set mpirun='mpirun -np _NP_ _EXEC_'
> endif
> 
> ############################################################################
> 
> 
> #which def-file are we using?
> 
> if ($#argv < 1) then
> 	echo usage: $0 deffile
> 	exit 
> endif
> 
> #correct PWD variable if using the amd-daemon
> # we assume that any /tmp_mnt/xxx directory will be mounted
> # as /xxx!!!!!
> 
> setenv PWD `pwd|sed "s/tmp_mnt\///"`
> setenv PWD $cwd
> # echo $PWD
> if (-e .processes) rm .processes
> touch .lock_
> foreach i (.lock_*)
>     rm $i
> end
> 
> while ($#argv)
>   switch ($1)
>   case -h:
>   case -H: 
>     set help
>     shift; breaksw
>   case -up:
>     set updn = up
>     set dnup = dn
>     shift; breaksw
>   case -dn:
>     set updn = 'dn'
>     set dnup = 'up'
>     shift; breaksw
>   case -sc:
>     set sc = 's'
>     shift; breaksw
>   case -c:
>     set cmplx = c
>     shift; breaksw
>   case -so:
>     set so = 'so'
>     shift; breaksw
>   default:
>     set def = $1:r
>     shift; breaksw
>   endsw
> end
> 
> set exe = $bin/lapw1$cmplx
> set exe = lapw1$cmplx
> echo "** " Error in Parallel LAPW1 >$def.error
> 
> #are we running parallel?
> testinput .machines single
> echo "starting parallel lapw1 at `date`"
> echo "starting parallel lapw1 at `date`" >>$log
> 
> echo "RUNNING" >.lapw1para
> 
> #before we start, we wipe away all parallel  files
> if ( -e lapw1.error ) rm *lapw1*.error
> if ( -e uplapw1.error ) rm *lapw1*.error
> if ( -e dnlapw1.error ) rm *lapw1*.error
> 
> if ( -e lapw1_1.error ) rm *lapw1_*.error
> if ( -e uplapw1_1.error ) rm *lapw1_*.error
> if ( -e dnlapw1_1.error ) rm *lapw1_*.error
> if ( -e .time1_1) rm .time1_*
> 
> #get name of case
> set case    = `pwd`
> set case    = $case:t 
> if ($case == "") then
>   echo "ERROR: no case.inst-file -> exit"
>   exit 1
> endif
> #set klist file
> 
> # to prevent possible NFS-bug
> touch $case.klist_1 $case.nsh${updn} $case.energy${updn} $case.scf1${updn}_1
> rm $case.klist_[1-9]* $case.nsh${updn} $case.energy${updn}* $case.scf1${updn}*
> 
> set caseklist=`grep '\.klist' $def.def | cut -f 2 -d,| cut -f 2 -d\' `
> 
> #check if klist is read from unit 4
> #else quit!
> set test = `grep "K-VECTORS" $case.in1${cmplx}|cut -c21`
> if ($test != 4) then
>   echo "   K-Vectors must be read from unit 4"
>   echo "   for parallel execution! "
>   echo "   Change setting in $case.in1${cmplx}! "
>   exit 1
> endif
> 
> echo "->  starting parallel LAPW1 jobs at `date`"
> 
> if ($debug > 0) echo `date`" ->" Setting up case $case for parallel execution
> if ($debug > 0) echo `date`" ->" of LAPW1
> if ($debug > 0) echo `date`" ->" ""
> #
> #get machine names and weighs, set up 
> #proper junks for parallel execution
> #in case of a spin polarized calculation
> #ensure both up and down are done with the
> #same number of k-points; first calc. generates
> #file .machines.help
> 
> if ($updn == 'up' || $updn == 'dn') then
>   if ($debug > 1) echo `date`" ->" sp
>   if (-e .machines.help && $updn == 'dn') then
>     cp .machines.help $tmp
>     echo "running LAPW1 in parallel mode (using .machines.help)"
>   else
>     cp .machines $tmp
>     cp $tmp .machines.help
>     echo "running LAPW1 in parallel mode (using .machines)"
>   endif
> else
>   if ($debug > 1) echo `date`" ->" non sp
>   cp .machines $tmp
>   echo "running LAPW1 in parallel mode (using .machines)"
> endif
> 
> 
> 
> grep : $tmp |grep -v '#' |grep -v lapw0| grep -v granular | grep -v residue |grep -v extrafine |grep -v lapw2_vector >$tmp2
> 
> 
> awk -F: '{print "init:" $2}' < $tmp2 >.processes
> set weigh    = `cut -f1 -d: $tmp2 | xargs`
> set machine  = `cut -f2 -d: $tmp2 | xargs`
> set lockfile = `cut -f2 -d: $tmp2 | awk '{print $1 NR}'|xargs`
> set wweigh   = ($weigh)
> set mist     = `wc $tmp2 `
> set proc     = $mist[1]
> unset mist
> set mist     = `grep $granular $tmp | grep -v '#'| cut -f2 -d:`
> if ($mist != "") then 
>     if ($debug > 0) echo "Granularity set to $mist"
>     set granularity = $mist
> endif
> set mist     = `grep $ef $tmp |grep -v '#'| cut -f2 -d:`
> if ($mist == 1) then 
>     if ($debug >0) echo "Extrafine set"
>     set extrafine
> else if( $?extrafine ) then
>     if ($debug >0) echo "Extrafine set"
> else
>     if ($debug >0) echo "Extrafine unset"
>     unset extrafine
> endif
> set mist     = `grep $res $tmp |grep -v '#'| cut -f2 -d:`
> if ($mist != "") then 
>     if ($debug > 0) echo "Residue set to $mist"
>     set residue = $mist
>     echo "residue:$residue" >>.processes
> else
>     unset residue
> endif
> unset mist
>     
> set i          = 0
> set sumw       = `awk -F: '{sumw += $1};END {print sumw}' $tmp`
> sed "/END/q" <$caseklist >$caseklist.tmp
> set mist = `wc $caseklist.tmp`
> set klist = $mist[1]
> @ klist -- 
> # modify weights
> set i = 1
> set sumn = 0
> while ($i <= $#weigh)
>     @ weigh[$i] *= $klist
>     @ weigh[$i] /= $sumw
>     @ weigh[$i] /= $granularity
>     if ($weigh[$i] == 0 ) then
> 	@  weigh[$i] ++  # oops, we divided by too big a number
>     endif
>     @ sumn += $weigh[$i]
>     @ i ++
> end
> # check for residue
> if ($?residue) then
>     @ resk = $klist % $sumn
>     if ($resk == 0) unset residue
> endif
> if ($debug > 0) then
>     if ($?residue) then
>     echo "    klist:       $klist + $resk"
>     else
>     echo `date`" -> klist:       $klist"
>     endif
>     echo `date`" -> machines:    $machine"
>     echo `date`" -> procs:       $proc"
>     if ($?residue) then
>     echo "    residue:     $residue ($resk k)"
>     endif
>     echo `date`" -> weigh(old):  $wweigh"
>     echo `date`" -> sumw:        $sumw"
>     echo `date`" -> granularity: $granularity"
>     echo `date`" -> weigh(new):  $weigh"
> endif
> 
> # now we add our residue as first machine
> if ($?residue) then
>     set machine  = `echo "$residue $machine" |xargs`
>     set weigh    = `echo "$resk $weigh" | xargs`
>     set lockfile = (${residue}0 $lockfile)
>     @ proc ++
> endif
> if ($debug > 0) echo `date`" -> Splitting $caseklist.tmp into junks"
> 
> set newklist=`head -2 $caseklist.tmp | tail -1 | wc`
> if ( $newklist[3] < = 55 ) then
>   head -1 $caseklist.tmp|cut -c36->head.diff
> else
>   head -1 $caseklist.tmp|cut -c56->head.diff
> endif
> cat <<$EOF >script
> 1N
> s/\n//
> $EOF
> 
> @ multi = $proc * $granularity
> 
> # a valid .machines file could look like
> #
> #  # This is a valid .machines file 
> #  #
> #  granularity:1
> #  1:alpha
> #  1:beta 
> #  3:gamma:2 delta 
> #  3:delta:1 epsilon:4 
> #  residue:delta:2 
> #  lapw0:gamma:2 delta:2 epsilon:4 
> #
> set TMP=.machinetmp
> if ( -f .machinetmp ) rm .machinetmp
> #
> # get only those lines of the .machines file which contribute 
> # to a parallel execution, and rip them of the weight parameter
> #grep '^[0-9]*:.* [a-zA-Z][a-zA-Z]*' .machines | sed -e 's/^[0-9]*://' >$TMP
> if($?residue) then
>   grep '^residue:' $tmp |sed -e 's/^residue://' -e 's/ *$//' >$TMP
> endif
> grep '^[0-9][0-9]*:.*' $tmp | sed -e 's/^[0-9]*://' -e 's/^ *//' \
>   -e 's/ *$//' >>$TMP
> # get the weight parameters and write them into the array $weight_per_job
> set weight_per_job=`grep '^[0-9][0-9]*:.*' $tmp | sed -e 's/^\([0-9]*\):.*/\1/' | bc`
> #
> #
> # alternatively, for lapw0 use only the line starting with 'lapw0'
> #grep '^lapw0:' .machines | sed -e 's/^lapw0://' -e 's/ *$//' >$TMP
> #set weight_per_job=1
> #cat $TMP
> #
> #
> # count the number of parallel jobs, i.e. the number of lines of the
> # (stripped) .machines file
> set number_of_p_jobs=`wc -l $TMP | sed -e 's/^ *//' | cut -d ' ' -f 1`
> # put the number of processors per parallel job into the array $number_per_job
> 
> # added by Kevin Jorissen  januari 2003
> # fix by PB for names including -; added " - replacement" to set number_per_job
> #####sed -e 's/\-/\./g' $TMP > .machinetmp222  
> if ($debug > 0) more .machinetmp222
> set TMP=.machinetmp222
> if ($debug > 0) echo $TMP
> # end of addition by Kevin Jorissen
> 
> set number_per_job=`sed -e 's/\-/./g' -e 's/[ 	]*$//' -e 's/^[ 	]*//' -e 's/[a-zA-Z0-9\.]*:\([0-9]*\)/\1/g' -e 's/[ ^]*[a-zA-Z][a-zA-Z0-9\.]*/ 1/g' $TMP | sed -e 's/^ //' -e 's/  */+/g' |bc`
> #
> # create a series of .machine[$i] files
> set TMP=.machinetmp
> set i = 1
> echo $number_of_p_jobs number_of_parallel_jobs
> while ($i <= $number_of_p_jobs)
> # convert the single lines with multiple entries to
> # a file with one entry per line
> #echo $TMP
> #cat $TMP
> sed -e "${i}p" -e 'd' $TMP | sed -e '/\(^[a-zA-Z0-9]*$\)/p' -e '/\(^[a-zA-Z0-9]*:[0-9]*$\)/p' -e "s/  */\\
> /gp" -e 'd' >.machine$i
> #echo -------- .machine$i : $number_per_job[$i] processors
> #cat .machine$i
> @ i ++
> end
> #echo --------
> rm $TMP
> # 
> # NOW WE HAVE:
> #   .machine$i files hold the machinefiles used by MPI
> #   $number_of_p_jobs is the number of k-point parallel jobs
> #                            (sequential plus MPI jobs)
> #   $number_per_job[$i] is the number of processors per k-point parallel job
> #   $weight_per_job[$i] is the k-point weight per job
> #
> 
> # convert each line 'host:x' to x lines 'host'
> set i=1
> while ($i <= $number_of_p_jobs)
> #  echo -------- .machine$i : $number_per_job[$i] processors : weight $weight_per_job[$i]
> #  cat .machine$i
>   set x=`cut -d: -f2 -s .machine$i`
> #  echo $x
>   if ("$x" != '') then
>     set machine_i=`cat .machine$i`
>     rm .machine$i
> #    echo $machine_i
>     set ii=1
>     foreach s ($x)
>       set iii=1
> #      echo s=$s
>       while ($iii <= $s)
> 	echo $machine_i[$ii] |cut -d: -f1 >>.machine$i
> 	@ iii ++
>       end
>       @ ii ++
>     end
> #    cat .machine$i
>   endif
>   @ i ++  
> end
> # NOW WE HAVE 
> # machinefiles which do not distiguish 
> # between shared and distributed memory. 
> # This means, that clusters of shared memory computers will communicate
> # only via sockets, also between the processors of one computer.
>  
> set loop    = 0
> set kbegin  = 1
> set endloop = 0
> 
> kloop:
>     set p = 1
>     if ($?residue && $?resok) set p = 2
>     while ($p <= $proc)
> 
>     if !(-e .lock_$lockfile[$p]) then
> 	if ($kbegin > $klist) goto endkloop
>         @ loop ++
> 	
> 	if ($debug > 0) echo prepare $loop on $machine[$p]
> 	
> 	if ($debug > 0) echo `date`" -> Creating klist $loop "
> 	set kold = $kbegin
> 	if ($loop > $multi && $?extrafine) then
> 	    @ head = $kbegin
> 	    set tail = 1
> 	    @ kbegin = $kbegin + 1
> 	else 
>     	    @ head = $kbegin + $weigh[$p] - 1
> 	    set tail = $weigh[$p]
> 	    @ kbegin = $kbegin + $weigh[$p]
> 	endif
> 
> 
> 	if ($head >= $klist) then
> 	    set head    = $klist
> 	    @ tail = $klist - $kold - 1
> 	endif
> 	set kpl = ($kpl $tail)
> 	if ($debug > 1) echo  "$loop : ${kpl[$loop]}k ($machine[$p], $weigh[$p])"
> 	
> 	head -$head $caseklist.tmp | tail -$tail > ${caseklist}_$loop
> 	echo "END" >>${caseklist}_$loop
>         if ( $newklist[3] < = 55 ) then
> 	   cut -c-35 ${caseklist}_$loop |sed "1r head.diff" >$tmp
>         else
> 	   cut -c-55 ${caseklist}_$loop |sed "1r head.diff" >$tmp
>         endif
> 	sed -f script $tmp >${caseklist}_$loop
> 
> 	#creating def files
> 	if ($debug > 1) echo `date`" ->" " "
> 	if ($debug > 1) echo `date`" ->"  "creating "$def"_$loop.def:  "  
> 
> 
> 	cp $def.def $tmp
> 	#subsituting in files:  
> 	cat <<$EOF >.script
> s/$caseklist/&_$loop/w .mist
> s/output1$updn/&_$loop/w .mist
> s/vector$updn/&_$loop/w .mist
> s/scf1$updn/&_$loop/w .mist
> s/energy$updn/&_$loop/w .mist
> $EOF
> 	sed -f .script $tmp > "$def"_$loop.def
>         if($loop != 1 ) then
> 	cat <<$EOF >.script
> /71,
> d
> w
> q
> $EOF
>         ed "$def"_$loop.def <.script >& .mist
>         endif
> 	rm .mist $tmp
> 
> 	    echo "$loop : $machine[$p] :  $tail : $number_per_job[$p] : $p" >>.processes
> 	    echo "" > .lock_$lockfile[$p]
> 	    if($number_per_job[$p] == 1 ) then
> 	    echo -n "$machine[$p](${kpl[$loop]}) " >.time1_$loop
> 	      if ($useremote == 1) then
> 		  ($remote $machine[$p] "cd $PWD;$t $exe ${def}_$loop.def ;fixerror_lapw ${def}_$loop";rm -f .lock_$lockfile[$p]) >& .stdout1_$loop; sortoutput 1_$loop &
> 	      else
> 		  (cd $PWD;$t $exe ${def}_$loop.def;rm -f .lock_$lockfile[$p]) >>.time1_$loop &
> 	      endif
> 	    else
> 	    set helpout=`cat .machine$p` 
> 	    echo -n "$helpout(${kpl[$loop]}) " >.time1_$loop
>               set ttt=(`echo $mpirun | sed -e "s^_NP_^$number_per_job[$p]^" -e "s^_EXEC_^$WIENROOT/${exe}_mpi ${def}_$loop.def^" -e "s^_HOSTS_^.machine$p^"`)
> 	      if ($useremote == 1) then
>                  set remotemachine = `head -1 .machine$p`
> 		 ($remote $remotemachine "cd $PWD;$t $ttt;rm -f .lock_$lockfile[$p]") >>.time1_$loop &
>               else
> 	         (cd $PWD;$t $ttt;rm -f .lock_$lockfile[$p]) >>.time1_$loop &
>               endif
> 	    endif
>             jobs -l >.lapw1${cmplx}para.$$.`hostname`
>     endif
>     @ p ++
>     if ($?residue) then
>        # now a job on residue machine has started
>        set resok
>     endif
>     sleep $delay
>     end
>     sleep $sleepy
>     #echo try again
> goto kloop
> 
> endkloop:
> if ($debug > 0) echo waiting for all processes to complete
> wait
> 
> 
> 
> if ($debug > 0) echo `date`" ->" "all processes done."
> sleep $sleepy
> 
> #cpu summary:
> if ($debug > 0) echo `date`" ->" CPU TIME summary:
> if ($debug > 0) echo `date`" ->" ================
> set counter = 0
> while ($counter < $loop)
>     @ counter ++
>     echo -n "     "
>     cat .time1_$counter
>     echo -n "     " >>$log
>     cat .time1_$counter >>$log
> end
> 
> set counter = 0
> while ($counter < $loop)
>     @ counter ++
>     if ($debug > 1) echo testerror ${def}_$counter
>     testerror ${def}_$counter
> end
> 
> # correct number of k-points in scf1-file
> 
> sed "/NUMBER OF K-POINTS:/s/[0-9].*/$klist/" < $case.scf1${updn}_1 >$case.scf1$updn
> #cp $case.scf1${updn}_1 $case.scf1$updn
> 
> # postanalysis
> echo "   Summary of lapw1para:"> $tmp
> set p = 1
> while ($p <= $proc) 
>     set m = $machine[$p]
>     cat .time1_*| grep -w $m |tr "(" " " |tr ")" " " |tr ":" " " |sed "s/u / /" | \
> 	    awk '{k += $2;u += $3; cl += 60*$5+$6} \
> 		END {print "   '$m'\t k=" k "\t user=" u "\t wallclock=" cl}' >> $tmp
>     @ p ++
> end
> # remove multiple entries of multiproc. machines
> uniq < $tmp | tee -a $log
> 
> 
> echo "<- " done at `date`>>$log
> echo "-----------------------------------------------------------------">>$log
> if ( -e $def.error )rm $def.error
> touch $def.error
> echo "DONE" >.lapw1para
> rm $tmp $tmp2 >&/dev/null
> rm .lapw1${cmplx}para.$$.`hostname` >&/dev/null
> exit 0
> 
> error:
> echo "** " LAPW1 crashed!
> echo "** " LAPW1 STOPPED at `date`>>$log
> echo "** " check ERROR FILES! >>$log
> echo "-----------------------------------------------------------------">>$log
> echo "** " Error in Parallel LAPW1 >$def.error
> echo "** " LAPW1 STOPPED at `date`>>$def.error
> echo "** " check ERROR FILES! >>$def.error
> cat ${def}_*.error >>$def.error
> echo "ERROR" >.lapw1para
> rm $tmp $tmp2 >&/dev/null
> rm .lapw1${cmplx}para.$$.`hostname` >&/dev/null
> exit 1
> 
> 
> single:
> echo running $exe in single mode
> #ensure lapw2 also runs in single mode!
> echo > .processes
> rm .processes
> $exe $def.def
> exit 0
> 
> exit:
>    rm $tmp $tmp2 >&/dev/null
>    rm .lapw1${cmplx}para.$$.`hostname` >&/dev/null
> 
> 
> 
> ------------------------------------------------------------------------
> 
> _______________________________________________
> Wien mailing list
> Wien at zeus.theochem.tuwien.ac.at
> http://zeus.theochem.tuwien.ac.at/mailman/listinfo/wien

-- 

                                       P.Blaha
--------------------------------------------------------------------------
Peter BLAHA, Inst.f. Materials Chemistry, TU Vienna, A-1060 Vienna
Phone: +43-1-58801-15671             FAX: +43-1-58801-15698
Email: blaha at theochem.tuwien.ac.at    WWW: http://info.tuwien.ac.at/theochem/
--------------------------------------------------------------------------
-------------- next part --------------
An embedded and charset-unspecified text was scrubbed...
Name: lapw1para_lapw
Url: http://zeus.theochem.tuwien.ac.at/pipermail/wien/attachments/20081127/2c2238e8/lapw1para_lapw.ksh


More information about the Wien mailing list