260 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			260 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
2017-09-06T08:21:51Z Job state change UNDEFINED -> ACCEPTED   Reason: (Re)Accepting new job
 | 
						|
2017-09-06T08:21:51Z Job state change ACCEPTED -> PREPARING   Reason: Starting job processing
 | 
						|
2017-09-06T08:22:24Z Job state change PREPARING -> SUBMIT   Reason: Pre-staging finished, passing job to LRMS
 | 
						|
----- starting submit_slurm_job -----
 | 
						|
SLURM jobname: accetuation_nn
 | 
						|
SLURM job script built
 | 
						|
SLURM script follows:
 | 
						|
-------------------------------------------------------------------
 | 
						|
#!/bin/bash -l
 | 
						|
# SLURM batch job script built by grid-manager
 | 
						|
#SBATCH --no-requeue
 | 
						|
#SBATCH -e /net/hold/data1/arc/session//e1PMDmzlk9qnOeFSGmVnjcgoABFKDmABFKDmZmMKDmABFKDmnXN6sm.comment
 | 
						|
#SBATCH -o /net/hold/data1/arc/session//e1PMDmzlk9qnOeFSGmVnjcgoABFKDmABFKDmZmMKDmABFKDmnXN6sm.comment
 | 
						|
 | 
						|
#SBATCH -p gridlong
 | 
						|
#SBATCH --nice=50
 | 
						|
#SBATCH -J 'accetuation_nn'
 | 
						|
#SBATCH --get-user-env=10L
 | 
						|
#SBATCH -n 1
 | 
						|
#SBATCH   --constraint=gpu --gres=gpu:1
 | 
						|
#SBATCH -t 750:0
 | 
						|
#SBATCH -t 750:0
 | 
						|
#SBATCH --mem-per-cpu=12000
 | 
						|
# run singularity image if RTE with singularity is required
 | 
						|
if [ -z $SINGULARITY_CONTAINER ]; then
 | 
						|
exec /bin/singularity exec -B /var/spool/slurm,/cvmfs,/net/hold/data1,/data1,/data1/slurm,/home,/usr/lib64/nvidia /net/hold/data1/singularity-images/theano-gpu-2.img $0
 | 
						|
fi
 | 
						|
 | 
						|
# Overide umask of execution node (sometime values are really strange)
 | 
						|
umask 077
 | 
						|
 
 | 
						|
# source with arguments for DASH shells
 | 
						|
sourcewithargs() {
 | 
						|
script=$1
 | 
						|
shift
 | 
						|
. $script
 | 
						|
}
 | 
						|
# Setting environment variables as specified by user
 | 
						|
export 'GRID_GLOBAL_JOBID=gsiftp://nsc.ijs.si:2811/jobs/e1PMDmzlk9qnOeFSGmVnjcgoABFKDmABFKDmZmMKDmABFKDmnXN6sm'
 | 
						|
 | 
						|
RUNTIME_JOB_DIR=/net/hold/data1/arc/session//e1PMDmzlk9qnOeFSGmVnjcgoABFKDmABFKDmZmMKDmABFKDmnXN6sm
 | 
						|
RUNTIME_JOB_STDIN=/dev/null
 | 
						|
RUNTIME_JOB_STDOUT=/net/hold/data1/arc/session//e1PMDmzlk9qnOeFSGmVnjcgoABFKDmABFKDmZmMKDmABFKDmnXN6sm/out.txt
 | 
						|
RUNTIME_JOB_STDERR=/net/hold/data1/arc/session//e1PMDmzlk9qnOeFSGmVnjcgoABFKDmABFKDmZmMKDmABFKDmnXN6sm/err.txt
 | 
						|
RUNTIME_JOB_DIAG=/net/hold/data1/arc/session//e1PMDmzlk9qnOeFSGmVnjcgoABFKDmABFKDmZmMKDmABFKDmnXN6sm.diag
 | 
						|
if [ ! -z "$RUNTIME_GRIDAREA_DIR" ] ; then
 | 
						|
  RUNTIME_JOB_DIR=$RUNTIME_GRIDAREA_DIR/`basename $RUNTIME_JOB_DIR`
 | 
						|
  RUNTIME_JOB_STDIN=`echo "$RUNTIME_JOB_STDIN" | sed "s#^$RUNTIME_JOB_DIR#$RUNTIME_GRIDAREA_DIR#"`
 | 
						|
  RUNTIME_JOB_STDOUT=`echo "$RUNTIME_JOB_STDOUT" | sed "s#^$RUNTIME_JOB_DIR#$RUNTIME_GRIDAREA_DIR#"`
 | 
						|
  RUNTIME_JOB_STDERR=`echo "$RUNTIME_JOB_STDERR" | sed "s#^$RUNTIME_JOB_DIR#$RUNTIME_GRIDAREA_DIR#"`
 | 
						|
  RUNTIME_JOB_DIAG=`echo "$RUNTIME_JOB_DIAG" | sed "s#^$RUNTIME_JOB_DIR#$RUNTIME_GRIDAREA_DIR#"`
 | 
						|
  RUNTIME_CONTROL_DIR=`echo "$RUNTIME_CONTROL_DIR" | sed "s#^$RUNTIME_JOB_DIR#$RUNTIME_GRIDAREA_DIR#"`
 | 
						|
fi
 | 
						|
RUNTIME_LOCAL_SCRATCH_DIR=${RUNTIME_LOCAL_SCRATCH_DIR:-$WORKDIR}
 | 
						|
RUNTIME_FRONTEND_SEES_NODE=${RUNTIME_FRONTEND_SEES_NODE:-}
 | 
						|
RUNTIME_NODE_SEES_FRONTEND=${RUNTIME_NODE_SEES_FRONTEND:-yes}
 | 
						|
  if [ ! -z "$RUNTIME_LOCAL_SCRATCH_DIR" ] && [ ! -z "$RUNTIME_NODE_SEES_FRONTEND" ]; then
 | 
						|
    RUNTIME_NODE_JOB_DIR="$RUNTIME_LOCAL_SCRATCH_DIR"/`basename "$RUNTIME_JOB_DIR"`
 | 
						|
    rm -rf "$RUNTIME_NODE_JOB_DIR"
 | 
						|
    mkdir -p "$RUNTIME_NODE_JOB_DIR"
 | 
						|
    # move directory contents
 | 
						|
    for f in "$RUNTIME_JOB_DIR"/.* "$RUNTIME_JOB_DIR"/*; do 
 | 
						|
      [ "$f" = "$RUNTIME_JOB_DIR/*" ] && continue # glob failed, no files
 | 
						|
      [ "$f" = "$RUNTIME_JOB_DIR/." ] && continue
 | 
						|
      [ "$f" = "$RUNTIME_JOB_DIR/.." ] && continue
 | 
						|
      [ "$f" = "$RUNTIME_JOB_DIR/.diag" ] && continue
 | 
						|
      [ "$f" = "$RUNTIME_JOB_DIR/.comment" ] && continue
 | 
						|
      if ! mv "$f" "$RUNTIME_NODE_JOB_DIR"; then
 | 
						|
        echo "Failed to move '$f' to '$RUNTIME_NODE_JOB_DIR'" 1>&2
 | 
						|
        exit 1
 | 
						|
      fi
 | 
						|
    done
 | 
						|
    if [ ! -z "$RUNTIME_FRONTEND_SEES_NODE" ] ; then
 | 
						|
      # creating link for whole directory
 | 
						|
       ln -s "$RUNTIME_FRONTEND_SEES_NODE"/`basename "$RUNTIME_JOB_DIR"` "$RUNTIME_JOB_DIR"
 | 
						|
    else
 | 
						|
      # keep stdout, stderr and control directory on frontend
 | 
						|
      # recreate job directory
 | 
						|
      mkdir -p "$RUNTIME_JOB_DIR"
 | 
						|
      # make those files
 | 
						|
      mkdir -p `dirname "$RUNTIME_JOB_STDOUT"`
 | 
						|
      mkdir -p `dirname "$RUNTIME_JOB_STDERR"`
 | 
						|
      touch "$RUNTIME_JOB_STDOUT"
 | 
						|
      touch "$RUNTIME_JOB_STDERR"
 | 
						|
      RUNTIME_JOB_STDOUT__=`echo "$RUNTIME_JOB_STDOUT" | sed "s#^${RUNTIME_JOB_DIR}#${RUNTIME_NODE_JOB_DIR}#"`
 | 
						|
      RUNTIME_JOB_STDERR__=`echo "$RUNTIME_JOB_STDERR" | sed "s#^${RUNTIME_JOB_DIR}#${RUNTIME_NODE_JOB_DIR}#"`
 | 
						|
      rm "$RUNTIME_JOB_STDOUT__" 2>/dev/null
 | 
						|
      rm "$RUNTIME_JOB_STDERR__" 2>/dev/null
 | 
						|
      if [ ! -z "$RUNTIME_JOB_STDOUT__" ] && [ "$RUNTIME_JOB_STDOUT" != "$RUNTIME_JOB_STDOUT__" ]; then
 | 
						|
        ln -s "$RUNTIME_JOB_STDOUT" "$RUNTIME_JOB_STDOUT__"
 | 
						|
      fi
 | 
						|
      if [ "$RUNTIME_JOB_STDOUT__" != "$RUNTIME_JOB_STDERR__" ] ; then
 | 
						|
        if [ ! -z "$RUNTIME_JOB_STDERR__" ] && [ "$RUNTIME_JOB_STDERR" != "$RUNTIME_JOB_STDERR__" ]; then
 | 
						|
          ln -s "$RUNTIME_JOB_STDERR" "$RUNTIME_JOB_STDERR__"
 | 
						|
        fi
 | 
						|
      fi
 | 
						|
      if [ ! -z "$RUNTIME_CONTROL_DIR" ] ; then
 | 
						|
        # move control directory back to frontend
 | 
						|
        RUNTIME_CONTROL_DIR__=`echo "$RUNTIME_CONTROL_DIR" | sed "s#^${RUNTIME_JOB_DIR}#${RUNTIME_NODE_JOB_DIR}#"`
 | 
						|
        mv "$RUNTIME_CONTROL_DIR__" "$RUNTIME_CONTROL_DIR"
 | 
						|
      fi
 | 
						|
    fi
 | 
						|
    # adjust stdin,stdout & stderr pointers
 | 
						|
    RUNTIME_JOB_STDIN=`echo "$RUNTIME_JOB_STDIN" | sed "s#^${RUNTIME_JOB_DIR}#${RUNTIME_NODE_JOB_DIR}#"`
 | 
						|
    RUNTIME_JOB_STDOUT=`echo "$RUNTIME_JOB_STDOUT" | sed "s#^${RUNTIME_JOB_DIR}#${RUNTIME_NODE_JOB_DIR}#"`
 | 
						|
    RUNTIME_JOB_STDERR=`echo "$RUNTIME_JOB_STDERR" | sed "s#^${RUNTIME_JOB_DIR}#${RUNTIME_NODE_JOB_DIR}#"`
 | 
						|
    RUNTIME_FRONTEND_JOB_DIR="$RUNTIME_JOB_DIR"
 | 
						|
    RUNTIME_JOB_DIR="$RUNTIME_NODE_JOB_DIR"
 | 
						|
  fi
 | 
						|
  if [ -z "$RUNTIME_NODE_SEES_FRONTEND" ] ; then
 | 
						|
    mkdir -p "$RUNTIME_JOB_DIR"
 | 
						|
  fi
 | 
						|
 | 
						|
RESULT=0
 | 
						|
 | 
						|
if [ "$RESULT" = '0' ] ; then
 | 
						|
# Running runtime scripts
 | 
						|
export RUNTIME_CONFIG_DIR=${RUNTIME_CONFIG_DIR:-/net/hold/data1/arc/runtime/}
 | 
						|
runtimeenvironments=
 | 
						|
if [ ! -z "$RUNTIME_CONFIG_DIR" ] ; then
 | 
						|
  if [ -r "${RUNTIME_CONFIG_DIR}/APPS/BASE/THEANO-GPU-0.9" ] ; then
 | 
						|
    runtimeenvironments="${runtimeenvironments}APPS/BASE/THEANO-GPU-0.9;"
 | 
						|
    cmdl=${RUNTIME_CONFIG_DIR}/APPS/BASE/THEANO-GPU-0.9 
 | 
						|
    sourcewithargs $cmdl 1  
 | 
						|
    if [ $? -ne '0' ] ; then 
 | 
						|
      echo "Runtime APPS/BASE/THEANO-GPU-0.9 script failed " 1>&2 
 | 
						|
      echo "Runtime APPS/BASE/THEANO-GPU-0.9 script failed " 1>"$RUNTIME_JOB_DIAG" 
 | 
						|
      exit 1
 | 
						|
    fi 
 | 
						|
  fi
 | 
						|
fi
 | 
						|
 | 
						|
echo "runtimeenvironments=$runtimeenvironments" >> "$RUNTIME_JOB_DIAG"
 | 
						|
if [ ! "X$SLURM_NODEFILE" = 'X' ] ; then
 | 
						|
  if [ -r "$SLURM_NODEFILE" ] ; then
 | 
						|
    cat "$SLURM_NODEFILE" | sed 's/\(.*\)/nodename=\1/' >> "$RUNTIME_JOB_DIAG"
 | 
						|
    NODENAME_WRITTEN="1"
 | 
						|
  else
 | 
						|
    SLURM_NODEFILE=
 | 
						|
  fi
 | 
						|
fi
 | 
						|
if [ "$RESULT" = '0' ] ; then
 | 
						|
  # Changing to session directory
 | 
						|
  HOME=$RUNTIME_JOB_DIR
 | 
						|
  export HOME
 | 
						|
  if ! cd "$RUNTIME_JOB_DIR"; then
 | 
						|
    echo "Failed to switch to '$RUNTIME_JOB_DIR'" 1>&2
 | 
						|
    RESULT=1
 | 
						|
  fi
 | 
						|
  if [ ! -z "$RESULT" ] && [ "$RESULT" != 0 ]; then
 | 
						|
    exit $RESULT
 | 
						|
  fi
 | 
						|
nodename=`/bin/hostname -f`
 | 
						|
echo "nodename=$nodename" >> "$RUNTIME_JOB_DIAG"
 | 
						|
echo "Processors=1" >> "$RUNTIME_JOB_DIAG"
 | 
						|
executable='./workbench.sh'
 | 
						|
# Check if executable exists
 | 
						|
if [ ! -f "$executable" ]; 
 | 
						|
then 
 | 
						|
  echo "Path \"$executable\" does not seem to exist" 1>$RUNTIME_JOB_STDOUT 2>$RUNTIME_JOB_STDERR 1>&2
 | 
						|
  exit 1
 | 
						|
fi
 | 
						|
# See if executable is a script, and extract the name of the interpreter
 | 
						|
line1=`dd if="$executable" count=1 2>/dev/null | head -n 1`
 | 
						|
command=`echo $line1 | sed -n 's/^#! *//p'`
 | 
						|
interpreter=`echo $command | awk '{print $1}'`
 | 
						|
if [ "$interpreter" = /usr/bin/env ]; then interpreter=`echo $command | awk '{print $2}'`; fi
 | 
						|
# If it's a script and the interpreter is not found ...
 | 
						|
[ "x$interpreter" = x ] || type "$interpreter" > /dev/null 2>&1 || {
 | 
						|
 | 
						|
  echo "Cannot run $executable: $interpreter: not found" 1>$RUNTIME_JOB_STDOUT 2>$RUNTIME_JOB_STDERR 1>&2
 | 
						|
  exit 1; }
 | 
						|
GNU_TIME='/usr/bin/time'
 | 
						|
if [ ! -z "$GNU_TIME" ] && ! "$GNU_TIME" --version >/dev/null 2>&1; then
 | 
						|
  echo "WARNING: GNU time not found at: $GNU_TIME" 2>&1;
 | 
						|
  GNU_TIME=
 | 
						|
fi 
 | 
						|
 | 
						|
if [ -z "$GNU_TIME" ] ; then
 | 
						|
   "./workbench.sh" <$RUNTIME_JOB_STDIN 1>$RUNTIME_JOB_STDOUT 2>$RUNTIME_JOB_STDERR
 | 
						|
else
 | 
						|
  $GNU_TIME -o "$RUNTIME_JOB_DIAG" -a -f 'WallTime=%es\nKernelTime=%Ss\nUserTime=%Us\nCPUUsage=%P\nMaxResidentMemory=%MkB\nAverageResidentMemory=%tkB\nAverageTotalMemory=%KkB\nAverageUnsharedMemory=%DkB\nAverageUnsharedStack=%pkB\nAverageSharedMemory=%XkB\nPageSize=%ZB\nMajorPageFaults=%F\nMinorPageFaults=%R\nSwaps=%W\nForcedSwitches=%c\nWaitSwitches=%w\nInputs=%I\nOutputs=%O\nSocketReceived=%r\nSocketSent=%s\nSignals=%k\n'  "./workbench.sh" <$RUNTIME_JOB_STDIN 1>$RUNTIME_JOB_STDOUT 2>$RUNTIME_JOB_STDERR
 | 
						|
 | 
						|
fi
 | 
						|
RESULT=$?
 | 
						|
 | 
						|
fi
 | 
						|
fi
 | 
						|
if [ ! -z "$RUNTIME_CONFIG_DIR" ] ; then
 | 
						|
  if [ -r "${RUNTIME_CONFIG_DIR}/APPS/BASE/THEANO-GPU-0.9" ] ; then
 | 
						|
    cmdl=${RUNTIME_CONFIG_DIR}/APPS/BASE/THEANO-GPU-0.9
 | 
						|
    sourcewithargs $cmdl 2  
 | 
						|
  fi
 | 
						|
fi
 | 
						|
 | 
						|
if [ ! -z  "$RUNTIME_LOCAL_SCRATCH_DIR" ] ; then
 | 
						|
  find ./ -type l -exec rm -f "{}" ";"
 | 
						|
  find ./ -type f -exec chmod u+w "{}" ";"
 | 
						|
  chmod -R u-w "$RUNTIME_JOB_DIR"/'40_epoch.h5' 2>/dev/null
 | 
						|
  chmod -R u-w "$RUNTIME_JOB_DIR"/'workbench.py' 2>/dev/null
 | 
						|
  chmod -R u-w "$RUNTIME_JOB_DIR"/'workbench.sh' 2>/dev/null
 | 
						|
  chmod -R u-w "$RUNTIME_JOB_DIR"/'40_epoch_history.pkl' 2>/dev/null
 | 
						|
  chmod -R u-w "$RUNTIME_JOB_DIR"/'notes' 2>/dev/null
 | 
						|
  chmod -R u-w "$RUNTIME_JOB_DIR"/'out.txt' 2>/dev/null
 | 
						|
  chmod -R u-w "$RUNTIME_JOB_DIR"/'err.txt' 2>/dev/null
 | 
						|
  chmod -R u-w "$RUNTIME_JOB_DIR"/'gmlog' 2>/dev/null
 | 
						|
  find ./ -type f -perm /200 -exec rm -f "{}" ";"
 | 
						|
  find ./ -type f -exec chmod u+w "{}" ";"
 | 
						|
fi
 | 
						|
 | 
						|
  if [ ! -z "$RUNTIME_LOCAL_SCRATCH_DIR" ] && [ ! -z "$RUNTIME_NODE_SEES_FRONTEND" ]; then 
 | 
						|
    if [ ! -z "$RUNTIME_FRONTEND_SEES_NODE" ] ; then
 | 
						|
      # just move it
 | 
						|
      rm -rf "$RUNTIME_FRONTEND_JOB_DIR"
 | 
						|
      destdir=`dirname "$RUNTIME_FRONTEND_JOB_DIR"`
 | 
						|
      if ! mv "$RUNTIME_NODE_JOB_DIR" "$destdir"; then
 | 
						|
        echo "Failed to move '$RUNTIME_NODE_JOB_DIR' to '$destdir'" 1>&2
 | 
						|
        RESULT=1
 | 
						|
      fi
 | 
						|
    else
 | 
						|
      # remove links
 | 
						|
      rm -f "$RUNTIME_JOB_STDOUT" 2>/dev/null
 | 
						|
      rm -f "$RUNTIME_JOB_STDERR" 2>/dev/null
 | 
						|
      # move directory contents
 | 
						|
      for f in "$RUNTIME_NODE_JOB_DIR"/.* "$RUNTIME_NODE_JOB_DIR"/*; do 
 | 
						|
        [ "$f" = "$RUNTIME_NODE_JOB_DIR/*" ] && continue # glob failed, no files
 | 
						|
        [ "$f" = "$RUNTIME_NODE_JOB_DIR/." ] && continue
 | 
						|
        [ "$f" = "$RUNTIME_NODE_JOB_DIR/.." ] && continue
 | 
						|
        [ "$f" = "$RUNTIME_NODE_JOB_DIR/.diag" ] && continue
 | 
						|
        [ "$f" = "$RUNTIME_NODE_JOB_DIR/.comment" ] && continue
 | 
						|
        if ! mv "$f" "$RUNTIME_FRONTEND_JOB_DIR"; then
 | 
						|
          echo "Failed to move '$f' to '$RUNTIME_FRONTEND_JOB_DIR'" 1>&2
 | 
						|
          RESULT=1
 | 
						|
        fi
 | 
						|
      done
 | 
						|
      rm -rf "$RUNTIME_NODE_JOB_DIR"
 | 
						|
    fi
 | 
						|
  fi
 | 
						|
  echo "exitcode=$RESULT" >> "$RUNTIME_JOB_DIAG"
 | 
						|
  exit $RESULT
 | 
						|
-------------------------------------------------------------------
 | 
						|
 | 
						|
job submitted successfully!
 | 
						|
local job id: 734035
 | 
						|
----- exiting submit_slurm_job -----
 | 
						|
 | 
						|
2017-09-06T08:22:24Z Job state change SUBMIT -> INLRMS   Reason: Job is passed to LRMS
 | 
						|
------- Contents of output stream forwarded by the LRMS ---------
 | 
						|
WARNING: GNU time not found at: /usr/bin/time
 | 
						|
slurmstepd: error: *** JOB 734035 ON nsc-fp006 CANCELLED AT 2017-09-06T22:52:34 DUE TO TIME LIMIT ***
 | 
						|
------------------------- End of output -------------------------
 | 
						|
2017-09-06T20:55:12Z Job state change INLRMS -> FINISHING   Reason: Job finished executing in LRMS
 | 
						|
2017-09-06T20:56:12Z Job state change FINISHING -> FINISHED   Reason: Stage-out finished.
 | 
						|
2017-09-06T21:49:02Z Job state change UNDEFINED -> FINISHED   Reason: (Re)Accepting new job
 | 
						|
2017-09-06T23:49:20Z Job state change UNDEFINED -> FINISHED   Reason: (Re)Accepting new job
 | 
						|
2017-09-07T01:49:53Z Job state change UNDEFINED -> FINISHED   Reason: (Re)Accepting new job
 | 
						|
2017-09-07T03:50:36Z Job state change UNDEFINED -> FINISHED   Reason: (Re)Accepting new job
 | 
						|
2017-09-07T05:50:39Z Job state change UNDEFINED -> FINISHED   Reason: (Re)Accepting new job
 | 
						|
2017-09-07T07:50:54Z Job state change UNDEFINED -> FINISHED   Reason: (Re)Accepting new job
 |