260 lines
12 KiB
Plaintext
260 lines
12 KiB
Plaintext
2017-09-06T08:21:51Z Job state change UNDEFINED -> ACCEPTED Reason: (Re)Accepting new job
|
|
2017-09-06T08:21:51Z Job state change ACCEPTED -> PREPARING Reason: Starting job processing
|
|
2017-09-06T08:22:24Z Job state change PREPARING -> SUBMIT Reason: Pre-staging finished, passing job to LRMS
|
|
----- starting submit_slurm_job -----
|
|
SLURM jobname: accetuation_nn
|
|
SLURM job script built
|
|
SLURM script follows:
|
|
-------------------------------------------------------------------
|
|
#!/bin/bash -l
|
|
# SLURM batch job script built by grid-manager
|
|
#SBATCH --no-requeue
|
|
#SBATCH -e /net/hold/data1/arc/session//e1PMDmzlk9qnOeFSGmVnjcgoABFKDmABFKDmZmMKDmABFKDmnXN6sm.comment
|
|
#SBATCH -o /net/hold/data1/arc/session//e1PMDmzlk9qnOeFSGmVnjcgoABFKDmABFKDmZmMKDmABFKDmnXN6sm.comment
|
|
|
|
#SBATCH -p gridlong
|
|
#SBATCH --nice=50
|
|
#SBATCH -J 'accetuation_nn'
|
|
#SBATCH --get-user-env=10L
|
|
#SBATCH -n 1
|
|
#SBATCH --constraint=gpu --gres=gpu:1
|
|
#SBATCH -t 750:0
|
|
#SBATCH -t 750:0
|
|
#SBATCH --mem-per-cpu=12000
|
|
# run singularity image if RTE with singularity is required
|
|
if [ -z $SINGULARITY_CONTAINER ]; then
|
|
exec /bin/singularity exec -B /var/spool/slurm,/cvmfs,/net/hold/data1,/data1,/data1/slurm,/home,/usr/lib64/nvidia /net/hold/data1/singularity-images/theano-gpu-2.img $0
|
|
fi
|
|
|
|
# Overide umask of execution node (sometime values are really strange)
|
|
umask 077
|
|
|
|
# source with arguments for DASH shells
|
|
sourcewithargs() {
|
|
script=$1
|
|
shift
|
|
. $script
|
|
}
|
|
# Setting environment variables as specified by user
|
|
export 'GRID_GLOBAL_JOBID=gsiftp://nsc.ijs.si:2811/jobs/e1PMDmzlk9qnOeFSGmVnjcgoABFKDmABFKDmZmMKDmABFKDmnXN6sm'
|
|
|
|
RUNTIME_JOB_DIR=/net/hold/data1/arc/session//e1PMDmzlk9qnOeFSGmVnjcgoABFKDmABFKDmZmMKDmABFKDmnXN6sm
|
|
RUNTIME_JOB_STDIN=/dev/null
|
|
RUNTIME_JOB_STDOUT=/net/hold/data1/arc/session//e1PMDmzlk9qnOeFSGmVnjcgoABFKDmABFKDmZmMKDmABFKDmnXN6sm/out.txt
|
|
RUNTIME_JOB_STDERR=/net/hold/data1/arc/session//e1PMDmzlk9qnOeFSGmVnjcgoABFKDmABFKDmZmMKDmABFKDmnXN6sm/err.txt
|
|
RUNTIME_JOB_DIAG=/net/hold/data1/arc/session//e1PMDmzlk9qnOeFSGmVnjcgoABFKDmABFKDmZmMKDmABFKDmnXN6sm.diag
|
|
if [ ! -z "$RUNTIME_GRIDAREA_DIR" ] ; then
|
|
RUNTIME_JOB_DIR=$RUNTIME_GRIDAREA_DIR/`basename $RUNTIME_JOB_DIR`
|
|
RUNTIME_JOB_STDIN=`echo "$RUNTIME_JOB_STDIN" | sed "s#^$RUNTIME_JOB_DIR#$RUNTIME_GRIDAREA_DIR#"`
|
|
RUNTIME_JOB_STDOUT=`echo "$RUNTIME_JOB_STDOUT" | sed "s#^$RUNTIME_JOB_DIR#$RUNTIME_GRIDAREA_DIR#"`
|
|
RUNTIME_JOB_STDERR=`echo "$RUNTIME_JOB_STDERR" | sed "s#^$RUNTIME_JOB_DIR#$RUNTIME_GRIDAREA_DIR#"`
|
|
RUNTIME_JOB_DIAG=`echo "$RUNTIME_JOB_DIAG" | sed "s#^$RUNTIME_JOB_DIR#$RUNTIME_GRIDAREA_DIR#"`
|
|
RUNTIME_CONTROL_DIR=`echo "$RUNTIME_CONTROL_DIR" | sed "s#^$RUNTIME_JOB_DIR#$RUNTIME_GRIDAREA_DIR#"`
|
|
fi
|
|
RUNTIME_LOCAL_SCRATCH_DIR=${RUNTIME_LOCAL_SCRATCH_DIR:-$WORKDIR}
|
|
RUNTIME_FRONTEND_SEES_NODE=${RUNTIME_FRONTEND_SEES_NODE:-}
|
|
RUNTIME_NODE_SEES_FRONTEND=${RUNTIME_NODE_SEES_FRONTEND:-yes}
|
|
if [ ! -z "$RUNTIME_LOCAL_SCRATCH_DIR" ] && [ ! -z "$RUNTIME_NODE_SEES_FRONTEND" ]; then
|
|
RUNTIME_NODE_JOB_DIR="$RUNTIME_LOCAL_SCRATCH_DIR"/`basename "$RUNTIME_JOB_DIR"`
|
|
rm -rf "$RUNTIME_NODE_JOB_DIR"
|
|
mkdir -p "$RUNTIME_NODE_JOB_DIR"
|
|
# move directory contents
|
|
for f in "$RUNTIME_JOB_DIR"/.* "$RUNTIME_JOB_DIR"/*; do
|
|
[ "$f" = "$RUNTIME_JOB_DIR/*" ] && continue # glob failed, no files
|
|
[ "$f" = "$RUNTIME_JOB_DIR/." ] && continue
|
|
[ "$f" = "$RUNTIME_JOB_DIR/.." ] && continue
|
|
[ "$f" = "$RUNTIME_JOB_DIR/.diag" ] && continue
|
|
[ "$f" = "$RUNTIME_JOB_DIR/.comment" ] && continue
|
|
if ! mv "$f" "$RUNTIME_NODE_JOB_DIR"; then
|
|
echo "Failed to move '$f' to '$RUNTIME_NODE_JOB_DIR'" 1>&2
|
|
exit 1
|
|
fi
|
|
done
|
|
if [ ! -z "$RUNTIME_FRONTEND_SEES_NODE" ] ; then
|
|
# creating link for whole directory
|
|
ln -s "$RUNTIME_FRONTEND_SEES_NODE"/`basename "$RUNTIME_JOB_DIR"` "$RUNTIME_JOB_DIR"
|
|
else
|
|
# keep stdout, stderr and control directory on frontend
|
|
# recreate job directory
|
|
mkdir -p "$RUNTIME_JOB_DIR"
|
|
# make those files
|
|
mkdir -p `dirname "$RUNTIME_JOB_STDOUT"`
|
|
mkdir -p `dirname "$RUNTIME_JOB_STDERR"`
|
|
touch "$RUNTIME_JOB_STDOUT"
|
|
touch "$RUNTIME_JOB_STDERR"
|
|
RUNTIME_JOB_STDOUT__=`echo "$RUNTIME_JOB_STDOUT" | sed "s#^${RUNTIME_JOB_DIR}#${RUNTIME_NODE_JOB_DIR}#"`
|
|
RUNTIME_JOB_STDERR__=`echo "$RUNTIME_JOB_STDERR" | sed "s#^${RUNTIME_JOB_DIR}#${RUNTIME_NODE_JOB_DIR}#"`
|
|
rm "$RUNTIME_JOB_STDOUT__" 2>/dev/null
|
|
rm "$RUNTIME_JOB_STDERR__" 2>/dev/null
|
|
if [ ! -z "$RUNTIME_JOB_STDOUT__" ] && [ "$RUNTIME_JOB_STDOUT" != "$RUNTIME_JOB_STDOUT__" ]; then
|
|
ln -s "$RUNTIME_JOB_STDOUT" "$RUNTIME_JOB_STDOUT__"
|
|
fi
|
|
if [ "$RUNTIME_JOB_STDOUT__" != "$RUNTIME_JOB_STDERR__" ] ; then
|
|
if [ ! -z "$RUNTIME_JOB_STDERR__" ] && [ "$RUNTIME_JOB_STDERR" != "$RUNTIME_JOB_STDERR__" ]; then
|
|
ln -s "$RUNTIME_JOB_STDERR" "$RUNTIME_JOB_STDERR__"
|
|
fi
|
|
fi
|
|
if [ ! -z "$RUNTIME_CONTROL_DIR" ] ; then
|
|
# move control directory back to frontend
|
|
RUNTIME_CONTROL_DIR__=`echo "$RUNTIME_CONTROL_DIR" | sed "s#^${RUNTIME_JOB_DIR}#${RUNTIME_NODE_JOB_DIR}#"`
|
|
mv "$RUNTIME_CONTROL_DIR__" "$RUNTIME_CONTROL_DIR"
|
|
fi
|
|
fi
|
|
# adjust stdin,stdout & stderr pointers
|
|
RUNTIME_JOB_STDIN=`echo "$RUNTIME_JOB_STDIN" | sed "s#^${RUNTIME_JOB_DIR}#${RUNTIME_NODE_JOB_DIR}#"`
|
|
RUNTIME_JOB_STDOUT=`echo "$RUNTIME_JOB_STDOUT" | sed "s#^${RUNTIME_JOB_DIR}#${RUNTIME_NODE_JOB_DIR}#"`
|
|
RUNTIME_JOB_STDERR=`echo "$RUNTIME_JOB_STDERR" | sed "s#^${RUNTIME_JOB_DIR}#${RUNTIME_NODE_JOB_DIR}#"`
|
|
RUNTIME_FRONTEND_JOB_DIR="$RUNTIME_JOB_DIR"
|
|
RUNTIME_JOB_DIR="$RUNTIME_NODE_JOB_DIR"
|
|
fi
|
|
if [ -z "$RUNTIME_NODE_SEES_FRONTEND" ] ; then
|
|
mkdir -p "$RUNTIME_JOB_DIR"
|
|
fi
|
|
|
|
RESULT=0
|
|
|
|
if [ "$RESULT" = '0' ] ; then
|
|
# Running runtime scripts
|
|
export RUNTIME_CONFIG_DIR=${RUNTIME_CONFIG_DIR:-/net/hold/data1/arc/runtime/}
|
|
runtimeenvironments=
|
|
if [ ! -z "$RUNTIME_CONFIG_DIR" ] ; then
|
|
if [ -r "${RUNTIME_CONFIG_DIR}/APPS/BASE/THEANO-GPU-0.9" ] ; then
|
|
runtimeenvironments="${runtimeenvironments}APPS/BASE/THEANO-GPU-0.9;"
|
|
cmdl=${RUNTIME_CONFIG_DIR}/APPS/BASE/THEANO-GPU-0.9
|
|
sourcewithargs $cmdl 1
|
|
if [ $? -ne '0' ] ; then
|
|
echo "Runtime APPS/BASE/THEANO-GPU-0.9 script failed " 1>&2
|
|
echo "Runtime APPS/BASE/THEANO-GPU-0.9 script failed " 1>"$RUNTIME_JOB_DIAG"
|
|
exit 1
|
|
fi
|
|
fi
|
|
fi
|
|
|
|
echo "runtimeenvironments=$runtimeenvironments" >> "$RUNTIME_JOB_DIAG"
|
|
if [ ! "X$SLURM_NODEFILE" = 'X' ] ; then
|
|
if [ -r "$SLURM_NODEFILE" ] ; then
|
|
cat "$SLURM_NODEFILE" | sed 's/\(.*\)/nodename=\1/' >> "$RUNTIME_JOB_DIAG"
|
|
NODENAME_WRITTEN="1"
|
|
else
|
|
SLURM_NODEFILE=
|
|
fi
|
|
fi
|
|
if [ "$RESULT" = '0' ] ; then
|
|
# Changing to session directory
|
|
HOME=$RUNTIME_JOB_DIR
|
|
export HOME
|
|
if ! cd "$RUNTIME_JOB_DIR"; then
|
|
echo "Failed to switch to '$RUNTIME_JOB_DIR'" 1>&2
|
|
RESULT=1
|
|
fi
|
|
if [ ! -z "$RESULT" ] && [ "$RESULT" != 0 ]; then
|
|
exit $RESULT
|
|
fi
|
|
nodename=`/bin/hostname -f`
|
|
echo "nodename=$nodename" >> "$RUNTIME_JOB_DIAG"
|
|
echo "Processors=1" >> "$RUNTIME_JOB_DIAG"
|
|
executable='./workbench.sh'
|
|
# Check if executable exists
|
|
if [ ! -f "$executable" ];
|
|
then
|
|
echo "Path \"$executable\" does not seem to exist" 1>$RUNTIME_JOB_STDOUT 2>$RUNTIME_JOB_STDERR 1>&2
|
|
exit 1
|
|
fi
|
|
# See if executable is a script, and extract the name of the interpreter
|
|
line1=`dd if="$executable" count=1 2>/dev/null | head -n 1`
|
|
command=`echo $line1 | sed -n 's/^#! *//p'`
|
|
interpreter=`echo $command | awk '{print $1}'`
|
|
if [ "$interpreter" = /usr/bin/env ]; then interpreter=`echo $command | awk '{print $2}'`; fi
|
|
# If it's a script and the interpreter is not found ...
|
|
[ "x$interpreter" = x ] || type "$interpreter" > /dev/null 2>&1 || {
|
|
|
|
echo "Cannot run $executable: $interpreter: not found" 1>$RUNTIME_JOB_STDOUT 2>$RUNTIME_JOB_STDERR 1>&2
|
|
exit 1; }
|
|
GNU_TIME='/usr/bin/time'
|
|
if [ ! -z "$GNU_TIME" ] && ! "$GNU_TIME" --version >/dev/null 2>&1; then
|
|
echo "WARNING: GNU time not found at: $GNU_TIME" 2>&1;
|
|
GNU_TIME=
|
|
fi
|
|
|
|
if [ -z "$GNU_TIME" ] ; then
|
|
"./workbench.sh" <$RUNTIME_JOB_STDIN 1>$RUNTIME_JOB_STDOUT 2>$RUNTIME_JOB_STDERR
|
|
else
|
|
$GNU_TIME -o "$RUNTIME_JOB_DIAG" -a -f 'WallTime=%es\nKernelTime=%Ss\nUserTime=%Us\nCPUUsage=%P\nMaxResidentMemory=%MkB\nAverageResidentMemory=%tkB\nAverageTotalMemory=%KkB\nAverageUnsharedMemory=%DkB\nAverageUnsharedStack=%pkB\nAverageSharedMemory=%XkB\nPageSize=%ZB\nMajorPageFaults=%F\nMinorPageFaults=%R\nSwaps=%W\nForcedSwitches=%c\nWaitSwitches=%w\nInputs=%I\nOutputs=%O\nSocketReceived=%r\nSocketSent=%s\nSignals=%k\n' "./workbench.sh" <$RUNTIME_JOB_STDIN 1>$RUNTIME_JOB_STDOUT 2>$RUNTIME_JOB_STDERR
|
|
|
|
fi
|
|
RESULT=$?
|
|
|
|
fi
|
|
fi
|
|
if [ ! -z "$RUNTIME_CONFIG_DIR" ] ; then
|
|
if [ -r "${RUNTIME_CONFIG_DIR}/APPS/BASE/THEANO-GPU-0.9" ] ; then
|
|
cmdl=${RUNTIME_CONFIG_DIR}/APPS/BASE/THEANO-GPU-0.9
|
|
sourcewithargs $cmdl 2
|
|
fi
|
|
fi
|
|
|
|
if [ ! -z "$RUNTIME_LOCAL_SCRATCH_DIR" ] ; then
|
|
find ./ -type l -exec rm -f "{}" ";"
|
|
find ./ -type f -exec chmod u+w "{}" ";"
|
|
chmod -R u-w "$RUNTIME_JOB_DIR"/'40_epoch.h5' 2>/dev/null
|
|
chmod -R u-w "$RUNTIME_JOB_DIR"/'workbench.py' 2>/dev/null
|
|
chmod -R u-w "$RUNTIME_JOB_DIR"/'workbench.sh' 2>/dev/null
|
|
chmod -R u-w "$RUNTIME_JOB_DIR"/'40_epoch_history.pkl' 2>/dev/null
|
|
chmod -R u-w "$RUNTIME_JOB_DIR"/'notes' 2>/dev/null
|
|
chmod -R u-w "$RUNTIME_JOB_DIR"/'out.txt' 2>/dev/null
|
|
chmod -R u-w "$RUNTIME_JOB_DIR"/'err.txt' 2>/dev/null
|
|
chmod -R u-w "$RUNTIME_JOB_DIR"/'gmlog' 2>/dev/null
|
|
find ./ -type f -perm /200 -exec rm -f "{}" ";"
|
|
find ./ -type f -exec chmod u+w "{}" ";"
|
|
fi
|
|
|
|
if [ ! -z "$RUNTIME_LOCAL_SCRATCH_DIR" ] && [ ! -z "$RUNTIME_NODE_SEES_FRONTEND" ]; then
|
|
if [ ! -z "$RUNTIME_FRONTEND_SEES_NODE" ] ; then
|
|
# just move it
|
|
rm -rf "$RUNTIME_FRONTEND_JOB_DIR"
|
|
destdir=`dirname "$RUNTIME_FRONTEND_JOB_DIR"`
|
|
if ! mv "$RUNTIME_NODE_JOB_DIR" "$destdir"; then
|
|
echo "Failed to move '$RUNTIME_NODE_JOB_DIR' to '$destdir'" 1>&2
|
|
RESULT=1
|
|
fi
|
|
else
|
|
# remove links
|
|
rm -f "$RUNTIME_JOB_STDOUT" 2>/dev/null
|
|
rm -f "$RUNTIME_JOB_STDERR" 2>/dev/null
|
|
# move directory contents
|
|
for f in "$RUNTIME_NODE_JOB_DIR"/.* "$RUNTIME_NODE_JOB_DIR"/*; do
|
|
[ "$f" = "$RUNTIME_NODE_JOB_DIR/*" ] && continue # glob failed, no files
|
|
[ "$f" = "$RUNTIME_NODE_JOB_DIR/." ] && continue
|
|
[ "$f" = "$RUNTIME_NODE_JOB_DIR/.." ] && continue
|
|
[ "$f" = "$RUNTIME_NODE_JOB_DIR/.diag" ] && continue
|
|
[ "$f" = "$RUNTIME_NODE_JOB_DIR/.comment" ] && continue
|
|
if ! mv "$f" "$RUNTIME_FRONTEND_JOB_DIR"; then
|
|
echo "Failed to move '$f' to '$RUNTIME_FRONTEND_JOB_DIR'" 1>&2
|
|
RESULT=1
|
|
fi
|
|
done
|
|
rm -rf "$RUNTIME_NODE_JOB_DIR"
|
|
fi
|
|
fi
|
|
echo "exitcode=$RESULT" >> "$RUNTIME_JOB_DIAG"
|
|
exit $RESULT
|
|
-------------------------------------------------------------------
|
|
|
|
job submitted successfully!
|
|
local job id: 734035
|
|
----- exiting submit_slurm_job -----
|
|
|
|
2017-09-06T08:22:24Z Job state change SUBMIT -> INLRMS Reason: Job is passed to LRMS
|
|
------- Contents of output stream forwarded by the LRMS ---------
|
|
WARNING: GNU time not found at: /usr/bin/time
|
|
slurmstepd: error: *** JOB 734035 ON nsc-fp006 CANCELLED AT 2017-09-06T22:52:34 DUE TO TIME LIMIT ***
|
|
------------------------- End of output -------------------------
|
|
2017-09-06T20:55:12Z Job state change INLRMS -> FINISHING Reason: Job finished executing in LRMS
|
|
2017-09-06T20:56:12Z Job state change FINISHING -> FINISHED Reason: Stage-out finished.
|
|
2017-09-06T21:49:02Z Job state change UNDEFINED -> FINISHED Reason: (Re)Accepting new job
|
|
2017-09-06T23:49:20Z Job state change UNDEFINED -> FINISHED Reason: (Re)Accepting new job
|
|
2017-09-07T01:49:53Z Job state change UNDEFINED -> FINISHED Reason: (Re)Accepting new job
|
|
2017-09-07T03:50:36Z Job state change UNDEFINED -> FINISHED Reason: (Re)Accepting new job
|
|
2017-09-07T05:50:39Z Job state change UNDEFINED -> FINISHED Reason: (Re)Accepting new job
|
|
2017-09-07T07:50:54Z Job state change UNDEFINED -> FINISHED Reason: (Re)Accepting new job
|