You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

260 lines
12 KiB

2017-09-06T08:21:51Z Job state change UNDEFINED -> ACCEPTED Reason: (Re)Accepting new job
2017-09-06T08:21:51Z Job state change ACCEPTED -> PREPARING Reason: Starting job processing
2017-09-06T08:22:24Z Job state change PREPARING -> SUBMIT Reason: Pre-staging finished, passing job to LRMS
----- starting submit_slurm_job -----
SLURM jobname: accetuation_nn
SLURM job script built
SLURM script follows:
-------------------------------------------------------------------
#!/bin/bash -l
# SLURM batch job script built by grid-manager
#SBATCH --no-requeue
#SBATCH -e /net/hold/data1/arc/session//e1PMDmzlk9qnOeFSGmVnjcgoABFKDmABFKDmZmMKDmABFKDmnXN6sm.comment
#SBATCH -o /net/hold/data1/arc/session//e1PMDmzlk9qnOeFSGmVnjcgoABFKDmABFKDmZmMKDmABFKDmnXN6sm.comment
#SBATCH -p gridlong
#SBATCH --nice=50
#SBATCH -J 'accetuation_nn'
#SBATCH --get-user-env=10L
#SBATCH -n 1
#SBATCH --constraint=gpu --gres=gpu:1
#SBATCH -t 750:0
#SBATCH -t 750:0
#SBATCH --mem-per-cpu=12000
# run singularity image if RTE with singularity is required
if [ -z $SINGULARITY_CONTAINER ]; then
exec /bin/singularity exec -B /var/spool/slurm,/cvmfs,/net/hold/data1,/data1,/data1/slurm,/home,/usr/lib64/nvidia /net/hold/data1/singularity-images/theano-gpu-2.img $0
fi
# Overide umask of execution node (sometime values are really strange)
umask 077
# source with arguments for DASH shells
sourcewithargs() {
script=$1
shift
. $script
}
# Setting environment variables as specified by user
export 'GRID_GLOBAL_JOBID=gsiftp://nsc.ijs.si:2811/jobs/e1PMDmzlk9qnOeFSGmVnjcgoABFKDmABFKDmZmMKDmABFKDmnXN6sm'
RUNTIME_JOB_DIR=/net/hold/data1/arc/session//e1PMDmzlk9qnOeFSGmVnjcgoABFKDmABFKDmZmMKDmABFKDmnXN6sm
RUNTIME_JOB_STDIN=/dev/null
RUNTIME_JOB_STDOUT=/net/hold/data1/arc/session//e1PMDmzlk9qnOeFSGmVnjcgoABFKDmABFKDmZmMKDmABFKDmnXN6sm/out.txt
RUNTIME_JOB_STDERR=/net/hold/data1/arc/session//e1PMDmzlk9qnOeFSGmVnjcgoABFKDmABFKDmZmMKDmABFKDmnXN6sm/err.txt
RUNTIME_JOB_DIAG=/net/hold/data1/arc/session//e1PMDmzlk9qnOeFSGmVnjcgoABFKDmABFKDmZmMKDmABFKDmnXN6sm.diag
if [ ! -z "$RUNTIME_GRIDAREA_DIR" ] ; then
RUNTIME_JOB_DIR=$RUNTIME_GRIDAREA_DIR/`basename $RUNTIME_JOB_DIR`
RUNTIME_JOB_STDIN=`echo "$RUNTIME_JOB_STDIN" | sed "s#^$RUNTIME_JOB_DIR#$RUNTIME_GRIDAREA_DIR#"`
RUNTIME_JOB_STDOUT=`echo "$RUNTIME_JOB_STDOUT" | sed "s#^$RUNTIME_JOB_DIR#$RUNTIME_GRIDAREA_DIR#"`
RUNTIME_JOB_STDERR=`echo "$RUNTIME_JOB_STDERR" | sed "s#^$RUNTIME_JOB_DIR#$RUNTIME_GRIDAREA_DIR#"`
RUNTIME_JOB_DIAG=`echo "$RUNTIME_JOB_DIAG" | sed "s#^$RUNTIME_JOB_DIR#$RUNTIME_GRIDAREA_DIR#"`
RUNTIME_CONTROL_DIR=`echo "$RUNTIME_CONTROL_DIR" | sed "s#^$RUNTIME_JOB_DIR#$RUNTIME_GRIDAREA_DIR#"`
fi
RUNTIME_LOCAL_SCRATCH_DIR=${RUNTIME_LOCAL_SCRATCH_DIR:-$WORKDIR}
RUNTIME_FRONTEND_SEES_NODE=${RUNTIME_FRONTEND_SEES_NODE:-}
RUNTIME_NODE_SEES_FRONTEND=${RUNTIME_NODE_SEES_FRONTEND:-yes}
if [ ! -z "$RUNTIME_LOCAL_SCRATCH_DIR" ] && [ ! -z "$RUNTIME_NODE_SEES_FRONTEND" ]; then
RUNTIME_NODE_JOB_DIR="$RUNTIME_LOCAL_SCRATCH_DIR"/`basename "$RUNTIME_JOB_DIR"`
rm -rf "$RUNTIME_NODE_JOB_DIR"
mkdir -p "$RUNTIME_NODE_JOB_DIR"
# move directory contents
for f in "$RUNTIME_JOB_DIR"/.* "$RUNTIME_JOB_DIR"/*; do
[ "$f" = "$RUNTIME_JOB_DIR/*" ] && continue # glob failed, no files
[ "$f" = "$RUNTIME_JOB_DIR/." ] && continue
[ "$f" = "$RUNTIME_JOB_DIR/.." ] && continue
[ "$f" = "$RUNTIME_JOB_DIR/.diag" ] && continue
[ "$f" = "$RUNTIME_JOB_DIR/.comment" ] && continue
if ! mv "$f" "$RUNTIME_NODE_JOB_DIR"; then
echo "Failed to move '$f' to '$RUNTIME_NODE_JOB_DIR'" 1>&2
exit 1
fi
done
if [ ! -z "$RUNTIME_FRONTEND_SEES_NODE" ] ; then
# creating link for whole directory
ln -s "$RUNTIME_FRONTEND_SEES_NODE"/`basename "$RUNTIME_JOB_DIR"` "$RUNTIME_JOB_DIR"
else
# keep stdout, stderr and control directory on frontend
# recreate job directory
mkdir -p "$RUNTIME_JOB_DIR"
# make those files
mkdir -p `dirname "$RUNTIME_JOB_STDOUT"`
mkdir -p `dirname "$RUNTIME_JOB_STDERR"`
touch "$RUNTIME_JOB_STDOUT"
touch "$RUNTIME_JOB_STDERR"
RUNTIME_JOB_STDOUT__=`echo "$RUNTIME_JOB_STDOUT" | sed "s#^${RUNTIME_JOB_DIR}#${RUNTIME_NODE_JOB_DIR}#"`
RUNTIME_JOB_STDERR__=`echo "$RUNTIME_JOB_STDERR" | sed "s#^${RUNTIME_JOB_DIR}#${RUNTIME_NODE_JOB_DIR}#"`
rm "$RUNTIME_JOB_STDOUT__" 2>/dev/null
rm "$RUNTIME_JOB_STDERR__" 2>/dev/null
if [ ! -z "$RUNTIME_JOB_STDOUT__" ] && [ "$RUNTIME_JOB_STDOUT" != "$RUNTIME_JOB_STDOUT__" ]; then
ln -s "$RUNTIME_JOB_STDOUT" "$RUNTIME_JOB_STDOUT__"
fi
if [ "$RUNTIME_JOB_STDOUT__" != "$RUNTIME_JOB_STDERR__" ] ; then
if [ ! -z "$RUNTIME_JOB_STDERR__" ] && [ "$RUNTIME_JOB_STDERR" != "$RUNTIME_JOB_STDERR__" ]; then
ln -s "$RUNTIME_JOB_STDERR" "$RUNTIME_JOB_STDERR__"
fi
fi
if [ ! -z "$RUNTIME_CONTROL_DIR" ] ; then
# move control directory back to frontend
RUNTIME_CONTROL_DIR__=`echo "$RUNTIME_CONTROL_DIR" | sed "s#^${RUNTIME_JOB_DIR}#${RUNTIME_NODE_JOB_DIR}#"`
mv "$RUNTIME_CONTROL_DIR__" "$RUNTIME_CONTROL_DIR"
fi
fi
# adjust stdin,stdout & stderr pointers
RUNTIME_JOB_STDIN=`echo "$RUNTIME_JOB_STDIN" | sed "s#^${RUNTIME_JOB_DIR}#${RUNTIME_NODE_JOB_DIR}#"`
RUNTIME_JOB_STDOUT=`echo "$RUNTIME_JOB_STDOUT" | sed "s#^${RUNTIME_JOB_DIR}#${RUNTIME_NODE_JOB_DIR}#"`
RUNTIME_JOB_STDERR=`echo "$RUNTIME_JOB_STDERR" | sed "s#^${RUNTIME_JOB_DIR}#${RUNTIME_NODE_JOB_DIR}#"`
RUNTIME_FRONTEND_JOB_DIR="$RUNTIME_JOB_DIR"
RUNTIME_JOB_DIR="$RUNTIME_NODE_JOB_DIR"
fi
if [ -z "$RUNTIME_NODE_SEES_FRONTEND" ] ; then
mkdir -p "$RUNTIME_JOB_DIR"
fi
RESULT=0
if [ "$RESULT" = '0' ] ; then
# Running runtime scripts
export RUNTIME_CONFIG_DIR=${RUNTIME_CONFIG_DIR:-/net/hold/data1/arc/runtime/}
runtimeenvironments=
if [ ! -z "$RUNTIME_CONFIG_DIR" ] ; then
if [ -r "${RUNTIME_CONFIG_DIR}/APPS/BASE/THEANO-GPU-0.9" ] ; then
runtimeenvironments="${runtimeenvironments}APPS/BASE/THEANO-GPU-0.9;"
cmdl=${RUNTIME_CONFIG_DIR}/APPS/BASE/THEANO-GPU-0.9
sourcewithargs $cmdl 1
if [ $? -ne '0' ] ; then
echo "Runtime APPS/BASE/THEANO-GPU-0.9 script failed " 1>&2
echo "Runtime APPS/BASE/THEANO-GPU-0.9 script failed " 1>"$RUNTIME_JOB_DIAG"
exit 1
fi
fi
fi
echo "runtimeenvironments=$runtimeenvironments" >> "$RUNTIME_JOB_DIAG"
if [ ! "X$SLURM_NODEFILE" = 'X' ] ; then
if [ -r "$SLURM_NODEFILE" ] ; then
cat "$SLURM_NODEFILE" | sed 's/\(.*\)/nodename=\1/' >> "$RUNTIME_JOB_DIAG"
NODENAME_WRITTEN="1"
else
SLURM_NODEFILE=
fi
fi
if [ "$RESULT" = '0' ] ; then
# Changing to session directory
HOME=$RUNTIME_JOB_DIR
export HOME
if ! cd "$RUNTIME_JOB_DIR"; then
echo "Failed to switch to '$RUNTIME_JOB_DIR'" 1>&2
RESULT=1
fi
if [ ! -z "$RESULT" ] && [ "$RESULT" != 0 ]; then
exit $RESULT
fi
nodename=`/bin/hostname -f`
echo "nodename=$nodename" >> "$RUNTIME_JOB_DIAG"
echo "Processors=1" >> "$RUNTIME_JOB_DIAG"
executable='./workbench.sh'
# Check if executable exists
if [ ! -f "$executable" ];
then
echo "Path \"$executable\" does not seem to exist" 1>$RUNTIME_JOB_STDOUT 2>$RUNTIME_JOB_STDERR 1>&2
exit 1
fi
# See if executable is a script, and extract the name of the interpreter
line1=`dd if="$executable" count=1 2>/dev/null | head -n 1`
command=`echo $line1 | sed -n 's/^#! *//p'`
interpreter=`echo $command | awk '{print $1}'`
if [ "$interpreter" = /usr/bin/env ]; then interpreter=`echo $command | awk '{print $2}'`; fi
# If it's a script and the interpreter is not found ...
[ "x$interpreter" = x ] || type "$interpreter" > /dev/null 2>&1 || {
echo "Cannot run $executable: $interpreter: not found" 1>$RUNTIME_JOB_STDOUT 2>$RUNTIME_JOB_STDERR 1>&2
exit 1; }
GNU_TIME='/usr/bin/time'
if [ ! -z "$GNU_TIME" ] && ! "$GNU_TIME" --version >/dev/null 2>&1; then
echo "WARNING: GNU time not found at: $GNU_TIME" 2>&1;
GNU_TIME=
fi
if [ -z "$GNU_TIME" ] ; then
"./workbench.sh" <$RUNTIME_JOB_STDIN 1>$RUNTIME_JOB_STDOUT 2>$RUNTIME_JOB_STDERR
else
$GNU_TIME -o "$RUNTIME_JOB_DIAG" -a -f 'WallTime=%es\nKernelTime=%Ss\nUserTime=%Us\nCPUUsage=%P\nMaxResidentMemory=%MkB\nAverageResidentMemory=%tkB\nAverageTotalMemory=%KkB\nAverageUnsharedMemory=%DkB\nAverageUnsharedStack=%pkB\nAverageSharedMemory=%XkB\nPageSize=%ZB\nMajorPageFaults=%F\nMinorPageFaults=%R\nSwaps=%W\nForcedSwitches=%c\nWaitSwitches=%w\nInputs=%I\nOutputs=%O\nSocketReceived=%r\nSocketSent=%s\nSignals=%k\n' "./workbench.sh" <$RUNTIME_JOB_STDIN 1>$RUNTIME_JOB_STDOUT 2>$RUNTIME_JOB_STDERR
fi
RESULT=$?
fi
fi
if [ ! -z "$RUNTIME_CONFIG_DIR" ] ; then
if [ -r "${RUNTIME_CONFIG_DIR}/APPS/BASE/THEANO-GPU-0.9" ] ; then
cmdl=${RUNTIME_CONFIG_DIR}/APPS/BASE/THEANO-GPU-0.9
sourcewithargs $cmdl 2
fi
fi
if [ ! -z "$RUNTIME_LOCAL_SCRATCH_DIR" ] ; then
find ./ -type l -exec rm -f "{}" ";"
find ./ -type f -exec chmod u+w "{}" ";"
chmod -R u-w "$RUNTIME_JOB_DIR"/'40_epoch.h5' 2>/dev/null
chmod -R u-w "$RUNTIME_JOB_DIR"/'workbench.py' 2>/dev/null
chmod -R u-w "$RUNTIME_JOB_DIR"/'workbench.sh' 2>/dev/null
chmod -R u-w "$RUNTIME_JOB_DIR"/'40_epoch_history.pkl' 2>/dev/null
chmod -R u-w "$RUNTIME_JOB_DIR"/'notes' 2>/dev/null
chmod -R u-w "$RUNTIME_JOB_DIR"/'out.txt' 2>/dev/null
chmod -R u-w "$RUNTIME_JOB_DIR"/'err.txt' 2>/dev/null
chmod -R u-w "$RUNTIME_JOB_DIR"/'gmlog' 2>/dev/null
find ./ -type f -perm /200 -exec rm -f "{}" ";"
find ./ -type f -exec chmod u+w "{}" ";"
fi
if [ ! -z "$RUNTIME_LOCAL_SCRATCH_DIR" ] && [ ! -z "$RUNTIME_NODE_SEES_FRONTEND" ]; then
if [ ! -z "$RUNTIME_FRONTEND_SEES_NODE" ] ; then
# just move it
rm -rf "$RUNTIME_FRONTEND_JOB_DIR"
destdir=`dirname "$RUNTIME_FRONTEND_JOB_DIR"`
if ! mv "$RUNTIME_NODE_JOB_DIR" "$destdir"; then
echo "Failed to move '$RUNTIME_NODE_JOB_DIR' to '$destdir'" 1>&2
RESULT=1
fi
else
# remove links
rm -f "$RUNTIME_JOB_STDOUT" 2>/dev/null
rm -f "$RUNTIME_JOB_STDERR" 2>/dev/null
# move directory contents
for f in "$RUNTIME_NODE_JOB_DIR"/.* "$RUNTIME_NODE_JOB_DIR"/*; do
[ "$f" = "$RUNTIME_NODE_JOB_DIR/*" ] && continue # glob failed, no files
[ "$f" = "$RUNTIME_NODE_JOB_DIR/." ] && continue
[ "$f" = "$RUNTIME_NODE_JOB_DIR/.." ] && continue
[ "$f" = "$RUNTIME_NODE_JOB_DIR/.diag" ] && continue
[ "$f" = "$RUNTIME_NODE_JOB_DIR/.comment" ] && continue
if ! mv "$f" "$RUNTIME_FRONTEND_JOB_DIR"; then
echo "Failed to move '$f' to '$RUNTIME_FRONTEND_JOB_DIR'" 1>&2
RESULT=1
fi
done
rm -rf "$RUNTIME_NODE_JOB_DIR"
fi
fi
echo "exitcode=$RESULT" >> "$RUNTIME_JOB_DIAG"
exit $RESULT
-------------------------------------------------------------------
job submitted successfully!
local job id: 734035
----- exiting submit_slurm_job -----
2017-09-06T08:22:24Z Job state change SUBMIT -> INLRMS Reason: Job is passed to LRMS
------- Contents of output stream forwarded by the LRMS ---------
WARNING: GNU time not found at: /usr/bin/time
slurmstepd: error: *** JOB 734035 ON nsc-fp006 CANCELLED AT 2017-09-06T22:52:34 DUE TO TIME LIMIT ***
------------------------- End of output -------------------------
2017-09-06T20:55:12Z Job state change INLRMS -> FINISHING Reason: Job finished executing in LRMS
2017-09-06T20:56:12Z Job state change FINISHING -> FINISHED Reason: Stage-out finished.
2017-09-06T21:49:02Z Job state change UNDEFINED -> FINISHED Reason: (Re)Accepting new job
2017-09-06T23:49:20Z Job state change UNDEFINED -> FINISHED Reason: (Re)Accepting new job
2017-09-07T01:49:53Z Job state change UNDEFINED -> FINISHED Reason: (Re)Accepting new job
2017-09-07T03:50:36Z Job state change UNDEFINED -> FINISHED Reason: (Re)Accepting new job
2017-09-07T05:50:39Z Job state change UNDEFINED -> FINISHED Reason: (Re)Accepting new job
2017-09-07T07:50:54Z Job state change UNDEFINED -> FINISHED Reason: (Re)Accepting new job