* sinfo-report-node-stats.sh: Simple tool to report status of compute

nodes based on SLURM's "sinfo" output.
master
Wirawan Purwanto 3 years ago
parent 18d79dd34b
commit 19c833c3ff
  1. 68
      slurm/sinfo-report-node-stats.sh

@ -0,0 +1,68 @@
#!/bin/bash
#
# Query the status of active nodes from sinfo
DOC="Get information about active nodes from SLURM perspective.
Environment variables that affect this script:
* OUTDIR
* TIMESTAMP
* SINFO_NODES
"
: ${OUTDIR:=.}
# If the output of "sinfo -N" is not specified, we will fetch
# the output from SLURM and include additional information.
if [ -z "${SINFO_NODES}" ]; then
if [ -n "$TIMESTAMP" ]; then
case "$TIMESTAMP" in
(-|"(none)"|none)
TIMESTAMP= # BLANK
;;
esac
else
TIMESTAMP=$(date +"_%Y-%m-%dT%H.%M.%S")
fi
# FIXME: Yeah I know this can run into race condition, oh well.
# sinfo -N should be considered the authoritative output.
sinfo > "${OUTDIR}/sinfo${TIMESTAMP}.txt"
sinfo -s > "${OUTDIR}/sinfo-s${TIMESTAMP}.txt"
sinfo -N > "${OUTDIR}/sinfo-N${TIMESTAMP}.txt"
SINFO_NODES="${OUTDIR}/sinfo-N${TIMESTAMP}.txt"
else
echo "Reusing node info from ${SINFO_NODES}"
echo "Assigned TIMESTAMP=${TIMESTAMP:-(none)}"
fi
# `sinfo -N` will give list of nodes and partition it belongs
# (one line per node:partition combination)
# Get the list of node names, exclude fail and down state, sort it to a unique list
tail -n +2 "${SINFO_NODES}" \
| awk '$4 !~ /fail|down/ {print $1}' \
| sort \
| uniq > "${OUTDIR}/nodes-active${TIMESTAMP}.txt"
# Get the list of node names, strip the host number (-NNN),
# sort it to a unique list and give the count
tail -n +2 "${SINFO_NODES}" \
| awk '$4 !~ /fail|down/ {print $1}' \
| sort \
| uniq \
| sed -e 's/-[0-9][0-9]*$//' \
| uniq -c > "${OUTDIR}/nodes-active-types${TIMESTAMP}.txt"
#tail -n +2 "${OUTDIR}/sinfo-N_${TIMESTAMP}.txt" | sed -e 's/-[0-9][0-9]*$//' | sort | uniq -c > "${OUTDIR}/sinfo-active-nodes-types_${TIMESTAMP}.txt"
tail -n +2 "${SINFO_NODES}" \
| awk ' {print $1}' \
| sort \
| uniq \
| sed -e 's/-[0-9][0-9]*$//' \
| uniq -c > "${OUTDIR}/nodes-all-types${TIMESTAMP}.txt"
Loading…
Cancel
Save