#!/bin/bash # # Query the status of active nodes from sinfo DOC="Get information about active nodes from SLURM perspective. Environment variables that affect this script: * OUTDIR * TIMESTAMP * SINFO_NODES " : ${OUTDIR:=.} # If the output of "sinfo -N" is not specified, we will fetch # the output from SLURM and include additional information. if [ -z "${SINFO_NODES}" ]; then if [ -n "$TIMESTAMP" ]; then case "$TIMESTAMP" in (-|"(none)"|none) TIMESTAMP= # BLANK ;; esac else TIMESTAMP=$(date +"_%Y-%m-%dT%H.%M.%S") fi # FIXME: Yeah I know this can run into race condition, oh well. # sinfo -N should be considered the authoritative output. sinfo > "${OUTDIR}/sinfo${TIMESTAMP}.txt" sinfo -s > "${OUTDIR}/sinfo-s${TIMESTAMP}.txt" sinfo -N > "${OUTDIR}/sinfo-N${TIMESTAMP}.txt" SINFO_NODES="${OUTDIR}/sinfo-N${TIMESTAMP}.txt" else echo "Reusing node info from ${SINFO_NODES}" echo "Assigned TIMESTAMP=${TIMESTAMP:-(none)}" fi # `sinfo -N` will give list of nodes and partition it belongs # (one line per node:partition combination) # Get the list of node names, exclude fail and down state, sort it to a unique list tail -n +2 "${SINFO_NODES}" \ | awk '$4 !~ /fail|down/ {print $1}' \ | sort \ | uniq > "${OUTDIR}/nodes-active${TIMESTAMP}.txt" # Get the list of node names, strip the host number (-NNN), # sort it to a unique list and give the count tail -n +2 "${SINFO_NODES}" \ | awk '$4 !~ /fail|down/ {print $1}' \ | sort \ | uniq \ | sed -e 's/-[0-9][0-9]*$//' \ | uniq -c > "${OUTDIR}/nodes-active-types${TIMESTAMP}.txt" #tail -n +2 "${OUTDIR}/sinfo-N_${TIMESTAMP}.txt" | sed -e 's/-[0-9][0-9]*$//' | sort | uniq -c > "${OUTDIR}/sinfo-active-nodes-types_${TIMESTAMP}.txt" tail -n +2 "${SINFO_NODES}" \ | awk ' {print $1}' \ | sort \ | uniq \ | sed -e 's/-[0-9][0-9]*$//' \ | uniq -c > "${OUTDIR}/nodes-all-types${TIMESTAMP}.txt"