diff --git a/slurm/check-gpu-utilization.sh b/slurm/check-gpu-utilization.sh new file mode 100755 index 0000000..fbc9cf7 --- /dev/null +++ b/slurm/check-gpu-utilization.sh @@ -0,0 +1,41 @@ +#!/bin/bash +# +# check-gpu-utilization.sh +# Given a SLURM cluster, enumerate all the GPU nodes except those that are down +# +# Created: 2023-04-06 + +set -eu + +# Must be a valid regex +GPU_PARTITIONS='^gpu$' + +# list GPU nodes being utilized (partially/fully) +LIST_GPU_NODES=( $(sinfo -N | awk '($3 ~ /'"$GPU_PARTITIONS"'/) && ($4 ~ /^(mix|alloc)$/) { print $1 }') ) + +echo "$0" +date + +# list all the jobs: + +echo "=== LISTING OF ALL GPU JOBS ===" +LIST_GPU_JOBS=$(squeue | awk '$2 ~ /'"$GPU_PARTITIONS"'/ { print }') +echo "$LIST_GPU_JOBS" +echo + +echo "=== LISTING OF GPU UTILIZATIONS PER NODE ===" +for Node in "${LIST_GPU_NODES[@]}"; do + echo " :: node: $Node" + ssh "$Node" nvidia-smi + echo +done +echo + +echo "=== LISTING OF GPU JOB SPECIFICATIONS ===" +for Job in $(echo "${LIST_GPU_JOBS}" | awk '{ if ($1 != "JOBID") { print($1) } }'); do + #echo " :: Job: $Node" + scontrol show job "$Job" + #echo +done + +#squeue | awk '$2 ~ /'"$GPU_PARTITIONS"'/ { print }'