#!/bin/bash # # check-gpu-utilization.sh # Given a SLURM cluster, enumerate all the GPU nodes except those that are down # # Created: 2023-04-06 set -eu # Must be a valid regex GPU_PARTITIONS='^gpu$' # list GPU nodes being utilized (partially/fully) LIST_GPU_NODES=( $(sinfo -N | awk '($3 ~ /'"$GPU_PARTITIONS"'/) && ($4 ~ /^(mix|alloc)$/) { print $1 }') ) echo "$0" date # list all the jobs: echo "=== LISTING OF ALL GPU JOBS ===" LIST_GPU_JOBS=$(squeue | awk '$2 ~ /'"$GPU_PARTITIONS"'/ { print }') echo "$LIST_GPU_JOBS" echo echo "=== LISTING OF GPU UTILIZATIONS PER NODE ===" for Node in "${LIST_GPU_NODES[@]}"; do echo " :: node: $Node" ssh "$Node" nvidia-smi echo done echo echo "=== LISTING OF GPU JOB SPECIFICATIONS ===" for Job in $(echo "${LIST_GPU_JOBS}" | awk '{ if ($1 != "JOBID") { print($1) } }'); do #echo " :: Job: $Node" scontrol show job "$Job" #echo done #squeue | awk '$2 ~ /'"$GPU_PARTITIONS"'/ { print }'