Tested on Wahab cluster.master
parent
3aa1688f8e
commit
dfb9db6a60
1 changed files with 41 additions and 0 deletions
@ -0,0 +1,41 @@ |
|||||||
|
#!/bin/bash |
||||||
|
# |
||||||
|
# check-gpu-utilization.sh |
||||||
|
# Given a SLURM cluster, enumerate all the GPU nodes except those that are down |
||||||
|
# |
||||||
|
# Created: 2023-04-06 |
||||||
|
|
||||||
|
set -eu |
||||||
|
|
||||||
|
# Must be a valid regex |
||||||
|
GPU_PARTITIONS='^gpu$' |
||||||
|
|
||||||
|
# list GPU nodes being utilized (partially/fully) |
||||||
|
LIST_GPU_NODES=( $(sinfo -N | awk '($3 ~ /'"$GPU_PARTITIONS"'/) && ($4 ~ /^(mix|alloc)$/) { print $1 }') ) |
||||||
|
|
||||||
|
echo "$0" |
||||||
|
date |
||||||
|
|
||||||
|
# list all the jobs: |
||||||
|
|
||||||
|
echo "=== LISTING OF ALL GPU JOBS ===" |
||||||
|
LIST_GPU_JOBS=$(squeue | awk '$2 ~ /'"$GPU_PARTITIONS"'/ { print }') |
||||||
|
echo "$LIST_GPU_JOBS" |
||||||
|
echo |
||||||
|
|
||||||
|
echo "=== LISTING OF GPU UTILIZATIONS PER NODE ===" |
||||||
|
for Node in "${LIST_GPU_NODES[@]}"; do |
||||||
|
echo " :: node: $Node" |
||||||
|
ssh "$Node" nvidia-smi |
||||||
|
echo |
||||||
|
done |
||||||
|
echo |
||||||
|
|
||||||
|
echo "=== LISTING OF GPU JOB SPECIFICATIONS ===" |
||||||
|
for Job in $(echo "${LIST_GPU_JOBS}" | awk '{ if ($1 != "JOBID") { print($1) } }'); do |
||||||
|
#echo " :: Job: $Node" |
||||||
|
scontrol show job "$Job" |
||||||
|
#echo |
||||||
|
done |
||||||
|
|
||||||
|
#squeue | awk '$2 ~ /'"$GPU_PARTITIONS"'/ { print }' |
Loading…
Reference in new issue