Custom HPC software & tools from Wirawan. Primarily tailored toward ODU HPC sytems.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

185 lines
4.4 KiB

#!/bin/bash
# 20151028
#
# Note: original extraction command on turing:
#
# qstat -f | grep -ve '^[-# ]' -e '^queuename' | less
#
: ${optShowDisabledNodes=0}
: ${optPrintRaw=0}
function node_slot_stats_raw()
{
qstat -f | _Process_node_slot_stats_raw
}
function node_slot_stats_raw_f()
{
_Process_node_slot_stats_raw "$1"
}
function _Process_node_slot_stats_raw()
# Prints the node stats from `qstat -f' in raw format:
# - not printing disabled nodes
# - not showing the computational jobs that are running on these nodes
{
gawk -v optShowDisabledNodes="$optShowDisabledNodes" \
'
BEGIN {
STDERR = "/dev/stderr"
}
FNR == 1 && $1 == "queuename" { print; next; }
# Valid host status field
($0 ~ /^[A-Za-z]/) && (NF == 5 || NF == 6) && (optShowDisabledNodes!=0 || ($6 !~ /d/)) {
print
}
' "$@"
}
function node_slot_stats_per_machine_type()
# Prints status of slot availability per machine type (defined as
# host with the same base hostname (e.g. "c6-", or "c8-").
# Originally implemented based on the naming of hosts on Turing cluster.
#
# Example output: (changes depending on what's disabled and the load of the cluster)
#
# MACHTYPE NODE CORES used free resv
# c6 15 240 77 163 0
# c8 40 768 569 199 0
# cr 74 1480 988 492 0
# crhimem 3 96 0 96 0
# crphi 10 200 48 152 0
# d430 49 1568 1292 276 0
# d730 10 280 10 270 0
#
# FIXME: If a machine is covered by more than one queue, this will cause the counts
# to be overestimated. Must register if a machine has been encountered and not
# re-account that machine.
{
qstat -f | _Process_node_slot_stats_per_machine_type
}
function _Process_node_slot_stats_per_machine_type()
# Processing part of the routine above.
{
gawk \
-v optShowDisabledNodes="$optShowDisabledNodes" \
-v optPrintRaw="$optPrintRaw" \
'####
BEGIN {
STDERR = "/dev/stderr"
hostnames_seen[-1234] = 0
}
FNR == 1 && $1 == "queuename" { next; }
# Valid host status field
($0 ~ /^[A-Za-z]/) && (NF == 5 || NF == 6) {
queue_node = $1
core_usage_combo = $3
states = $6 # if any
# skip disabled hosts
if (states ~ /d/ && (optShowDisabledNodes==0)) next;
if (optPrintRaw != 0) print($0)
# gawk extension of match:
if (match(queue_node, /^([^@]+)@([^-]+)-(.*)$/, Strs))
{
queue = Strs[1]
hostkind = Strs[2]
hostnum = Strs[3]
hostname = hostkind "-" hostnum
}
else
{
print("Invalid queue/host combo: " queue_node) > STDERR
next
}
split(core_usage_combo, Strs, "/")
slots_resv = Strs[1]
slots_used = Strs[2]
slots_tot = Strs[3]
# Avoiding double counting:
if (hostname in hostname_seen)
{
print("Host already seen: " hostname) > STDERR
next
}
mach_node_count[hostkind] = mach_node_count[hostkind] + 1
mach_node_slot_count[hostkind] = slots_tot # assume homogenous! This DOES NOT work with c8-type nodes!
mach_slots_tot[hostkind] = mach_slots_tot[hostkind] + slots_tot
mach_slots_used[hostkind] = mach_slots_used[hostkind] + slots_used
mach_slots_resv[hostkind] = mach_slots_resv[hostkind] + slots_resv
}
function report_node_stats()
{
j = 0
for (i in mach_node_count)
{
j += 1
machs[j] = i
}
machs_count = asort(machs)
printf("%-16s %4s %5s %5s %5s %5s\n", "MACHTYPE", "NODE", "CORES", "used", "free", "resv")
for (i = 1; i <= machs_count; ++i)
{
mach = machs[i]
printf("%-16s %4d %5d %5d %5d %5d\n",
mach, mach_node_count[mach], mach_slots_tot[mach],
mach_slots_used[mach],
mach_slots_tot[mach] - mach_slots_used[mach] - mach_slots_resv[mach],
mach_slots_resv[mach])
}
}
END {
report_node_stats()
}
' \
"$@"
}
function node_slot_stats_per_machine_type_f()
{
_Process_node_slot_stats_per_machine_type "$1"
}
case "$1" in
(--raw|raw)
if [ "$2" ]; then
node_slot_stats_raw_f "$2"
else
node_slot_stats_raw
fi
;;
(--stats|stats|"")
if [ "$2" ]; then
node_slot_stats_per_machine_type_f "$2"
else
node_slot_stats_per_machine_type
fi
;;
(--stats-with-disabled|stats-with-disabled)
if [ "$2" ]; then
optShowDisabledNodes=1 node_slot_stats_per_machine_type_f "$2"
else
optShowDisabledNodes=1 node_slot_stats_per_machine_type
fi
;;
(*)
echo "Unknown action: $1" >&2
exit 2
;;
esac