* Added "hoststats" subcommand for summarizing host occupancy statistics

irrespective of queue.
Wirawan Purwanto 8 years ago
parent ebdc93e80f
commit b6d22cf68b
  1. 84

@ -50,6 +50,52 @@ def node_slot_stats_raw(qstat_f, show_disabled_nodes=False):
def node_slot_stats(qstat_f, show_disabled_nodes=False):
"""Prints status of slot availability per machine type (defined as
host with the same base hostname (e.g. "c6-", or "c8-").
Originally implemented based on the naming of hosts on Turing cluster.
In SGE terminology, "slot" means a CPU core.
Example output:
MACHTYPE NODE CORES used free resv
c6 15 240 77 163 0
c8 40 768 569 199 0
cr 74 1480 988 492 0
crhimem 3 96 0 96 0
crphi 10 200 48 152 0
d430 49 1568 1292 276 0
d730 10 280 10 270 0
(changes depending on what's disabled and the load of the cluster)
FIXME: If a machine is covered by more than one queue, this will
cause the counts to be overestimated. Must register if a machine has
been encountered and not re-account that machine.
However this may not be the best approach as queues are overlapping
on machines. Since on Turing, the practice is not to further split a
machine to multiple queues (i.e. a 32-core node have all the 32
cores assignable to both main and timed-main queues, rather than
dedicating 16 for main and 16 for timed-main), we use a particular
way to avoid the double-counting:
- slots_resv: total number of reserved slots in a node (for whatever
the sysadmin designates) -- sum them up
- slots_used: total number of slots currently used (i.e.,
occupied by jobs) -- sum them up
- slots_tot: total number of slots in a node -- take the maximum
value encountered.
Had the nodes split-dedicated to a particular queue, we have to
take the sum of the values instead.
from pprint import pprint
host_stats = collect_host_stats(qstat_f, show_disabled_nodes)
def node_slot_stats_per_machine_type(qstat_f, show_disabled_nodes=False):
"""Prints status of slot availability per machine type (defined as
host with the same base hostname (e.g. "c6-", or "c8-").
@ -174,6 +220,35 @@ def collect_host_stats(qstat_f, show_disabled_nodes=None):
return host_stats
def node_load_ratio(node_load, slots_used):
"""Ratio of node load vs slots claimed to be used. This should be close to one
if the job uses the CPUs efficiently, or near zero if most jobs are interactive
(i.e. lots of idling)."""
return node_load / slots_used if slots_used != 0 \
else 0.0 if node_load < 0.75 \
else float('nan')
def print_host_stats(host_stats):
"""Prints the per-host statistics gathered by `collect_host_stats`.
hostnames = sorted(host_stats.keys())
print("%-16s %5s %5s %5s %5s %7s %9s" \
% ("HOST", "CORES", "used", "free", "resv", "load", "load/used"))
for h in hostnames:
hs = host_stats[h]
print("%-16s %5d %5d %5d %5d %7.2f %9.3f" \
% (h,
hs['slots_tot'] - hs['slots_used'] - hs['slots_resv'],
node_load_ratio(hs['node_load'], hs['slots_used']),
def summarize_hosttype_stats(host_stats):
"""Further summarize the host stats by the host type (denoted by the
prefix of the hostname before the dash character, i.e. "c8" for
@ -226,6 +301,7 @@ def print_hosttype_stats(hosttype_stats):
def help():
msg = """\
%(CMD)s - Shows node status from SGE information
@ -263,7 +339,9 @@ def main_default(argv):
elif argv[1] in ('--raw', 'raw'):
cmd = "raw"
elif argv[1] in ('--stats', 'stats', 'stat'):
cmd = "stats"
cmd = "stats" # old stats, a.k.a. hosttype_stats
elif re.search(r'^(--)?host-?stat', argv[1]):
cmd = "hoststats"
elif argv[1] in ('--help', 'help', '-h'):
return 0
@ -311,6 +389,10 @@ def main_default(argv):
elif cmd == "hoststats":
elif cmd == "stats":