From b6d22cf68b283f5027a8703216f783b4288ce6e3 Mon Sep 17 00:00:00 2001 From: Wirawan Purwanto Date: Tue, 20 Sep 2016 17:47:24 -0400 Subject: [PATCH] * Added "hoststats" subcommand for summarizing host occupancy statistics irrespective of queue. --- sge/show-node-status.py | 84 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 83 insertions(+), 1 deletion(-) diff --git a/sge/show-node-status.py b/sge/show-node-status.py index 18da2a7..ddaa4dc 100755 --- a/sge/show-node-status.py +++ b/sge/show-node-status.py @@ -50,6 +50,52 @@ def node_slot_stats_raw(qstat_f, show_disabled_nodes=False): print(L) +def node_slot_stats(qstat_f, show_disabled_nodes=False): + """Prints status of slot availability per machine type (defined as + host with the same base hostname (e.g. "c6-", or "c8-"). + Originally implemented based on the naming of hosts on Turing cluster. + In SGE terminology, "slot" means a CPU core. + + Example output: + + MACHTYPE NODE CORES used free resv + c6 15 240 77 163 0 + c8 40 768 569 199 0 + cr 74 1480 988 492 0 + crhimem 3 96 0 96 0 + crphi 10 200 48 152 0 + d430 49 1568 1292 276 0 + d730 10 280 10 270 0 + + (changes depending on what's disabled and the load of the cluster) + + FIXME: If a machine is covered by more than one queue, this will + cause the counts to be overestimated. Must register if a machine has + been encountered and not re-account that machine. + However this may not be the best approach as queues are overlapping + on machines. Since on Turing, the practice is not to further split a + machine to multiple queues (i.e. a 32-core node have all the 32 + cores assignable to both main and timed-main queues, rather than + dedicating 16 for main and 16 for timed-main), we use a particular + way to avoid the double-counting: + + - slots_resv: total number of reserved slots in a node (for whatever + the sysadmin designates) -- sum them up + + - slots_used: total number of slots currently used (i.e., + occupied by jobs) -- sum them up + + - slots_tot: total number of slots in a node -- take the maximum + value encountered. + Had the nodes split-dedicated to a particular queue, we have to + take the sum of the values instead. + + """ + from pprint import pprint + host_stats = collect_host_stats(qstat_f, show_disabled_nodes) + print_host_stats(host_stats) + + def node_slot_stats_per_machine_type(qstat_f, show_disabled_nodes=False): """Prints status of slot availability per machine type (defined as host with the same base hostname (e.g. "c6-", or "c8-"). @@ -174,6 +220,35 @@ def collect_host_stats(qstat_f, show_disabled_nodes=None): return host_stats +def node_load_ratio(node_load, slots_used): + """Ratio of node load vs slots claimed to be used. This should be close to one + if the job uses the CPUs efficiently, or near zero if most jobs are interactive + (i.e. lots of idling).""" + return node_load / slots_used if slots_used != 0 \ + else 0.0 if node_load < 0.75 \ + else float('nan') + + +def print_host_stats(host_stats): + """Prints the per-host statistics gathered by `collect_host_stats`. + """ + hostnames = sorted(host_stats.keys()) + print("%-16s %5s %5s %5s %5s %7s %9s" \ + % ("HOST", "CORES", "used", "free", "resv", "load", "load/used")) + for h in hostnames: + hs = host_stats[h] + print("%-16s %5d %5d %5d %5d %7.2f %9.3f" \ + % (h, + hs['slots_tot'], + hs['slots_used'], + hs['slots_tot'] - hs['slots_used'] - hs['slots_resv'], + hs['slots_resv'], + hs['node_load'], + node_load_ratio(hs['node_load'], hs['slots_used']), + ) + ) + + def summarize_hosttype_stats(host_stats): """Further summarize the host stats by the host type (denoted by the prefix of the hostname before the dash character, i.e. "c8" for @@ -226,6 +301,7 @@ def print_hosttype_stats(hosttype_stats): + def help(): msg = """\ %(CMD)s - Shows node status from SGE information @@ -263,7 +339,9 @@ def main_default(argv): elif argv[1] in ('--raw', 'raw'): cmd = "raw" elif argv[1] in ('--stats', 'stats', 'stat'): - cmd = "stats" + cmd = "stats" # old stats, a.k.a. hosttype_stats + elif re.search(r'^(--)?host-?stat', argv[1]): + cmd = "hoststats" elif argv[1] in ('--help', 'help', '-h'): help() return 0 @@ -311,6 +389,10 @@ def main_default(argv): node_slot_stats_raw(qstat_f_current, show_disabled_nodes=show_disabled_nodes, ) + elif cmd == "hoststats": + node_slot_stats(qstat_f_current, + show_disabled_nodes=show_disabled_nodes, + ) elif cmd == "stats": node_slot_stats_per_machine_type(qstat_f_current, show_disabled_nodes=show_disabled_nodes,