From f06803ba6c47f7151722a0328528d3242210a31e Mon Sep 17 00:00:00 2001 From: Wirawan Purwanto Date: Wed, 14 Sep 2016 10:16:35 -0400 Subject: [PATCH] * show-node-status.py: A toolbox to analyze node status returned by SGE. --- sge/show-node-status.py | 236 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 229 insertions(+), 7 deletions(-) diff --git a/sge/show-node-status.py b/sge/show-node-status.py index d8bddd1..775d616 100755 --- a/sge/show-node-status.py +++ b/sge/show-node-status.py @@ -20,10 +20,16 @@ import re import subprocess import sys +class ParseError(RuntimeError): + pass + +class ProgramError(RuntimeError): + pass + #----------------------- UNDER CONSTRUCTION ----------------------- #Nothing was done yet -def node_slot_stats_raw(qstat_f, show_disabled_nodes=True): +def node_slot_stats_raw(qstat_f, show_disabled_nodes=False): """Prints the node stats from `qstat -f' in raw format: - not printing disabled nodes - not showing the computational jobs that are running on these nodes @@ -44,6 +50,180 @@ def node_slot_stats_raw(qstat_f, show_disabled_nodes=True): print(L) +def node_slot_stats_per_machine_type(qstat_f, show_disabled_nodes=False): + """Prints status of slot availability per machine type (defined as + host with the same base hostname (e.g. "c6-", or "c8-"). + Originally implemented based on the naming of hosts on Turing cluster. + In SGE terminology, "slot" means a CPU core. + + Example output: + + MACHTYPE NODE CORES used free resv + c6 15 240 77 163 0 + c8 40 768 569 199 0 + cr 74 1480 988 492 0 + crhimem 3 96 0 96 0 + crphi 10 200 48 152 0 + d430 49 1568 1292 276 0 + d730 10 280 10 270 0 + + (changes depending on what's disabled and the load of the cluster) + + FIXME: If a machine is covered by more than one queue, this will + cause the counts to be overestimated. Must register if a machine has + been encountered and not re-account that machine. + However this may not be the best approach as queues are overlapping + on machines. Since on Turing, the practice is not to further split a + machine to multiple queues (i.e. a 32-core node have all the 32 + cores assignable to both main and timed-main queues, rather than + dedicating 16 for main and 16 for timed-main), we use a particular + way to avoid the double-counting: + + - slots_resv: total number of reserved slots in a node (for whatever + the sysadmin designates) -- sum them up + + - slots_used: total number of slots currently used (i.e., + occupied by jobs) -- sum them up + + - slots_tot: total number of slots in a node -- take the maximum + value encountered. + Had the nodes split-dedicated to a particular queue, we have to + take the sum of the values instead. + + """ + from pprint import pprint + host_stats = collect_host_stats(qstat_f, show_disabled_nodes) + #pprint(host_stats) + hosttype_stats = summarize_hosttype_stats(host_stats) + #pprint(hosttype_stats) + print_hosttype_stats(hosttype_stats) + + +def collect_host_stats(qstat_f, show_disabled_nodes=None): + """Internal routine to collect node stats from `qstat -f` by + combining node status that were printed for each `queue@hostname` + combinations. + The result is a dict with hostname as the key.""" + + host_stats = {} + #host_list_by_kinds = {} + + def host_get_stats_rec(hostname): + if hostname not in host_stats: + s = { + 'slots_resv': 0, + 'slots_used': 0, + 'slots_tot': 0, + 'queues': [], + } + host_stats[hostname] = s + return host_stats[hostname] + + FNR = 0 + for L in qstat_f: + FNR += 1 + FLDS = L.split() + status_flags = FLDS[5] if (len(FLDS) > 5) else "" + + if FNR == 1 and FLDS[0] == "queuename": + continue + + # Valid host status field + if re.search(r'^[A-Za-z]', L) and len(FLDS) in (5,6): + # This line has a format like this: + # main@c8-014.cm.cluster BIP 0/10/16 9.98 linux-x64 d + # ^ queue & node name + queue_node, queue_type, core_usage_combo, node_load, os_arch \ + = tuple(FLDS[0:5]) + try: + node_load = float(node_load) + except ValueError: + node_load = 0 + # status flags, see above + + # skip disabled hosts + if ("d" in status_flags) and not show_disabled_nodes: + continue + + #if (optPrintRaw != 0) print($0) + + # Extract more useful info + + m = re.search(r'^([^@]+)@([^-]+)-(.*)$', queue_node) + if not m: + raise ParseError, \ + "Invalid queue/host combo on line %D: %s" % (FNR, queue_node) + queue, hostkind, hostnum = m.groups() + hostname = hostkind + "-" + hostnum + + slots_resv, slots_used, slots_tot = map(int, core_usage_combo.split("/")) + + hoststat = host_get_stats_rec(hostname) + hoststat['slots_resv'] += slots_resv + hoststat['slots_used'] += slots_used + # FIXME assume same across queues; fix if not correct: + hoststat['slots_tot'] = max(hoststat['slots_tot'], slots_tot) + hoststat['os_arch'] = os_arch + # FIXME we assume all of same queue type; fix if not correct: + hoststat['queue_type'] = queue_type + hoststat['queues'].append(queue) + # FIXME we assume all have same load; fix if not correct: + hoststat['node_load'] = node_load + + #return host_list_by_kinds, host_stats + return host_stats + + +def summarize_hosttype_stats(host_stats): + """Further summarize the host stats by the host type (denoted by the + prefix of the hostname before the dash character, i.e. "c8" for + "c8-003"). + """ + hosttype_stats = {} + + def hosttype_get_stats_rec(hosttype): + if hosttype not in hosttype_stats: + s = { + 'hosts': [], + } + hosttype_stats[hosttype] = s + return hosttype_stats[hosttype] + + for (hosttype, hostname) in [ (h.split('-')[0], h) for h in host_stats.keys() ]: + #print(hosttype, hostname) + hts = hosttype_get_stats_rec(hosttype) + hts['hosts'].append(hostname) + + for hts in hosttype_stats.values(): + #print hts + hts['host_count'] = len(hts['hosts']) + hts['slots_resv'] = sum(host_stats[h]['slots_resv'] for h in hts['hosts']) + hts['slots_tot'] = sum(host_stats[h]['slots_tot'] for h in hts['hosts']) + hts['slots_used'] = sum(host_stats[h]['slots_used'] for h in hts['hosts']) + hts['node_load'] = sum(host_stats[h]['node_load'] for h in hts['hosts']) + hts['os_arch'] = host_stats[hts['hosts'][0]]['slots_used'] + + return hosttype_stats + + +def print_hosttype_stats(hosttype_stats): + hosttypes = sorted(hosttype_stats.keys()) + print("%-16s %5s %5s %5s %5s %5s %7s %9s" \ + % ("MACHTYPE", "NODES", "CORES", "used", "free", "resv", "load", "load/used")) + for ht in hosttypes: + hts = hosttype_stats[ht] + print("%-16s %5d %5d %5d %5d %5d %7.2f %9.3f" \ + % (ht, hts['host_count'], + hts['slots_tot'], + hts['slots_used'], + hts['slots_tot'] - hts['slots_used'] - hts['slots_resv'], + hts['slots_resv'], + hts['node_load'], + hts['node_load'] / hts['slots_used'] if hts['slots_used'] != 0 + else 0.0 if hts['node_load'] < 0.75 else float('nan') + ) + ) + def help(): @@ -65,13 +245,14 @@ stats """ -def main_default(argv, save_qstat=True): +def main_default(argv, save_qstat=None): """Main default function: - By default we invoke qstat -f and prints the analysis. - If argv[1] is given, then we read in the file and use that for the analysis. """ from time import localtime, strftime + from getopt import getopt dtime = localtime() dtimestr = strftime("%Y%m%d-%H%M", dtime) @@ -87,10 +268,23 @@ def main_default(argv, save_qstat=True): raise ValueError, "Unknown action: "+argv[1] # Skip program name and first command: - cmdargs = argv[2:] + cmdargs_in = argv[2:] + cmdopts, cmdargs = getopt(cmdargs_in, + "ds", + ["show-disabled-nodes=", + "include-disabled-nodes=", + "save", + ]) # Default options show_disabled_nodes = False + for o,a in cmdopts: + if o in ('-d',): + show_disabled_nodes = True + elif o in ('--show-disabled-nodes', '--include-disabled-nodes'): + show_disabled_nodes = parse_int_or_bool(a) + elif o in ('-s', '--save'): + save_qstat = True if len(cmdargs) > 0: qstat_f_current = open(cmdargs[0], "r").read().splitlines() @@ -106,11 +300,11 @@ def main_default(argv, save_qstat=True): show_disabled_nodes=show_disabled_nodes, ) elif cmd == "stats": - node_slots_stats_per_node_type(qstat_f_current, - show_disabled_nodes=show_disabled_nodes, - ) + node_slot_stats_per_machine_type(qstat_f_current, + show_disabled_nodes=show_disabled_nodes, + ) else: - raise "Missing support for command: "+cmd + raise ProgramError, "Missing support for command: "+cmd @@ -157,6 +351,34 @@ def str_fmt_heading(fmt): return _str_fmt_heading_rx.sub(r'\1s', fmt) +def parse_int_or_bool(S): + if isinstance(S, basestring): + S = S.strip().lower() + try: + return int(S) + except ValueError: + if S in ('true', 't', 'yes', 'y', 'on'): + return True + elif S in ('false', 'f', 'no', 'n', 'off', '-', ''): + return False + else: + raise ValueError, "Don't understand '%s' for boolean value" % S + else: + return S + + +def parse_bool(S): + if isinstance(S, basestring): + S = S.strip().lower() + if S in ('true', 't', 'yes', 'y', 'on', '1'): + return True + elif S in ('false', 'f', 'no', 'n', 'off', '0', '-', ''): + return False + else: + raise ValueError, "Don't understand '%s' for boolean value" % S + else: + return S + # stub main code