#!/usr/bin/env python # # Created: 20160830 # Wirawan Purwanto """ show-node-status.py --------------------- Various tools to investigate node status in an SGE cluster. This tool is a replacement and upgrade of the shell version of the tool `node-slot-status.sh`. Usage: """ import os import re import subprocess import sys class ParseError(RuntimeError): pass class ProgramError(RuntimeError): pass MYSELF = 'show-node-status.py' def node_slot_stats_raw(qstat_f, show_disabled_nodes=False): """Prints the node stats from `qstat -f' in raw format: - not printing disabled nodes - not showing the computational jobs that are running on these nodes """ FNR = 0 for L in qstat_f: FNR += 1 FLDS = L.split() status_flags = FLDS[5] if (len(FLDS) > 5) else "" if FNR == 1 and FLDS[0] == "queuename": print(L) continue # Valid host status field if re.search(r'^[A-Za-z]', L) and len(FLDS) in (5,6) \ and (show_disabled_nodes or ("d" not in status_flags)): print(L) def node_slot_stats_per_machine_type(qstat_f, show_disabled_nodes=False): """Prints status of slot availability per machine type (defined as host with the same base hostname (e.g. "c6-", or "c8-"). Originally implemented based on the naming of hosts on Turing cluster. In SGE terminology, "slot" means a CPU core. Example output: MACHTYPE NODE CORES used free resv c6 15 240 77 163 0 c8 40 768 569 199 0 cr 74 1480 988 492 0 crhimem 3 96 0 96 0 crphi 10 200 48 152 0 d430 49 1568 1292 276 0 d730 10 280 10 270 0 (changes depending on what's disabled and the load of the cluster) FIXME: If a machine is covered by more than one queue, this will cause the counts to be overestimated. Must register if a machine has been encountered and not re-account that machine. However this may not be the best approach as queues are overlapping on machines. Since on Turing, the practice is not to further split a machine to multiple queues (i.e. a 32-core node have all the 32 cores assignable to both main and timed-main queues, rather than dedicating 16 for main and 16 for timed-main), we use a particular way to avoid the double-counting: - slots_resv: total number of reserved slots in a node (for whatever the sysadmin designates) -- sum them up - slots_used: total number of slots currently used (i.e., occupied by jobs) -- sum them up - slots_tot: total number of slots in a node -- take the maximum value encountered. Had the nodes split-dedicated to a particular queue, we have to take the sum of the values instead. """ from pprint import pprint host_stats = collect_host_stats(qstat_f, show_disabled_nodes) #pprint(host_stats) hosttype_stats = summarize_hosttype_stats(host_stats) #pprint(hosttype_stats) print_hosttype_stats(hosttype_stats) def collect_host_stats(qstat_f, show_disabled_nodes=None): """Internal routine to collect node stats from `qstat -f` by combining node status that were printed for each `queue@hostname` combinations. The result is a dict with hostname as the key.""" host_stats = {} #host_list_by_kinds = {} def host_get_stats_rec(hostname): if hostname not in host_stats: s = { 'slots_resv': 0, 'slots_used': 0, 'slots_tot': 0, 'queues': [], } host_stats[hostname] = s return host_stats[hostname] FNR = 0 for L in qstat_f: FNR += 1 FLDS = L.split() status_flags = FLDS[5] if (len(FLDS) > 5) else "" if FNR == 1 and FLDS[0] == "queuename": continue # Valid host status field if re.search(r'^[A-Za-z]', L) and len(FLDS) in (5,6): # This line has a format like this: # main@c8-014.cm.cluster BIP 0/10/16 9.98 linux-x64 d # ^ queue & node name queue_node, queue_type, core_usage_combo, node_load, os_arch \ = tuple(FLDS[0:5]) try: node_load = float(node_load) except ValueError: node_load = 0 # status flags, see above # skip disabled hosts if ("d" in status_flags) and not show_disabled_nodes: continue #if (optPrintRaw != 0) print($0) # Extract more useful info m = re.search(r'^([^@]+)@([^-]+)-([^.]*)((?:\..*)?)$', queue_node) if not m: raise ParseError, \ "Invalid queue/host combo on line %D: %s" % (FNR, queue_node) queue, hostkind, hostnum, hostdomain = m.groups() hostname = hostkind + "-" + hostnum slots_resv, slots_used, slots_tot = map(int, core_usage_combo.split("/")) hoststat = host_get_stats_rec(hostname) hoststat['slots_resv'] += slots_resv hoststat['slots_used'] += slots_used # FIXME assume same across queues; fix if not correct: hoststat['slots_tot'] = max(hoststat['slots_tot'], slots_tot) hoststat['os_arch'] = os_arch # FIXME we assume all of same queue type; fix if not correct: hoststat['queue_type'] = queue_type hoststat['queues'].append(queue) # FIXME we assume all have same load; fix if not correct: hoststat['node_load'] = node_load #return host_list_by_kinds, host_stats return host_stats def summarize_hosttype_stats(host_stats): """Further summarize the host stats by the host type (denoted by the prefix of the hostname before the dash character, i.e. "c8" for "c8-003"). """ hosttype_stats = {} def hosttype_get_stats_rec(hosttype): if hosttype not in hosttype_stats: s = { 'hosts': [], } hosttype_stats[hosttype] = s return hosttype_stats[hosttype] for (hosttype, hostname) in [ (h.split('-')[0], h) for h in host_stats.keys() ]: #print(hosttype, hostname) hts = hosttype_get_stats_rec(hosttype) hts['hosts'].append(hostname) for hts in hosttype_stats.values(): #print hts hts['host_count'] = len(hts['hosts']) hts['slots_resv'] = sum(host_stats[h]['slots_resv'] for h in hts['hosts']) hts['slots_tot'] = sum(host_stats[h]['slots_tot'] for h in hts['hosts']) hts['slots_used'] = sum(host_stats[h]['slots_used'] for h in hts['hosts']) hts['node_load'] = sum(host_stats[h]['node_load'] for h in hts['hosts']) hts['os_arch'] = host_stats[hts['hosts'][0]]['slots_used'] return hosttype_stats def print_hosttype_stats(hosttype_stats): hosttypes = sorted(hosttype_stats.keys()) print("%-16s %5s %5s %5s %5s %5s %7s %9s" \ % ("MACHTYPE", "NODES", "CORES", "used", "free", "resv", "load", "load/used")) for ht in hosttypes: hts = hosttype_stats[ht] print("%-16s %5d %5d %5d %5d %5d %7.2f %9.3f" \ % (ht, hts['host_count'], hts['slots_tot'], hts['slots_used'], hts['slots_tot'] - hts['slots_used'] - hts['slots_resv'], hts['slots_resv'], hts['node_load'], hts['node_load'] / hts['slots_used'] if hts['slots_used'] != 0 else 0.0 if hts['node_load'] < 0.75 else float('nan') ) ) def help(): msg = """\ %(CMD)s - Shows node status from SGE information The information is mainly drawn from `qstat -f` output. Usage: one of the following: %(CMD)s raw [qstat_file] [--save] [--show-disabled-nodes] Shows the raw queue/node status %(CMD)s %(CMD)s stats [qstat_file] [--save] [--show-disabled-nodes] Shows the statistic summary per node type """ \ % dict(CMD=MYSELF) print(msg) def main_default(argv): """Main default function: - By default we invoke qstat -f and prints the analysis. - If argv[1] is given, then we read in the file and use that for the analysis. """ from time import localtime, strftime from getopt import getopt, GetoptError dtime = localtime() dtimestr = strftime("%Y%m%d-%H%M", dtime) # Read the command first--what do we want to do if len(argv) < 2: cmd = "stats" elif argv[1] in ('--raw', 'raw'): cmd = "raw" elif argv[1] in ('--stats', 'stats', 'stat'): cmd = "stats" elif argv[1] in ('--help', 'help', '-h'): help() return 0 else: raise ValueError, "Unknown action: "+argv[1] # Skip program name and first command: cmdargs_in = argv[2:] try: cmdopts, cmdargs = getopt(cmdargs_in, "dhs", ["show-disabled-nodes=", "include-disabled-nodes=", "save", "help"]) except GetoptError as err: sys.stderr.writelines([str(err), "\n"]) return 2 # Process flag argument show_disabled_nodes = False save_qstat = False for o,a in cmdopts: if o in ('-h', '--help'): help() return 0 elif o in ('-d',): show_disabled_nodes = True elif o in ('--show-disabled-nodes', '--include-disabled-nodes'): show_disabled_nodes = parse_int_or_bool(a) elif o in ('-s', '--save'): save_qstat = True else: raise ProgramError, "Unhandled option in main program: %s %s" % (o,a) if len(cmdargs) > 0: qstat_f_current = open(cmdargs[0], "r").read().splitlines() else: qstat_f_current = pipe_out(('qstat', '-f'), split=True) if save_qstat: with open("qstat-f-%s.txt" % dtimestr, "w") as F: F.write("\n".join(qstat_f_current)) F.write("\n") if cmd == "raw": node_slot_stats_raw(qstat_f_current, show_disabled_nodes=show_disabled_nodes, ) elif cmd == "stats": node_slot_stats_per_machine_type(qstat_f_current, show_disabled_nodes=show_disabled_nodes, ) else: raise ProgramError, "Missing support for command: "+cmd return 0 # --------------------------------------------------------------------------- # Support tools below # --------------------------------------------------------------------------- def pipe_out(args, split=False, shell=False): """Executes a shell command, piping out the stdout to python for parsing. This is my customary shortcut for backtick operator. The result is either a single string (if split==False) or a list of strings with EOLs removed (if split==True).""" retval = subprocess.Popen(args, stdout=subprocess.PIPE, shell=shell).communicate()[0] if not split: return retval else: return retval.splitlines() # Internal variable: don't mess! _str_fmt_heading_rx = None def str_fmt_heading(fmt): """Replaces a printf-style formatting with one suitable for table heading: all non-string conversions are replaced with string conversions, preserving the minimum widths.""" # Originally from: $PWQMC77/scripts/cost.py and later Cr2_analysis_cbs.py . # #_str_fmt_heading_rx = None # only for development purposes import re global _str_fmt_heading_rx if _str_fmt_heading_rx is None: # Because of complicated regex, I verbosely write it out here: _str_fmt_heading_rx = re.compile(r""" ( % # % sign (?:\([^)]+\))? # optional '(keyname)' mapping key [-+#0 hlL]* # optional conversion flag [0-9*]* # optional minimum field width ) ((?:\.[0-9]*)?) # optional precision [^-+#*0 hlL0-9.%s] # not conv flag, dimensions, nor literal '%', # nor 's' conversion specifiers """, re.VERBOSE) return _str_fmt_heading_rx.sub(r'\1s', fmt) def parse_int_or_bool(S): if isinstance(S, basestring): S = S.strip().lower() try: return int(S) except ValueError: if S in ('true', 't', 'yes', 'y', 'on'): return True elif S in ('false', 'f', 'no', 'n', 'off', '-', ''): return False else: raise ValueError, "Don't understand '%s' for boolean value" % S else: return S def parse_bool(S): if isinstance(S, basestring): S = S.strip().lower() if S in ('true', 't', 'yes', 'y', 'on', '1'): return True elif S in ('false', 'f', 'no', 'n', 'off', '0', '-', ''): return False else: raise ValueError, "Don't understand '%s' for boolean value" % S else: return S # stub main code if __name__ == "__main__" and not "get_ipython" in globals(): main_default(sys.argv)