#!/usr/bin/env python # # Created: 20160830 # Wirawan Purwanto """ show-node-status.py --------------------- Various tools to investigate node status in an SGE cluster. This tool is a replacement and upgrade of the shell version of the tool `node-slot-status.sh`. Usage: """ import os import re import subprocess import sys class ParseError(RuntimeError): pass class ProgramError(RuntimeError): pass MYSELF = 'show-node-status.py' def node_slot_stats_raw(qstat_f, show_disabled_nodes=False): """Prints the node stats from `qstat -f' in raw format: - not printing disabled nodes - not showing the computational jobs that are running on these nodes """ FNR = 0 for L in qstat_f: FNR += 1 FLDS = L.split() status_flags = FLDS[5] if (len(FLDS) > 5) else "" if FNR == 1 and FLDS[0] == "queuename": print(L) continue # Valid host status field if re.search(r'^[A-Za-z]', L) and len(FLDS) in (5,6) \ and (show_disabled_nodes or ("d" not in status_flags)): print(L) def node_slot_stats(qstat_f, show_disabled_nodes=False): """Prints status of slot availability per machine type (defined as host with the same base hostname (e.g. "c6-", or "c8-"). Originally implemented based on the naming of hosts on Turing cluster. In SGE terminology, "slot" means a CPU core. Example output: MACHTYPE NODE CORES used free resv c6 15 240 77 163 0 c8 40 768 569 199 0 cr 74 1480 988 492 0 crhimem 3 96 0 96 0 crphi 10 200 48 152 0 d430 49 1568 1292 276 0 d730 10 280 10 270 0 (changes depending on what's disabled and the load of the cluster) FIXME: If a machine is covered by more than one queue, this will cause the counts to be overestimated. Must register if a machine has been encountered and not re-account that machine. However this may not be the best approach as queues are overlapping on machines. Since on Turing, the practice is not to further split a machine to multiple queues (i.e. a 32-core node have all the 32 cores assignable to both main and timed-main queues, rather than dedicating 16 for main and 16 for timed-main), we use a particular way to avoid the double-counting: - slots_resv: total number of reserved slots in a node (for whatever the sysadmin designates) -- sum them up - slots_used: total number of slots currently used (i.e., occupied by jobs) -- sum them up - slots_tot: total number of slots in a node -- take the maximum value encountered. Had the nodes split-dedicated to a particular queue, we have to take the sum of the values instead. """ from pprint import pprint host_stats = collect_host_stats(qstat_f, show_disabled_nodes) print_host_stats(host_stats) def node_slot_stats_per_machine_type(qstat_f, show_disabled_nodes=False): """Prints status of slot availability per machine type (defined as host with the same base hostname (e.g. "c6-", or "c8-"). Originally implemented based on the naming of hosts on Turing cluster. In SGE terminology, "slot" means a CPU core. Example output: MACHTYPE NODE CORES used free resv c6 15 240 77 163 0 c8 40 768 569 199 0 cr 74 1480 988 492 0 crhimem 3 96 0 96 0 crphi 10 200 48 152 0 d430 49 1568 1292 276 0 d730 10 280 10 270 0 (changes depending on what's disabled and the load of the cluster) FIXME: If a machine is covered by more than one queue, this will cause the counts to be overestimated. Must register if a machine has been encountered and not re-account that machine. However this may not be the best approach as queues are overlapping on machines. Since on Turing, the practice is not to further split a machine to multiple queues (i.e. a 32-core node have all the 32 cores assignable to both main and timed-main queues, rather than dedicating 16 for main and 16 for timed-main), we use a particular way to avoid the double-counting: - slots_resv: total number of reserved slots in a node (for whatever the sysadmin designates) -- sum them up - slots_used: total number of slots currently used (i.e., occupied by jobs) -- sum them up - slots_tot: total number of slots in a node -- take the maximum value encountered. Had the nodes split-dedicated to a particular queue, we have to take the sum of the values instead. """ from pprint import pprint host_stats = collect_host_stats(qstat_f, show_disabled_nodes) #pprint(host_stats) hosttype_stats = summarize_hosttype_stats(host_stats) #pprint(hosttype_stats) print_hosttype_stats(hosttype_stats) def collect_host_stats(qstat_f, show_disabled_nodes=None): """Internal routine to collect node stats from `qstat -f` by combining node status that were printed for each `queue@hostname` combinations. The result is a dict with hostname as the key.""" host_stats = {} #host_list_by_kinds = {} def host_get_stats_rec(hostname): if hostname not in host_stats: s = { 'slots_resv': 0, 'slots_used': 0, 'slots_tot': 0, 'queues': [], } host_stats[hostname] = s return host_stats[hostname] FNR = 0 for L in qstat_f: FNR += 1 FLDS = L.split() status_flags = FLDS[5] if (len(FLDS) > 5) else "" if FNR == 1 and FLDS[0] == "queuename": continue # Valid host status field if re.search(r'^[A-Za-z]', L) and len(FLDS) in (5,6): # This line has a format like this: # main@c8-014.cm.cluster BIP 0/10/16 9.98 linux-x64 d # ^ queue & node name queue_node, queue_type, core_usage_combo, node_load, os_arch \ = tuple(FLDS[0:5]) try: node_load = float(node_load) except ValueError: node_load = 0 # status flags, see above # skip disabled hosts if ("d" in status_flags) and not show_disabled_nodes: continue #if (optPrintRaw != 0) print($0) # Extract more useful info m = re.search(r'^([^@]+)@([^-]+)-([^.]*)((?:\..*)?)$', queue_node) if not m: raise ParseError, \ "Invalid queue/host combo on line %D: %s" % (FNR, queue_node) queue, hostkind, hostnum, hostdomain = m.groups() hostname = hostkind + "-" + hostnum slots_resv, slots_used, slots_tot = map(int, core_usage_combo.split("/")) hoststat = host_get_stats_rec(hostname) hoststat['slots_resv'] += slots_resv hoststat['slots_used'] += slots_used # FIXME assume same across queues; fix if not correct: hoststat['slots_tot'] = max(hoststat['slots_tot'], slots_tot) hoststat['os_arch'] = os_arch # FIXME we assume all of same queue type; fix if not correct: hoststat['queue_type'] = queue_type hoststat['queues'].append(queue) # FIXME we assume all have same load; fix if not correct: hoststat['node_load'] = node_load hoststat['status_flags'] = status_flags #return host_list_by_kinds, host_stats return host_stats def node_load_ratio(node_load, slots_used): """Ratio of node load vs slots claimed to be used. This should be close to one if the job uses the CPUs efficiently, or near zero if most jobs are interactive (i.e. lots of idling).""" return node_load / slots_used if slots_used != 0 \ else 0.0 if node_load < 0.75 \ else float('nan') def print_host_stats(host_stats): """Prints the per-host statistics gathered by `collect_host_stats`. """ hostnames = sorted(host_stats.keys()) print("%-16s %5s %5s %5s %5s %7s %9s" \ % ("HOST", "CORES", "used", "free", "resv", "load", "load/used")) for h in hostnames: hs = host_stats[h] print("%-16s %5d %5d %5d %5d %7.2f %9.3f%s" \ % (h, hs['slots_tot'], hs['slots_used'], hs['slots_tot'] - hs['slots_used'] - hs['slots_resv'], hs['slots_resv'], hs['node_load'], node_load_ratio(hs['node_load'], hs['slots_used']), " "+hs['status_flags'] if hs['status_flags'] else "" ) ) def summarize_hosttype_stats(host_stats): """Further summarize the host stats by the host type (denoted by the prefix of the hostname before the dash character, i.e. "c8" for "c8-003"). """ hosttype_stats = {} def hosttype_get_stats_rec(hosttype): if hosttype not in hosttype_stats: s = { 'hosts': [], } hosttype_stats[hosttype] = s return hosttype_stats[hosttype] for (hosttype, hostname) in [ (h.split('-')[0], h) for h in host_stats.keys() ]: #print(hosttype, hostname) hts = hosttype_get_stats_rec(hosttype) hts['hosts'].append(hostname) for hts in hosttype_stats.values(): #print hts hts['host_count'] = len(hts['hosts']) hts['slots_resv'] = sum(host_stats[h]['slots_resv'] for h in hts['hosts']) hts['slots_tot'] = sum(host_stats[h]['slots_tot'] for h in hts['hosts']) hts['slots_used'] = sum(host_stats[h]['slots_used'] for h in hts['hosts']) hts['node_load'] = sum(host_stats[h]['node_load'] for h in hts['hosts']) hts['os_arch'] = host_stats[hts['hosts'][0]]['slots_used'] return hosttype_stats def print_hosttype_stats(hosttype_stats): hosttypes = sorted(hosttype_stats.keys()) print("%-16s %5s %5s %5s %5s %5s %7s %9s" \ % ("MACHTYPE", "NODES", "CORES", "used", "free", "resv", "load", "load/used")) for ht in hosttypes: hts = hosttype_stats[ht] print("%-16s %5d %5d %5d %5d %5d %7.2f %9.3f" \ % (ht, hts['host_count'], hts['slots_tot'], hts['slots_used'], hts['slots_tot'] - hts['slots_used'] - hts['slots_resv'], hts['slots_resv'], hts['node_load'], hts['node_load'] / hts['slots_used'] if hts['slots_used'] != 0 else 0.0 if hts['node_load'] < 0.75 else float('nan') ) ) def help(): msg = """\ %(CMD)s - Shows node status from SGE information The information is mainly drawn from `qstat -f` output. Usage: one of the following: %(CMD)s raw [qstat_file] [--save] [--show-disabled-nodes] Shows the raw queue/node status %(CMD)s hoststats [qstat_file] [--save] [--show-disabled-nodes] Shows the present statistics for every node %(CMD)s %(CMD)s stats [qstat_file] [--save] [--show-disabled-nodes] Shows the statistic summary per node type """ \ % dict(CMD=MYSELF) print(msg) def main_default(argv): """Main default function: - By default we invoke qstat -f and prints the analysis. - If argv[1] is given, then we read in the file and use that for the analysis. """ from time import localtime, strftime from getopt import getopt, GetoptError dtime = localtime() dtimestr = strftime("%Y%m%d-%H%M", dtime) # Read the command first--what do we want to do if len(argv) < 2: cmd = "stats" elif argv[1] in ('--raw', 'raw'): cmd = "raw" elif argv[1] in ('--stats', 'stats', 'stat'): cmd = "stats" # old stats, a.k.a. hosttype_stats elif re.search(r'^(--)?host-?stat', argv[1]): cmd = "hoststats" elif argv[1] in ('--help', 'help', '-h'): help() return 0 else: raise ValueError, "Unknown action: "+argv[1] # Skip program name and first command: cmdargs_in = argv[2:] try: cmdopts, cmdargs = getopt(cmdargs_in, "dhs", ["show-disabled-nodes=", "include-disabled-nodes=", "save", "help"]) except GetoptError as err: sys.stderr.writelines([str(err), "\n"]) return 2 # Process flag argument show_disabled_nodes = False save_qstat = False for o,a in cmdopts: if o in ('-h', '--help'): help() return 0 elif o in ('-d',): show_disabled_nodes = True elif o in ('--show-disabled-nodes', '--include-disabled-nodes'): show_disabled_nodes = parse_int_or_bool(a) elif o in ('-s', '--save'): save_qstat = True else: raise ProgramError, "Unhandled option in main program: %s %s" % (o,a) if len(cmdargs) > 0: qstat_f_current = open(cmdargs[0], "r").read().splitlines() else: qstat_f_current = pipe_out(('qstat', '-f'), split=True) if save_qstat: with open("qstat-f-%s.txt" % dtimestr, "w") as F: F.write("\n".join(qstat_f_current)) F.write("\n") if cmd == "raw": node_slot_stats_raw(qstat_f_current, show_disabled_nodes=show_disabled_nodes, ) elif cmd == "hoststats": node_slot_stats(qstat_f_current, show_disabled_nodes=show_disabled_nodes, ) elif cmd == "stats": node_slot_stats_per_machine_type(qstat_f_current, show_disabled_nodes=show_disabled_nodes, ) else: raise ProgramError, "Missing support for command: "+cmd return 0 # --------------------------------------------------------------------------- # Support tools below # --------------------------------------------------------------------------- def pipe_out(args, split=False, shell=False): """Executes a shell command, piping out the stdout to python for parsing. This is my customary shortcut for backtick operator. The result is either a single string (if split==False) or a list of strings with EOLs removed (if split==True).""" retval = subprocess.Popen(args, stdout=subprocess.PIPE, shell=shell).communicate()[0] if not split: return retval else: return retval.splitlines() # Internal variable: don't mess! _str_fmt_heading_rx = None def str_fmt_heading(fmt): """Replaces a printf-style formatting with one suitable for table heading: all non-string conversions are replaced with string conversions, preserving the minimum widths.""" # Originally from: $PWQMC77/scripts/cost.py and later Cr2_analysis_cbs.py . # #_str_fmt_heading_rx = None # only for development purposes import re global _str_fmt_heading_rx if _str_fmt_heading_rx is None: # Because of complicated regex, I verbosely write it out here: _str_fmt_heading_rx = re.compile(r""" ( % # % sign (?:\([^)]+\))? # optional '(keyname)' mapping key [-+#0 hlL]* # optional conversion flag [0-9*]* # optional minimum field width ) ((?:\.[0-9]*)?) # optional precision [^-+#*0 hlL0-9.%s] # not conv flag, dimensions, nor literal '%', # nor 's' conversion specifiers """, re.VERBOSE) return _str_fmt_heading_rx.sub(r'\1s', fmt) def parse_int_or_bool(S): if isinstance(S, basestring): S = S.strip().lower() try: return int(S) except ValueError: if S in ('true', 't', 'yes', 'y', 'on'): return True elif S in ('false', 'f', 'no', 'n', 'off', '-', ''): return False else: raise ValueError, "Don't understand '%s' for boolean value" % S else: return S def parse_bool(S): if isinstance(S, basestring): S = S.strip().lower() if S in ('true', 't', 'yes', 'y', 'on', '1'): return True elif S in ('false', 'f', 'no', 'n', 'off', '0', '-', ''): return False else: raise ValueError, "Don't understand '%s' for boolean value" % S else: return S # stub main code if __name__ == "__main__" and not "get_ipython" in globals(): main_default(sys.argv)