* show-node-status.py: A toolbox to analyze node status returned by SGE.

Wirawan Purwanto 8 years ago
parent acfb11e010
commit f06803ba6c
  1. 236

@ -20,10 +20,16 @@ import re
import subprocess
import sys
class ParseError(RuntimeError):
class ProgramError(RuntimeError):
#----------------------- UNDER CONSTRUCTION -----------------------
#Nothing was done yet
def node_slot_stats_raw(qstat_f, show_disabled_nodes=True):
def node_slot_stats_raw(qstat_f, show_disabled_nodes=False):
"""Prints the node stats from `qstat -f' in raw format:
- not printing disabled nodes
- not showing the computational jobs that are running on these nodes
@ -44,6 +50,180 @@ def node_slot_stats_raw(qstat_f, show_disabled_nodes=True):
def node_slot_stats_per_machine_type(qstat_f, show_disabled_nodes=False):
"""Prints status of slot availability per machine type (defined as
host with the same base hostname (e.g. "c6-", or "c8-").
Originally implemented based on the naming of hosts on Turing cluster.
In SGE terminology, "slot" means a CPU core.
Example output:
MACHTYPE NODE CORES used free resv
c6 15 240 77 163 0
c8 40 768 569 199 0
cr 74 1480 988 492 0
crhimem 3 96 0 96 0
crphi 10 200 48 152 0
d430 49 1568 1292 276 0
d730 10 280 10 270 0
(changes depending on what's disabled and the load of the cluster)
FIXME: If a machine is covered by more than one queue, this will
cause the counts to be overestimated. Must register if a machine has
been encountered and not re-account that machine.
However this may not be the best approach as queues are overlapping
on machines. Since on Turing, the practice is not to further split a
machine to multiple queues (i.e. a 32-core node have all the 32
cores assignable to both main and timed-main queues, rather than
dedicating 16 for main and 16 for timed-main), we use a particular
way to avoid the double-counting:
- slots_resv: total number of reserved slots in a node (for whatever
the sysadmin designates) -- sum them up
- slots_used: total number of slots currently used (i.e.,
occupied by jobs) -- sum them up
- slots_tot: total number of slots in a node -- take the maximum
value encountered.
Had the nodes split-dedicated to a particular queue, we have to
take the sum of the values instead.
from pprint import pprint
host_stats = collect_host_stats(qstat_f, show_disabled_nodes)
hosttype_stats = summarize_hosttype_stats(host_stats)
def collect_host_stats(qstat_f, show_disabled_nodes=None):
"""Internal routine to collect node stats from `qstat -f` by
combining node status that were printed for each `queue@hostname`
The result is a dict with hostname as the key."""
host_stats = {}
#host_list_by_kinds = {}
def host_get_stats_rec(hostname):
if hostname not in host_stats:
s = {
'slots_resv': 0,
'slots_used': 0,
'slots_tot': 0,
'queues': [],
host_stats[hostname] = s
return host_stats[hostname]
FNR = 0
for L in qstat_f:
FNR += 1
FLDS = L.split()
status_flags = FLDS[5] if (len(FLDS) > 5) else ""
if FNR == 1 and FLDS[0] == "queuename":
# Valid host status field
if re.search(r'^[A-Za-z]', L) and len(FLDS) in (5,6):
# This line has a format like this:
# main@c8-014.cm.cluster BIP 0/10/16 9.98 linux-x64 d
# ^ queue & node name
queue_node, queue_type, core_usage_combo, node_load, os_arch \
= tuple(FLDS[0:5])
node_load = float(node_load)
except ValueError:
node_load = 0
# status flags, see above
# skip disabled hosts
if ("d" in status_flags) and not show_disabled_nodes:
#if (optPrintRaw != 0) print($0)
# Extract more useful info
m = re.search(r'^([^@]+)@([^-]+)-(.*)$', queue_node)
if not m:
raise ParseError, \
"Invalid queue/host combo on line %D: %s" % (FNR, queue_node)
queue, hostkind, hostnum = m.groups()
hostname = hostkind + "-" + hostnum
slots_resv, slots_used, slots_tot = map(int, core_usage_combo.split("/"))
hoststat = host_get_stats_rec(hostname)
hoststat['slots_resv'] += slots_resv
hoststat['slots_used'] += slots_used
# FIXME assume same across queues; fix if not correct:
hoststat['slots_tot'] = max(hoststat['slots_tot'], slots_tot)
hoststat['os_arch'] = os_arch
# FIXME we assume all of same queue type; fix if not correct:
hoststat['queue_type'] = queue_type
# FIXME we assume all have same load; fix if not correct:
hoststat['node_load'] = node_load
#return host_list_by_kinds, host_stats
return host_stats
def summarize_hosttype_stats(host_stats):
"""Further summarize the host stats by the host type (denoted by the
prefix of the hostname before the dash character, i.e. "c8" for
hosttype_stats = {}
def hosttype_get_stats_rec(hosttype):
if hosttype not in hosttype_stats:
s = {
'hosts': [],
hosttype_stats[hosttype] = s
return hosttype_stats[hosttype]
for (hosttype, hostname) in [ (h.split('-')[0], h) for h in host_stats.keys() ]:
#print(hosttype, hostname)
hts = hosttype_get_stats_rec(hosttype)
for hts in hosttype_stats.values():
#print hts
hts['host_count'] = len(hts['hosts'])
hts['slots_resv'] = sum(host_stats[h]['slots_resv'] for h in hts['hosts'])
hts['slots_tot'] = sum(host_stats[h]['slots_tot'] for h in hts['hosts'])
hts['slots_used'] = sum(host_stats[h]['slots_used'] for h in hts['hosts'])
hts['node_load'] = sum(host_stats[h]['node_load'] for h in hts['hosts'])
hts['os_arch'] = host_stats[hts['hosts'][0]]['slots_used']
return hosttype_stats
def print_hosttype_stats(hosttype_stats):
hosttypes = sorted(hosttype_stats.keys())
print("%-16s %5s %5s %5s %5s %5s %7s %9s" \
% ("MACHTYPE", "NODES", "CORES", "used", "free", "resv", "load", "load/used"))
for ht in hosttypes:
hts = hosttype_stats[ht]
print("%-16s %5d %5d %5d %5d %5d %7.2f %9.3f" \
% (ht, hts['host_count'],
hts['slots_tot'] - hts['slots_used'] - hts['slots_resv'],
hts['node_load'] / hts['slots_used'] if hts['slots_used'] != 0
else 0.0 if hts['node_load'] < 0.75 else float('nan')
def help():
@ -65,13 +245,14 @@ stats
def main_default(argv, save_qstat=True):
def main_default(argv, save_qstat=None):
"""Main default function:
- By default we invoke qstat -f and prints the analysis.
- If argv[1] is given, then we read in the file and
use that for the analysis.
from time import localtime, strftime
from getopt import getopt
dtime = localtime()
dtimestr = strftime("%Y%m%d-%H%M", dtime)
@ -87,10 +268,23 @@ def main_default(argv, save_qstat=True):
raise ValueError, "Unknown action: "+argv[1]
# Skip program name and first command:
cmdargs = argv[2:]
cmdargs_in = argv[2:]
cmdopts, cmdargs = getopt(cmdargs_in,
# Default options
show_disabled_nodes = False
for o,a in cmdopts:
if o in ('-d',):
show_disabled_nodes = True
elif o in ('--show-disabled-nodes', '--include-disabled-nodes'):
show_disabled_nodes = parse_int_or_bool(a)
elif o in ('-s', '--save'):
save_qstat = True
if len(cmdargs) > 0:
qstat_f_current = open(cmdargs[0], "r").read().splitlines()
@ -106,11 +300,11 @@ def main_default(argv, save_qstat=True):
elif cmd == "stats":
raise "Missing support for command: "+cmd
raise ProgramError, "Missing support for command: "+cmd
@ -157,6 +351,34 @@ def str_fmt_heading(fmt):
return _str_fmt_heading_rx.sub(r'\1s', fmt)
def parse_int_or_bool(S):
if isinstance(S, basestring):
S = S.strip().lower()
return int(S)
except ValueError:
if S in ('true', 't', 'yes', 'y', 'on'):
return True
elif S in ('false', 'f', 'no', 'n', 'off', '-', ''):
return False
raise ValueError, "Don't understand '%s' for boolean value" % S
return S
def parse_bool(S):
if isinstance(S, basestring):
S = S.strip().lower()
if S in ('true', 't', 'yes', 'y', 'on', '1'):
return True
elif S in ('false', 'f', 'no', 'n', 'off', '0', '-', ''):
return False
raise ValueError, "Don't understand '%s' for boolean value" % S
return S
# stub main code