|
|
|
#!/usr/bin/env python
|
|
|
|
#
|
|
|
|
# Created: 20160830
|
|
|
|
# Wirawan Purwanto
|
|
|
|
|
|
|
|
"""
|
|
|
|
show-node-status.py
|
|
|
|
---------------------
|
|
|
|
|
|
|
|
Various tools to investigate node status in an SGE cluster.
|
|
|
|
This tool is a replacement and upgrade of the shell version of the tool
|
|
|
|
`node-slot-status.sh`.
|
|
|
|
|
|
|
|
Usage:
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
import os
|
|
|
|
import re
|
|
|
|
import subprocess
|
|
|
|
import sys
|
|
|
|
|
|
|
|
class ParseError(RuntimeError):
|
|
|
|
pass
|
|
|
|
|
|
|
|
class ProgramError(RuntimeError):
|
|
|
|
pass
|
|
|
|
|
|
|
|
MYSELF = 'show-node-status.py'
|
|
|
|
|
|
|
|
|
|
|
|
def node_slot_stats_raw(qstat_f, show_disabled_nodes=False):
|
|
|
|
"""Prints the node stats from `qstat -f' in raw format:
|
|
|
|
- not printing disabled nodes
|
|
|
|
- not showing the computational jobs that are running on these nodes
|
|
|
|
"""
|
|
|
|
FNR = 0
|
|
|
|
for L in qstat_f:
|
|
|
|
FNR += 1
|
|
|
|
FLDS = L.split()
|
|
|
|
status_flags = FLDS[5] if (len(FLDS) > 5) else ""
|
|
|
|
|
|
|
|
if FNR == 1 and FLDS[0] == "queuename":
|
|
|
|
print(L)
|
|
|
|
continue
|
|
|
|
|
|
|
|
# Valid host status field
|
|
|
|
if re.search(r'^[A-Za-z]', L) and len(FLDS) in (5,6) \
|
|
|
|
and (show_disabled_nodes or ("d" not in status_flags)):
|
|
|
|
print(L)
|
|
|
|
|
|
|
|
|
|
|
|
def node_slot_stats(qstat_f, show_disabled_nodes=False):
|
|
|
|
"""Prints status of slot availability per machine type (defined as
|
|
|
|
host with the same base hostname (e.g. "c6-", or "c8-").
|
|
|
|
Originally implemented based on the naming of hosts on Turing cluster.
|
|
|
|
In SGE terminology, "slot" means a CPU core.
|
|
|
|
|
|
|
|
Example output:
|
|
|
|
|
|
|
|
MACHTYPE NODE CORES used free resv
|
|
|
|
c6 15 240 77 163 0
|
|
|
|
c8 40 768 569 199 0
|
|
|
|
cr 74 1480 988 492 0
|
|
|
|
crhimem 3 96 0 96 0
|
|
|
|
crphi 10 200 48 152 0
|
|
|
|
d430 49 1568 1292 276 0
|
|
|
|
d730 10 280 10 270 0
|
|
|
|
|
|
|
|
(changes depending on what's disabled and the load of the cluster)
|
|
|
|
|
|
|
|
FIXME: If a machine is covered by more than one queue, this will
|
|
|
|
cause the counts to be overestimated. Must register if a machine has
|
|
|
|
been encountered and not re-account that machine.
|
|
|
|
However this may not be the best approach as queues are overlapping
|
|
|
|
on machines. Since on Turing, the practice is not to further split a
|
|
|
|
machine to multiple queues (i.e. a 32-core node have all the 32
|
|
|
|
cores assignable to both main and timed-main queues, rather than
|
|
|
|
dedicating 16 for main and 16 for timed-main), we use a particular
|
|
|
|
way to avoid the double-counting:
|
|
|
|
|
|
|
|
- slots_resv: total number of reserved slots in a node (for whatever
|
|
|
|
the sysadmin designates) -- sum them up
|
|
|
|
|
|
|
|
- slots_used: total number of slots currently used (i.e.,
|
|
|
|
occupied by jobs) -- sum them up
|
|
|
|
|
|
|
|
- slots_tot: total number of slots in a node -- take the maximum
|
|
|
|
value encountered.
|
|
|
|
Had the nodes split-dedicated to a particular queue, we have to
|
|
|
|
take the sum of the values instead.
|
|
|
|
|
|
|
|
"""
|
|
|
|
from pprint import pprint
|
|
|
|
host_stats = collect_host_stats(qstat_f, show_disabled_nodes)
|
|
|
|
print_host_stats(host_stats)
|
|
|
|
|
|
|
|
|
|
|
|
def node_slot_stats_per_machine_type(qstat_f, show_disabled_nodes=False):
|
|
|
|
"""Prints status of slot availability per machine type (defined as
|
|
|
|
host with the same base hostname (e.g. "c6-", or "c8-").
|
|
|
|
Originally implemented based on the naming of hosts on Turing cluster.
|
|
|
|
In SGE terminology, "slot" means a CPU core.
|
|
|
|
|
|
|
|
Example output:
|
|
|
|
|
|
|
|
MACHTYPE NODE CORES used free resv
|
|
|
|
c6 15 240 77 163 0
|
|
|
|
c8 40 768 569 199 0
|
|
|
|
cr 74 1480 988 492 0
|
|
|
|
crhimem 3 96 0 96 0
|
|
|
|
crphi 10 200 48 152 0
|
|
|
|
d430 49 1568 1292 276 0
|
|
|
|
d730 10 280 10 270 0
|
|
|
|
|
|
|
|
(changes depending on what's disabled and the load of the cluster)
|
|
|
|
|
|
|
|
FIXME: If a machine is covered by more than one queue, this will
|
|
|
|
cause the counts to be overestimated. Must register if a machine has
|
|
|
|
been encountered and not re-account that machine.
|
|
|
|
However this may not be the best approach as queues are overlapping
|
|
|
|
on machines. Since on Turing, the practice is not to further split a
|
|
|
|
machine to multiple queues (i.e. a 32-core node have all the 32
|
|
|
|
cores assignable to both main and timed-main queues, rather than
|
|
|
|
dedicating 16 for main and 16 for timed-main), we use a particular
|
|
|
|
way to avoid the double-counting:
|
|
|
|
|
|
|
|
- slots_resv: total number of reserved slots in a node (for whatever
|
|
|
|
the sysadmin designates) -- sum them up
|
|
|
|
|
|
|
|
- slots_used: total number of slots currently used (i.e.,
|
|
|
|
occupied by jobs) -- sum them up
|
|
|
|
|
|
|
|
- slots_tot: total number of slots in a node -- take the maximum
|
|
|
|
value encountered.
|
|
|
|
Had the nodes split-dedicated to a particular queue, we have to
|
|
|
|
take the sum of the values instead.
|
|
|
|
|
|
|
|
"""
|
|
|
|
from pprint import pprint
|
|
|
|
host_stats = collect_host_stats(qstat_f, show_disabled_nodes)
|
|
|
|
#pprint(host_stats)
|
|
|
|
hosttype_stats = summarize_hosttype_stats(host_stats)
|
|
|
|
#pprint(hosttype_stats)
|
|
|
|
print_hosttype_stats(hosttype_stats)
|
|
|
|
|
|
|
|
|
|
|
|
def collect_host_stats(qstat_f, show_disabled_nodes=None):
|
|
|
|
"""Internal routine to collect node stats from `qstat -f` by
|
|
|
|
combining node status that were printed for each `queue@hostname`
|
|
|
|
combinations.
|
|
|
|
The result is a dict with hostname as the key."""
|
|
|
|
|
|
|
|
host_stats = {}
|
|
|
|
#host_list_by_kinds = {}
|
|
|
|
|
|
|
|
def host_get_stats_rec(hostname):
|
|
|
|
if hostname not in host_stats:
|
|
|
|
s = {
|
|
|
|
'slots_resv': 0,
|
|
|
|
'slots_used': 0,
|
|
|
|
'slots_tot': 0,
|
|
|
|
'queues': [],
|
|
|
|
}
|
|
|
|
host_stats[hostname] = s
|
|
|
|
return host_stats[hostname]
|
|
|
|
|
|
|
|
FNR = 0
|
|
|
|
for L in qstat_f:
|
|
|
|
FNR += 1
|
|
|
|
FLDS = L.split()
|
|
|
|
status_flags = FLDS[5] if (len(FLDS) > 5) else ""
|
|
|
|
|
|
|
|
if FNR == 1 and FLDS[0] == "queuename":
|
|
|
|
continue
|
|
|
|
|
|
|
|
# Valid host status field
|
|
|
|
if re.search(r'^[A-Za-z]', L) and len(FLDS) in (5,6):
|
|
|
|
# This line has a format like this:
|
|
|
|
# main@c8-014.cm.cluster BIP 0/10/16 9.98 linux-x64 d
|
|
|
|
# ^ queue & node name
|
|
|
|
queue_node, queue_type, core_usage_combo, node_load, os_arch \
|
|
|
|
= tuple(FLDS[0:5])
|
|
|
|
try:
|
|
|
|
node_load = float(node_load)
|
|
|
|
except ValueError:
|
|
|
|
node_load = 0
|
|
|
|
# status flags, see above
|
|
|
|
|
|
|
|
# skip disabled hosts
|
|
|
|
if ("d" in status_flags) and not show_disabled_nodes:
|
|
|
|
continue
|
|
|
|
|
|
|
|
#if (optPrintRaw != 0) print($0)
|
|
|
|
|
|
|
|
# Extract more useful info
|
|
|
|
|
|
|
|
m = re.search(r'^([^@]+)@([^-]+)-([^.]*)((?:\..*)?)$', queue_node)
|
|
|
|
if not m:
|
|
|
|
raise ParseError, \
|
|
|
|
"Invalid queue/host combo on line %D: %s" % (FNR, queue_node)
|
|
|
|
queue, hostkind, hostnum, hostdomain = m.groups()
|
|
|
|
hostname = hostkind + "-" + hostnum
|
|
|
|
|
|
|
|
slots_resv, slots_used, slots_tot = map(int, core_usage_combo.split("/"))
|
|
|
|
|
|
|
|
hoststat = host_get_stats_rec(hostname)
|
|
|
|
hoststat['slots_resv'] += slots_resv
|
|
|
|
hoststat['slots_used'] += slots_used
|
|
|
|
# FIXME assume same across queues; fix if not correct:
|
|
|
|
hoststat['slots_tot'] = max(hoststat['slots_tot'], slots_tot)
|
|
|
|
hoststat['os_arch'] = os_arch
|
|
|
|
# FIXME we assume all of same queue type; fix if not correct:
|
|
|
|
hoststat['queue_type'] = queue_type
|
|
|
|
hoststat['queues'].append(queue)
|
|
|
|
# FIXME we assume all have same load; fix if not correct:
|
|
|
|
hoststat['node_load'] = node_load
|
|
|
|
hoststat['status_flags'] = status_flags
|
|
|
|
|
|
|
|
#return host_list_by_kinds, host_stats
|
|
|
|
return host_stats
|
|
|
|
|
|
|
|
|
|
|
|
def node_load_ratio(node_load, slots_used):
|
|
|
|
"""Ratio of node load vs slots claimed to be used. This should be close to one
|
|
|
|
if the job uses the CPUs efficiently, or near zero if most jobs are interactive
|
|
|
|
(i.e. lots of idling)."""
|
|
|
|
return node_load / slots_used if slots_used != 0 \
|
|
|
|
else 0.0 if node_load < 0.75 \
|
|
|
|
else float('nan')
|
|
|
|
|
|
|
|
|
|
|
|
def print_host_stats(host_stats):
|
|
|
|
"""Prints the per-host statistics gathered by `collect_host_stats`.
|
|
|
|
"""
|
|
|
|
hostnames = sorted(host_stats.keys())
|
|
|
|
print("%-16s %5s %5s %5s %5s %7s %9s" \
|
|
|
|
% ("HOST", "CORES", "used", "free", "resv", "load", "load/used"))
|
|
|
|
for h in hostnames:
|
|
|
|
hs = host_stats[h]
|
|
|
|
print("%-16s %5d %5d %5d %5d %7.2f %9.3f%s" \
|
|
|
|
% (h,
|
|
|
|
hs['slots_tot'],
|
|
|
|
hs['slots_used'],
|
|
|
|
hs['slots_tot'] - hs['slots_used'] - hs['slots_resv'],
|
|
|
|
hs['slots_resv'],
|
|
|
|
hs['node_load'],
|
|
|
|
node_load_ratio(hs['node_load'], hs['slots_used']),
|
|
|
|
" "+hs['status_flags'] if hs['status_flags'] else ""
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def summarize_hosttype_stats(host_stats):
|
|
|
|
"""Further summarize the host stats by the host type (denoted by the
|
|
|
|
prefix of the hostname before the dash character, i.e. "c8" for
|
|
|
|
"c8-003").
|
|
|
|
"""
|
|
|
|
hosttype_stats = {}
|
|
|
|
|
|
|
|
def hosttype_get_stats_rec(hosttype):
|
|
|
|
if hosttype not in hosttype_stats:
|
|
|
|
s = {
|
|
|
|
'hosts': [],
|
|
|
|
}
|
|
|
|
hosttype_stats[hosttype] = s
|
|
|
|
return hosttype_stats[hosttype]
|
|
|
|
|
|
|
|
for (hosttype, hostname) in [ (h.split('-')[0], h) for h in host_stats.keys() ]:
|
|
|
|
#print(hosttype, hostname)
|
|
|
|
hts = hosttype_get_stats_rec(hosttype)
|
|
|
|
hts['hosts'].append(hostname)
|
|
|
|
|
|
|
|
for hts in hosttype_stats.values():
|
|
|
|
#print hts
|
|
|
|
hts['host_count'] = len(hts['hosts'])
|
|
|
|
hts['slots_resv'] = sum(host_stats[h]['slots_resv'] for h in hts['hosts'])
|
|
|
|
hts['slots_tot'] = sum(host_stats[h]['slots_tot'] for h in hts['hosts'])
|
|
|
|
hts['slots_used'] = sum(host_stats[h]['slots_used'] for h in hts['hosts'])
|
|
|
|
hts['node_load'] = sum(host_stats[h]['node_load'] for h in hts['hosts'])
|
|
|
|
hts['os_arch'] = host_stats[hts['hosts'][0]]['slots_used']
|
|
|
|
|
|
|
|
return hosttype_stats
|
|
|
|
|
|
|
|
|
|
|
|
def print_hosttype_stats(hosttype_stats):
|
|
|
|
hosttypes = sorted(hosttype_stats.keys())
|
|
|
|
print("%-16s %5s %5s %5s %5s %5s %7s %9s" \
|
|
|
|
% ("MACHTYPE", "NODES", "CORES", "used", "free", "resv", "load", "load/used"))
|
|
|
|
for ht in hosttypes:
|
|
|
|
hts = hosttype_stats[ht]
|
|
|
|
print("%-16s %5d %5d %5d %5d %5d %7.2f %9.3f" \
|
|
|
|
% (ht, hts['host_count'],
|
|
|
|
hts['slots_tot'],
|
|
|
|
hts['slots_used'],
|
|
|
|
hts['slots_tot'] - hts['slots_used'] - hts['slots_resv'],
|
|
|
|
hts['slots_resv'],
|
|
|
|
hts['node_load'],
|
|
|
|
hts['node_load'] / hts['slots_used'] if hts['slots_used'] != 0
|
|
|
|
else 0.0 if hts['node_load'] < 0.75 else float('nan')
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def help():
|
|
|
|
msg = """\
|
|
|
|
%(CMD)s - Shows node status from SGE information
|
|
|
|
|
|
|
|
The information is mainly drawn from `qstat -f` output.
|
|
|
|
|
|
|
|
Usage: one of the following:
|
|
|
|
|
|
|
|
%(CMD)s raw [qstat_file] [--save] [--show-disabled-nodes]
|
|
|
|
Shows the raw queue/node status
|
|
|
|
|
|
|
|
%(CMD)s hoststats [qstat_file] [--save] [--show-disabled-nodes]
|
|
|
|
Shows the present statistics for every node
|
|
|
|
|
|
|
|
%(CMD)s
|
|
|
|
%(CMD)s stats [qstat_file] [--save] [--show-disabled-nodes]
|
|
|
|
Shows the statistic summary per node type
|
|
|
|
""" \
|
|
|
|
% dict(CMD=MYSELF)
|
|
|
|
print(msg)
|
|
|
|
|
|
|
|
|
|
|
|
def main_default(argv):
|
|
|
|
"""Main default function:
|
|
|
|
- By default we invoke qstat -f and prints the analysis.
|
|
|
|
- If argv[1] is given, then we read in the file and
|
|
|
|
use that for the analysis.
|
|
|
|
"""
|
|
|
|
from time import localtime, strftime
|
|
|
|
from getopt import getopt, GetoptError
|
|
|
|
|
|
|
|
dtime = localtime()
|
|
|
|
dtimestr = strftime("%Y%m%d-%H%M", dtime)
|
|
|
|
|
|
|
|
# Read the command first--what do we want to do
|
|
|
|
if len(argv) < 2:
|
|
|
|
cmd = "stats"
|
|
|
|
elif argv[1] in ('--raw', 'raw'):
|
|
|
|
cmd = "raw"
|
|
|
|
elif argv[1] in ('--stats', 'stats', 'stat'):
|
|
|
|
cmd = "stats" # old stats, a.k.a. hosttype_stats
|
|
|
|
elif re.search(r'^(--)?host-?stat', argv[1]):
|
|
|
|
cmd = "hoststats"
|
|
|
|
elif argv[1] in ('--help', 'help', '-h'):
|
|
|
|
help()
|
|
|
|
return 0
|
|
|
|
else:
|
|
|
|
raise ValueError, "Unknown action: "+argv[1]
|
|
|
|
|
|
|
|
# Skip program name and first command:
|
|
|
|
cmdargs_in = argv[2:]
|
|
|
|
try:
|
|
|
|
cmdopts, cmdargs = getopt(cmdargs_in,
|
|
|
|
"dhs",
|
|
|
|
["show-disabled-nodes=", "include-disabled-nodes=",
|
|
|
|
"save",
|
|
|
|
"help"])
|
|
|
|
except GetoptError as err:
|
|
|
|
sys.stderr.writelines([str(err), "\n"])
|
|
|
|
return 2
|
|
|
|
|
|
|
|
# Process flag argument
|
|
|
|
show_disabled_nodes = False
|
|
|
|
save_qstat = False
|
|
|
|
for o,a in cmdopts:
|
|
|
|
if o in ('-h', '--help'):
|
|
|
|
help()
|
|
|
|
return 0
|
|
|
|
elif o in ('-d',):
|
|
|
|
show_disabled_nodes = True
|
|
|
|
elif o in ('--show-disabled-nodes', '--include-disabled-nodes'):
|
|
|
|
show_disabled_nodes = parse_int_or_bool(a)
|
|
|
|
elif o in ('-s', '--save'):
|
|
|
|
save_qstat = True
|
|
|
|
else:
|
|
|
|
raise ProgramError, "Unhandled option in main program: %s %s" % (o,a)
|
|
|
|
|
|
|
|
if len(cmdargs) > 0:
|
|
|
|
qstat_f_current = open(cmdargs[0], "r").read().splitlines()
|
|
|
|
else:
|
|
|
|
qstat_f_current = pipe_out(('qstat', '-f'), split=True)
|
|
|
|
if save_qstat:
|
|
|
|
with open("qstat-f-%s.txt" % dtimestr, "w") as F:
|
|
|
|
F.write("\n".join(qstat_f_current))
|
|
|
|
F.write("\n")
|
|
|
|
|
|
|
|
if cmd == "raw":
|
|
|
|
node_slot_stats_raw(qstat_f_current,
|
|
|
|
show_disabled_nodes=show_disabled_nodes,
|
|
|
|
)
|
|
|
|
elif cmd == "hoststats":
|
|
|
|
node_slot_stats(qstat_f_current,
|
|
|
|
show_disabled_nodes=show_disabled_nodes,
|
|
|
|
)
|
|
|
|
elif cmd == "stats":
|
|
|
|
node_slot_stats_per_machine_type(qstat_f_current,
|
|
|
|
show_disabled_nodes=show_disabled_nodes,
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
raise ProgramError, "Missing support for command: "+cmd
|
|
|
|
|
|
|
|
return 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
# Support tools below
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
def pipe_out(args, split=False, shell=False):
|
|
|
|
"""Executes a shell command, piping out the stdout to python for parsing.
|
|
|
|
This is my customary shortcut for backtick operator.
|
|
|
|
The result is either a single string (if split==False) or a list of strings
|
|
|
|
with EOLs removed (if split==True)."""
|
|
|
|
retval = subprocess.Popen(args, stdout=subprocess.PIPE, shell=shell).communicate()[0]
|
|
|
|
if not split:
|
|
|
|
return retval
|
|
|
|
else:
|
|
|
|
return retval.splitlines()
|
|
|
|
|
|
|
|
|
|
|
|
# Internal variable: don't mess!
|
|
|
|
_str_fmt_heading_rx = None
|
|
|
|
def str_fmt_heading(fmt):
|
|
|
|
"""Replaces a printf-style formatting with one suitable for table heading:
|
|
|
|
all non-string conversions are replaced with string conversions,
|
|
|
|
preserving the minimum widths."""
|
|
|
|
# Originally from: $PWQMC77/scripts/cost.py and later Cr2_analysis_cbs.py .
|
|
|
|
#
|
|
|
|
#_str_fmt_heading_rx = None # only for development purposes
|
|
|
|
import re
|
|
|
|
global _str_fmt_heading_rx
|
|
|
|
if _str_fmt_heading_rx is None:
|
|
|
|
# Because of complicated regex, I verbosely write it out here:
|
|
|
|
_str_fmt_heading_rx = re.compile(r"""
|
|
|
|
(
|
|
|
|
% # % sign
|
|
|
|
(?:\([^)]+\))? # optional '(keyname)' mapping key
|
|
|
|
[-+#0 hlL]* # optional conversion flag
|
|
|
|
[0-9*]* # optional minimum field width
|
|
|
|
)
|
|
|
|
((?:\.[0-9]*)?) # optional precision
|
|
|
|
[^-+#*0 hlL0-9.%s] # not conv flag, dimensions, nor literal '%',
|
|
|
|
# nor 's' conversion specifiers
|
|
|
|
""", re.VERBOSE)
|
|
|
|
return _str_fmt_heading_rx.sub(r'\1s', fmt)
|
|
|
|
|
|
|
|
|
|
|
|
def parse_int_or_bool(S):
|
|
|
|
if isinstance(S, basestring):
|
|
|
|
S = S.strip().lower()
|
|
|
|
try:
|
|
|
|
return int(S)
|
|
|
|
except ValueError:
|
|
|
|
if S in ('true', 't', 'yes', 'y', 'on'):
|
|
|
|
return True
|
|
|
|
elif S in ('false', 'f', 'no', 'n', 'off', '-', ''):
|
|
|
|
return False
|
|
|
|
else:
|
|
|
|
raise ValueError, "Don't understand '%s' for boolean value" % S
|
|
|
|
else:
|
|
|
|
return S
|
|
|
|
|
|
|
|
|
|
|
|
def parse_bool(S):
|
|
|
|
if isinstance(S, basestring):
|
|
|
|
S = S.strip().lower()
|
|
|
|
if S in ('true', 't', 'yes', 'y', 'on', '1'):
|
|
|
|
return True
|
|
|
|
elif S in ('false', 'f', 'no', 'n', 'off', '0', '-', ''):
|
|
|
|
return False
|
|
|
|
else:
|
|
|
|
raise ValueError, "Don't understand '%s' for boolean value" % S
|
|
|
|
else:
|
|
|
|
return S
|
|
|
|
|
|
|
|
|
|
|
|
# stub main code
|
|
|
|
|
|
|
|
if __name__ == "__main__" and not "get_ipython" in globals():
|
|
|
|
main_default(sys.argv)
|