|
|
|
@ -50,6 +50,52 @@ def node_slot_stats_raw(qstat_f, show_disabled_nodes=False): |
|
|
|
|
print(L) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def node_slot_stats(qstat_f, show_disabled_nodes=False): |
|
|
|
|
"""Prints status of slot availability per machine type (defined as |
|
|
|
|
host with the same base hostname (e.g. "c6-", or "c8-"). |
|
|
|
|
Originally implemented based on the naming of hosts on Turing cluster. |
|
|
|
|
In SGE terminology, "slot" means a CPU core. |
|
|
|
|
|
|
|
|
|
Example output: |
|
|
|
|
|
|
|
|
|
MACHTYPE NODE CORES used free resv |
|
|
|
|
c6 15 240 77 163 0 |
|
|
|
|
c8 40 768 569 199 0 |
|
|
|
|
cr 74 1480 988 492 0 |
|
|
|
|
crhimem 3 96 0 96 0 |
|
|
|
|
crphi 10 200 48 152 0 |
|
|
|
|
d430 49 1568 1292 276 0 |
|
|
|
|
d730 10 280 10 270 0 |
|
|
|
|
|
|
|
|
|
(changes depending on what's disabled and the load of the cluster) |
|
|
|
|
|
|
|
|
|
FIXME: If a machine is covered by more than one queue, this will |
|
|
|
|
cause the counts to be overestimated. Must register if a machine has |
|
|
|
|
been encountered and not re-account that machine. |
|
|
|
|
However this may not be the best approach as queues are overlapping |
|
|
|
|
on machines. Since on Turing, the practice is not to further split a |
|
|
|
|
machine to multiple queues (i.e. a 32-core node have all the 32 |
|
|
|
|
cores assignable to both main and timed-main queues, rather than |
|
|
|
|
dedicating 16 for main and 16 for timed-main), we use a particular |
|
|
|
|
way to avoid the double-counting: |
|
|
|
|
|
|
|
|
|
- slots_resv: total number of reserved slots in a node (for whatever |
|
|
|
|
the sysadmin designates) -- sum them up |
|
|
|
|
|
|
|
|
|
- slots_used: total number of slots currently used (i.e., |
|
|
|
|
occupied by jobs) -- sum them up |
|
|
|
|
|
|
|
|
|
- slots_tot: total number of slots in a node -- take the maximum |
|
|
|
|
value encountered. |
|
|
|
|
Had the nodes split-dedicated to a particular queue, we have to |
|
|
|
|
take the sum of the values instead. |
|
|
|
|
|
|
|
|
|
""" |
|
|
|
|
from pprint import pprint |
|
|
|
|
host_stats = collect_host_stats(qstat_f, show_disabled_nodes) |
|
|
|
|
print_host_stats(host_stats) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def node_slot_stats_per_machine_type(qstat_f, show_disabled_nodes=False): |
|
|
|
|
"""Prints status of slot availability per machine type (defined as |
|
|
|
|
host with the same base hostname (e.g. "c6-", or "c8-"). |
|
|
|
@ -174,6 +220,35 @@ def collect_host_stats(qstat_f, show_disabled_nodes=None): |
|
|
|
|
return host_stats |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def node_load_ratio(node_load, slots_used): |
|
|
|
|
"""Ratio of node load vs slots claimed to be used. This should be close to one |
|
|
|
|
if the job uses the CPUs efficiently, or near zero if most jobs are interactive |
|
|
|
|
(i.e. lots of idling).""" |
|
|
|
|
return node_load / slots_used if slots_used != 0 \ |
|
|
|
|
else 0.0 if node_load < 0.75 \ |
|
|
|
|
else float('nan') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def print_host_stats(host_stats): |
|
|
|
|
"""Prints the per-host statistics gathered by `collect_host_stats`. |
|
|
|
|
""" |
|
|
|
|
hostnames = sorted(host_stats.keys()) |
|
|
|
|
print("%-16s %5s %5s %5s %5s %7s %9s" \ |
|
|
|
|
% ("HOST", "CORES", "used", "free", "resv", "load", "load/used")) |
|
|
|
|
for h in hostnames: |
|
|
|
|
hs = host_stats[h] |
|
|
|
|
print("%-16s %5d %5d %5d %5d %7.2f %9.3f" \ |
|
|
|
|
% (h, |
|
|
|
|
hs['slots_tot'], |
|
|
|
|
hs['slots_used'], |
|
|
|
|
hs['slots_tot'] - hs['slots_used'] - hs['slots_resv'], |
|
|
|
|
hs['slots_resv'], |
|
|
|
|
hs['node_load'], |
|
|
|
|
node_load_ratio(hs['node_load'], hs['slots_used']), |
|
|
|
|
) |
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def summarize_hosttype_stats(host_stats): |
|
|
|
|
"""Further summarize the host stats by the host type (denoted by the |
|
|
|
|
prefix of the hostname before the dash character, i.e. "c8" for |
|
|
|
@ -226,6 +301,7 @@ def print_hosttype_stats(hosttype_stats): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def help(): |
|
|
|
|
msg = """\ |
|
|
|
|
%(CMD)s - Shows node status from SGE information |
|
|
|
@ -263,7 +339,9 @@ def main_default(argv): |
|
|
|
|
elif argv[1] in ('--raw', 'raw'): |
|
|
|
|
cmd = "raw" |
|
|
|
|
elif argv[1] in ('--stats', 'stats', 'stat'): |
|
|
|
|
cmd = "stats" |
|
|
|
|
cmd = "stats" # old stats, a.k.a. hosttype_stats |
|
|
|
|
elif re.search(r'^(--)?host-?stat', argv[1]): |
|
|
|
|
cmd = "hoststats" |
|
|
|
|
elif argv[1] in ('--help', 'help', '-h'): |
|
|
|
|
help() |
|
|
|
|
return 0 |
|
|
|
@ -311,6 +389,10 @@ def main_default(argv): |
|
|
|
|
node_slot_stats_raw(qstat_f_current, |
|
|
|
|
show_disabled_nodes=show_disabled_nodes, |
|
|
|
|
) |
|
|
|
|
elif cmd == "hoststats": |
|
|
|
|
node_slot_stats(qstat_f_current, |
|
|
|
|
show_disabled_nodes=show_disabled_nodes, |
|
|
|
|
) |
|
|
|
|
elif cmd == "stats": |
|
|
|
|
node_slot_stats_per_machine_type(qstat_f_current, |
|
|
|
|
show_disabled_nodes=show_disabled_nodes, |
|
|
|
|