From f06803ba6c47f7151722a0328528d3242210a31e Mon Sep 17 00:00:00 2001
From: Wirawan Purwanto <wpurwant@turing.hpc.odu.edu>
Date: Wed, 14 Sep 2016 10:16:35 -0400
Subject: [PATCH] * show-node-status.py: A toolbox to analyze node status
 returned by SGE.

---
 sge/show-node-status.py | 236 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 229 insertions(+), 7 deletions(-)

diff --git a/sge/show-node-status.py b/sge/show-node-status.py
index d8bddd1..775d616 100755
--- a/sge/show-node-status.py
+++ b/sge/show-node-status.py
@@ -20,10 +20,16 @@ import re
 import subprocess
 import sys
 
+class ParseError(RuntimeError):
+  pass
+
+class ProgramError(RuntimeError):
+  pass
+
 #----------------------- UNDER CONSTRUCTION -----------------------
 #Nothing was done yet
 
-def node_slot_stats_raw(qstat_f, show_disabled_nodes=True):
+def node_slot_stats_raw(qstat_f, show_disabled_nodes=False):
   """Prints the node stats from `qstat -f' in raw format:
   - not printing disabled nodes
   - not showing the computational jobs that are running on these nodes
@@ -44,6 +50,180 @@ def node_slot_stats_raw(qstat_f, show_disabled_nodes=True):
       print(L)
 
 
+def node_slot_stats_per_machine_type(qstat_f, show_disabled_nodes=False):
+  """Prints status of slot availability per machine type (defined as
+  host with the same base hostname (e.g. "c6-", or "c8-").
+  Originally implemented based on the naming of hosts on Turing cluster.
+  In SGE terminology, "slot" means a CPU core.
+
+  Example output:
+
+     MACHTYPE          NODE   CORES  used  free  resv
+     c6                  15     240    77   163     0
+     c8                  40     768   569   199     0
+     cr                  74    1480   988   492     0
+     crhimem              3      96     0    96     0
+     crphi               10     200    48   152     0
+     d430                49    1568  1292   276     0
+     d730                10     280    10   270     0
+
+  (changes depending on what's disabled and the load of the cluster)
+
+  FIXME: If a machine is covered by more than one queue, this will
+  cause the counts to be overestimated. Must register if a machine has
+  been encountered and not re-account that machine.
+  However this may not be the best approach as queues are overlapping
+  on machines. Since on Turing, the practice is not to further split a
+  machine to multiple queues (i.e. a 32-core node have all the 32
+  cores assignable to both main and timed-main queues, rather than
+  dedicating 16 for main and 16 for timed-main), we use a particular
+  way to avoid the double-counting:
+
+  - slots_resv: total number of reserved slots in a node (for whatever
+    the sysadmin designates) -- sum them up
+
+  - slots_used: total number of slots currently used (i.e.,
+    occupied by jobs) -- sum them up
+
+  - slots_tot: total number of slots in a node -- take the maximum
+    value encountered.
+    Had the nodes split-dedicated to a particular queue, we have to
+    take the sum of the values instead.
+
+  """
+  from pprint import pprint
+  host_stats = collect_host_stats(qstat_f, show_disabled_nodes)
+  #pprint(host_stats)
+  hosttype_stats = summarize_hosttype_stats(host_stats)
+  #pprint(hosttype_stats)
+  print_hosttype_stats(hosttype_stats)
+
+
+def collect_host_stats(qstat_f, show_disabled_nodes=None):
+  """Internal routine to collect node stats from `qstat -f` by
+  combining node status that were printed for each `queue@hostname`
+  combinations.
+  The result is a dict with hostname as the key."""
+
+  host_stats = {}
+  #host_list_by_kinds = {}
+
+  def host_get_stats_rec(hostname):
+    if hostname not in host_stats:
+      s = {
+        'slots_resv': 0,
+        'slots_used': 0,
+        'slots_tot': 0,
+        'queues': [],
+      }
+      host_stats[hostname] = s
+    return host_stats[hostname]
+
+  FNR = 0
+  for L in qstat_f:
+    FNR += 1
+    FLDS = L.split()
+    status_flags = FLDS[5] if (len(FLDS) > 5) else ""
+
+    if FNR == 1 and FLDS[0] == "queuename":
+      continue
+
+    # Valid host status field
+    if re.search(r'^[A-Za-z]', L) and len(FLDS) in (5,6):
+      # This line has a format like this:
+      # main@c8-014.cm.cluster         BIP   0/10/16        9.98     linux-x64     d
+      # ^ queue & node name
+      queue_node, queue_type, core_usage_combo, node_load, os_arch \
+          = tuple(FLDS[0:5])
+      try:
+        node_load = float(node_load)
+      except ValueError:
+        node_load = 0
+      # status flags, see above
+
+      # skip disabled hosts
+      if ("d" in status_flags) and not show_disabled_nodes:
+        continue
+
+      #if (optPrintRaw != 0) print($0)
+
+      # Extract more useful info
+
+      m = re.search(r'^([^@]+)@([^-]+)-(.*)$', queue_node)
+      if not m:
+        raise ParseError, \
+              "Invalid queue/host combo on line %D: %s" % (FNR, queue_node)
+      queue, hostkind, hostnum = m.groups()
+      hostname = hostkind + "-" + hostnum
+
+      slots_resv, slots_used, slots_tot = map(int, core_usage_combo.split("/"))
+
+      hoststat = host_get_stats_rec(hostname)
+      hoststat['slots_resv'] += slots_resv
+      hoststat['slots_used'] += slots_used
+      # FIXME assume same across queues; fix if not correct:
+      hoststat['slots_tot'] = max(hoststat['slots_tot'], slots_tot)
+      hoststat['os_arch'] = os_arch
+      # FIXME we assume all of same queue type; fix if not correct:
+      hoststat['queue_type'] = queue_type
+      hoststat['queues'].append(queue)
+      # FIXME we assume all have same load; fix if not correct:      
+      hoststat['node_load'] = node_load
+
+  #return host_list_by_kinds, host_stats
+  return host_stats
+
+
+def summarize_hosttype_stats(host_stats):
+  """Further summarize the host stats by the host type (denoted by the
+  prefix of the hostname before the dash character, i.e. "c8" for
+  "c8-003").
+  """
+  hosttype_stats = {}
+
+  def hosttype_get_stats_rec(hosttype):
+    if hosttype not in hosttype_stats:
+      s = {
+        'hosts': [],
+      }
+      hosttype_stats[hosttype] = s
+    return hosttype_stats[hosttype]
+
+  for (hosttype, hostname) in [ (h.split('-')[0], h) for h in host_stats.keys() ]:
+    #print(hosttype, hostname)
+    hts = hosttype_get_stats_rec(hosttype)
+    hts['hosts'].append(hostname)
+
+  for hts in hosttype_stats.values():
+    #print hts
+    hts['host_count'] = len(hts['hosts'])
+    hts['slots_resv'] = sum(host_stats[h]['slots_resv'] for h in hts['hosts'])
+    hts['slots_tot'] = sum(host_stats[h]['slots_tot'] for h in hts['hosts'])
+    hts['slots_used'] = sum(host_stats[h]['slots_used'] for h in hts['hosts'])
+    hts['node_load'] = sum(host_stats[h]['node_load'] for h in hts['hosts'])
+    hts['os_arch'] = host_stats[hts['hosts'][0]]['slots_used']
+
+  return hosttype_stats
+
+
+def print_hosttype_stats(hosttype_stats):
+  hosttypes = sorted(hosttype_stats.keys())
+  print("%-16s %5s   %5s %5s %5s %5s %7s %9s" \
+        % ("MACHTYPE", "NODES", "CORES", "used", "free", "resv", "load", "load/used"))
+  for ht in hosttypes:
+    hts = hosttype_stats[ht]
+    print("%-16s %5d   %5d %5d %5d %5d %7.2f %9.3f" \
+          % (ht, hts['host_count'],
+             hts['slots_tot'],
+             hts['slots_used'],
+             hts['slots_tot'] - hts['slots_used'] - hts['slots_resv'],
+             hts['slots_resv'],
+             hts['node_load'],
+             hts['node_load'] / hts['slots_used'] if hts['slots_used'] != 0
+               else 0.0 if hts['node_load'] < 0.75 else float('nan')
+            )
+         )
+
 
 
 def help():
@@ -65,13 +245,14 @@ stats
 """
 
 
-def main_default(argv, save_qstat=True):
+def main_default(argv, save_qstat=None):
   """Main default function:
   - By default we invoke qstat -f and prints the analysis.
   - If argv[1] is given, then we read in the file and 
     use that for the analysis.
   """
   from time import localtime, strftime
+  from getopt import getopt
 
   dtime = localtime()
   dtimestr = strftime("%Y%m%d-%H%M", dtime)
@@ -87,10 +268,23 @@ def main_default(argv, save_qstat=True):
     raise ValueError, "Unknown action: "+argv[1]
 
   # Skip program name and first command:
-  cmdargs = argv[2:]
+  cmdargs_in = argv[2:]
+  cmdopts, cmdargs = getopt(cmdargs_in,
+                            "ds",
+                            ["show-disabled-nodes=",
+                             "include-disabled-nodes=",
+                             "save",
+                            ])
 
   # Default options
   show_disabled_nodes = False
+  for o,a in cmdopts:
+    if o in ('-d',):
+      show_disabled_nodes = True
+    elif o in ('--show-disabled-nodes', '--include-disabled-nodes'):
+      show_disabled_nodes = parse_int_or_bool(a)
+    elif o in ('-s', '--save'):
+      save_qstat = True
 
   if len(cmdargs) > 0:
     qstat_f_current = open(cmdargs[0], "r").read().splitlines()
@@ -106,11 +300,11 @@ def main_default(argv, save_qstat=True):
                         show_disabled_nodes=show_disabled_nodes,
                        )
   elif cmd == "stats":
-    node_slots_stats_per_node_type(qstat_f_current,
-                                   show_disabled_nodes=show_disabled_nodes,
-                                  )
+    node_slot_stats_per_machine_type(qstat_f_current,
+                                     show_disabled_nodes=show_disabled_nodes,
+                                    )
   else:
-    raise  "Missing support for command: "+cmd
+    raise ProgramError, "Missing support for command: "+cmd
 
 
 
@@ -157,6 +351,34 @@ def str_fmt_heading(fmt):
   return _str_fmt_heading_rx.sub(r'\1s', fmt)
 
 
+def parse_int_or_bool(S):
+  if isinstance(S, basestring):
+    S = S.strip().lower()
+    try:
+      return int(S)
+    except ValueError:
+      if S in ('true', 't', 'yes', 'y', 'on'):
+        return True
+      elif S in ('false', 'f', 'no', 'n', 'off', '-', ''):
+        return False
+      else:
+        raise ValueError, "Don't understand '%s' for boolean value" % S
+  else:
+    return S
+
+
+def parse_bool(S):
+  if isinstance(S, basestring):
+    S = S.strip().lower()
+    if S in ('true', 't', 'yes', 'y', 'on', '1'):
+      return True
+    elif S in ('false', 'f', 'no', 'n', 'off', '0', '-', ''):
+      return False
+    else:
+      raise ValueError, "Don't understand '%s' for boolean value" % S
+  else:
+    return S
+
 
 # stub main code