#!/usr/bin/env python
#
# Created: 20160830
# Wirawan Purwanto

"""
show-node-status.py
---------------------

Various tools to investigate node status in an SGE cluster.
This tool is a replacement and upgrade of the shell version of the tool
`node-slot-status.sh`.

Usage:

"""

import os
import re
import subprocess
import sys

class ParseError(RuntimeError):
  pass

class ProgramError(RuntimeError):
  pass

MYSELF = 'show-node-status.py'


def node_slot_stats_raw(qstat_f, show_disabled_nodes=False):
  """Prints the node stats from `qstat -f' in raw format:
  - not printing disabled nodes
  - not showing the computational jobs that are running on these nodes
  """
  FNR = 0
  for L in qstat_f:
    FNR += 1
    FLDS = L.split()
    status_flags = FLDS[5] if (len(FLDS) > 5) else ""

    if FNR == 1 and FLDS[0] == "queuename":
      print(L)
      continue

    # Valid host status field
    if re.search(r'^[A-Za-z]', L) and len(FLDS) in (5,6) \
       and (show_disabled_nodes or ("d" not in status_flags)):
      print(L)


def node_slot_stats_per_machine_type(qstat_f, show_disabled_nodes=False):
  """Prints status of slot availability per machine type (defined as
  host with the same base hostname (e.g. "c6-", or "c8-").
  Originally implemented based on the naming of hosts on Turing cluster.
  In SGE terminology, "slot" means a CPU core.

  Example output:

     MACHTYPE          NODE   CORES  used  free  resv
     c6                  15     240    77   163     0
     c8                  40     768   569   199     0
     cr                  74    1480   988   492     0
     crhimem              3      96     0    96     0
     crphi               10     200    48   152     0
     d430                49    1568  1292   276     0
     d730                10     280    10   270     0

  (changes depending on what's disabled and the load of the cluster)

  FIXME: If a machine is covered by more than one queue, this will
  cause the counts to be overestimated. Must register if a machine has
  been encountered and not re-account that machine.
  However this may not be the best approach as queues are overlapping
  on machines. Since on Turing, the practice is not to further split a
  machine to multiple queues (i.e. a 32-core node have all the 32
  cores assignable to both main and timed-main queues, rather than
  dedicating 16 for main and 16 for timed-main), we use a particular
  way to avoid the double-counting:

  - slots_resv: total number of reserved slots in a node (for whatever
    the sysadmin designates) -- sum them up

  - slots_used: total number of slots currently used (i.e.,
    occupied by jobs) -- sum them up

  - slots_tot: total number of slots in a node -- take the maximum
    value encountered.
    Had the nodes split-dedicated to a particular queue, we have to
    take the sum of the values instead.

  """
  from pprint import pprint
  host_stats = collect_host_stats(qstat_f, show_disabled_nodes)
  #pprint(host_stats)
  hosttype_stats = summarize_hosttype_stats(host_stats)
  #pprint(hosttype_stats)
  print_hosttype_stats(hosttype_stats)


def collect_host_stats(qstat_f, show_disabled_nodes=None):
  """Internal routine to collect node stats from `qstat -f` by
  combining node status that were printed for each `queue@hostname`
  combinations.
  The result is a dict with hostname as the key."""

  host_stats = {}
  #host_list_by_kinds = {}

  def host_get_stats_rec(hostname):
    if hostname not in host_stats:
      s = {
        'slots_resv': 0,
        'slots_used': 0,
        'slots_tot': 0,
        'queues': [],
      }
      host_stats[hostname] = s
    return host_stats[hostname]

  FNR = 0
  for L in qstat_f:
    FNR += 1
    FLDS = L.split()
    status_flags = FLDS[5] if (len(FLDS) > 5) else ""

    if FNR == 1 and FLDS[0] == "queuename":
      continue

    # Valid host status field
    if re.search(r'^[A-Za-z]', L) and len(FLDS) in (5,6):
      # This line has a format like this:
      # main@c8-014.cm.cluster         BIP   0/10/16        9.98     linux-x64     d
      # ^ queue & node name
      queue_node, queue_type, core_usage_combo, node_load, os_arch \
          = tuple(FLDS[0:5])
      try:
        node_load = float(node_load)
      except ValueError:
        node_load = 0
      # status flags, see above

      # skip disabled hosts
      if ("d" in status_flags) and not show_disabled_nodes:
        continue

      #if (optPrintRaw != 0) print($0)

      # Extract more useful info

      m = re.search(r'^([^@]+)@([^-]+)-([^.]*)((?:\..*)?)$', queue_node)
      if not m:
        raise ParseError, \
              "Invalid queue/host combo on line %D: %s" % (FNR, queue_node)
      queue, hostkind, hostnum, hostdomain = m.groups()
      hostname = hostkind + "-" + hostnum

      slots_resv, slots_used, slots_tot = map(int, core_usage_combo.split("/"))

      hoststat = host_get_stats_rec(hostname)
      hoststat['slots_resv'] += slots_resv
      hoststat['slots_used'] += slots_used
      # FIXME assume same across queues; fix if not correct:
      hoststat['slots_tot'] = max(hoststat['slots_tot'], slots_tot)
      hoststat['os_arch'] = os_arch
      # FIXME we assume all of same queue type; fix if not correct:
      hoststat['queue_type'] = queue_type
      hoststat['queues'].append(queue)
      # FIXME we assume all have same load; fix if not correct:      
      hoststat['node_load'] = node_load

  #return host_list_by_kinds, host_stats
  return host_stats


def summarize_hosttype_stats(host_stats):
  """Further summarize the host stats by the host type (denoted by the
  prefix of the hostname before the dash character, i.e. "c8" for
  "c8-003").
  """
  hosttype_stats = {}

  def hosttype_get_stats_rec(hosttype):
    if hosttype not in hosttype_stats:
      s = {
        'hosts': [],
      }
      hosttype_stats[hosttype] = s
    return hosttype_stats[hosttype]

  for (hosttype, hostname) in [ (h.split('-')[0], h) for h in host_stats.keys() ]:
    #print(hosttype, hostname)
    hts = hosttype_get_stats_rec(hosttype)
    hts['hosts'].append(hostname)

  for hts in hosttype_stats.values():
    #print hts
    hts['host_count'] = len(hts['hosts'])
    hts['slots_resv'] = sum(host_stats[h]['slots_resv'] for h in hts['hosts'])
    hts['slots_tot'] = sum(host_stats[h]['slots_tot'] for h in hts['hosts'])
    hts['slots_used'] = sum(host_stats[h]['slots_used'] for h in hts['hosts'])
    hts['node_load'] = sum(host_stats[h]['node_load'] for h in hts['hosts'])
    hts['os_arch'] = host_stats[hts['hosts'][0]]['slots_used']

  return hosttype_stats


def print_hosttype_stats(hosttype_stats):
  hosttypes = sorted(hosttype_stats.keys())
  print("%-16s %5s   %5s %5s %5s %5s %7s %9s" \
        % ("MACHTYPE", "NODES", "CORES", "used", "free", "resv", "load", "load/used"))
  for ht in hosttypes:
    hts = hosttype_stats[ht]
    print("%-16s %5d   %5d %5d %5d %5d %7.2f %9.3f" \
          % (ht, hts['host_count'],
             hts['slots_tot'],
             hts['slots_used'],
             hts['slots_tot'] - hts['slots_used'] - hts['slots_resv'],
             hts['slots_resv'],
             hts['node_load'],
             hts['node_load'] / hts['slots_used'] if hts['slots_used'] != 0
               else 0.0 if hts['node_load'] < 0.75 else float('nan')
            )
         )


def help():
  msg = """\
%(CMD)s - Shows node status from SGE information

The information is mainly drawn from `qstat -f` output.

Usage: one of the following:

%(CMD)s raw [qstat_file] [--save] [--show-disabled-nodes]
    Shows the raw queue/node status

%(CMD)s
%(CMD)s stats [qstat_file] [--save] [--show-disabled-nodes]
    Shows the statistic summary per node type
""" \
    % dict(CMD=MYSELF)
  print(msg)


def main_default(argv):
  """Main default function:
  - By default we invoke qstat -f and prints the analysis.
  - If argv[1] is given, then we read in the file and 
    use that for the analysis.
  """
  from time import localtime, strftime
  from getopt import getopt, GetoptError

  dtime = localtime()
  dtimestr = strftime("%Y%m%d-%H%M", dtime)

  # Read the command first--what do we want to do
  if len(argv) < 2:
    cmd = "stats"
  elif argv[1] in ('--raw', 'raw'):
    cmd = "raw"
  elif argv[1] in ('--stats', 'stats', 'stat'):
    cmd = "stats"
  elif argv[1] in ('--help', 'help', '-h'):
    help()
    return 0
  else:
    raise ValueError, "Unknown action: "+argv[1]

  # Skip program name and first command:
  cmdargs_in = argv[2:]
  try:
    cmdopts, cmdargs = getopt(cmdargs_in,
                              "dhs",
                              ["show-disabled-nodes=", "include-disabled-nodes=",
                               "save",
                               "help"])
  except GetoptError as err:
    sys.stderr.writelines([str(err), "\n"])
    return 2

  # Process flag argument
  show_disabled_nodes = False
  save_qstat = False
  for o,a in cmdopts:
    if o in ('-h', '--help'):
      help()
      return 0
    elif o in ('-d',):
      show_disabled_nodes = True
    elif o in ('--show-disabled-nodes', '--include-disabled-nodes'):
      show_disabled_nodes = parse_int_or_bool(a)
    elif o in ('-s', '--save'):
      save_qstat = True
    else:
      raise ProgramError, "Unhandled option in main program: %s %s" % (o,a)

  if len(cmdargs) > 0:
    qstat_f_current = open(cmdargs[0], "r").read().splitlines()
  else:
    qstat_f_current = pipe_out(('qstat', '-f'), split=True)
    if save_qstat:
      with open("qstat-f-%s.txt" % dtimestr, "w") as F:
        F.write("\n".join(qstat_f_current))
        F.write("\n")

  if cmd == "raw":
    node_slot_stats_raw(qstat_f_current,
                        show_disabled_nodes=show_disabled_nodes,
                       )
  elif cmd == "stats":
    node_slot_stats_per_machine_type(qstat_f_current,
                                     show_disabled_nodes=show_disabled_nodes,
                                    )
  else:
    raise ProgramError, "Missing support for command: "+cmd

  return 0


# ---------------------------------------------------------------------------
# Support tools below
# ---------------------------------------------------------------------------

def pipe_out(args, split=False, shell=False):
  """Executes a shell command, piping out the stdout to python for parsing.
  This is my customary shortcut for backtick operator.
  The result is either a single string (if split==False) or a list of strings
  with EOLs removed (if split==True)."""
  retval = subprocess.Popen(args, stdout=subprocess.PIPE, shell=shell).communicate()[0]
  if not split:
    return retval
  else:
    return retval.splitlines()


# Internal variable: don't mess!
_str_fmt_heading_rx = None
def str_fmt_heading(fmt):
  """Replaces a printf-style formatting with one suitable for table heading:
  all non-string conversions are replaced with string conversions,
  preserving the minimum widths."""
  # Originally from: $PWQMC77/scripts/cost.py and later Cr2_analysis_cbs.py .
  #
  #_str_fmt_heading_rx = None # only for development purposes
  import re
  global _str_fmt_heading_rx
  if _str_fmt_heading_rx is None:
    # Because of complicated regex, I verbosely write it out here:
    _str_fmt_heading_rx = re.compile(r"""
      (
        %                 # % sign
        (?:\([^)]+\))?    # optional '(keyname)' mapping key
        [-+#0 hlL]*       # optional conversion flag
        [0-9*]*           # optional minimum field width
      )
      ((?:\.[0-9]*)?)     # optional precision
      [^-+#*0 hlL0-9.%s]  # not conv flag, dimensions, nor literal '%',
                          # nor 's' conversion specifiers
    """, re.VERBOSE)
  return _str_fmt_heading_rx.sub(r'\1s', fmt)


def parse_int_or_bool(S):
  if isinstance(S, basestring):
    S = S.strip().lower()
    try:
      return int(S)
    except ValueError:
      if S in ('true', 't', 'yes', 'y', 'on'):
        return True
      elif S in ('false', 'f', 'no', 'n', 'off', '-', ''):
        return False
      else:
        raise ValueError, "Don't understand '%s' for boolean value" % S
  else:
    return S


def parse_bool(S):
  if isinstance(S, basestring):
    S = S.strip().lower()
    if S in ('true', 't', 'yes', 'y', 'on', '1'):
      return True
    elif S in ('false', 'f', 'no', 'n', 'off', '0', '-', ''):
      return False
    else:
      raise ValueError, "Don't understand '%s' for boolean value" % S
  else:
    return S


# stub main code

if __name__ == "__main__" and not "get_ipython" in globals():
  main_default(sys.argv)