diff --git a/sge/dump-cluster-info.py b/sge/dump-cluster-info.py index d27d8c9..dc243de 100755 --- a/sge/dump-cluster-info.py +++ b/sge/dump-cluster-info.py @@ -62,9 +62,12 @@ class sh(object): return 0 - -globals().setdefault("NODE_LIST", []) -globals().setdefault("NODE_BAD_LIST", set()) +_g = globals() +_g.setdefault("NODE_LIST", []) +#_g.setdefault("NODE_BAD_LIST", set()) +_g.setdefault("NODE_BAD_LIST", []) +_g.setdefault("NODE_GOOD_LIST", []) +_g.setdefault("ROOT_DIR", "cluster-info") def get_node_list(): @@ -95,14 +98,17 @@ def rhost_run(host, cmdline): return rslt -def rhosts_pipe_out(cmdline, filename, hosts=None, rootdir="cluster-info"): +def rhosts_pipe_out(cmdline, filename, hosts=None, rootdir=None): """Executes cmdline on each remote host (the list is given in and """ + global ROOT_DIR from os.path import dirname, join, isdir path_join = join Verb = 100 if hosts is None: hosts = node_list() + if rootdir is None: + rootdir = ROOT_DIR for H in hosts: host_base = H.split(".")[0] outfname = path_join(rootdir, host_base, filename) @@ -137,12 +143,69 @@ def test_accessible_hosts(hosts=None): return good_hosts, bad_hosts +def cpuinfo_extract_processor_names(fn, ht=False): + # REFS: + # https://access.redhat.com/discussions/480953 + """Extracts the names of processors from /proc/cpuinfo. + Returns it as a list of processor names. + + WARNING: Hyperthreading is detected with a lame methodology, + and only half of the number of cores are reported (i.e. only + physical cores)""" + A = [] + siblings_on_socket = None + cores_on_socket = None + with open(fn, "r") as F: + for L in F: + if L.startswith("model name"): + modelname = L.split(":", 1)[1].strip() + A.append(modelname) + elif L.startswith("siblings"): + siblings_on_socket = int(L.split(":", 1)[1].strip()) + elif L.startswith("cpu cores"): + cores_on_socket = int(L.split(":", 1)[1].strip()) + + #print "siblings: ", siblings_on_socket + #print "cores: ", cores_on_socket + + # FIXME: Quick-and-dirty solution for hyperthreading; + # see Red Hat site above; not 100% reliable if there are several + # kinds of CPU models, which I don't think I'll ever encountered. + if (not ht) \ + and siblings_on_socket is not None \ + and cores_on_socket is not None \ + and siblings_on_socket != cores_on_socket: + assert cores_on_socket*2 == siblings_on_socket + # ^^otherwise it's not Hyperthreading, the code has to be fixed! + + A = A[0:len(A)/2] ### HACK!!! + print("Warning: hyperthreading detected in %s" % fn) + + return A + + +def agg_count_names(namelist): + """Aggregates the names in namelist to names->count mapping, as a dict. + Useful, e.g. for counting number of unique elements in a list. + """ + A = {} + for C in namelist: + try: + A[C] = A[C] + 1 + except KeyError: + A[C] = 1 + return A + + # Below are the main gather tools def gather_cpuinfo(hosts=None): """Gather tool: for cpuinfo""" rhosts_pipe_out(("cat", "/proc/cpuinfo"), "cpuinfo.txt", hosts=hosts) +def gather_lscpu(hosts=None): + """Gather tool: for lscpu""" + rhosts_pipe_out(("lscpu"), "lscpu.txt", hosts=hosts) def gather_lspci(hosts=None): """Gather tool: for lspci""" @@ -152,5 +215,95 @@ def gather_free(hosts=None): """Gather tool: for free""" rhosts_pipe_out(("free"), "free.txt", hosts=hosts) +def gather_uname_a(hosts=None): + """Gather tool: for free""" + rhosts_pipe_out(("uname", "-a"), "uname-a.txt", hosts=hosts) + + +#def dict_str_sorted(d): +# return "{" + ", ". + +def summarize_cpu(hosts=None): + from pprint import pformat + global ROOT_DIR + hosts_base = [ H.split(".")[0] for H in hosts ] + getfile = lambda H, bn: os.path.join(ROOT_DIR, H, bn) + cpu_info = [] + + px_hosts_by_type = {} + + for H in hosts_base: + px_names = cpuinfo_extract_processor_names(getfile(H, "cpuinfo.txt")) + px_group = agg_count_names(px_names) + #print("%s : %s" % (H, px_group)) + + px_group_key = pformat(px_group) # use pretty representation + + try: + px_hosts_by_type[px_group_key]["hosts"] += [ H ] + except KeyError: + px_hosts_by_type[px_group_key] = { + "cpu_count": px_group, + "hosts": [ H ] + } + + return px_hosts_by_type + + +def print_summarize_cpu(summary): + host_types = sorted(summary.keys()) + nproc_grand_total = 0 + nnode_grand_total = 0 + for T in host_types: + rec = summary[T] + nproc_per_node = sum(rec["cpu_count"].values()) + print("%s:: %d hosts, %d procs/node, total %d procs" \ + % (T, + len(rec["hosts"]), + nproc_per_node, + len(rec["hosts"]) * nproc_per_node, + )) + print("") + print(" " + " ".join(sorted(rec["hosts"]))) + print("") + nproc_grand_total += len(rec["hosts"]) * nproc_per_node + nnode_grand_total += len(rec["hosts"]) + + print("Grand total %d procs" % nproc_grand_total) + print("Grand total %d nodes" % nnode_grand_total) + + +def tally_summarize_cpu(summary): + """Tallies up the total number of processors + """ + + +def analyze_cpu_composition(): + summ = summarize_cpu(NODE_GOOD_LIST) + print_summarize_cpu(summ) + + +def Gather_all(): + """Master gathering routine, to gather everything all at once. + It will take some time to gather every bit of information. + """ + global NODE_GOOD_LIST, NODE_BAD_LIST, NODE_LIST + print("Testing node accesibility...") + NODE_GOOD_LIST, NODE_BAD_LIST = test_accesible_hosts() + + print("\nGathering cpuinfo...") + gather_cpuinfo(NODE_GOOD_LIST) + + print("\nGathering lscpu...") + gather_lscpu(NODE_GOOD_LIST) + + print("\nGathering lspci...") + gather_lspci(NODE_GOOD_LIST) + + print("\nGathering free mem...") + gather_free(NODE_GOOD_LIST) + + print("\nGathering uname...") + gather_uname_a(NODE_GOOD_LIST)