diff --git a/sge/dump-cluster-info.py b/sge/dump-cluster-info.py index 49f4e05..5e6b78d 100755 --- a/sge/dump-cluster-info.py +++ b/sge/dump-cluster-info.py @@ -6,6 +6,49 @@ # A tool that dumps every possibly imaginable info I want to get from # a SGE-managed cluster. +""" +This is a tool that dumps every possible imaginable info I want to get from +a SGE-managed cluster. +This tool runs at user-level, so can only gather information that an +ordinary user can mine from the cluster. + +Currently the info available for dumping are: + +- cpufreq +- lscpu +- lspci +- free memory (the `free` command) +- uname +- dmesg +- mount +- df + + +Typical workflow +---------------- + +As a starter, use routine `Gather_all` to gather all information bits +from the compute nodes. This is an expensive gather operation; it may +take a while to complete. + +There is a tool called `test_accessible_hosts` to read the list of +nodes from `qhost` SGE command, then checks the availability of every +node by performing ssh into each one. + + +Analysis: CPU variety +--------------------- + +To summarize the kinds of CPUs available on the compute nodes, as well +as listing the nodes that have them, use `summarize_cpu` and +`print_summarize_cpu`. +This tool requires that the output of `gather_cpuinfo` has been saved +to `cluster-info/$HOSTNAME/cpuinfo.txt` files`, where $HOSTNAME stands +for the host basename (without domain qualifier) for every compute +node. +The routine that does them all is `analyze_cpu_composition`. +""" + import os import re import subprocess @@ -291,6 +334,20 @@ def tally_summarize_cpu(summary): def analyze_cpu_composition(): + """Performs analysis of the CPU composition of an SGE cluster. + Automatically queries the up (available) nodes and gathers the cpuinfo, + if it is necessary. + """ + global NODE_GOOD_LIST, NODE_BAD_LIST + global ROOT_DIR + getfile = lambda H, bn: os.path.join(ROOT_DIR, H.split('.')[0], bn) + if len(NODE_GOOD_LIST) == 0: + print("Warning: need to test node accesibility...") + NODE_GOOD_LIST, NODE_BAD_LIST = test_accessible_hosts() + if not os.path.exists(getfile(NODE_GOOD_LIST[0], "cpuinfo.txt")): + print("Warning: need to gather cpuinfo...") + # Most likely you haven't run gather_cpuinfo then... + gather_cpu_info(NODE_GOOD_LIST) summ = summarize_cpu(NODE_GOOD_LIST) print_summarize_cpu(summ) @@ -327,4 +384,3 @@ def Gather_all(): print("\nGathering df...") gather_df(NODE_GOOD_LIST) -