* Added convenience for gathering & analyzing CPUs on the cluster.

* Documentation update.
master
Wirawan Purwanto 8 years ago
parent aa597b907c
commit 739d765f53
  1. 58
      sge/dump-cluster-info.py

@ -6,6 +6,49 @@
# A tool that dumps every possibly imaginable info I want to get from # A tool that dumps every possibly imaginable info I want to get from
# a SGE-managed cluster. # a SGE-managed cluster.
"""
This is a tool that dumps every possible imaginable info I want to get from
a SGE-managed cluster.
This tool runs at user-level, so can only gather information that an
ordinary user can mine from the cluster.
Currently the info available for dumping are:
- cpufreq
- lscpu
- lspci
- free memory (the `free` command)
- uname
- dmesg
- mount
- df
Typical workflow
----------------
As a starter, use routine `Gather_all` to gather all information bits
from the compute nodes. This is an expensive gather operation; it may
take a while to complete.
There is a tool called `test_accessible_hosts` to read the list of
nodes from `qhost` SGE command, then checks the availability of every
node by performing ssh into each one.
Analysis: CPU variety
---------------------
To summarize the kinds of CPUs available on the compute nodes, as well
as listing the nodes that have them, use `summarize_cpu` and
`print_summarize_cpu`.
This tool requires that the output of `gather_cpuinfo` has been saved
to `cluster-info/$HOSTNAME/cpuinfo.txt` files`, where $HOSTNAME stands
for the host basename (without domain qualifier) for every compute
node.
The routine that does them all is `analyze_cpu_composition`.
"""
import os import os
import re import re
import subprocess import subprocess
@ -291,6 +334,20 @@ def tally_summarize_cpu(summary):
def analyze_cpu_composition(): def analyze_cpu_composition():
"""Performs analysis of the CPU composition of an SGE cluster.
Automatically queries the up (available) nodes and gathers the cpuinfo,
if it is necessary.
"""
global NODE_GOOD_LIST, NODE_BAD_LIST
global ROOT_DIR
getfile = lambda H, bn: os.path.join(ROOT_DIR, H.split('.')[0], bn)
if len(NODE_GOOD_LIST) == 0:
print("Warning: need to test node accesibility...")
NODE_GOOD_LIST, NODE_BAD_LIST = test_accessible_hosts()
if not os.path.exists(getfile(NODE_GOOD_LIST[0], "cpuinfo.txt")):
print("Warning: need to gather cpuinfo...")
# Most likely you haven't run gather_cpuinfo then...
gather_cpu_info(NODE_GOOD_LIST)
summ = summarize_cpu(NODE_GOOD_LIST) summ = summarize_cpu(NODE_GOOD_LIST)
print_summarize_cpu(summ) print_summarize_cpu(summ)
@ -327,4 +384,3 @@ def Gather_all():
print("\nGathering df...") print("\nGathering df...")
gather_df(NODE_GOOD_LIST) gather_df(NODE_GOOD_LIST)

Loading…
Cancel
Save