|
|
|
@ -6,6 +6,49 @@ |
|
|
|
|
# A tool that dumps every possibly imaginable info I want to get from |
|
|
|
|
# a SGE-managed cluster. |
|
|
|
|
|
|
|
|
|
""" |
|
|
|
|
This is a tool that dumps every possible imaginable info I want to get from |
|
|
|
|
a SGE-managed cluster. |
|
|
|
|
This tool runs at user-level, so can only gather information that an |
|
|
|
|
ordinary user can mine from the cluster. |
|
|
|
|
|
|
|
|
|
Currently the info available for dumping are: |
|
|
|
|
|
|
|
|
|
- cpufreq |
|
|
|
|
- lscpu |
|
|
|
|
- lspci |
|
|
|
|
- free memory (the `free` command) |
|
|
|
|
- uname |
|
|
|
|
- dmesg |
|
|
|
|
- mount |
|
|
|
|
- df |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Typical workflow |
|
|
|
|
---------------- |
|
|
|
|
|
|
|
|
|
As a starter, use routine `Gather_all` to gather all information bits |
|
|
|
|
from the compute nodes. This is an expensive gather operation; it may |
|
|
|
|
take a while to complete. |
|
|
|
|
|
|
|
|
|
There is a tool called `test_accessible_hosts` to read the list of |
|
|
|
|
nodes from `qhost` SGE command, then checks the availability of every |
|
|
|
|
node by performing ssh into each one. |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Analysis: CPU variety |
|
|
|
|
--------------------- |
|
|
|
|
|
|
|
|
|
To summarize the kinds of CPUs available on the compute nodes, as well |
|
|
|
|
as listing the nodes that have them, use `summarize_cpu` and |
|
|
|
|
`print_summarize_cpu`. |
|
|
|
|
This tool requires that the output of `gather_cpuinfo` has been saved |
|
|
|
|
to `cluster-info/$HOSTNAME/cpuinfo.txt` files`, where $HOSTNAME stands |
|
|
|
|
for the host basename (without domain qualifier) for every compute |
|
|
|
|
node. |
|
|
|
|
The routine that does them all is `analyze_cpu_composition`. |
|
|
|
|
""" |
|
|
|
|
|
|
|
|
|
import os |
|
|
|
|
import re |
|
|
|
|
import subprocess |
|
|
|
@ -291,6 +334,20 @@ def tally_summarize_cpu(summary): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def analyze_cpu_composition(): |
|
|
|
|
"""Performs analysis of the CPU composition of an SGE cluster. |
|
|
|
|
Automatically queries the up (available) nodes and gathers the cpuinfo, |
|
|
|
|
if it is necessary. |
|
|
|
|
""" |
|
|
|
|
global NODE_GOOD_LIST, NODE_BAD_LIST |
|
|
|
|
global ROOT_DIR |
|
|
|
|
getfile = lambda H, bn: os.path.join(ROOT_DIR, H.split('.')[0], bn) |
|
|
|
|
if len(NODE_GOOD_LIST) == 0: |
|
|
|
|
print("Warning: need to test node accesibility...") |
|
|
|
|
NODE_GOOD_LIST, NODE_BAD_LIST = test_accessible_hosts() |
|
|
|
|
if not os.path.exists(getfile(NODE_GOOD_LIST[0], "cpuinfo.txt")): |
|
|
|
|
print("Warning: need to gather cpuinfo...") |
|
|
|
|
# Most likely you haven't run gather_cpuinfo then... |
|
|
|
|
gather_cpu_info(NODE_GOOD_LIST) |
|
|
|
|
summ = summarize_cpu(NODE_GOOD_LIST) |
|
|
|
|
print_summarize_cpu(summ) |
|
|
|
|
|
|
|
|
@ -327,4 +384,3 @@ def Gather_all(): |
|
|
|
|
print("\nGathering df...") |
|
|
|
|
gather_df(NODE_GOOD_LIST) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|