* Added find-run-hosts.sh: swiss-army tool to find hosts where a job

run, dump the process trees, etc.
master
Wirawan Purwanto 9 years ago
parent 8ae0841ca6
commit 8b99995409
  1. 158
      sge/find-run-hosts.sh

@ -0,0 +1,158 @@
#!/bin/bash
#
# 20160713
# Wirawan Purwanto
shopt -s extglob
function find_run_hosts()
# Find where a job runs.
{
local optJobNumber="$1"
qstat -f | _Find_run_hosts
}
function _Find_run_hosts()
# Processing part of the routine above.
# Takes qstat -f output.
{
awk \
-v optPrintRaw="$optPrintRaw" \
-v optJobNumber="$optJobNumber" \
'####
BEGIN {
STDERR = "/dev/stderr"
}
# Valid host status field
($0 ~ /^[A-Za-z]/) && (NF == 5 || NF == 6) {
host_info = $0
queue_node = $1
core_usage_combo = $3
states = $6 # if any
next
}
$1 == optJobNumber {
print host_info
print $0
}' # end awk script
}
function find_job_owner()
# Arg: <jobnumber>
{
qstat -j "$1" \
| awk '$1 == "owner:" { print $2; exit }'
}
function find_job_master_node()
# Arg: <jobnumber>
# Finds the "master" node of the job, i.e. where the batch job script
# was first initially launched.
# Unfortunately qstat -j doesn't return the desired info, we have to
# utilize full qstat to get this info.
{
qstat \
| awk -v optJobNumber="$1" '
$1 == optJobNumber {
queueHost = $8
sub(/^.*@/, "", queueHost)
print queueHost
exit
}' # end awk script
}
function list_job_nodes()
# Arg: <jobnumber>
{
find_run_hosts "$1" \
| awk '
($0 ~ /^[A-Za-z]/) && (NF == 5 || NF == 6) {
host = $1
sub(/^.*@/, "", host)
print host
}' # end awk script
}
function dump_process_tree1()
# Arg: <host> <user>
{
local HOST="$1" JOB_OWNER="$2"
ssh "$HOST" ps ux --forest -u "$JOB_OWNER" \
| awk -v JobOwner="$JOB_OWNER" '($1 == "USER") || ($1 == JobOwner) { print }'
# Note: some workaround was needed because it also printed other users
# notably the one running this job.
}
function dump_process_trees()
{
local SGE_JOB_ID="$1"
local MASTER_HOST ALL_HOSTS HOST
local JOB_OWNER
echo "job_number: $SGE_JOB_ID"
MASTER_HOST=$(find_job_master_node "$SGE_JOB_ID")
JOB_OWNER=$(find_job_owner "$SGE_JOB_ID")
ALL_HOSTS=( $(list_job_nodes "$SGE_JOB_ID") )
echo "master_host: $MASTER_HOST"
dump_process_tree1 "$MASTER_HOST" "$JOB_OWNER"
for HOST in "${ALL_HOSTS[@]}"; do
if [ "$HOST" = "$MASTER_HOST" ]; then continue; fi
echo "host: $HOST"
dump_process_tree1 "$HOST" "$JOB_OWNER"
done
}
# Main program switchboard
case "$1" in
(+([0-9]))
SGE_JOB_ID="$1"
find_run_hosts "$SGE_JOB_ID"
;;
(--process*|process*|proc|procs|px)
# Prints where the head node of the job is (i.e. the master node of the job)
# where the job script was first executing
if [ -z "$2" ]; then
echo "Job ID required as arg 2" >&2
exit 2
fi
SGE_JOB_ID="$2"
dump_process_trees "$SGE_JOB_ID"
;;
(--head-node|--head|head|headnode|head-node|--master-node|--master|master|masternode|master-node)
# Prints where the head node of the job is (i.e. the master node of the job)
# where the job script was first executing
if [ -z "$2" ]; then
echo "Job ID required as arg 2" >&2
exit 2
fi
SGE_JOB_ID="$2"
find_job_master_node "$SGE_JOB_ID"
;;
(--list-node*|--list|list|listnode*|list-node*|node|nodes)
# Lists all the compute nodes being used by this job
if [ -z "$2" ]; then
echo "Job ID required as arg 2" >&2
exit 2
fi
SGE_JOB_ID="$2"
list_job_nodes "$SGE_JOB_ID"
;;
(*)
echo "Unknown action: $1" >&2
exit 2
;;
esac
#SGE_JOB_ID="$1"
#find_run_hosts "$SGE_JOB_ID"
#find_job_owner "$SGE_JOB_ID"
#find_job_head_node "$SGE_JOB_ID"
Loading…
Cancel
Save