You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
159 lines
3.3 KiB
159 lines
3.3 KiB
8 years ago
|
#!/bin/bash
|
||
|
#
|
||
|
# 20160713
|
||
|
# Wirawan Purwanto
|
||
|
|
||
|
shopt -s extglob
|
||
|
|
||
|
function find_run_hosts()
|
||
|
# Find where a job runs.
|
||
|
{
|
||
|
local optJobNumber="$1"
|
||
|
qstat -f | _Find_run_hosts
|
||
|
}
|
||
|
|
||
|
function _Find_run_hosts()
|
||
|
# Processing part of the routine above.
|
||
|
# Takes qstat -f output.
|
||
|
{
|
||
|
awk \
|
||
|
-v optPrintRaw="$optPrintRaw" \
|
||
|
-v optJobNumber="$optJobNumber" \
|
||
|
'####
|
||
|
BEGIN {
|
||
|
STDERR = "/dev/stderr"
|
||
|
}
|
||
|
|
||
|
# Valid host status field
|
||
|
($0 ~ /^[A-Za-z]/) && (NF == 5 || NF == 6) {
|
||
|
host_info = $0
|
||
|
queue_node = $1
|
||
|
core_usage_combo = $3
|
||
|
states = $6 # if any
|
||
|
next
|
||
|
}
|
||
|
|
||
|
$1 == optJobNumber {
|
||
|
print host_info
|
||
|
print $0
|
||
|
}' # end awk script
|
||
|
}
|
||
|
|
||
|
|
||
|
function find_job_owner()
|
||
|
# Arg: <jobnumber>
|
||
|
{
|
||
|
qstat -j "$1" \
|
||
|
| awk '$1 == "owner:" { print $2; exit }'
|
||
|
}
|
||
|
|
||
|
|
||
|
function find_job_master_node()
|
||
|
# Arg: <jobnumber>
|
||
|
# Finds the "master" node of the job, i.e. where the batch job script
|
||
|
# was first initially launched.
|
||
|
# Unfortunately qstat -j doesn't return the desired info, we have to
|
||
|
# utilize full qstat to get this info.
|
||
|
{
|
||
|
qstat \
|
||
|
| awk -v optJobNumber="$1" '
|
||
|
$1 == optJobNumber {
|
||
|
queueHost = $8
|
||
|
sub(/^.*@/, "", queueHost)
|
||
|
print queueHost
|
||
|
exit
|
||
|
}' # end awk script
|
||
|
}
|
||
|
|
||
|
|
||
|
function list_job_nodes()
|
||
|
# Arg: <jobnumber>
|
||
|
{
|
||
|
find_run_hosts "$1" \
|
||
|
| awk '
|
||
|
($0 ~ /^[A-Za-z]/) && (NF == 5 || NF == 6) {
|
||
|
host = $1
|
||
|
sub(/^.*@/, "", host)
|
||
|
print host
|
||
|
}' # end awk script
|
||
|
}
|
||
|
|
||
|
function dump_process_tree1()
|
||
|
# Arg: <host> <user>
|
||
|
{
|
||
|
local HOST="$1" JOB_OWNER="$2"
|
||
|
ssh "$HOST" ps ux --forest -u "$JOB_OWNER" \
|
||
|
| awk -v JobOwner="$JOB_OWNER" '($1 == "USER") || ($1 == JobOwner) { print }'
|
||
|
# Note: some workaround was needed because it also printed other users
|
||
|
# notably the one running this job.
|
||
|
}
|
||
|
|
||
|
|
||
|
function dump_process_trees()
|
||
|
{
|
||
|
local SGE_JOB_ID="$1"
|
||
|
local MASTER_HOST ALL_HOSTS HOST
|
||
|
local JOB_OWNER
|
||
|
echo "job_number: $SGE_JOB_ID"
|
||
|
MASTER_HOST=$(find_job_master_node "$SGE_JOB_ID")
|
||
|
JOB_OWNER=$(find_job_owner "$SGE_JOB_ID")
|
||
|
ALL_HOSTS=( $(list_job_nodes "$SGE_JOB_ID") )
|
||
|
echo "master_host: $MASTER_HOST"
|
||
|
dump_process_tree1 "$MASTER_HOST" "$JOB_OWNER"
|
||
|
|
||
|
for HOST in "${ALL_HOSTS[@]}"; do
|
||
|
if [ "$HOST" = "$MASTER_HOST" ]; then continue; fi
|
||
|
echo "host: $HOST"
|
||
|
dump_process_tree1 "$HOST" "$JOB_OWNER"
|
||
|
done
|
||
|
}
|
||
|
|
||
|
|
||
|
# Main program switchboard
|
||
|
|
||
|
case "$1" in
|
||
|
(+([0-9]))
|
||
|
SGE_JOB_ID="$1"
|
||
|
find_run_hosts "$SGE_JOB_ID"
|
||
|
;;
|
||
|
(--process*|process*|proc|procs|px)
|
||
|
# Prints where the head node of the job is (i.e. the master node of the job)
|
||
|
# where the job script was first executing
|
||
|
if [ -z "$2" ]; then
|
||
|
echo "Job ID required as arg 2" >&2
|
||
|
exit 2
|
||
|
fi
|
||
|
SGE_JOB_ID="$2"
|
||
|
dump_process_trees "$SGE_JOB_ID"
|
||
|
;;
|
||
|
(--head-node|--head|head|headnode|head-node|--master-node|--master|master|masternode|master-node)
|
||
|
# Prints where the head node of the job is (i.e. the master node of the job)
|
||
|
# where the job script was first executing
|
||
|
if [ -z "$2" ]; then
|
||
|
echo "Job ID required as arg 2" >&2
|
||
|
exit 2
|
||
|
fi
|
||
|
SGE_JOB_ID="$2"
|
||
|
find_job_master_node "$SGE_JOB_ID"
|
||
|
;;
|
||
|
(--list-node*|--list|list|listnode*|list-node*|node|nodes)
|
||
|
# Lists all the compute nodes being used by this job
|
||
|
if [ -z "$2" ]; then
|
||
|
echo "Job ID required as arg 2" >&2
|
||
|
exit 2
|
||
|
fi
|
||
|
SGE_JOB_ID="$2"
|
||
|
list_job_nodes "$SGE_JOB_ID"
|
||
|
;;
|
||
|
(*)
|
||
|
echo "Unknown action: $1" >&2
|
||
|
exit 2
|
||
|
;;
|
||
|
esac
|
||
|
|
||
|
#SGE_JOB_ID="$1"
|
||
|
#find_run_hosts "$SGE_JOB_ID"
|
||
|
#find_job_owner "$SGE_JOB_ID"
|
||
|
#find_job_head_node "$SGE_JOB_ID"
|
||
|
|