From 8b99995409d0bbad7820f391faddb47b1865721e Mon Sep 17 00:00:00 2001 From: Wirawan Purwanto Date: Thu, 14 Jul 2016 00:24:48 -0400 Subject: [PATCH] * Added find-run-hosts.sh: swiss-army tool to find hosts where a job run, dump the process trees, etc. --- sge/find-run-hosts.sh | 158 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 158 insertions(+) create mode 100755 sge/find-run-hosts.sh diff --git a/sge/find-run-hosts.sh b/sge/find-run-hosts.sh new file mode 100755 index 0000000..16660fd --- /dev/null +++ b/sge/find-run-hosts.sh @@ -0,0 +1,158 @@ +#!/bin/bash +# +# 20160713 +# Wirawan Purwanto + +shopt -s extglob + +function find_run_hosts() +# Find where a job runs. +{ + local optJobNumber="$1" + qstat -f | _Find_run_hosts +} + +function _Find_run_hosts() +# Processing part of the routine above. +# Takes qstat -f output. +{ + awk \ + -v optPrintRaw="$optPrintRaw" \ + -v optJobNumber="$optJobNumber" \ + '#### +BEGIN { + STDERR = "/dev/stderr" +} + +# Valid host status field +($0 ~ /^[A-Za-z]/) && (NF == 5 || NF == 6) { + host_info = $0 + queue_node = $1 + core_usage_combo = $3 + states = $6 # if any + next +} + +$1 == optJobNumber { + print host_info + print $0 +}' # end awk script +} + + +function find_job_owner() +# Arg: +{ + qstat -j "$1" \ + | awk '$1 == "owner:" { print $2; exit }' +} + + +function find_job_master_node() +# Arg: +# Finds the "master" node of the job, i.e. where the batch job script +# was first initially launched. +# Unfortunately qstat -j doesn't return the desired info, we have to +# utilize full qstat to get this info. +{ + qstat \ + | awk -v optJobNumber="$1" ' +$1 == optJobNumber { + queueHost = $8 + sub(/^.*@/, "", queueHost) + print queueHost + exit +}' # end awk script +} + + +function list_job_nodes() +# Arg: +{ + find_run_hosts "$1" \ + | awk ' +($0 ~ /^[A-Za-z]/) && (NF == 5 || NF == 6) { + host = $1 + sub(/^.*@/, "", host) + print host +}' # end awk script +} + +function dump_process_tree1() +# Arg: +{ + local HOST="$1" JOB_OWNER="$2" + ssh "$HOST" ps ux --forest -u "$JOB_OWNER" \ + | awk -v JobOwner="$JOB_OWNER" '($1 == "USER") || ($1 == JobOwner) { print }' + # Note: some workaround was needed because it also printed other users + # notably the one running this job. +} + + +function dump_process_trees() +{ + local SGE_JOB_ID="$1" + local MASTER_HOST ALL_HOSTS HOST + local JOB_OWNER + echo "job_number: $SGE_JOB_ID" + MASTER_HOST=$(find_job_master_node "$SGE_JOB_ID") + JOB_OWNER=$(find_job_owner "$SGE_JOB_ID") + ALL_HOSTS=( $(list_job_nodes "$SGE_JOB_ID") ) + echo "master_host: $MASTER_HOST" + dump_process_tree1 "$MASTER_HOST" "$JOB_OWNER" + + for HOST in "${ALL_HOSTS[@]}"; do + if [ "$HOST" = "$MASTER_HOST" ]; then continue; fi + echo "host: $HOST" + dump_process_tree1 "$HOST" "$JOB_OWNER" + done +} + + +# Main program switchboard + +case "$1" in +(+([0-9])) + SGE_JOB_ID="$1" + find_run_hosts "$SGE_JOB_ID" + ;; +(--process*|process*|proc|procs|px) + # Prints where the head node of the job is (i.e. the master node of the job) + # where the job script was first executing + if [ -z "$2" ]; then + echo "Job ID required as arg 2" >&2 + exit 2 + fi + SGE_JOB_ID="$2" + dump_process_trees "$SGE_JOB_ID" + ;; +(--head-node|--head|head|headnode|head-node|--master-node|--master|master|masternode|master-node) + # Prints where the head node of the job is (i.e. the master node of the job) + # where the job script was first executing + if [ -z "$2" ]; then + echo "Job ID required as arg 2" >&2 + exit 2 + fi + SGE_JOB_ID="$2" + find_job_master_node "$SGE_JOB_ID" + ;; +(--list-node*|--list|list|listnode*|list-node*|node|nodes) + # Lists all the compute nodes being used by this job + if [ -z "$2" ]; then + echo "Job ID required as arg 2" >&2 + exit 2 + fi + SGE_JOB_ID="$2" + list_job_nodes "$SGE_JOB_ID" + ;; +(*) + echo "Unknown action: $1" >&2 + exit 2 + ;; +esac + +#SGE_JOB_ID="$1" +#find_run_hosts "$SGE_JOB_ID" +#find_job_owner "$SGE_JOB_ID" +#find_job_head_node "$SGE_JOB_ID" +