#!/bin/bash # # 20160713 # Wirawan Purwanto shopt -s extglob function find_run_hosts() # Find where a job runs. { local optJobNumber="$1" qstat -f | _Find_run_hosts } function _Find_run_hosts() # Processing part of the routine above. # Takes qstat -f output. { awk \ -v optPrintRaw="$optPrintRaw" \ -v optJobNumber="$optJobNumber" \ '#### BEGIN { STDERR = "/dev/stderr" } # Valid host status field ($0 ~ /^[A-Za-z]/) && (NF == 5 || NF == 6) { host_info = $0 queue_node = $1 core_usage_combo = $3 states = $6 # if any next } $1 == optJobNumber { print host_info print $0 }' # end awk script } function find_job_owner() # Arg: { qstat -j "$1" \ | awk '$1 == "owner:" { print $2; exit }' } function find_job_master_node() # Arg: # Finds the "master" node of the job, i.e. where the batch job script # was first initially launched. # Unfortunately qstat -j doesn't return the desired info, we have to # utilize full qstat to get this info. { qstat \ | awk -v optJobNumber="$1" ' $1 == optJobNumber { queueHost = $8 sub(/^.*@/, "", queueHost) print queueHost exit }' # end awk script } function list_job_nodes() # Arg: { find_run_hosts "$1" \ | awk ' ($0 ~ /^[A-Za-z]/) && (NF == 5 || NF == 6) { host = $1 sub(/^.*@/, "", host) print host }' # end awk script } function dump_process_tree1() # Arg: { local HOST="$1" JOB_OWNER="$2" ssh "$HOST" ps ux --forest -u "$JOB_OWNER" \ | awk -v JobOwner="$JOB_OWNER" '($1 == "USER") || ($1 == JobOwner) { print }' # Note: some workaround was needed because it also printed other users # notably the one running this job. } function dump_process_trees() { local SGE_JOB_ID="$1" local MASTER_HOST ALL_HOSTS HOST local JOB_OWNER echo "job_number: $SGE_JOB_ID" MASTER_HOST=$(find_job_master_node "$SGE_JOB_ID") JOB_OWNER=$(find_job_owner "$SGE_JOB_ID") ALL_HOSTS=( $(list_job_nodes "$SGE_JOB_ID") ) echo "master_host: $MASTER_HOST" dump_process_tree1 "$MASTER_HOST" "$JOB_OWNER" for HOST in "${ALL_HOSTS[@]}"; do if [ "$HOST" = "$MASTER_HOST" ]; then continue; fi echo "host: $HOST" dump_process_tree1 "$HOST" "$JOB_OWNER" done } # Main program switchboard case "$1" in (+([0-9])) SGE_JOB_ID="$1" find_run_hosts "$SGE_JOB_ID" ;; (--process*|process*|proc|procs|px) # Dumps all the processes belonging to this job's owner on all the # nodes allocated to the specified job. if [ -z "$2" ]; then echo "Job ID required as arg 2" >&2 exit 2 fi SGE_JOB_ID="$2" dump_process_trees "$SGE_JOB_ID" ;; (--head-node|--head|head|headnode|head-node|--master-node|--master|master|masternode|master-node) # Prints where the head node of the job is (i.e. the master node of the job) # where the job script was first executing if [ -z "$2" ]; then echo "Job ID required as arg 2" >&2 exit 2 fi SGE_JOB_ID="$2" find_job_master_node "$SGE_JOB_ID" ;; (--list-node*|--list|list|listnode*|list-node*|node|nodes) # Lists all the compute nodes being used by this job if [ -z "$2" ]; then echo "Job ID required as arg 2" >&2 exit 2 fi SGE_JOB_ID="$2" list_job_nodes "$SGE_JOB_ID" ;; (*) echo "Unknown action: $1" >&2 exit 2 ;; esac #SGE_JOB_ID="$1" #find_run_hosts "$SGE_JOB_ID" #find_job_owner "$SGE_JOB_ID" #find_job_head_node "$SGE_JOB_ID"