#!/bin/bash
#
# Copyright (c) 2006 Mellanox Technologies. All rights reserved.
#
# This Software is licensed under one of the following licenses:
#
# 1) under the terms of the "Common Public License 1.0" a copy of which is
#    available from the Open Source Initiative, see
#    http://www.opensource.org/licenses/cpl.php.
#
# 2) under the terms of the "The BSD License" a copy of which is
#    available from the Open Source Initiative, see
#    http://www.opensource.org/licenses/bsd-license.php.
#
# 3) under the terms of the "GNU General Public License (GPL) Version 2" a
#    copy of which is available from the Open Source Initiative, see
#    http://www.opensource.org/licenses/gpl-license.php.
#
# Licensee has the right to choose one of the above licenses.
#
# Redistributions of source code must retain the above copyright
# notice and one of the license notices.
#
# Redistributions in binary form must reproduce both the above copyright
# notice, one of the license notices in the documentation
# and/or other materials provided with the distribution.
#
#  $Id$
#

prog=srp_daemon
params=$@
ibdir="/sys/class/infiniband"
log="/var/log/srp_daemon.log"
pid=""
min_sleep=1
max_sleep=30
sleep_on_failure=${min_sleep}
last_status=1024 # a dummy status
dump_log=1
report_timeout=7200
report_run_time=60
last_report=0

trap 'trap_handler' 2 15

while [ ! -z "$1" ]
do
    case "$1" in
        -i)
            hca_id=$2
            shift 2
        ;;
        -p)
            port=$2
            shift 2
        ;;
        *)
            shift
        ;; 
    esac
done

trap_handler()
{
    if [ -n "$pid" ]; then
        kill -15 $pid  > /dev/null 2>&1
    fi
    logger -i -t "$(basename $0)" "killing $prog."
    exit 0
}

while true
do
    # Check the ib/srp status
    if [ ! -d ${ibdir} ]; then
        sleep 5
        continue
    fi

    if [ ! -d ${ibdir}/${hca_id}/ports/${port} ]; then
        sleep 10
        continue
    fi
   
    if ! ( /sbin/lsmod | grep -w "ib_srp" > /dev/null 2>&1 ); then
        sleep 60
        continue
    fi

    if [ $dump_log -eq 1 ]; then
        logger -i -t "$(basename $0)" "starting ${prog}: [HCA=${hca_id}] [port=${port}]"
    fi

    start_time=$(date +%s | tr -d '[:space:]')
    ${prog} ${params} >> ${log} 2>&1 &
    pid=$!
    wait $pid
    status=$?
    end_time=$(date +%s | tr -d '[:space:]')
    let run_time=${end_time}-${start_time}
    let time_from_last_report=${end_time}-${last_report}
    if [ $run_time -gt $report_run_time -o $time_from_last_report -gt $report_timeout -o $status -ne $last_status ]; then
	dump_log=1
    else
	dump_log=0
    fi
    if [ $dump_log -eq 1 ]; then
        logger -i -t "$(basename $0)" "failed ${prog}: [HCA=${hca_id}] [port=${port}] [exit status=${status}]. Will try to restart ${prog} periodically. No more warnings will be issued in the next ${report_timeout} seconds if the same problem repeats"
	last_report=$end_time
    fi
    last_status=$status
    sleep $sleep_on_failure
    if [ $run_time -gt 10 ]; then
        sleep_on_failure=${min_sleep}
    else
        let sleep_on_failure*=2
    fi
    if [ $sleep_on_failure -gt $max_sleep ]; then
        sleep_on_failure=${max_sleep}
    fi
done
