Archive for October, 2009

condor_master for managing processes

October 21, 2009
#!/usr/bin/env python

from socket import socket, htons, AF_INET, SOCK_DGRAM
from array import array

def send_alive(pid, timeout=300, master_host="127.0.0.1", master_port=1271):
    """
    Send a UDP packet to the condor_master containing the
    DC_CHILDALIVE command.

    This will have the master register a trigger to fire in timeout
    seconds. When the trigger fires the pid will be killed. Each time
    the DC_CHILDALIVE is sent the trigger's timer is reset to fire in
    timeout seconds.

    DC_CHILDALIVE should be sent every timeout/3 seconds.
    """
    sock = socket(AF_INET, SOCK_DGRAM)
    sock.sendto(build_message(pid, timeout), (master_host, master_port))

def build_message(pid, timeout):
    """
    Build a datagram packet to send to the condor_master.

    The package format is (command, pid, timeout). The command is
    always DC_CHILDALIVE (the integer 60008). The pid is the pid of
    the process the master is monitoring, i.e. getpid if this
    script. The timeout is the amount of time, in seconds, the master
    will wait before killing the pid. Each field in the packet must be
    8 bytes long, thus the padding.
    """
    DC_CHILDALIVE = 60008

    message=array('H')
    message.append(0); message.append(0); message.append(0) # padding
    message.append(htons(DC_CHILDALIVE))
    message.append(0); message.append(0); message.append(0) # padding
    message.append(htons(pid))        
    message.append(0); message.append(0); message.append(0) # padding
    message.append(htons(timeout))

    return message.tostring()

#
# The condor_master is a daemon that can run arbitrary executables,
# monitor them for failures, restart them, watch for executable
# updates, and send obituary emails.
#
# The condor_master can monitor any program it starts for abnormal
# termination, e.g. return code != 0 or caused by a signal. It can
# also detect hung processes if they periodically send DC_CHILDALIVE
# commands.
#
# The condor_master cleans up process trees, not just the processes it
# directly started.
#
# Example usage...
#
# Run this script from the condor_master, e.g.
#  env CONDOR_CONFIG=ONLY_ENV \
#      _CONDOR_WANT_UDP_COMMAND_SOCKET=TRUE \
#      _CONDOR_NETWORK_INTERFACE=127.0.0.1 \
#      _CONDOR_MASTER_LOG=master.log \
#      _CONDOR_MASTER=$(which condor_master) \
#      _CONDOR_PROG=$(which <this script>) \
#      _CONDOR_PROG_ARGS="-some -args 3" \
#      _CONDOR_DAEMON_LIST=MASTER,PROG \
#      condor_master -p 1271 -pidfile master.pid
#
# At some point kill -STOP or -TERM this script and watch the
# condor_master react.
#
# condor_master configuration/features:
#
#  CONDOR_CONFIG=ONLY_ENV
#    - useful, only look in the ENV for config, no config file needed
#
#  _CONDOR_WANT_UDP_COMMAND_SOCKET=TRUE
#    - required, make master listen for UDP commands
#
#  _CONDOR_NETWORK_INTERFACE=<ip>
#    - useful, make master listen on <ip>
#
#  _CONDOR_MASTER_LOG=<file>
#    - required, the master's log file
#
#  _CONDOR_MASTER_DEBUG=<level>
#    - D_ALWAYS is default D_FULLDEBUG shows a lot more
#
#  _CONDOR_MASTER=<path to condor_master>
#    - required, master will restart itself if its executable is
#      updated
#
#  _CONDOR_PROG=<path to this script>
#    - required, the path to an executable the master will start and
#      monitor
#
#  _CONDOR_PROG_ARGS=<arguments>
#    - useful, if the executable needs the master to pass it arguments
#
#  _CONDOR_DAEMON_LIST=MASTER,PROG
#    - required, list of executables the master will monitor
#
#  _CONDOR_MAIL=/bin/mail
#    - useful, no default, if email notification of PROG hang/crash is
#      desired. don't specify and get no emails.
#
#  _CONDOR_CONDOR_ADMIN=admin@fqdn
#    - useful, the address to send emails to
#
#  _CONDOR_MASTER_BACKOFF_CONSTANT, C, default 9 seconds
#  _CONDOR_MASTER_BACKOFF_FACTOR, K, default 2 seconds
#  _CONDOR_MASTER_BACKOFF_CEILING, T, default 3600 seconds
#  _CONDOR_MASTER_RECOVER_FACTOR, default 500 seconds
#    - useful, parameters to control the exponential backoff
#      start delay = min(T, C + K^n),
#        n is the # of restarts since last recovery
#        a recovery is a run of RECOVER_FACTOR without a crash
#
#  _CONDOR_MASTER_CHECK_NEW_EXEC_INTERVAL=<seconds>
#    - useful, default 300, how often to check executable timestamps
#      and potentially restart processes
#
#  _CONDOR_MASTER_NEW_BINARY_DELAY=<seconds>
#    - useful, default 120, time waited after noticing a timestamp
#      change before restarting the executable
#
#  condor_master -p <port> -pidfile <file>
#    - specify a port for the master to listen on, default is
#      ephemerial, and specify a file where the master writes its pid,
#      default is nowhere
#      also: -t (log to stdout) -f (run in foreground)
#
# Advanced note: condor_squawk can send DC_CHILDALIVE as well, e.g.
#    $ echo "command DC_CHILDALIVE 1234 15" | condor_squawk "<127.0.0.1:1271>"
#

import time, os

timeout = 30
while True:
    send_alive(os.getpid(), timeout)
    time.sleep(timeout/3)
Advertisements

%d bloggers like this: