/* 
** Copyright 1986, 1987, 1988, 1989, 1990, 1991 by the Condor Design Team
** 
** Permission to use, copy, modify, and distribute this software and its
** documentation for any purpose and without fee is hereby granted,
** provided that the above copyright notice appear in all copies and that
** both that copyright notice and this permission notice appear in
** supporting documentation, and that the names of the University of
** Wisconsin and the Condor Design Team not be used in advertising or
** publicity pertaining to distribution of the software without specific,
** written prior permission.  The University of Wisconsin and the Condor
** Design Team make no representations about the suitability of this
** software for any purpose.  It is provided "as is" without express
** or implied warranty.
** 
** THE UNIVERSITY OF WISCONSIN AND THE CONDOR DESIGN TEAM DISCLAIM ALL
** WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES
** OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE UNIVERSITY OF
** WISCONSIN OR THE CONDOR DESIGN TEAM BE LIABLE FOR ANY SPECIAL, INDIRECT
** OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
** OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
** OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
** OR PERFORMANCE OF THIS SOFTWARE.
** 
** Authors:  Allan Bricker and Michael J. Litzkow,
** 	         University of Wisconsin, Computer Sciences Dept.
** 
*/ 


#include <stdio.h>

#if defined(IRIX331)
#define __EXTENSIONS__
#include <signal.h>
#undef __EXTENSIONS__
#define BADSIG SIG_ERR
#else
#include <signal.h>
#endif

#include <errno.h>
#include <pwd.h>
#include <netdb.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/file.h>
#include <sys/time.h>
#include <sys/resource.h>
#include "sched.h"
#include "debug.h"
#include "except.h"
#include "expr.h"

#if defined(HPUX8)
#include "fake_flock.h"
#define _BSD
#endif
#include <sys/wait.h>

#define MAX_LINES 100

typedef struct {
	long	data[MAX_LINES + 1];
	int		first;
	int		last;
	int		size;
	int		n_elem;
} QUEUE;


extern char	*SigNames[];
char	*param(), *rindex(), *strdup(), *prog_name();
int		sigchld_handler(), sigalrm_handler(), sigint_handler(),
		sighup_handler(), sigquit_handler(), restart_master();
long	delete_queue();
int		daily_housekeeping();

static char *_FileName_ = __FILE__;		/* Used by EXCEPT (see except.h)     */

#define MINUTE	60
#define HOUR	60 * MINUTE
#define MAXIMUM(a,b) ((a)>(b)?(a):(b))
#define STAR -1

char	*MyName;

time_t	GetTimeStamp();

int		KbdD_Restarts;
int		KbdD_Pid;
time_t	KbdD_TimeStamp;

int		SchedD_Restarts;
int		SchedD_Pid;
time_t	SchedD_TimeStamp;

int		StartD_Restarts;
int		StartD_Pid;
time_t	StartD_TimeStamp;

int		Collector_Restarts;
int		Collector_Pid;
time_t	Collector_TimeStamp;

int		Negotiator_Restarts;
int		Negotiator_Pid;
time_t	Negotiator_TimeStamp;

time_t	Master_TimeStamp;
int		MasterLockFD;

char	*CollectorHost;
char	*NegotiatorHost;
int		RestartsPerHour;
int		Foreground;
int		Termlog;
char	*Master;
char	*MasterLog;
char	*Collector;
char	*Collector_Log;
char	*Negotiator;
char	*Negotiator_Log;
char	*StartD;
char	*Start_Log;
char	*KbdD;
char	*Kbd_Log;
char	*SchedD;
char	*Sched_Log;
char	*CondorAdministrator;
char	*FS_Preen;

int		NotFlag;
int		PublishObituaries;
int		X_runs_here;
int		Lines;

usage( name )
char	*name;
{
	dprintf( D_ALWAYS, "Usage: %s [-f] [-t] [-n]\n", name );
	exit( 1 );
}

DoCleanup()
{
	(void)signal( SIGCHLD, SIG_IGN );
	do_killpg( Collector_Pid, SIGKILL );
	do_killpg( Negotiator_Pid, SIGKILL );
	do_killpg( StartD_Pid, SIGKILL );
	do_killpg( KbdD_Pid, SIGKILL );
	do_killpg( SchedD_Pid, SIGKILL );
}


main( argc, argv )
int		argc;
char	*argv[];
{
	struct itimerval	timer;
	struct passwd		*pwd, *getpwnam();
	char				**ptr, *startem;

	MyName = argv[0];

	if( getuid() != 0 ) {
		dprintf( D_ALWAYS, "%s must be run as ROOT\n", MyName );
		exit( 1 );
	}

		/* Run as group condor so we can access log files even if they
		   are remotely mounted with NFS - needed because
		   root = nobody on the remote file system */
	if( (pwd=getpwnam("condor")) == NULL ) {
		EXCEPT( "condor not in passwd file" );
	}
	if( setgid(pwd->pw_gid) < 0 ) {
		EXCEPT( "setgid(%d)", pwd->pw_gid );
	}

#ifdef NFSFIX
	/*
	** A NFS bug fix prohibits root from writing to an NFS partition
	** even if he has group write permission.  We need to be Condor
	** most of the time.
	*/
	set_condor_euid(__FILE__,__LINE__);
#endif NFSFIX


	config( MyName, (CONTEXT *)0 );

	init_params();
	add_to_path( "/etc" );
	add_to_path( "/usr/etc" );

	if( argc > 4 ) {
		usage( argv[0] );
	}
	_EXCEPT_Cleanup = DoCleanup;

	for( ptr=argv+1; *ptr; ptr++ ) {
		if( ptr[0][0] != '-' ) {
			usage( argv[0] );
		}
		switch( ptr[0][1] ) {
			case 'f':
				Foreground++;
				break;
			case 't':
				Termlog++;
				break;
			case 'n':
				NotFlag++;
				break;
			default:
				usage( argv[0] );
		}
	}

	dprintf_config( "MASTER", 2 );

	startem = param("START_DAEMONS");
	if( !startem || *startem == 'f' || *startem == 'F' ) {
		dprintf( D_ALWAYS, "START_DAEMONS flag was set to %s.  Exiting.\n",
			startem?startem:"(NULL)");
		exit( 0 );
	}

	if( !Termlog ) {
		detach();
	}

		/* Make sure we are the only copy of condor_master running */
	get_lock( MasterLog );

	if( !Foreground ) {
		if( fork() ) {
			exit( 0 );
		}
	}


	dprintf( D_ALWAYS,"*************************************************\n" );
	dprintf( D_ALWAYS,"***          CONDOR_MASTER STARTING UP        ***\n" );
	dprintf( D_ALWAYS,"***               PID = %-6d                ***\n",
																	getpid() );
	dprintf( D_ALWAYS,"*************************************************\n" );

	if( signal(SIGALRM,sigalrm_handler) == BADSIG ) {
		EXCEPT( "signal(SIGALRM,0x%x)", sigalrm_handler );
	}

	if( signal(SIGCHLD,sigchld_handler) == BADSIG ) {
		EXCEPT( "signal(SIGCHLD,0x%x)", sigchld_handler );
	}

	if( signal(SIGINT,sigint_handler) == BADSIG ) {
		EXCEPT( "signal(SIGINT,0x%x)", sigint_handler );
	}

	if( signal(SIGQUIT,sigquit_handler) == BADSIG ) {
		EXCEPT( "signal(SIGQUIT,0x%x)", sigquit_handler );
	}

	if( signal(SIGTERM,sigquit_handler) == BADSIG ) {
		EXCEPT( "signal(SIGTERM,0x%x)", sigquit_handler );
	}

	if( signal(SIGHUP,sighup_handler) == BADSIG ) {
		EXCEPT( "signal(SIGHUP,0x%x)", sighup_handler );
	}

	if( signal(SIGUSR1,restart_master) == BADSIG ) {
		EXCEPT( "signal(SIGUSR1,0x%x)", restart_master );
	}

		/* once a day at 3:30 a.m. */
	schedule_event( STAR, STAR, 3, 30, 0, daily_housekeeping );

		/* once an hour for testing */
	/*
	schedule_event( STAR, STAR, STAR, 0, 0, daily_housekeeping );
	*/


	start_all_daemons();

	timer.it_interval.tv_sec = HOUR / RestartsPerHour;
	timer.it_interval.tv_usec = 0;
	timer.it_value = timer.it_interval;
	if( setitimer(ITIMER_REAL,&timer,(struct itimerval *)0) < 0 ) {
		EXCEPT( "setitimer(ITIMER_REAL,0x%x,0)", &timer );
	}

	for(;;) {
		sigpause( 0 );
	}
}


sigalrm_handler()
{
	KbdD_Restarts = MAXIMUM(0,KbdD_Restarts-1);
	SchedD_Restarts = MAXIMUM(0,SchedD_Restarts-1);
	StartD_Restarts = MAXIMUM(0,StartD_Restarts-1);
	Collector_Restarts = MAXIMUM(0,Collector_Restarts-1);
	Negotiator_Restarts = MAXIMUM(0,Negotiator_Restarts-1);

	if( NewExecutable(Master, &Master_TimeStamp) ) {
		restart_master();
	}

	if( Collector_Pid && NewExecutable(Collector, &Collector_TimeStamp) ) {
		dprintf(D_ALWAYS, "Collector was modified.  Killing %s\n", Collector);
		do_killpg( Collector_Pid, SIGKILL );
		Collector_Restarts = 0;
	}

	if( Negotiator_Pid && NewExecutable(Negotiator, &Negotiator_TimeStamp) ) {
		dprintf(D_ALWAYS, "Negotiator was modified.  Killing %s\n", Negotiator);
		do_killpg( Negotiator_Pid, SIGKILL );
		Negotiator_Restarts = 0;
	}

	if( NewExecutable(KbdD, &KbdD_TimeStamp) ) {
		dprintf(D_ALWAYS, "KbdD was modified.  Killing %s\n", KbdD);
		do_killpg( KbdD_Pid, SIGKILL );
		KbdD_Restarts = 0;
	}

	if( NewExecutable(SchedD, &SchedD_TimeStamp) ) {
		dprintf(D_ALWAYS, "SchedD was modified.  Killing %s\n", SchedD);
		do_killpg( SchedD_Pid, SIGKILL );
		SchedD_Restarts = 0;
	}

	if( NewExecutable(StartD, &StartD_TimeStamp) ) {
		dprintf(D_ALWAYS, "StartD was modified.  Killing %s\n", StartD);
		do_killpg( StartD_Pid, SIGKILL );
		StartD_Restarts = 0;
	}

	event_mgr();
}

restart_master()
{
	int			pid;

	dprintf(D_ALWAYS, "RESTARTING MASTER (new executable)\n");
	(void)signal( SIGCHLD, SIG_IGN );
	if( Collector_Pid ) {
		do_killpg( Collector_Pid, SIGKILL );
		dprintf(D_ALWAYS, "Killed Collector pid = %d\n", Collector_Pid );
	}
	if( Negotiator_Pid ) {
		do_killpg( Negotiator_Pid, SIGKILL );
		dprintf(D_ALWAYS, "Killed Negotiator pid = %d\n", Negotiator_Pid );
	}
	if( StartD_Pid ) {
		do_killpg( StartD_Pid, SIGKILL );
		dprintf(D_ALWAYS, "Killed StartD pid = %d\n", StartD_Pid );
	}
	if( KbdD_Pid ) {
		do_killpg( KbdD_Pid, SIGKILL );
		dprintf(D_ALWAYS, "Killed KbdD pid = %d\n", KbdD_Pid );
	}
	if( SchedD_Pid ) {
		do_killpg( SchedD_Pid, SIGKILL );
		dprintf(D_ALWAYS, "Killed SchedD pid = %d\n", SchedD_Pid );
	}

		/* Wait until all children die */
	for(;;) {
		pid = wait( (union wait*)0 );
		dprintf( D_ALWAYS, "Wait() returns pid %d\n", pid );
		if( pid < 0 ) {
			if( errno == ECHILD ) {
				break;
			} else {
				EXCEPT( "wait( 0 )" );
			}
		}
	}
	dprintf( D_ALWAYS, "Done waiting for all children\n" );

	if( flock(MasterLockFD,LOCK_UN) < 0 ) {
		dprintf( D_ALWAYS, "Can't remove lock on \"%s\"\n", MasterLog );
		EXCEPT( "flock(%d,0%o)", MasterLockFD, LOCK_UN );
	}
	dprintf( D_ALWAYS, "Unlocked file descriptor %d\n", MasterLockFD );
	(void)close( MasterLockFD );
	dprintf( D_ALWAYS, "Closed file descriptor %d\n", MasterLockFD );

	dprintf( D_ALWAYS, "Doing exec( \"%s\", \"condor_master\", 0 )", Master );
	(void)execl(Master, "condor_master", 0);
#ifdef NFSFIX
	/* Must be condor to write to log files. */
	set_condor_euid(__FILE__,__LINE__);
#endif NFSFIX
	EXCEPT("execl(%s, condor_master, 0)", Master);
}

#define IS_DAEMON(p) (p==SchedD_Pid||p==StartD_Pid|| \
			p==Collector_Pid||p==Negotiator_Pid||p==KbdD_Pid)

sigchld_handler()
{
	int		pid = 0;
	union wait	status;

	while( (pid=wait3(&status,WNOHANG,(struct rusage *)0)) != 0 ) {
		if( pid == -1 ) {
			EXCEPT( "wait3(0x%x,WNOHANG,0) returns %d", &status, pid );
		}
		if( WIFSTOPPED(status) ) {
			continue;
		}
		if( !IS_DAEMON(pid) ) {
			dprintf( D_ALWAYS, "Pid %d died with ", pid );
			if( WIFEXITED(status) ) {
				dprintf( D_ALWAYS | D_NOHEADER,
									"status %d\n", WEXITSTATUS(status) );
				continue;
			}
			if( WIFSIGNALED(status) ) {
				dprintf( D_ALWAYS | D_NOHEADER,
									"signal %d\n", WTERMSIG(status) );
			}
			continue;
		}
		if( status.w_termsig != SIGKILL && PublishObituaries ) {
			obituary( pid, &status );
		}
		restart( pid );
		dprintf( D_ALWAYS | D_NOHEADER, "\n" );
	}
}

char *
prog_log( pid )
int		pid;
{
	if( pid == StartD_Pid ) {
		return Start_Log;
	}
	if( pid == KbdD_Pid ) {
		return Kbd_Log;
	}
	if( pid == SchedD_Pid ) {
		return Sched_Log;
	}
	if( pid == Collector_Pid ) {
		return Collector_Log;
	}
	if( pid == Negotiator_Pid ) {
		return Negotiator_Log;
	}
	return "Unknown Program!!!";
}

char *
prog_name( pid )
int		pid;
{
	if( pid == StartD_Pid ) {
		return StartD;
	}
	if( pid == KbdD_Pid ) {
		return KbdD;
	}
	if( pid == SchedD_Pid ) {
		return SchedD;
	}
	if( pid == Collector_Pid ) {
		return Collector;
	}
	if( pid == Negotiator_Pid ) {
		return Negotiator;
	}
	return "Unknown Program!!!";
}


restart( pid )
int		pid;
{

	sleep( 30 );

	if( pid == KbdD_Pid ) {
		dprintf( D_ALWAYS, "The KbdD (process %d) died\n", pid );
		do_killpg( pid, SIGKILL ) ;
		if( ++KbdD_Restarts > RestartsPerHour ) {
			give_up( KbdD );
		}
		KbdD_Pid = start_daemon( KbdD );
	} else if( pid == SchedD_Pid ) {
		dprintf( D_ALWAYS, "The SchedD (process %d) died\n", pid );
		do_killpg( pid, SIGKILL ) ;
		if( ++SchedD_Restarts > RestartsPerHour ) {
			give_up( SchedD );
		}
		SchedD_Pid = start_daemon( SchedD );
	} else if( pid == StartD_Pid ) {
		dprintf( D_ALWAYS, "The StartD (process %d ) died\n", pid );
		do_killpg( pid, SIGKILL ) ;
		if( ++StartD_Restarts > RestartsPerHour ) {
			give_up( StartD );
		}
		StartD_Pid = start_daemon( StartD );
	} else if( pid == Collector_Pid ) {
		dprintf( D_ALWAYS, "The Collector (process %d ) died\n", pid );
		do_killpg( pid, SIGKILL ) ;
		if( ++Collector_Restarts > RestartsPerHour ) {
			give_up( Collector );
		}
		Collector_Pid = start_daemon( Collector );
	} else if( pid == Negotiator_Pid ) {
		dprintf( D_ALWAYS, "The Negotiator (process %d ) died\n", pid );
		do_killpg( pid, SIGKILL ) ;
		if( ++Negotiator_Restarts > RestartsPerHour ) {
			give_up( Negotiator );
		}
		Negotiator_Pid = start_daemon( Negotiator );
	} else {
		dprintf( D_ALWAYS, "Child %d died, but not a daemon -- Ignored\n", pid);
	}
}

SetSyscalls(){}

init_params()
{
	char	*tmp;

	if( (Master = param("MASTER")) == NULL ) {
		EXCEPT( "MASTER not specified in config file" );
	}

	if( (MasterLog = param("MASTER_LOG")) == NULL ) {
		EXCEPT( "MASTER_LOG not specified in config file" );
	}

	if( (CollectorHost = param("COLLECTOR_HOST")) == NULL ) {
		EXCEPT( "COLLECTOR_HOST not specified in config file" );
	}

	if( (NegotiatorHost = param("NEGOTIATOR_HOST")) == NULL ) {
		EXCEPT( "NEGOTIATOR_HOST not specified in config file" );
	}

	if( (Collector = param("COLLECTOR")) == NULL ) {
		EXCEPT( "COLLECTOR not specified in config file" );
	}

	if( (Negotiator = param("NEGOTIATOR")) == NULL ) {
		EXCEPT( "NEGOTIATOR not specified in config file" );
	}

	if( (Collector_Log = param("COLLECTOR_LOG")) == NULL ) {
		EXCEPT( "COLLECTOR_LOG not specified in config file" );
	}

	if( (Negotiator_Log = param("NEGOTIATOR_LOG")) == NULL ) {
		EXCEPT( "NEGOTIATOR_LOG not specified in config file" );
	}

	if( (StartD = param("STARTD")) == NULL ) {
		EXCEPT( "STARTD not specified in config file" );
	}

	if( (Start_Log = param("STARTD_LOG")) == NULL ) {
		EXCEPT( "STARTD_LOG not specified in config file" );
	}

	if( (KbdD = param("KBDD")) == NULL ) {
		EXCEPT( "KBDD not specified in config file" );
	}

	if( (Kbd_Log = param("KBDD_LOG")) == NULL ) {
		EXCEPT( "KBDD_LOG not specified in config file" );
	}

	if( (SchedD = param("SCHEDD")) == NULL ) {
		EXCEPT( "SCHEDD not specified in config file" );
	}

	if( (Sched_Log = param("SCHEDD_LOG")) == NULL ) {
		EXCEPT( "SCHEDD_LOG not specified in config file" );
	}

	if( (CondorAdministrator = param("CONDOR_ADMIN")) == NULL ) {
		EXCEPT( "CONDOR_ADMIN not specified in config file" );
	}

	tmp = param("X_RUNS_HERE");
	if( tmp && (*tmp == 't' || *tmp == 'T') ) {
		X_runs_here = TRUE;
	} else {
		X_runs_here = FALSE;
	}

	tmp = param("PUBLISH_OBITUARIES");
	if( tmp && (*tmp == 't' || *tmp == 'T') ) {
		PublishObituaries = TRUE;
	} else {
		PublishObituaries = FALSE;
	}

	tmp = param("OBITUARY_LOG_LENGTH");
	if( tmp == NULL ) {
		Lines = 20;
	} else {
		Lines = atoi( tmp );
	}

	tmp = param( "RESTARTS_PER_HOUR" );
	if( tmp == NULL ) {
		RestartsPerHour = 4;
	} else {
		RestartsPerHour = atoi( tmp );
	}

	if( param("MASTER_DEBUG") ) {
		if( boolean("MASTER_DEBUG","Foreground") ) {
			Foreground++;
		}
	}

	FS_Preen = param( "PREEN" );
}


start_daemon( pathname )
char	*pathname;
{
	int		pid;
	char	*shortname;

	if( NotFlag ) {
		dprintf( D_ALWAYS, "NOT Starting \"%s\"\n", pathname );
		return 0;
	}

	if( shortname = rindex(pathname,'/') ) {
		shortname += 1;
	} else {
		shortname = pathname;
	}

	if( access(pathname,X_OK) != 0 ) {
		EXCEPT( "%s: Cannot execute", pathname );
	}

	if( (pid = vfork()) < 0 ) {
		EXCEPT( "vfork()" );
	}

	if( pid == 0 ) {	/* The child */
#ifdef NFSFIX
		/* Daemons need to be started as root. */
		set_root_euid(__FILE__,__LINE__);
#endif NFSFIX
		pid = getpid();
		if( setpgrp(0,pid) < 0 ) {
			EXCEPT( "setpgrp(0,%d)", pid );
		}
		(void)execl( pathname, shortname, "-f", 0 );
#ifdef NFSFIX
		/* Must be condor to write to log files. */
		set_condor_euid(__FILE__,__LINE__);
#endif NFSFIX
		EXCEPT( "execl( %s, %s, -f, 0 )", pathname, shortname );
#ifdef LINT
		return 0;
#endif LINT
	} else { 			/* The parent */
		dprintf( D_ALWAYS, "Started \"%s\", pid and pgroup = %d\n",
			shortname, pid );
		return pid;
	}
}

collector_runs_here()
{
	char	hostname[512];
	char	*my_host_name;
	char	*mgr_host_name;
	struct hostent	*hp, *gethostbyname();
	
		/* Get the "official" name of our own host */
	if( gethostname(hostname,sizeof(hostname)) < 0 ) {
		EXCEPT( "gethostname(0x%x,%d)", hostname, sizeof(hostname) );
	}
	if( (hp=gethostbyname(hostname)) == NULL ) {
		EXCEPT( "gethostbyname(%s)", hostname );
	}
	my_host_name = strdup( hp->h_name );

		/* Get the "official" name of the collector host */
	if( (hp=gethostbyname(CollectorHost)) == NULL ) {
		EXCEPT( "gethostbyname(%s)", CollectorHost );
	}
	mgr_host_name = strdup( hp->h_name );

	return strcmp(my_host_name,mgr_host_name) == MATCH;
}

negotiator_runs_here()
{
	char	hostname[512];
	char	*my_host_name;
	char	*mgr_host_name;
	struct hostent	*hp, *gethostbyname();
	
		/* Get the "official" name of our own host */
	if( gethostname(hostname,sizeof(hostname)) < 0 ) {
		EXCEPT( "gethostname(0x%x,%d)", hostname, sizeof(hostname) );
	}
	if( (hp=gethostbyname(hostname)) == NULL ) {
		EXCEPT( "gethostbyname(%s)", hostname );
	}
	my_host_name = strdup( hp->h_name );

		/* Get the "official" name of the negotiator host */
	if( (hp=gethostbyname(NegotiatorHost)) == NULL ) {
		EXCEPT( "gethostbyname(%s)", NegotiatorHost );
	}
	mgr_host_name = strdup( hp->h_name );

	return strcmp(my_host_name,mgr_host_name) == MATCH;
}

obituary( pid, status )
int			pid;
union wait	*status;
{
	char	cmd[512];
	char	hostname[512];
	FILE	*mailer, *popen();
	char	*name, *log;


		/* If daemon with a serious bug gets installed, we may end up
		** doing many restarts in rapid succession.  In that case, we
		** don't want to send repeated mail to the CONDOR administrator.
		** This could overwhelm the administrator's machine.
		*/
	if( pid == KbdD_Pid ) {
		return;
		/*
		if( KbdD_Restarts > 1 ) {
			return;
		}
		*/
	} else if( pid == SchedD_Pid ) {
		if( SchedD_Restarts > 1 ) {
			return;
		}
	} else if( pid == StartD_Pid ) {
		if( StartD_Restarts > 1 ) {
			return;
		}
	} else if( pid == Collector_Pid ) {
		if( Collector_Restarts > 1 ) {
			return;
		}
	} else if( pid == Negotiator_Pid ) {
		if( Negotiator_Restarts > 1 ) {
			return;
		}
	} else {
		EXCEPT( "Pid %d returned by wait3(), but not a child\n", pid );
	}


	name = prog_name( pid );
	log = prog_log( pid );

	dprintf( D_ALWAYS, "Sending obituary for \"%s\" to \"%s\"\n",
												name, CondorAdministrator );

	if( gethostname(hostname,sizeof(hostname)) < 0 ) {
		EXCEPT( "gethostname(0x%x,%d)", hostname, sizeof(hostname) );
	}

	(void)sprintf( cmd, "%s %s", BIN_MAIL, CondorAdministrator );
	if( (mailer=popen(cmd,"w")) == NULL ) {
		EXCEPT( "popen(\"%s\",\"w\")", cmd );
	}

	fprintf( mailer, "To: %s\n", CondorAdministrator );
	fprintf( mailer, "Subject: CONDOR Problem\n" );
	fprintf( mailer, "\n" );

	if( status->w_termsig ) {
		fprintf( mailer, "\"%s\" on \"%s\" died due to signal %d\n",
									name, hostname, status->w_termsig );
		fprintf( mailer, "(%s core was produced)\n",
									status->w_coredump ? "a" : "no" );
	} else {
		fprintf( mailer,
		"\"%s\" on \"%s\" exited with status %d\n",
									name, hostname, status->w_retcode );
	}
	tail_log( mailer, log, Lines );

		/* Don't do a pclose here, it wait()'s, and may steal an
		** exit notification of one of our daemons.  Instead we'll clean
		** up popen's child in our SIGCHLD handler.
		*/
	(void)fclose( mailer );
}

tail_log( output, file, lines )
FILE	*output;
char	*file;
int		lines;
{
	FILE	*input;
	int		ch, last_ch;
	long	loc, ftell();
	QUEUE	queue, *q = &queue;

	if( (input=fopen(file,"r")) == NULL ) {
		fprintf( stderr, "Can't open %s\n", file );
		return;
	}

	init_queue( q, lines );
	last_ch = '\n';

	while( (ch=getc(input)) != EOF ) {
		if( last_ch == '\n' && ch != '\n' ) {
			insert_queue( q, ftell(input) - 1 );
		}
		last_ch = ch;
	}


	while( !empty_queue( q ) ) {
		loc = delete_queue( q );
		display_line( loc, input, output );
	}
	(void)fclose( input );
}

display_line( loc, input, output )
long	loc;
FILE	*input;
FILE	*output;
{
	int		ch;

	(void)fseek( input, loc, 0 );

	for(;;) {
		ch = getc(input);
		(void)putc( ch, output );
		if( ch == EOF || ch == '\n' ) {
			return;
		}
	}
}

init_queue( queue, size )
QUEUE	*queue;
{
	queue->first = 0;
	queue->last = 0;
	queue->size = size;
	queue->n_elem = 0;
}

insert_queue( queue, elem )
QUEUE	*queue;
long	elem;
{
	if( queue->n_elem == queue->size ) {
		queue->first = (queue->first + 1) % (queue->size + 1);
	} else {
		queue->n_elem += 1;
	}
	queue->data[queue->last] = elem;
	queue->last = (queue->last + 1) % (queue->size + 1);
}

long
delete_queue( queue )
QUEUE	*queue;
{
	long	answer;

	queue->n_elem -= 1;
	answer = queue->data[ queue->first ];
	queue->first = (queue->first + 1) % (queue->size + 1);
	return answer;
}

empty_queue( queue )
QUEUE	*queue;
{
	return queue->first == queue->last;
}


give_up( name )
char	*name;
{
	char	cmd[512];
	char	hostname[512];
	FILE	*mailer, *popen();

	dprintf( D_ALWAYS, "Exceeded %d restarts / hour on \"%s\"\n",
													RestartsPerHour, name );
	dprintf( D_ALWAYS, "Sending mail to \"%s\"\n", CondorAdministrator );

	if( gethostname(hostname,sizeof(hostname)) < 0 ) {
		EXCEPT( "gethostname(0x%x,%d)", hostname, sizeof(hostname) );
	}

	(void)sprintf( cmd, "%s %s", BIN_MAIL, CondorAdministrator );
	if( (mailer=popen(cmd,"w")) == NULL ) {
		EXCEPT( "popen(\"%s\",\"w\")", cmd );
	}

	fprintf( mailer, "To: %s\n", CondorAdministrator );
	fprintf( mailer, "Subject: CONDOR Problem\n" );
	fprintf( mailer, "\n" );

	fprintf( mailer, "HELP!\n\n" );
	fprintf( mailer,
	"The CONDOR_DaemonMaster on [%s] has exceeded %d restarts/hour for [%s]\n",
			hostname, RestartsPerHour, name);

	(void)pclose( mailer );

	(void)signal( SIGCHLD, SIG_IGN );
	do_killpg( Collector_Pid, SIGKILL );
	do_killpg( Negotiator_Pid, SIGKILL );
	do_killpg( StartD_Pid, SIGKILL );
	do_killpg( KbdD_Pid, SIGKILL );
	do_killpg( SchedD_Pid, SIGKILL );
	dprintf( D_ALWAYS, "*** E X I T I N G ***\n\n" );
	exit( 1 );
}

get_lock( file_name )
char	*file_name;
{

#if defined(AIX31) || defined(IRIX331) || defined(HPUX8)
	if( (MasterLockFD=open(file_name,O_RDWR,0)) < 0 ) {
		EXCEPT( "open(%s,0,0)", file_name );
	}
#else
	if( (MasterLockFD=open(file_name,0,0)) < 0 ) {
		EXCEPT( "open(%s,0,0)", file_name );
	}
#endif

	if( flock(MasterLockFD,LOCK_EX|LOCK_NB) < 0 ) {
		dprintf( D_ALWAYS, "Can't get lock on file \"%s\"\n", file_name );
		EXCEPT( "flock(%d,0%o)", MasterLockFD, LOCK_EX | LOCK_NB );
	}
}


do_killpg( pgrp, sig )
int		pgrp;
int		sig;
{
	int		status;

	if( !pgrp ) {
		return;
	}

	if( sig < 0 || sig >= NSIG ) {
		EXCEPT( "Unknown signal (%d)", sig );
	}

#ifdef NFSFIX
	set_root_euid(__FILE__,__LINE__);
#endif NFSFIX

	(void)killpg( pgrp, sig );

#ifdef NFSFIX
	set_condor_euid(__FILE__,__LINE__);
#endif NFSFIX

}

do_kill( pid, sig )
int		pid;
int		sig;
{
	int		status;

	if( !pid ) {
		return;
	}

#ifdef NFSFIX
	set_root_euid(__FILE__,__LINE__);
#endif NFSFIX

	status = kill( pid, sig );

#ifdef NFSFIX
	set_condor_euid(__FILE__,__LINE__);
#endif NFSFIX

	if( status < 0 ) {
		EXCEPT( "kill(%d,%d)", pid, sig );
	}
	dprintf( D_ALWAYS, "Sent %s to process %d\n", SigNames[sig], pid );
}

/*
** Re read the config file, and send all the daemons a signal telling
** them to do so also.
*/
sighup_handler()
{
	dprintf( D_ALWAYS, "Re reading config file\n" );
	config( MyName, (CONTEXT *)0 );
	init_params();
	do_kill( Collector_Pid, SIGHUP );
	do_kill( Negotiator_Pid, SIGHUP );
	do_kill( StartD_Pid, SIGHUP );
	do_kill( KbdD_Pid, SIGHUP );
	do_kill( SchedD_Pid, SIGHUP );
	dprintf( D_ALWAYS | D_NOHEADER, "\n" );
}

/*
** Kill and restart all daemons.
*/
sigint_handler()
{
	dprintf( D_ALWAYS, "Killing all daemons\n" );
	(void)signal( SIGCHLD, SIG_IGN );
	do_killpg( Collector_Pid, SIGKILL );
	do_killpg( Negotiator_Pid, SIGKILL );
	do_killpg( StartD_Pid, SIGKILL );
	do_killpg( KbdD_Pid, SIGKILL );
	do_killpg( SchedD_Pid, SIGKILL );
	dprintf( D_ALWAYS, "Restarting all daemons\n" );
	sleep( 5 );	/* NOT a good way to do this... */
	if( signal(SIGCHLD,sigchld_handler) == BADSIG ) {
		EXCEPT( "signal(SIGCHLD,0x%x)", sigchld_handler );
	}
	start_all_daemons();
}
/*
** Kill all daemons and go away.
*/
sigquit_handler()
{
	(void)signal( SIGCHLD, SIG_IGN );
	do_killpg( Collector_Pid, SIGKILL );
	do_killpg( Negotiator_Pid, SIGKILL );
	do_killpg( StartD_Pid, SIGKILL );
	do_killpg( SchedD_Pid, SIGKILL );
	do_killpg( KbdD_Pid, SIGKILL );
	dprintf( D_ALWAYS, "Killed by SIGQUIT\n" );
	set_machine_status( CONDOR_DOWN );
	exit( 0 );
}

start_all_daemons()
{
	Master_TimeStamp = GetTimeStamp( Master );

	if( collector_runs_here() ) {
		Collector_Pid = start_daemon( Collector );
		Collector_TimeStamp = GetTimeStamp( Collector );
	}

	if( negotiator_runs_here() ) {
		Negotiator_Pid = start_daemon( Negotiator );
		Negotiator_TimeStamp = GetTimeStamp( Negotiator );
	}

	if( X_runs_here ) {
		KbdD_Pid = start_daemon( KbdD );
		KbdD_TimeStamp = GetTimeStamp( KbdD );
	}

	StartD_Pid = start_daemon( StartD );
	StartD_TimeStamp = GetTimeStamp( StartD );

	SchedD_Pid = start_daemon( SchedD );
	SchedD_TimeStamp = GetTimeStamp( SchedD );

	dprintf( D_ALWAYS | D_NOHEADER, "\n" );
}

time_t
GetTimeStamp(file)
char *file;
{
	struct stat sbuf;

	if( stat(file, &sbuf) < 0 ) {
		return( (time_t) -1 );
	}

	return( sbuf.st_mtime );
}

NewExecutable(file, tsp)
char *file;
time_t *tsp;
{
	time_t cts = GetTimeStamp(file);

	if( cts == (time_t) -1 ) {
		/*
		**	We could have been in the process of installing a new
		**	version, and that's why the 'stat' failed.  Catch it
		**  next time around.
		*/
		return( FALSE );
	}

	if( cts != *tsp ) {
		*tsp = cts;
		return( TRUE );
	}

	return FALSE;
}

char	*Shell = "/bin/sh";
daily_housekeeping()
{
	int		child_pid;

	if( FS_Preen == NULL ) {
		return;
	}

		/* Check log, spool, and execute for any junk files left lying around */
	dprintf( D_ALWAYS,
	"Calling execl( \"%s\", \"sh\", \"-c\", \"%s\", 0 )\n", Shell, FS_Preen );

	if( (child_pid=vfork()) == 0 ) {	/* The child */
		execl( Shell, "sh", "-c", FS_Preen, 0 );
		_exit( 127 );
	} else {				/* The parent */
		dprintf( D_ALWAYS, "Shell pid is %d\n", child_pid );
		return;
	}

	/*
	Note: can't use system() here.  That calls wait(), but the child's
	status will be captured our own sigchld_handler().  This would
	cause the wait() called by system() to hang forever...
														-- mike

	(void)system( FS_Preen );
	*/
}
