/* 
** Copyright 1986, 1987, 1988, 1989 University of Wisconsin
** 
** Permission to use, copy, modify, and distribute this software and its
** documentation for any purpose and without fee is hereby granted,
** provided that the above copyright notice appear in all copies and that
** both that copyright notice and this permission notice appear in
** supporting documentation, and that the name of the University of
** Wisconsin not be used in advertising or publicity pertaining to
** distribution of the software without specific, written prior
** permission.  The University of Wisconsin makes no representations about
** the suitability of this software for any purpose.  It is provided "as
** is" without express or implied warranty.
** 
** THE UNIVERSITY OF WISCONSIN DISCLAIMS ALL WARRANTIES WITH REGARD TO
** THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
** FITNESS. IN NO EVENT SHALL THE UNIVERSITY OF WISCONSIN  BE LIABLE FOR
** ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
** WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
** ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
** OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
** 
** Authors:  Allan Bricker and Michael J. Litzkow,
** 	         University of Wisconsin, Computer Sciences Dept.
** 
*/ 


#include <stdio.h>
#include <signal.h>
#include <netdb.h>
#include <pwd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/param.h>
#include <sys/socket.h>
#include <sys/resource.h>
#include <sys/file.h>
#include <netinet/in.h>
#include <rpc/types.h>
#include <rpc/xdr.h>
#include "sched.h"
#include "debug.h"
#include "trace.h"
#include "except.h"
#include "expr.h"
#include "proc.h"

#ifdef NDBM
#include <ndbm.h>
#else NDBM
#include "ndbm_fake.h"
#endif NDBM

#define SUCCESS 1
#define CANT_RUN 0

static char *_FileName_ = __FILE__;		/* Used by EXCEPT (see except.h)     */
extern char	*Spool;
extern char	*Shadow;
extern char	**environ;

CONTEXT		*create_context(), *build_context();
bool_t 		xdr_int();
XDR			*xdr_Udp_Init(), *xdr_Init();
ELEM		*create_elem();
EXPR		*create_expr();
char		*strdup();
int			prio_compar();
EXPR		*build_expr(), *scan();

extern CONTEXT	*MachineContext;
extern char		*MgrHost;
extern int		UdpSock;
extern int		MaxJobStarts;
extern int		MaxJobsRunning;

DBM		*Q, *OpenJobQueue();

int		JobsRunning;
int		JobsIdle;
char	*Owners[1024];
int		N_Owners;
struct sockaddr_in	From;
int		Len;

#define MAX_PRIO_REC 2048
struct prio_rec {
	PROC_ID		id;
	float		prio;
} PrioRec[MAX_PRIO_REC];
int		N_PrioRecs = 0;

#define MAX_SHADOW_RECS 512
struct shadow_rec {
	int			pid;
	PROC_ID		job_id;
	char		*host;
} ShadowRecs[ MAX_SHADOW_RECS ];
int	NShadowRecs;

int		RejectedClusters[ 1024 ];
int		N_RejectedClusters;

timeout()
{
	count_jobs();
	update_central_mgr();
}

/*
** Examine the job queue to determine how many CONDOR jobs we currently have
** running, and how many individual users own them.
*/
count_jobs()
{
	char	queue[MAXPATHLEN];
	int		i;
	int		count();
	ELEM	tmp;
	int		prio_compar();

	N_Owners = 0;
	JobsRunning = 0;
	JobsIdle = 0;

	(void)sprintf( queue, "%s/job_queue", Spool );
	if( (Q=OpenJobQueue(queue,O_RDONLY,0)) == NULL ) {
		EXCEPT( "OpenJobQueue(%s)", queue );
	}

	LockJobQueue( Q, READER );

	ScanJobQueue( Q, count );

	CloseJobQueue( Q );

	dprintf( D_FULLDEBUG, "JobsRunning = %d\n", JobsRunning );
	dprintf( D_FULLDEBUG, "JobsIdle = %d\n", JobsIdle );
	dprintf( D_FULLDEBUG, "N_Owners = %d\n", N_Owners );

	for( i=0; i<N_Owners; i++ ) {
		free( Owners[i] );
	}

	tmp.type = INT;
	tmp.i_val = JobsRunning;
	store_stmt( build_expr("Running",&tmp), MachineContext );

	tmp.i_val = JobsIdle;
	store_stmt( build_expr("Idle",&tmp), MachineContext );

	tmp.i_val = N_Owners;
	store_stmt( build_expr("Users",&tmp), MachineContext );
}

prio_compar( a, b )
struct prio_rec		*a;
struct prio_rec		*b;
{
	if( a->prio < b->prio ) {
		return 1;
	}
	if( a->prio > b->prio ) {
		return -1;
	}
	return 0;
}

count( proc )
PROC	*proc;
{
	if( proc->status == RUNNING ) {
		JobsRunning += 1;
	}

	if( proc->status == IDLE || proc->status == UNEXPANDED ) {
		JobsIdle += 1;
	}

	insert_owner( proc->owner );

}

job_prio( proc )
PROC	*proc;
{
	ELEM	tmp;
	float	prio;
	CONTEXT	*job_context;

	if( proc->status == RUNNING ) {
		JobsRunning += 1;
		return;
	}

		/* Job already running, or not runnable, don't bother
		   calculating priority */
	if( !(proc->status == UNEXPANDED || proc->status == IDLE) ) {
		return;
	}

	job_context = create_context();

	tmp.type = INT;
	tmp.i_val = proc->q_date;
	store_stmt( build_expr("QDate",&tmp), job_context );

	tmp.i_val = proc->status;
	store_stmt( build_expr("Status",&tmp), job_context );

	tmp.i_val = proc->prio;
	store_stmt( build_expr("UserPrio",&tmp), job_context );

	if( evaluate_float( "PRIO", &prio, MachineContext, job_context ) < 0 ) {
		EXCEPT( "Can't evaluate \"PRIO\"" );
	}

	PrioRec[N_PrioRecs].id = proc->id;
	PrioRec[N_PrioRecs].prio = prio;
	N_PrioRecs += 1;

	free_context( job_context );
}

insert_owner( owner )
char	*owner;
{
	int		i;

	for( i=0; i<N_Owners; i++ ) {
		if( strcmp(Owners[i],owner) == MATCH ) {
			return;
		}
	}
	Owners[i] = strdup( owner );
	N_Owners += 1;
}


update_central_mgr()
{
	int		cmd;
	XDR		xdr, *xdrs = NULL;

	dprintf( D_FULLDEBUG, "Called update_central_mgr()\n" );

	xdrs = xdr_Udp_Init( &UdpSock, &xdr );
	xdrs->x_op = XDR_ENCODE;

	cmd = SCHEDD_INFO;
	if( !xdr_int(xdrs, &cmd) ) {
		xdr_destroy( xdrs );
	}

	if( !xdr_context(xdrs,MachineContext) ) {
		xdr_destroy( xdrs );
	}

	if( !xdrrec_endofrecord(xdrs,TRUE) ) {
		xdr_destroy( xdrs );
	}

	xdr_destroy( xdrs );

}

abort_job( xdrs )
XDR		*xdrs;
{
	PROC_ID	job_id;
	char	*host;
	int		i;

	xdrs->x_op = XDR_DECODE;
	if( !xdr_proc_id(xdrs,&job_id) ) {
		dprintf( D_ALWAYS, "abort_job() can't read job_id\n" );
		return;
	}

	for( i=0; i<NShadowRecs; i++ ) {
		if( ShadowRecs[i].job_id.cluster == job_id.cluster &&
			(ShadowRecs[i].job_id.proc == job_id.proc || job_id.proc == -1)) {

			host = ShadowRecs[i].host;
			dprintf( D_ALWAYS,
				"Found shadow record for job %d.%d, host = %s\n",
				job_id.cluster, job_id.proc, host );
			send_kill_command( host );
		}
	}
}

send_kill_command( host )
char	*host;
{
	XDR		xdr, *xdrs = NULL;
	int		sock = -1;
	int		cmd;

      /* Connect to the startd on the serving host */
    if( (sock = do_connect(host, "condor_startd", START_PORT)) < 0 ) {
        dprintf( D_ALWAYS, "Can't connect to startd on %s\n", host );
        return;
    }
    xdrs = xdr_Init( &sock, &xdr );
    xdrs->x_op = XDR_ENCODE;

    cmd = KILL_FRGN_JOB;
    if( !xdr_int(xdrs, &cmd) ) {
		dprintf( D_ALWAYS, "Can't send KILL_FRGN_JOB cmd to schedd on %s\n",
			host );
		xdr_destroy( xdrs );
		close( sock );
		return;
	}
		
	if( !xdrrec_endofrecord(xdrs,TRUE) ) {
		dprintf( D_ALWAYS, "Can't send xdr end_of_record to schedd on %s\n",
			host );
		xdr_destroy( xdrs );
		close( sock );
		return;
	}

    dprintf( D_ALWAYS, "Sent KILL_FRGN_JOB command to startd on %s\n", host );
	xdr_destroy( xdrs );
	close( sock );
}

#define RETURN \
	if( context ) { \
		free_context( context ); \
	} \
	if( q ) { \
		CloseJobQueue( q ); \
	} \
	return
/*
** The negotiator wants to give us permission to run a job on some
** server.  We must negotiate to try and match one of our jobs with a
** server which is capable of running it.  NOTE: We must keep job queue
** locked during this operation.
*/
negotiate( xdrs )
XDR		*xdrs;
{
	char	queue[MAXPATHLEN];
	int		i;
	int		op;
	CONTEXT	*context = NULL;
	PROC_ID	id;
	char	*host = NULL;
	DBM		*q = NULL;
	int		jobs_started = 0;
	int		jobs;
	int		cur_cluster = -1;

	dprintf( D_FULLDEBUG, "\n" );
	dprintf( D_FULLDEBUG, "Entered negotiate\n" );

		/* Open and lock the job queue */
	(void)sprintf( queue, "%s/job_queue", Spool );
	if( (q=OpenJobQueue(queue,O_RDWR,0)) == NULL ) {
		EXCEPT( "OpenJobQueue(%s)", queue );
	}

	LockJobQueue( q, WRITER );

		/* Prioritize the jobs */
	N_PrioRecs = 0;
	JobsRunning = 0;
	ScanJobQueue( q, job_prio );
	qsort( (char *)PrioRec, N_PrioRecs, sizeof(PrioRec[0]), prio_compar );
	jobs = N_PrioRecs;
	/*
	** for( i=0; i<N_PrioRecs; i++ ) {
	** 	dprintf( D_FULLDEBUG, "PrioRec[%d] = %d.%d %3.10f\n",
	** 		i, PrioRec[i].id.cluster, PrioRec[i].id.proc, PrioRec[i].prio );
	** }
	*/

	N_RejectedClusters = 0;

		/* Try jobs in priority order */
	for( i=0; i < N_PrioRecs;  ) {

		id = PrioRec[i].id;
		if( cluster_rejected(id.cluster) ) {
			i += 1;
			continue;
		}

			/* Wait for manager to request job info */
		if( !rcv_int(xdrs,&op,TRUE) ) {
			dprintf( D_ALWAYS, "Can't receive request from manager\n" );
			RETURN;
		}

		switch( op ) {
			case REJECTED:
				mark_cluster_rejected( cur_cluster );
				i += 1;
				break;
			case SEND_JOB_INFO:
				if( jobs_started >= MaxJobStarts ) {
					if( !snd_int(xdrs,NO_MORE_JOBS,TRUE) ) {
						dprintf( D_ALWAYS, "Can't send NO_MORE_JOBS to mgr\n" );
						RETURN;
					}
					dprintf( D_ALWAYS,
					"Reached MAX_JOB_STARTS - %d jobs started, %d jobs idle\n",
										jobs_started, jobs - jobs_started );
					RETURN;
				}
				if( JobsRunning >= MaxJobsRunning ) {
					if( !snd_int(xdrs,NO_MORE_JOBS,TRUE) ) {
						dprintf( D_ALWAYS, "Can't send NO_MORE_JOBS to mgr\n" );
						RETURN;
					}
					dprintf( D_ALWAYS,
					"Reached MAX_JOBS_RUNNING, %d jobs started, %d jobs idle\n",
										jobs_started, jobs - jobs_started );
					RETURN;
				}


					/* Send a job description */
				context = build_context( &id, q );
				if( !snd_int(xdrs,JOB_INFO,FALSE) ) {
					dprintf( D_ALWAYS, "Can't send JOB_INFO to mgr\n" );
					RETURN;
				}
				if( !snd_context(xdrs,context,TRUE) ) {
					dprintf( D_ALWAYS, "1.Can't send job_context to mgr\n" );
					RETURN;
				}
				free_context( context );
				context = NULL;
				dprintf( D_FULLDEBUG, "Sent job %d.%d\n", id.cluster, id.proc );
				cur_cluster = id.cluster;
				break;
			case PERMISSION:
				if( !rcv_string(xdrs,&host,TRUE) ) {
					dprintf( D_ALWAYS, "Can't receive host name from mgr\n" );
					RETURN;
				}
				permission( host, &id, q );
				free( host );
				host = NULL;
				jobs_started += 1;
				JobsRunning += 1;
				i += 1;
				break;
			case END_NEGOTIATE:
				dprintf( D_ALWAYS, "Lost priority - %d jobs started\n",
														jobs_started );
				RETURN;
			default:
				dprintf( D_ALWAYS, "Got unexpected request (%d)\n", op );
				RETURN;
		}
	}

		/* Out of jobs */
	if( !snd_int(xdrs,NO_MORE_JOBS,TRUE) ) {
		dprintf( D_ALWAYS, "Can't send NO_MORE_JOBS to mgr\n" );
		RETURN;
	}
	if( jobs_started < jobs ) {
		dprintf( D_ALWAYS,
		"Out of servers - %d jobs started, %d jobs idle\n",
							jobs_started, jobs - jobs_started );
	} else {
		dprintf( D_ALWAYS,
		"Out of jobs - %d jobs started, %d jobs idle\n",
							jobs_started, jobs - jobs_started );
	}
	sleep( 2 );
	RETURN;

}
#undef RETURN

CONTEXT	*
build_context( id, q )
DBM		*q;
PROC_ID	*id;
{
	PROC	proc;
	CONTEXT	*job_context;
	char	line[1024];

	proc.id = *id;
	(void)FetchProc( q, &proc );
	job_context = create_context();

	(void)sprintf( line, "JOB_REQUIREMENTS = (%s) && (Disk >= %d)",
						proc.requirements, calc_disk_needed(&proc)  );
	store_stmt( scan(line), job_context );

	if( proc.preferences && proc.preferences[0] ) {
		(void)sprintf( line, "JOB_PREFERENCES = %s", proc.preferences );
	} else {
		(void)sprintf( line, "JOB_PREFERENCES = T" );
	}
	store_stmt( scan(line), job_context );

	(void)sprintf( line, "Owner = \"%s\"", proc.owner );
	store_stmt( scan(line), job_context );

	xdr_free_proc( &proc );
	return job_context;
}

permission( server, job_id, q )
char		*server;
PROC_ID		*job_id;
DBM			*q;
{
	char	*argv[5];
	char	cluster[10], proc[10];
	int		pid;
	int		i, lim;

	dprintf( D_FULLDEBUG, "Got permission to run job %d.%d on %s\n",
										job_id->cluster, job_id->proc, server);


	(void)sprintf( cluster, "%d", job_id->cluster );
	(void)sprintf( proc, "%d", job_id->proc );
	argv[0] = "condor_shadow";
	argv[1] = server;
	argv[2] = cluster;
	argv[3] = proc;
	argv[4] = 0;

#ifdef NOTDEF
	{
	char	**ptr;

	dprintf( D_ALWAYS, "About to call: " );
	for( ptr = argv; *ptr; ptr++ ) {
		dprintf( D_ALWAYS | D_NOHEADER, "%s ", *ptr );
	}
	dprintf( D_ALWAYS | D_NOHEADER, "\n" );
	}
#endif NOTDEF

	mark_job_running( job_id, q );
	lim = getdtablesize();

	switch( (pid=fork()) ) {
		case -1:	/* error */
			EXCEPT( "vfork" );
			break;
		case 0:		/* the child */
			(void)close( 0 );
			for( i=3; i<lim; i++ ) {
				(void)close( i );
			}
			(void)execve( Shadow, argv, environ );
			EXCEPT( "execve" );
			break;
		default:	/* the parent */
			dprintf( D_ALWAYS, "Running %d.%d on \"%s\", (shadow pid = %d)\n",
				job_id->cluster, job_id->proc, server, pid );
			add_shadow_rec( pid, job_id, server );
			break;
	}
}

calc_disk_needed( proc )
PROC	*proc;
{
	struct stat	buf;
	char	file_name[MAXPATHLEN];
	static	last_cluster = -1;
	static	ickpt_size;

	if( proc->status == UNEXPANDED ) {
		if( proc->id.cluster == last_cluster ) {
			return ickpt_size;
		}
		(void)sprintf( file_name, "%s/job%06d.ickpt", Spool, proc->id.cluster );
		if( stat(file_name,&buf) < 0 ) {
			EXCEPT( "stat(%s,0x%x)", file_name, &buf );
		}
		last_cluster = proc->id.cluster;
		ickpt_size = buf.st_size / 1024;
		return ickpt_size;
	} else {
		(void)sprintf( file_name, "%s/job%06d.ckpt.%d",
									Spool, proc->id.cluster, proc->id.proc );
		if( stat(file_name,&buf) < 0 ) {
			EXCEPT( "stat(%s,0x%x)", file_name, &buf );
		}
		return buf.st_size / 1024;
	}
}

add_shadow_rec( pid, job_id, server )
int			pid;
PROC_ID		*job_id;
char		*server;
{
	ShadowRecs[ NShadowRecs ].pid = pid;
	ShadowRecs[ NShadowRecs ].job_id = *job_id;
	ShadowRecs[ NShadowRecs ].host = strdup( server );
	NShadowRecs++;
	/*
	** dprintf( D_FULLDEBUG, "Added shadow record for PID %d, job (%d.%d)\n",
	**	pid, job_id->cluster, job_id->proc );
	*/
}

delete_shadow_rec( pid )
int		pid;
{
	int		i;

	for( i=0; i<NShadowRecs; i++ ) {
		if( ShadowRecs[i].pid == pid ) {
			/*
			** dprintf(D_FULLDEBUG,
			**		"Deleting shadow rec for PID %d, job (%d.%d)\n",
			**	pid, ShadowRecs[i].job_id.cluster, ShadowRecs[i].job_id.proc );
			*/
			check_zombie( pid, &(ShadowRecs[i].job_id) );
			free( ShadowRecs[i].host );
			NShadowRecs -= 1;
			ShadowRecs[i] = ShadowRecs[NShadowRecs];
			return;
		}
	}
	EXCEPT( "Can't find shadow record for process %d\n", pid );
}


mark_job_running( job_id, q )
PROC_ID		*job_id;
DBM			*q;
{
	PROC	proc;

	proc.id = *job_id;
	if( FetchProc(q,&proc) < 0 ) { 
		EXCEPT( "FetchProc(%d.%d)", proc.id.cluster, proc.id.proc );
	}

	if( proc.status == RUNNING ) {
		EXCEPT( "Trying to run job %d.%d, but already marked RUNNING!",
			proc.id.cluster, proc.id.proc );
	}

	proc.status = RUNNING;

	if( StoreProc(q,&proc) < 0 ) {
		EXCEPT( "StoreProc(0x%x,0x%x)", Q, proc );
	}

	/*
	** dprintf( D_FULLDEBUG, "Marked job %d.%d as RUNNING\n",
	**								proc.id.cluster, proc.id.proc );
	*/
}


mark_cluster_rejected( cluster )
int		cluster;
{
	int		i;

	for( i=0; i<N_RejectedClusters; i++ ) {
		if( RejectedClusters[i] == cluster ) {
			return;
		}
	}
	RejectedClusters[ N_RejectedClusters++ ] = cluster;
}

cluster_rejected( cluster )
int		cluster;
{
	int		i;

	for( i=0; i<N_RejectedClusters; i++ ) {
		if( RejectedClusters[i] == cluster ) {
			return 1;
		}
	}
	return 0;
}
