/*
 * Copyright (c) 1992, 1993 by the University of Southern California
 *
 * For copying and distribution information, please see the files
 * <prm-copyr.h>.
 *
 * Written by srao 8/92
 */

#include <prm-copyr.h>

#define	     ARDP_DEFAULT_PEER	 "prm-sm" /* Default destination port name */
#define	     ARDP_DEFAULT_PORT    408     /* Default destination port number */

#include <ardp.h>

#include <stdio.h>
#ifndef MACH386
#include <stdlib.h>
#endif

#include <pprot.h>
#include <sys/socket.h> 
#include <time.h>
#include <perrno.h>
#include <pmachine.h>
#include <plog.h>

#include <list_macros.h>

#include <netdb.h>

#include <sysmngr.h>


int         pfs_debug;
int         perrno;
char        *_progname;
char        inad_str[32];
char        *timestr;
char        buf[256];
char        *hname;
u_long      _my_taskid;
char        nmfile[80];  /* file in which list of nodemngrs is specified */
char        logfile[80]; /* For logging errors, if logging is enabled */
u_long      num_nodes;
prm_node_t  nodelist;
char        p_err_string[P_ERR_STRING_SZ];
char        _my_hostname[MAXHOSTNAMELEN];



main(int argc, char *argv[], char *envp[])

{
  int     debug, options, i, j, pn;
  char    *msg, *node_msg, buf[128], buf2[12], *curposn;
  u_char  op, job_exit_code;

  u_long  auth_code, nauth, req_jid, count, ipaddr;
  time_t  t0;
  u_short req_host_type[MAX_HOST_TYPES], grant_host_type[MAX_HOST_TYPES];
  u_long  ntasks;
  PTEXT   pkt, rpkt, node_pkt, pkt2, rpkt2;
  RREQ    current_req, newreq, prev_req, *pending_reqs;

  struct  job_info *job_tbl, *curjob;
  prm_node_t  curnode;
  FILE    *fd, *elfd;
  int     *alloc_tbl[MAX_HOST_TYPES], *n_allocd, found, nmflag, logflag;
  int     *tot_alloc_tbl, tot_allocd;
  u_long  ntmp;

  debug = 0;
  _my_taskid = 0;
  nmflag = logflag = 0;
  prev_req = 0;
  job_tbl = (struct job_info *)0;
  _progname=argv[0];
  argc--; argv++;
  
  while (argc > 0 && **argv == '-') {
    switch (*(argv[0]+1)) {
    case 'D':
      debug=1;    /* Default debug level */
      options |= SO_DEBUG;
      sscanf(argv[0],"-D%d", &debug);
      break;
      
    case 'f':
      argc--; argv++;
      nmflag = 1;
      sscanf(argv[0], "%s", nmfile);
      break;
      
    case 'l':
      logflag = 1;
      sscanf(argv[0], "-l%s", logfile);
      if (logfile[0]=='\0')         /* No logfilename specified */
	strcpy(logfile, SYSERRLOGFL);
      break;
      
    default:
      fprintf(stderr, "Usage: %s -f nmfile [ -D[debug-level] ] [ -l[logfile] ]\n", _progname);
      exit(ERRORCODE);
    }
    argc--; argv++;
  }
  
  pfs_debug = debug;
  
  if (nmflag) {
    if ((fd = fopen(nmfile, "r")) == NULL) {
      fprintf(stderr, "Error opening nodemanagers file!\n");
      exit(ERRORCODE);
    }
    if (logflag) 
      freopen(logfile, "a", stderr);
  }    
  else {
    fprintf(stderr, "(%s) No nodes to manage!\n", _progname);
    exit(1);
  }
 
  gethostname(_my_hostname, sizeof(_my_hostname));

#ifdef ROOT
  sprintf(buf, "#%d", PRM_SM_PRIV_PORT);
  ardp_bind_port(buf);
  ardp_set_prvport(ardp_srvport);
#endif

  sprintf(buf, "#%d", PRM_SM_UNPRIV_PORT);
  ardp_bind_port(buf);


  num_nodes = 0;
  while(fscanf(fd, "%s %s", buf, buf2) >= 0)
    ++num_nodes;

  rewind(fd);
  nodelist = (prm_node_t)calloc(num_nodes, sizeof(struct prm_node));
  
  pending_reqs = (RREQ *)calloc(num_nodes, sizeof(RREQ));
  
  for (i = 0; i < num_nodes; i++) {  /* send node status to every nodemngr */
    
    fscanf(fd, "%s", buf);
    nodelist[i].hostname = (char *)malloc(strlen(buf) + 1);
    strcpy(nodelist[i].hostname, buf);
    fscanf(fd, "%s", buf);
    nodelist[i].h_type = HOSTTYPE(buf);
    
    nodelist[i].addr = (prm_node_addr_t)malloc(PRM_AD_SZ);
    
    bzero(nodelist[i].addr, PRM_AD_SZ);
    
    newreq = ardp_rqalloc();
    newreq->pf_priority = 1;
    newreq->outpkt = pkt = ardp_ptalloc();  
    prm_headers(pkt, (u_char)PRM_NSTAT_QRY, (u_char)0, (u_char)0, 0);
    pkt->length = PRM_OPCODE_OFF + 1;
    
    sprintf(buf, "%s(%d)", nodelist[i].hostname, NODEMNGR_PORT);

    /* asynchronous request */
    if (ardp_send(newreq, buf, nodelist[i].addr, 0) == 	ARDP_SUCCESS) {
      nodelist[i].status = process_nstatus_msg(newreq); 
      if (newreq->pf_priority) {
	newreq->pf_priority = 0;
	ardp_rqfree(newreq);
      }
      pending_reqs[i] = NOREQ;
    }
    else
      pending_reqs[i] = newreq;
    
  } 
  
  for (i = 0; i < num_nodes; i++) {  /* collect responses from nodemngrs */
    if (pending_reqs[i]) {
      nodelist[i].status = process_nstatus_msg(pending_reqs[i]);
      if (pending_reqs[i]->pf_priority) {
	pending_reqs[i]->pf_priority = 0;
	ardp_rqfree(pending_reqs[i]);
      }
      else
	fprintf(stderr, "Attempt to free request structure not allocated by client (2)\n");
    }
  }
  cfree(pending_reqs, num_nodes, sizeof(RREQ));


  /* Sysmngr's Main Loop */

  for (;;) {
    
    current_req = ardp_get_nxt();
    msg = current_req->rcvd->start;
    
    if (*msg != PRM_PROTO_V) {
      PRM_wrong_proto(current_req);
      return;
    }
    
    /* Every message has an operation code giving an indication of what is
       to be done */

    op = *(msg + PRM_OPCODE_OFF);

    switch (op) {
      

    case PRM_RSC_REQ : /* Request from a jobmngr for node allocation */
      
      bcopy(msg + PRM_JOBID_OFF, &req_jid, LONG_SZ);
      req_jid = ntohl(req_jid);
      curposn = msg + PRM_TINFO_OFF;
      ntasks = 0;
#ifdef MULTI_HOST
      for (i = 0; i < MAX_HOST_TYPES; i++ ) {
	bcopy(curposn, &ntmp, LONG_SZ);
	req_host_type[i] = ntohl(ntmp);
	ntasks += req_host_type[i];
	grant_host_type[i] = 0;
	curposn += LONG_SZ;
      }
#else 
      bcopy(curposn, &ntasks, LONG_SZ); /* Number of tasks in job. By
					   default number of nodes 
					   requested is equal to
					   ntasks */
      ntasks = ntohl(ntasks);
#endif
      
      /* Tell jobmngr to backoff for 30 secs while nodes are being allocd */
      ardp_rwait(current_req, 30, 0, 0);
      
      rpkt = current_req->outpkt = ardp_ptalloc();
      msg = rpkt->start;
      *(msg + PRM_OPCODE_OFF) = PRM_RSC_GNT;
      *(msg + PRM_STATUS_OFF) = FAILURE;     /* A pessimitsic reply message. */
      rpkt->length = PRM_STATUS_OFF + 1;
      
      
      /* linearly search through the node list, entering pointers to the free 
	 ones in a table  */
      
      count = 0;
#ifdef MULTI_HOST
      for (j = 0; j < MAX_HOST_TYPES; j++ ) {
	alloc_tbl[j] = (int *)calloc(req_host_type[j], sizeof(int));
	
	i = 0;
	while ( (grant_host_type[j] < req_host_type[j]) && (i < num_nodes) ) {
	  if ( (nodelist[i].h_type == j) && (nodelist[i].status == 
					     NODE_UNUSED) ) {  
	    /* A free node of the same host type as requested */
	    *(alloc_tbl[j] + grant_host_type[j]) = i;
	    ++grant_host_type[j];
	    ++count;
	  } 
	  ++i;
	}
      }
#else
      i = 0;
      tot_alloc_tbl = (int *)calloc(ntasks, sizeof(int));
      while ( (count < ntasks) && (i < num_nodes) ) {
	if (nodelist[i].status == NODE_UNUSED) {  /* A free node */
	  tot_alloc_tbl[count] = i;
	  ++count;
	}
	++i;
      }
#endif
      
      curjob = (struct job_info *)0;
      if(count == 0)           /* No nodes were available */
	goto alloc_done;
      
      *(msg + PRM_STATUS_OFF) = SUCCESS;
      
      curjob = (struct job_info *)malloc (sizeof(struct job_info));
      curjob->jid = req_jid;
      curjob->ntasks = ntasks;
      bcopy(current_req->peer, &curjob->jmaddr, PRM_AD_SZ);
      /* Remember, jm does not have a port # allocated to it yet, so we
	 cannot send messages to it */
      
      curjob->node_ind = (int *) calloc(count, sizeof(int) );
      
      /* Position where we start filling in the internet addrs of  nodes */
      
#ifdef MULTI_HOST
      rpkt->length = NUMNODEOFFSET + LONG_SZ * MAX_HOST_TYPES; 
      n_allocd = &(curjob->num_allocd[0]);
#else
      rpkt->length = PRM_DATA_OFF;
      tot_allocd = 0;
#endif      
      
      node_pkt = ardp_ptalloc();
      node_msg = node_pkt->start;
      prm_headers(node_pkt, (u_char) PRM_AUTH_JOB, (u_char)0, (u_char)0, 
		  req_jid);
      ntmp = htonl(S_AD_SZ);
      bcopy(&ntmp, node_msg + PRM_DLEN_OFF, LONG_SZ);
      
      bcopy(current_req->peer, node_msg + PRM_DATA_OFF, S_AD_SZ);
      ((prm_node_addr_t)(node_msg + PRM_DATA_OFF))->sin_port = (u_short)0;

      auth_code = PRM_gen_auth_key();/* Generate an authorization key for jm */
      nauth = htonl(auth_code);
      bcopy(&nauth, node_msg + PRM_DATA_OFF + S_AD_SZ, LONG_SZ);

      node_pkt->length = PRM_DATA_OFF + LONG_SZ + S_AD_SZ;
      
      /* Send authcode to nodemngrs first. This is also check on the node
	 managers to ensure that node is up at this time and available to 
	 run tasks for this job. */
      
      
      curposn = msg + PRM_DATA_OFF;
      
#ifdef MULTI_HOST
      for (j = 0; j < MAX_HOST_TYPES; j++) {
	n_allocd[j] = 0;
	for(i = 0; i < grant_host_type[j]; i++) {
	  curnode = nodelist + *(alloc_tbl[j] + i);
	  pkt2 = ardp_ptalloc();
	  bcopy(node_pkt->start, pkt2->start, node_pkt->length);
	  pkt2->length = node_pkt->length;
	  
	  newreq = ardp_rqalloc();
	  newreq->pf_priority = 1;
	  newreq->outpkt = pkt2;
	  ardp_send (newreq, 0, curnode->addr, -1);
	  if( (rpkt2 = newreq->rcvd) == NOPKT) {
	    /* no reply from nodemngr. Assume node is down */
	    t0 = time(0);
	    timestr = ctime(&t0);
	    fprintf(stderr, "%s: (%s) nodemngr at %s doesn't respond!\n", 
		    timestr, _progname, inet_ntoa(curnode->addr->sin_addr));
	    fflush(stderr);
	    curnode->status = NODE_UNAVAIL;
	  }
	  else { /* Nodemngr replied. Therefor node available to run jobs */
	    bcopy(curnode->addr, msg + rpkt->length, PRM_AD_SZ);
	    rpkt->length += PRM_AD_SZ;
	    ++n_allocd[j];
	    curnode->status = NODE_JOB_ON;
	  }
	  *(curjob->node_ind+i) = i;
	  
	  if (newreq->pf_priority) {
	    newreq->pf_priority = 0;
	    ardp_rqfree(newreq); 
	  }
	  else 
	    fprintf(stderr, "Attempt to free a req struct not allocated by client (3)\n");
	} /* i */
	ntmp = htonl(n_allocd[j]);
	bcopy(&ntmp, curposn, LONG_SZ);
	curposn += LONG_SZ;
      }  /* j */ 
      free(alloc_tbl);
#else
      for(i = 0; i < count; i++) {
	curnode = nodelist + *(tot_alloc_tbl + i);
	pkt2 = ardp_ptalloc();
	bcopy(node_pkt->start, pkt2->start, node_pkt->length);
	pkt2->length = node_pkt->length;
	
	newreq = ardp_rqalloc();
	newreq->pf_priority = 1;
	newreq->outpkt = pkt2;
	ardp_send (newreq, 0, curnode->addr, -1);
	if( (rpkt2 = newreq->rcvd) == NOPKT) {
	  /* no reply from nodemngr. Assume node is down */
	  t0 = time(0);
	  timestr = ctime(&t0);
	  fprintf(stderr, "%s: (%s) nodemngr at %s doesn't respond!\n", 
		  timestr, _progname, inet_ntoa(curnode->addr->sin_addr));
	  fflush(stderr);
	  curnode->status = NODE_UNAVAIL;
	}
	else { /* Nodemngr replied. Therefor node available to run jobs */
	  bcopy(curnode->addr, msg + rpkt->length, PRM_AD_SZ);
	  rpkt->length += PRM_AD_SZ;
	  ++tot_allocd;
	  curnode->status = NODE_JOB_ON;
	}
	*(curjob->node_ind+i) = i;
	
	if (newreq->pf_priority) {
	  newreq->pf_priority = 0;
	  ardp_rqfree(newreq); 
	}
	else 
	  fprintf(stderr, "Attempt to free a req struct not allocated by client (3)\n");
      } /* i */
      curjob->tot_allocd = tot_allocd;	
      ntmp = htonl(tot_allocd);
      bcopy(&ntmp, msg + PRM_TINFO_OFF, LONG_SZ);	
      ntmp = htonl(tot_allocd * PRM_AD_SZ);
      bcopy(&ntmp, msg + PRM_DLEN_OFF, LONG_SZ);	
      free(tot_alloc_tbl);
#endif
      
      bcopy(&nauth, msg + rpkt->length, LONG_SZ);
      rpkt->length += LONG_SZ;

    alloc_done:
      ardp_respond(current_req, ARDP_R_COMPLETE);
      
      if (curjob)
	APPEND_ITEM(curjob, job_tbl);
      
      break;
    

    case PRM_NSTAT_UPDT:   /* Change status of node */
      
      bcopy(msg + PRM_JOBID_OFF, &ipaddr, LONG_SZ);
      
      for (i=0; i<num_nodes; i++) {
	if ( (nodelist[i].addr)->sin_addr.s_addr == ipaddr) {
	  nodelist[i].status = *(msg + PRM_STATUS_OFF);
	  found = TRUE;
	  break;
	}
      }
      if (!found) {  /* a new node. Add it in. */
	++num_nodes; 
	nodelist = (prm_node_t) realloc(num_nodes, sizeof(struct prm_node));
	nodelist[i].status = NODE_UNUSED;
	nodelist[i].addr = (prm_node_addr_t)malloc(PRM_AD_SZ);
	(nodelist[i].addr)->sin_addr.s_addr = ipaddr;
	(nodelist[i].addr)->sin_port = htons(NODEMNGR_PORT);
      }
      rpkt = current_req->outpkt = ardp_ptalloc();
      prm_headers(rpkt, (u_char)PRM_NSTAT_UPDT, (u_char)SUCCESS, (u_char)0, 0);
      rpkt->length = PRM_STATUS_OFF + 1;
      ardp_respond(current_req, ARDP_R_COMPLETE);

      break;

      
    case PRM_JOB_DONE:
      
      bcopy(msg + PRM_JOBID_OFF, &req_jid, LONG_SZ);
      req_jid = ntohl(req_jid);
      job_exit_code = *(msg + PRM_STATUS_OFF);

      rpkt = current_req->outpkt = ardp_ptalloc();
      prm_headers(rpkt, (u_char)PRM_JDONE_RESP, (u_char)SUCCESS, (u_char)0, 0);
      rpkt->length = PRM_STATUS_OFF + 1;
      ardp_respond(current_req, ARDP_R_COMPLETE);

      /* Find the entry in the job_tbl correspondint to this job */
      curjob = job_tbl;
      while ( curjob && (curjob->jid != req_jid) )
	curjob = curjob->next;


      if (curjob)           /* remove entry from list  */
	EXTRACT_ITEM(curjob, job_tbl);
      else
	break;
      
      /* If job exited due to insufficient resources, free resources allocated
	 to it. If job exited normally, the nodemngr will free the resources
	 and notify sysmngr. (case CHANGE_NSTAT above). */
      
      if ( (job_exit_code == J_INSF_RSC) || (job_exit_code == J_NO_LOCIO) ||
	  (job_exit_code == J_NO_NM) || (job_exit_code == J_AUTH_FAIL) ){

#ifdef MULTI_HOST
	for (j = 0; j < MAX_HOST_TYPES; j++) { 
	  for(i = 0; i < curjob->num_allocd[j]; i++) {
#else
	  for(i = 0; i < curjob->tot_allocd; i++) {
#endif
	    int ind;
	    ind = *(curjob->node_ind+i);
	    nodelist[ind].status = (u_char)NODE_UNUSED;
#ifdef MULTI_HOST  
	  }
	}
#else
	}
#endif
	free(curjob->node_ind);
      }
      free(curjob);

      break;

    default:
      break;

    }
  }
}


process_nstatus_msg(RREQ req)
{
  int status;
  PTEXT rpkt;
  RREQ cmpreq;
  extern RREQ ardp_completeQ;
  
  if (req->status != ARDP_STATUS_COMPLETE) 
    ardp_retrieve(req, -1); 
  else {
    cmpreq = ardp_completeQ;
    while (cmpreq && (cmpreq != req))
      cmpreq = cmpreq->next;
    if (cmpreq)
      EXTRACT_ITEM(req, ardp_completeQ);
  }
  if ( (rpkt = req->rcvd) == NOPKT) 
    status = NODE_UNAVAIL;      /* No reply. Assume unavailable */
  else {
    if((*(rpkt->start + PRM_OPCODE_OFF) == PRM_NSTAT_UPDT) && 
       (*(rpkt->start + PRM_STATUS_OFF) == SUCCESS)) 
      status = *(rpkt->start + PRM_ADINF_OFF);
    else {
      status = NODE_UNAVAIL;
      fprintf(stderr, "(%s) Unknown reply code from nodemngr %s\n", 
	      _progname, inet_ntoa(req->peer.sin_addr) );
      fflush(stderr);
    }
  }
  return status;
}

