/*
 * *****************************************************************
 * *                                                               *
 * *    Copyright (c) Digital Equipment Corporation, 1991, 1993    *
 * *                                                               *
 * *   All Rights Reserved.  Unpublished rights  reserved  under   *
 * *   the copyright laws of the United States.                    *
 * *                                                               *
 * *   The software contained on this media  is  proprietary  to   *
 * *   and  embodies  the  confidential  technology  of  Digital   *
 * *   Equipment Corporation.  Possession, use,  duplication  or   *
 * *   dissemination of the software and media is authorized only  *
 * *   pursuant to a valid written license from Digital Equipment  *
 * *   Corporation.                                                *
 * *                                                               *
 * *   RESTRICTED RIGHTS LEGEND   Use, duplication, or disclosure  *
 * *   by the U.S. Government is subject to restrictions  as  set  *
 * *   forth in Subparagraph (c)(1)(ii)  of  DFARS  252.227-7013,  *
 * *   or  in  FAR 52.227-19, as applicable.                       *
 * *                                                               *
 * *****************************************************************
 */
/*
 * HISTORY
 */
#ifndef lint
static char	*rcsid = "@(#)$RCSfile: kern_clock.c,v $ $Revision: 4.3.3.12 $ (DEC) $Date: 1993/01/08 17:45:59 $";
#endif 
/*
 * (c) Copyright 1990, OPEN SOFTWARE FOUNDATION, INC.
 * ALL RIGHTS RESERVED
 */
/*
 * Mach Operating System
 * Copyright (c) 1989 Carnegie-Mellon University
 * Copyright (c) 1988 Carnegie-Mellon University
 * Copyright (c) 1987 Carnegie-Mellon University
 * All rights reserved.  The CMU software License Agreement specifies
 * the terms and conditions for use and redistribution.
 */
/*
 * OSF/1 Release 1.0
 */
/*
 * Copyright (C) 1988,1989 Encore Computer Corporation.  All Rights Reserved
 *
 * Property of Encore Computer Corporation.
 * This software is made available solely pursuant to the terms of
 * a software license agreement which governs its use. Unauthorized
 * duplication, distribution or sale are strictly prohibited.
 *
 */
/*
 * kern_clock.c
 *
 * Modification History:
 *
 * 28 Nov 93 -- David Mills, University of Delaware
 *	Fixed bug in updating time_adj from time_freq
 *
 * 12-Sep-93	David Mills, University of Delaware
 *	Modified hardclock() to implement a first-order, adaptive-
 *	parameter, type-II. phase-lock loop to control the system clock
 *	phase and frequency. These modifications are compatible with,
 *	but do not require, the ioasic bus counter and modified
 *	microtime() routine.
 *
 * 9-Sep-93	David Mills, University of Delaware
 *	Added microset() call so hardclock() can latch the ioasic bus
 *	counter at each tick. This code is enabled by the MICRO define
 *	and should work in both Alpha and MIPS machines.
 *
 * 04-Feb-92	Jeff Denham
 *	Update POSIX.4 timer references for modified timer structure.
 *
 * 02-Jan-92	Fred Canter
 *	Add code to hardclock to count cache parity errors.
 *
 * 04-Nov-91     Jeff Denham
 *	For P1003.4, in psx4_adjust_callout(), add typecasts to P.4
 *	timer structure type when referencing proc structure fields
 *	via the c_arg field in callout structure. This is for peace of
 *	mind and eliminates compiler warnings.
 *
 * 03-May-91	Peter H. Smith
 *	Change hardcoded priority to a constant.
 *
 * 24-Apr-91     Jeff Denham
 *	For P1003.4, adjust POSIX timers when settimeofday() is called.
 *
 * 4-Apr-91     Lai-Wah Hui
 *      Add P1003.4 required extensions.  
 *      Specifically <rt_timer.h> is now included and if RT_TIMER
 *      is defined  a routine was added to psx_untimeout. psx4_untimeout
 *      will cancel a timer and store the time remaining in the thread structure.
 *
 */

#include <simple_clock.h>
#include <stat_time.h>
#include <mach_co_stats.h>
#include <cpus.h>

#ifdef	hc
pragma off (optimize);
#endif

/*
 * Copyright (c) 1982, 1986 Regents of the University of California.
 * All rights reserved.  The Berkeley software License Agreement
 * specifies the terms and conditions for redistribution.
 *

 */

#include <sys/unix_defs.h>
#include <kern/assert.h>
#include <kern/ast.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/dk.h>
#include <sys/callout.h>
#include <sys/user.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/table.h>
#include <sys/timex.h>
#if	MACH_CO_STATS
#include <mach_debug/callout_statistics.h>
#endif

#if PROFILING && PROFTYPE == 4
#include <sys/gprof.h>
#endif

#include <machine/reg.h>
#include <machine/cpu.h>
#if	!defined(romp) && !defined(mips)
#include <machine/psl.h>
#endif

#include <kern/thread.h>
#include <mach/machine.h>
#include <kern/sched.h>
#include <kern/sched_prim.h>
#include <kern/parallel.h>
#include <mach/time_value.h>
#include <kern/timer.h>
#include <sys/time.h>

#include <mach/boolean.h>
#include <rt_timer.h>

decl_simple_lock_data(,callout_lock)

struct callout *callfree, *callout, calltodo;
int ncallout;

/*
 * The fixtick and fixcnt variables are used with the stock Ultrix and
 * OSF/1 kernels to interpolate corrections when HZ does not evenly divide
 * the second. In the phase-lock loop model, only the fixtick variable is
 * used; its value is set at initialization.
 */
static int fixcnt = 0;	/* For systems for which hz doesn't evenly divide
			   1000000, this counts the ticks till we need to
			   correct the time with fixtick */

/*
 * The following defines establish the performance envelope of the
 * PLL, one to bound the maximum phase error, another to bound the
 * maximum frequency error and two others to bound the minimum and
 * maximum time between updates. The intent of these bounds is to force
 * the PLL to operate within predefined limits in order to conform to
 * the correctness models assumed by time-synchronization protocols like
 * NTP and DTSS. An excursion which exceeds these bounds is clamped to
 * the bound and operation proceeds accordingly. In practice, this can
 * occur only if something has failed or is operating out of tolerance,
 * but otherwise the PLL continues to operate in a stable mode. Note
 * that the MAXPHASE define conforms to the maximum offset allowed in
 * NTP before the system time is reset, rather than incrementally
 * adjusted.
 */
#define MAXPHASE 128000		/* max phase error (us) */
#define MAXFREQ 100		/* max frequency error (ppm) */
#define MINSEC 16		/* min interval between updates (s) */
#define MAXSEC 1200		/* max interval between updates (s) */

/*
 * The following variables are set by the ntp_adjtime() system call. The
 * time_status variable defines the synchronization status of the system
 * clock, with codes defined in the timex.h header file. The time_offset
 * variable is set by ntp_adjtime() and used by the phase-lock loop to
 * adjust the system time in small increments. The time_constant variable,
 * which is also set by ntp_adjtime(), determines the bandwidth or
 * "stiffness" of the phase lock loop. The time_tolerance variable is the
 * maximum frequency error or tolerance of the particular platform and is
 * a property of the architecture. The time_precision variable is usually
 * equal to the kernel tick variable; however, in case of the optional
 * microtime() routine, the resolution is improved to +-1 microsecond. The
 * time_maxerror variable is the maximum error of the system clock and is
 * computed as one-half the root delay plus root dispersion. It is
 * increased by a small amount each second to reflect the clock tolerance.
 */
int time_status = TIME_BAD;	/* clock synchronization status */
long time_offset = 0;		/* time adjustment (usec) */
long time_constant = 0;		/* pll time constant */
long time_tolerance = MAXFREQ;	/* frequency tolerance (ppm) */
long time_precision = 0;	/* clock precision (usec) */
long time_maxerror = 0;		/* maximum error (usec) */
long time_esterror = 0;		/* estimated error (usec) */

/*
 * The time_phase variable is the phase increment and the time_freq
 * variable is the frequency increment of the time variable at each tick
 * of the clock. The time_freq variable is set via ntp_adjtime()
 * from a value stored in a file when the synchronization daemon is
 * first started. Its value is retrieved via ntp_adjtime() and written to
 * the file about once per hour by the daemon. The time_adj variable is
 * the adjustment added to the value of tick at each timer interrupt and
 * is recomputed at each timer interrupt. The time_reftime variable is
 * the second's portion of the system time on the last call to
 * ntp_adjtime(). It is used to adjust the time_freq variable and to
 * increase the time_maxerror as the time since update increases.
 */
long time_phase = 0;		/* phase offset (scaled us) */
long time_freq = 0;		/* frequency offset (scaled ppm) */
long time_adj = 0;		/* tick adjust (scaled 1 / HZ) */
long time_reftime = 0;		/* time at last adjustment (s) */

#if	MACH_CO_STATS
#define	MCO_ASSERT(c)	assert(c)
#define	MCO_STATS(c)	(c)
#else
#define	MCO_ASSERT(c)
#define	MCO_STATS(c)
#endif

#if	UNIX_LOCKS
/*
 * A reminder about timeout handling and psignal.  The very
 * beginning of hardclock is handled on any processor, after
 * which all of the slave processors jump off to a routine
 * that does little more than check resource utilization
 * and possibly call psignal.  In that case, the psignal must
 * be rescheduled to happen on the master processor via
 * psignal_thread.
 *
 * The remainder of hardclock, including timeout handling,
 * is always done on the master processor.  We can avoid
 * passing psignal off to the psignal thread because we
 * "know" that there is no mp synchronization problem.
 *
 * One day, when proc manipulations are parallelized, this
 * ugliness will be fixed.
 *
 * Note that in the meantime, routines setting timeouts are
 * called back on the master processor!
 */
#endif

/*
 * Running the softclock in a thread is highly recommended.
 * So highly so it's not an option, but if desired here it is.
 */
#define SOFTCLOCK_THREAD 1

/*
 * Clock handling routines.
 *
 * This code is written to operate with two timers which run
 * independently of each other. The main clock, running at hz
 * times per second, is used to do scheduling and timeout calculations.
 * The second timer does resource utilization estimation statistically
 * based on the state of the machine phz times a second. Both functions
 * can be performed by a single clock (ie hz == phz), however the
 * statistics will be much more prone to errors. Ideally a machine
 * would have separate clocks measuring time spent in user state, system
 * state, interrupt state, and idle state. These clocks would allow a non-
 * approximate measure of resource utilization.
 */

/*
 * TODO:
 *	time of day, system/user timing, timeouts, profiling on separate timers
 *	allocate more timeout table slots when table overflows.
 */
#define BUMPTIME(t, usec) { \
	extern struct timeval *mach_tv; \
	register struct timeval *tp = (t); \
 \
	tp->tv_usec += (usec); \
	if (tp->tv_usec >= 1000000) { \
		tp->tv_usec -= 1000000; \
		tp->tv_sec++; \
	} \
	if (mach_tv) *mach_tv = time; \
}

/*
 * The hz hardware interval timer.
 * We update the events relating to real time.
 * If this timer is also being used to gather statistics,
 * we run through the statistics gathering routine as well.
 */

#define NTICKS	1

#ifdef  i386
int     dotimein = 0;
#define setsoftclock()  (dotimein++)
#undef	BASEPRI
#define	BASEPRI(X)	(0)
#endif

#ifdef notdef
int watch_dog_on = 0;
int watch_dog_interval = 0;
int watch_dog_time = 0;
int watch_dog_last_event = 0;
int watch_dog_event;
#endif /* notdef */

/*ARGSUSED*/
#ifdef	ibmrt
hardclock(dev,ps,pc)
	register dev_t dev;
	caddr_t pc;
	int ps;
#endif

#ifdef	i386
hardclock(pc,ps,oldpri)
	int oldpri;
	caddr_t pc;
	int ps;
#endif

/* #if	!defined(mips) && !defined(ibmrt) && !defined(i386) */
#if	!defined(ibmrt) && !defined(i386)
hardclock(pc, ps)
	caddr_t pc;
	int ps;
#endif
{
	register struct callout *p1;
	register thread_t	thread;
#if	SIMPLE_CLOCK
#define tick	myticks
	register int myticks;
#endif

	int needsoft = 0;
	int sig;
	extern int tickdelta;
	extern long timedelta;
	extern int fixtick;
	int time_update;
	int ltemp;
	extern struct timeval *mach_tv;
#ifndef	multimax
	extern int doresettodr;
#endif

	thread = current_thread();

#if	SIMPLE_CLOCK
	/*
	 *	Simple hardware timer does not restart on overflow, hence
	 *	interrupts do not happen at a constant rate.  Must call
	 *	machine-dependent routine to find out how much time has
	 *	elapsed since last interrupt.
	 */
	myticks = usec_elapsed();

	/*
	 *	NOTE: tick was #define'd to myticks above.
	 */
#endif


	if (thread->state & TH_IDLE) {
#if	STAT_TIME
		timer_bump(&thread->system_timer, NTICKS*tick);
#endif
		clock_tick(NTICKS, CPU_STATE_IDLE);
	} else if (USERMODE(ps)) {
#if	STAT_TIME
		timer_bump(&thread->user_timer, NTICKS*tick);
#endif
		clock_tick(NTICKS, CPU_STATE_USER);
		/*
	 	 * Charge the time out based on the mode the cpu is in.
	 	 * Here again we fudge for the lack of proper interval timers
		 * assuming that the current state has been around at least
		 * one tick.
		 */
		if (u.u_prof.pr_scale > 1) {
			u.u_procp->p_flag |= SOWEUPC;
			aston();
		}
		/*
		 * CPU was in user state.  Increment
		 * user time counter, and process process-virtual time
		 * interval timer.
		 */
		/*
		 * Even though our thread is guaranteed not to be examining
		 * the value of this timer because we know that the thread
		 * was in user mode, some other thread in this task could
		 * be manipulating the virtual timer.
		 */
		/* s = splhigh(); XXX */
		U_TIMER_LOCK();
		if (timerisset(&u.u_timer[ITIMER_VIRTUAL].it_value) &&
		    itimerdecr(&u.u_timer[ITIMER_VIRTUAL], tick) == 0)
			sig = SIGVTALRM;
		else
			sig = 0;
		U_TIMER_UNLOCK();
		/* splx(s); */
		if (sig)
			psignal(u.u_procp, sig);
	} else {
#if	STAT_TIME
		timer_bump(&thread->system_timer, NTICKS*tick);
#endif
		clock_tick(NTICKS, CPU_STATE_SYSTEM);
	}
#if NCPUS > 1
	if (cpu_number() != master_cpu) {
		slave_hardclock(pc, ps);
		return;
	}
#endif
#if MICRO
	/*
	 * Reset ioasic bus counter each tick for microtime()
	 */
	microset();
	time_precision = 1;
#else
	time_precision = tick;
#endif /* MICRO */
#ifdef	KTRACE
	kern_trace(400,time.tv_sec,time.tv_usec,0);
#endif  /* KTRACE */
#ifdef notdef
	if (watch_dog_on) {
		if (watch_dog_last_event != watch_dog_event) {
			watch_dog_time = 0;
			watch_dog_last_event = watch_dog_event;
		}
		else if (watch_dog_time++ > watch_dog_interval)
			panic("hardclock: watch_dog detected CPU IDLE");
	}
#endif /* notdef */

	/*
	 * Update real-time timeout queue.
	 * At front of queue are some number of events which are ``due''.
	 * The time to these is <= 0 and if negative represents the
	 * number of ticks which have passed since it was supposed to happen.
	 * The rest of the q elements (times > 0) are events yet to happen,
	 * where the time for each is given as a delta from the previous.
	 * Decrementing just the first of these serves to decrement the time
	 * to all events.
	 */
	/* s = splhigh(); XXX */
	simple_lock(&callout_lock);
	p1 = calltodo.c_next;
	while (p1) {
		if (--p1->c_time > 0)
			break;
		needsoft = 1;
		if (p1->c_time == 0)
			break;
		p1 = p1->c_next;
	}
	simple_unlock(&callout_lock);
	/* splx(s); */

	/*
	 * If the cpu is currently scheduled to a process, then
	 * charge it with resource utilization for a tick, updating
	 * statistics which run in (user+system) virtual time,
	 * such as the cpu time limit and profiling timers.
	 * This assumes that the current process has been running
	 * the entire last tick.
	 */
	if (!(thread->state & TH_IDLE))
	{
		if (u.u_rlimit[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
		    time_value_t	sys_time, user_time;

		    thread_read_times(thread, &user_time, &sys_time);
		    if ((sys_time.seconds + user_time.seconds + 1) >
		        u.u_rlimit[RLIMIT_CPU].rlim_cur) {
			psignal(u.u_procp, SIGXCPU);
			if (u.u_rlimit[RLIMIT_CPU].rlim_cur <
			    u.u_rlimit[RLIMIT_CPU].rlim_max)
				u.u_rlimit[RLIMIT_CPU].rlim_cur += 5;
			}
		}
		/* s = splhigh(); XXX */
		U_TIMER_LOCK();
		if (timerisset(&u.u_timer[ITIMER_PROF].it_value) &&
		    itimerdecr(&u.u_timer[ITIMER_PROF], tick) == 0)
			sig = SIGPROF;
		else
			sig = 0;
		U_TIMER_UNLOCK();
		/* splx(s); */
		if (sig)
			psignal(u.u_procp, SIGPROF);
	}


	/*
	 * If the alternate clock has not made itself known then
	 * we must gather the statistics.
	 */
	if (phz == 0)
		gatherstats(pc, ps);

	/*
	 * Increment the time-of-day, and schedule
	 * processing of the callouts at a very low cpu priority,
	 * so we don't keep the relatively high clock interrupt
	 * priority any longer than necessary.
	 */
	time_update = tick;
	if (timedelta < 0) {
		time_update -= tickdelta;
		timedelta += tickdelta;
	} else if (timedelta > 0) {
		time_update += tickdelta;
		timedelta -= tickdelta;
	}

	/*
	 * Compute the phase adjustment. If the low-order bits
	 * (time_phase) of the update overflow, bump the high-order
	 * bits (tick).
	 */
	time_phase += time_adj;
	if (time_phase < -FINEUSEC) {
		ltemp = -time_phase >> SHIFT_SCALE;
		time_phase += ltemp << SHIFT_SCALE;
		time_update -= ltemp;
	} else if (time_phase > FINEUSEC) {
		ltemp = time_phase >> SHIFT_SCALE;
		time_phase -= ltemp << SHIFT_SCALE;
		time_update += ltemp;
	}
	/* s = splhigh(); XXX */
	TIME_WRITE_LOCK();

	/*
	 * On rollover of the second the phase adjustment to be used for
	 * the next second is calculated. Also, the maximum error is
	 * increased by the tolerance. On rollover of the day the
	 * leap-warning indicator is checked and the apparent time
	 * adjusted +-1 s accordingly. The microtime() routine will
	 * insure that reported time is always monotonic.
	 */
	time.tv_usec += time_update;
	if (time.tv_usec >= 1000000) {
		time.tv_usec -= 1000000;
		time.tv_sec++;
		time_maxerror += time_tolerance;
		if (time_offset < 0) {
			ltemp = -time_offset >> (SHIFT_KG + time_constant);
			time_offset += ltemp;
			time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ -
			    SHIFT_UPDATE);
		} else {
			ltemp = time_offset >> (SHIFT_KG + time_constant);
			time_offset -= ltemp;
			time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ -
			    SHIFT_UPDATE);
		}
		if (time_freq < 0)
			time_adj -= -time_freq >>
			    (SHIFT_KF + SHIFT_HZ - SHIFT_SCALE);
		else
			time_adj += time_freq >>
			    (SHIFT_KF + SHIFT_HZ - SHIFT_SCALE);
		time_adj += fixtick << (SHIFT_SCALE - SHIFT_HZ);

		/* ugly divide should be replaced */
		if (time.tv_sec % 86400 == 0) {
			switch (time_status) {

				case TIME_INS:
				time.tv_sec--; /* !! */
				time_status = TIME_OOP;
				break;

				case TIME_DEL:
				time.tv_sec++;
				time_status = TIME_OK;
				break;

			}
		}
	}
	if (mach_tv)
		*mach_tv = time;
#if	!defined(multimax)
		if (timedelta == 0 && doresettodr) {
			doresettodr = 0;
			resettodr();
		}
#endif
	TIME_WRITE_UNLOCK();
	/* splx(s); */

#if	SOFTCLOCK_THREAD
	if (needsoft) {
		if (BASEPRI(ps)) {
			/*
			 * Optimization: use thread only if needed.
			 */
			softclock_scan(0);
		} else {
			extern softclock();
			thread_wakeup_one((vm_offset_t)softclock);
			aston();
		}
	}
#else	/* !SOFTCLOCK_THREAD */
/* Code retained for fallback - not recommended */
	if (needsoft) {
#ifdef	i386
		setsoftclock();
#else
		if (BASEPRI(ps)) {
			/*
			 * Save the overhead of a software interrupt;
			 * it will happen as soon as we return, so do it now.
			 */
			(void) splsoftclock();
#if	defined(sun3) || defined(sun4)
			softclock(USERMODE(ps) != 0);
#endif
#if	defined(vax) || defined(ns32000) || defined(ibmrt) || defined(__hp_osf) || defined(mips) || defined (__alpha)
			softclock(pc, ps);
#endif
		} else {
#if	defined(sun3) || defined(sun4)
			int softclock();
			softcall(softclock, USERMODE(ps) != 0);
#else
			setsoftclock();
#endif
		}
#endif	/* !i386 */
	}
#endif	/* !SOFTCLOCK_THREAD */
}
#if	SIMPLE_CLOCK
#undef	tick
#endif	

/*
 * This routine is called by ntp_adjtime to update the system clock phase and
 * frequency. This is an adaptive-parameter, first-order, type-II phase-lock
 * loop. Compute the time since the last update and clamp to a maximum
 * (for robustness). Multiply by the offset (sorry about the ugly multiply),
 * scale by the time constant, and add to the frequency variable. Then,
 * compute the phase variable as the offset scaled by the time constant.
 * Note that all shifts are assumed to be positive.
 */
hardupdate(offset)
long offset;
{
	long ltemp, mtemp;

	time_offset = offset << SHIFT_UPDATE;
	if (time_reftime)
		mtemp = time.tv_sec - time_reftime;
	else
		mtemp = 0;
	time_reftime = time.tv_sec;
	if (mtemp > MAXSEC)
		mtemp = MAXSEC;

	/* ugly multiply should be replaced */
	if (offset < 0)
		time_freq -= (-offset * mtemp) >>
		    (time_constant + time_constant);
	else
		time_freq += (offset * mtemp) >>
		    (time_constant + time_constant);
	ltemp = time_tolerance << SHIFT_KF;
	if (time_freq > ltemp)
		time_freq = ltemp;
	else if (time_freq < -ltemp)
		time_freq = -ltemp;
	if (time_status == TIME_BAD)
		time_status = TIME_OK;
}

int	dk_ndrive = DK_NDRIVE;
/*
 * Gather statistics on resource utilization.
 *
 * We make a gross assumption: that the system has been in the
 * state it is in (user state, kernel state, interrupt state,
 * or idle state) for the entire last time interval, and
 * update statistics accordingly.
 */
/*ARGSUSED*/
gatherstats(pc, ps)
	caddr_t pc;
	int ps;
{
	register int cpstate, s;

	/*
	 * Determine what state the cpu is in.
	 */
	if (USERMODE(ps)) {
		/*
		 * CPU was in user state.
		 */
		if (u.u_procp->p_nice > PRIZERO)
			cpstate = CP_NICE;
		else
			cpstate = CP_USER;
	} else {
		/*
		 * CPU was in system state.  If profiling kernel
		 * increment a counter.  If no process is running
		 * then this is a system tick if we were running
		 * at a non-zero IPL (in a driver).  If a process is running,
		 * then we charge it with system time even if we were
		 * at a non-zero IPL, since the system often runs
		 * this way during processing of system calls.
		 * This is approximate, but the lack of true interval
		 * timers makes doing anything else difficult.
		 */
		cpstate = CP_SYS;
		if ((current_thread()->state & TH_IDLE) && BASEPRI(ps))
			cpstate = CP_IDLE;
#if PROFILING && PROFTYPE == 4
		{	extern u_int *kcount;
			extern char *s_lowpc;
			extern u_long s_textsize;
			extern int profiling;

			s = pc - s_lowpc;
			if (profiling < 2 && s < s_textsize && kcount)
			{ kcount[s / (HISTFRACTION * sizeof (*kcount))]++; }
		}
#endif	/* PROFILING && PROFTYPE == 4 */
	}
	/*
	 * We maintain statistics shown by user-level statistics
	 * programs:  the amount of time in each cpu state.
	 */
	cp_time[cpstate]++;

	/* waste of time since no DEC disk controllers use the dk_busy
	 * field.  
	for (s = 0; s < DK_NDRIVE; s++)
		if (dk_busy & (1 << s))
			dk_time[s]++;
	*/
}

#if	!SOFTCLOCK_THREAD
/* Code retained for fallback - not recommended */
/*
 * Software priority level clock interrupt.
 * Run periodic events from timeout queue.
 */

#if	defined(sun3) || defined(sun4)
softclock(was_user_mode)
	int	was_user_mode;
#endif

#if	!defined(sun3) && !defined(sun4)
/*ARGSUSED*/
softclock(pc, ps)
	caddr_t pc;
	int ps;
#endif
{
#ifdef	mips
	acksoftclock();
#endif
#ifdef	KTRACE
	kern_trace(401,0,0,0);
#endif  /* KTRACE */
	for (;;) {
		register struct callout *p1;
		register caddr_t arg;
		register int (*func)();
		register int a, s;

		s = splhigh();
		simple_lock(&callout_lock);
		if ((p1 = calltodo.c_next) == 0 || p1->c_time > 0) {
			simple_unlock(&callout_lock);
			splx(s);
			break;
		}
		arg = p1->c_arg; func = p1->c_func; a = p1->c_time;
		calltodo.c_next = p1->c_next;
		p1->c_next = callfree;
		callfree = p1;
		MCO_ASSERT(callout_statistics.cos_current_size > 0);
		MCO_STATS(callout_statistics.cos_num_softclock++);
		MCO_STATS(callout_statistics.cos_current_size--);
		MCO_STATS(callout_statistics.cos_cum_softclock_size += \
			callout_statistics.cos_current_size);
		MCO_ASSERT(callout_statistics_invariant());
		simple_unlock(&callout_lock);
		splx(s);
		(*func)(arg, a);
	}
	/*
	 * If trapped user-mode and profiling, give it
	 * a profiling tick.
	 */
#if	defined(sun3) || defined(sun4)
	if (was_user_mode) {
#else
	if (USERMODE(ps)) {
#endif
		register struct proc *p = u.u_procp;

		if (u.u_prof.pr_scale > 1) {
			p->p_flag |= SOWEUPC;
			aston();
		}
	}
}
#else	/* SOFTCLOCK_THREAD */
/*
 * Software priority level clock "interrupt", handled by a thread.
 * Run periodic events from timeout queue.
 */
softclock()
{
	panic("softclock");
}

softclock_scan(flag)
{
	register struct callout *p1;
	register caddr_t arg;
	register int (*func)(), a;
	int s = splhigh();

	for (;;) {
		simple_lock(&callout_lock);
		if ((p1 = calltodo.c_next) == 0 || p1->c_time > 0)
			break;
		arg = p1->c_arg; func = p1->c_func; a = p1->c_time;
		calltodo.c_next = p1->c_next;
		p1->c_next = callfree;
		callfree = p1;
		MCO_ASSERT(callout_statistics.cos_current_size > 0);
		MCO_STATS(callout_statistics.cos_num_softclock++);
		MCO_STATS(callout_statistics.cos_current_size--);
		MCO_STATS(callout_statistics.cos_cum_softclock_size += \
			callout_statistics.cos_current_size);
		MCO_ASSERT(callout_statistics_invariant());
		simple_unlock(&callout_lock);
		(void) splsoftclock();
		(*func)(arg, a);
		(void) splhigh();
	}
	if (flag)
		assert_wait((vm_offset_t)softclock, FALSE);
	simple_unlock(&callout_lock);
	splx(s);
}

softclock_thread()
{
	thread_t thread = current_thread();

	thread_swappable(thread, FALSE);
/*
 * RT_SCHED: Change hardcoded priority to a constant.  Always valid.
 * The constant is defined in kern/sched.h.
 * Stomping on sched_pri here won't hurt, because the thread is running and
 * therefore not on a run queue.
 */
	thread->priority = thread->sched_pri = BASEPRI_SOFTCLOCK;
	unix_master();		/* XXX signals sent in timeouts */
	(void) spl0();

	for (;;) {
		softclock_scan(1);
		thread_block();
	}
	/* NOTREACHED */
}

thread_t softclock_thread_ptr; /* used for debugging crash dumps */

softclock_init()
{
	extern task_t first_task;

	softclock_thread_ptr = kernel_thread(first_task, softclock_thread);
}
#endif	/* SOFTCLOCK_THREAD */

/*
 * Arrange that (*fun)(arg) is called in t/hz seconds.
 */
timeout(fun, arg, t)
	int (*fun)();
	caddr_t arg;
	register int t;
{
	register struct callout *p1, *p2, *pnew;
	register int s = splhigh();
#if	MACH_CO_STATS
	register int pos = 0;
#endif
#ifdef	KTRACE
	kern_trace(402,fun,arg,t);
#endif  /* KTRACE */
	simple_lock(&callout_lock);
	if (t <= 0)
		t = 1;
	pnew = callfree;
	if (pnew == NULL)
		panic("timeout table overflow");
	callfree = pnew->c_next;
	pnew->c_arg = arg;
	pnew->c_func = fun;
	for (p1 = &calltodo; (p2 = p1->c_next) && p2->c_time < t; p1 = p2) {
		if (p2->c_time > 0)
			t -= p2->c_time;
		MCO_STATS(pos++);
	}
	p1->c_next = pnew;
	pnew->c_next = p2;
	pnew->c_time = t;
	if (p2)
		p2->c_time -= t;
	MCO_STATS(callout_statistics.cos_num_timeout++);
	MCO_STATS(callout_statistics.cos_cum_timeout_size += \
		callout_statistics.cos_current_size);
	MCO_STATS(callout_statistics.cos_cum_timeout_pos += pos);
	MCO_STATS(callout_statistics.cos_current_size++);
	MCO_ASSERT(callout_statistics.cos_current_size > 0);
	MCO_ASSERT(callout_statistics_invariant());
	simple_unlock(&callout_lock);
	splx(s);
}

/*
 * untimeout is called to remove a function timeout call
 * from the callout structure.
 */
untimeout(fun, arg)
	int (*fun)();
	caddr_t arg;
{
	register struct callout *p1, *p2;
	register int s;
#if	MACH_CO_STATS
	register int pos = 0;
#endif
#ifdef	KTRACE
	kern_trace(403,fun,arg,0);
#endif  /* KTRACE */
	s = splhigh();
	simple_lock(&callout_lock);
#if	MACH_CO_STATS
	callout_statistics.cos_num_untimeout++;
#endif
	for (p1 = &calltodo; (p2 = p1->c_next) != 0; p1 = p2) {
		if (p2->c_func == fun && p2->c_arg == arg) {
			if (p2->c_next && p2->c_time > 0)
				p2->c_next->c_time += p2->c_time;
			p1->c_next = p2->c_next;
			p2->c_next = callfree;
			callfree = p2;
			MCO_ASSERT(callout_statistics.cos_current_size > 0);
			MCO_STATS(callout_statistics.cos_num_untimeout_hit++);
			MCO_STATS(callout_statistics.cos_current_size--);
			MCO_STATS(callout_statistics.cos_cum_untimeout_pos += pos);
			break;
		}
		MCO_STATS(pos++);
	}
	MCO_STATS(callout_statistics.cos_cum_untimeout_size += \
		callout_statistics.cos_current_size);
	MCO_ASSERT(callout_statistics_invariant());
	simple_unlock(&callout_lock);
	splx(s);
}

#if	NCPUS > 1
/*
 * untimeout_try is a multiprocessor version of timeout that returns
 * a boolean indicating whether it successfully removed the entry.
 */
boolean_t
untimeout_try(fun, arg)
	int (*fun)();
	caddr_t arg;
{
	register struct callout *p1, *p2;
	register int s;
	register boolean_t	ret = FALSE;
#if	MACH_CO_STATS
	register int pos = 0;
#endif

	s = splhigh();
	simple_lock(&callout_lock);
#if	MACH_CO_STATS
	callout_statistics.cos_num_untimeout++;
#endif
	for (p1 = &calltodo; (p2 = p1->c_next) != 0; p1 = p2) {
		if (p2->c_func == fun && p2->c_arg == arg) {
			if (p2->c_next && p2->c_time > 0)
				p2->c_next->c_time += p2->c_time;
			p1->c_next = p2->c_next;
			p2->c_next = callfree;
			callfree = p2;
			MCO_ASSERT(callout_statistics.cos_current_size > 0);
			MCO_STATS(callout_statistics.cos_num_untimeout_hit++);
			MCO_STATS(callout_statistics.cos_current_size--);
			MCO_STATS(callout_statistics.cos_cum_untimeout_pos += pos);
			ret = TRUE;
			break;
		}
		MCO_STATS(pos++);
	}
	MCO_STATS(callout_statistics.cos_cum_untimeout_size +=
		callout_statistics.cos_current_size);
	MCO_ASSERT(callout_statistics_invariant());
	simple_unlock(&callout_lock);
	splx(s);
	return(ret);
}
#endif	/* NCPUS > 1 */

/*
 * Compute number of hz until specified time.
 * Used to compute third argument to timeout() from an
 * absolute time.
 */
hzto(tv)
	struct timeval *tv;
{
	register int ticks;
	register int sec, usec;

	int s = splhigh();
	TIME_READ_LOCK();
	sec = tv->tv_sec - time.tv_sec;
	usec = tv->tv_usec - time.tv_usec;
	TIME_READ_UNLOCK();
	splx(s);

	/*
	 * If number of seconds will fit in 32 bit arithmetic,
	 * then compute number of seconds to time and scale to
	 * ticks.  Otherwise round times greater than representible
	 * to maximum value.
	 *
	 * hz may range from 1 to 2147 without loss of (32-bit) precision.
	 * Maximum value for any timeout depends on hz.
	 * Must potentially correct for roundoff error in tick (1000000/hz)
	 * when passed as tv = { 0, tick }.
	 */
	if (sec + 1 <= INT_MAX / hz) {
		ticks = (sec * hz) + (((usec + tick - 1) * hz) / (1000*1000));
		if (ticks <= 0) ticks = 1;
	} else
		ticks = INT_MAX;
	return (ticks);
}

/* ARGSUSED */
profil(p, args, retval)
	struct proc *p;
	void *args;
	long *retval;
{
	register struct args {
		short	*bufbase;
		unsigned long bufsize;
		unsigned long pcoffset;
		unsigned long pcscale;
	} *uap = (struct args *)args;
	register struct uuprof *upp = &u.u_prof;

	upp->pr_base = uap->bufbase;
	upp->pr_size = uap->bufsize;
	upp->pr_off = uap->pcoffset;
	upp->pr_scale = uap->pcscale;
	return (0);
}
#if RT

/*
 * psx4_untimeout is called to remove a function timeout call
 * from the callout structure.
 */
psx4_untimeout(fun, arg)
	int (*fun)();
	caddr_t arg;
{
	register struct callout *p1, *p2;
	register int s;
        int cnt;
#if	MACH_CO_STATS
	register int pos = 0;
#endif

	s = splhigh();
	simple_lock(&callout_lock);
#if	MACH_CO_STATS
	callout_statistics.cos_num_untimeout++;
#endif
        cnt = 0;

	for (p1 = &calltodo; (p2 = p1->c_next) != 0; p1 = p2) {
                cnt += p2->c_time; 
		if (p2->c_func == fun && p2->c_arg == arg) {
			if (p2->c_next && p2->c_time > 0)
				p2->c_next->c_time += p2->c_time;
			p1->c_next = p2->c_next;
			p2->c_next = callfree;
			callfree = p2;
			MCO_ASSERT(callout_statistics.cos_current_size > 0);
			MCO_STATS(callout_statistics.cos_num_untimeout_hit++);
			MCO_STATS(callout_statistics.cos_current_size--);
			MCO_STATS(callout_statistics.cos_cum_untimeout_pos += pos);
			break;
		}
		MCO_STATS(pos++);
	}
	MCO_STATS(callout_statistics.cos_cum_untimeout_size += \
		callout_statistics.cos_current_size);
	MCO_ASSERT(callout_statistics_invariant());
	simple_unlock(&callout_lock);
	splx(s);

        return(cnt);
}
#endif

#if RT_TIMER
/*
 * P1003.4 psx4_adjust_callout() -- adjusts .4 timers in callout
 *	queue in response to settimeofday() calls.
 *
 * Environment:
 *  This function is called from setthetime() in bsd/kern_time.c.
 *  It is called at splhigh() with TIME_WRITE_LOCK locked.
 * 
 * Abstract:
 *  If we are in a POSIX timer environment, we want to scan the
 *  callout queue and adjust pending POSIX timers for the change
 *  in the system time. Relative timers retain their position
 *  in the callout queue but have their expected timeout times
 *  adjusted. Absolute timers are moved in the queue to reflect
 *  the change in the system time.
 *
 * Inputs:
 *  tv -- pointer to new timeval specified to settimeofday()
 *
 */

void
psx4_adjust_callout(tv)
struct timeval *tv;
{
	register struct callout abs, *p1, *p2, *a1 = &abs;
	register int pdelta = 0, adelta = 0;
	struct timeval tdelta;
	int tickdelta = 0;

	tdelta = time;
	timevalsub(&tdelta, tv);

	if (tdelta.tv_sec + 1 <= INT_MAX / hz) {
		tickdelta = (tdelta.tv_sec * hz) + (((tdelta.tv_usec + tick - 1) * hz) / (1000*1000));
	} else
		tickdelta = INT_MAX;

	abs.c_next = NULL;

	/*
	 * Step 1: Take absolute POSIX timers out of the callout queue,
	 * readjust tick values, and insert into a temporary list.
	 * POSIX timers with relative waits need to have their
	 * time values likewise adjusted.
	 */

	for (p1 = &calltodo; (p2 = p1->c_next) != NULL; ) {
		if (p2->c_func == psx4_tod_expire) { 	/* if a POSIX timer */
			if (((psx4_timer_t *)p2->c_arg)->psx4t_type & TIMER_ABSTIME) {
				/* This is an absolute wait */
				if (p2->c_next && p2->c_time > 0)
					p2->c_next->c_time += p2->c_time;
				p1->c_next = p2->c_next;
				a1->c_next = p2;
				a1 = p2;
				a1->c_time = (a1->c_time + pdelta) - adelta;
				adelta += a1->c_time;
				a1->c_next = NULL;
			}
			else {
				/* This is a relative wait -- adjust time. */
				PROC_TIMER_LOCK(((psx4_timer_t *)p2->c_arg)->psx4t_p_proc);
				timevalsub(&((psx4_timer_t *)p2->c_arg)->psx4t_timeval.it_value,
					&tdelta);
				PROC_TIMER_UNLOCK(((psx4_timer_t *)p2->c_arg)->psx4t_p_proc);
				pdelta += p2->c_time;
				p1 = p2;
			}
		}
		else {
			/* This isn't any kind of POSIX timer. Go to next. */
			pdelta += p2->c_time;
			p1 = p2;
		} 
	}

	/*
	 * Step 2: Adjust the first entry in the new absolute queue to
	 * reflect change in system time. This is done on a "tick" basis.
	 * If the time was set back, tickdelta is positive.
	 */

	if (abs.c_next != NULL)
		abs.c_next->c_time += tickdelta;

	/*
	 * Step 3: Insert adjusted entries back into callout queue, adjusting
	 * tick counts as necessary. Walk the calltodo list until it's
	 * empty, then see whether there's anything left in the absolute list.
	 */

	p1 = &calltodo;
	a1 = abs.c_next;

	while (((p2 = p1->c_next) != NULL) && a1) {

		/* 
		 * If less, insert and decrement next time by new entry.
		 * If greater, decrement new entry time by current time and
		 *   move to next entry. If there is no next entry, insert.
		 */

			if (a1->c_time < p2->c_time) {
				abs.c_next = a1->c_next; /* remove entry */
				p1->c_next = a1;
				a1->c_next = p2;
				p2->c_time -= a1->c_time;
				p1 = a1;
				a1 = abs.c_next;
			}
			else {
				a1->c_time -= p2->c_time;
				p1 = p2;
			}
	}

	if (a1 != NULL) {
		p1->c_next = a1;
	}

}
#endif /* RT_TIMER */
