

#!/bin/nawk -f -
#
# Accepts input lines of the form:
#
# job-name job-id { "start", "end" } timestamp
#
# So: ftp AA1234 start 829107402
#     telnet AB767 start 829107422
#     telnet AB767 end 829107477
#     ftp AA1234 end 829108382
#     etc..
#
# DO NOT USE a '+' or a '$' in the job-name or job-id fields. The timestamps
# should be in seconds (or any other simple integer). The end record for any
# job must come AFTEER the start, but otherwise we don't care much about
# order since we sort.
#
# It divides the jobs up by duration, according to the specified granularity.
# That is, the granularity determines the length of a 'tick' in seconds, and
# jobs that take the same number of ticks are lumped together.
#
# The job-name and the # of ticks it took determine a job's 'class', which
# is stored as something like 'ftp+7' for 'ftp jobs that took 7 ticks'.
# The start times for all the jobs in the world are stored away, and then
# sorted, according to class. The intervals between all the start times are
# checked, and used to determine a max and a min value for the possible
# gaps between job-starts in a given class. This essentially simulates
# the input load by assuming the job start times in the input are
# UNIFORMLY distributed between the max and min interval. This is likely
# to be false. Most times, say, you (in real life) start a job once or
# twice a minute, but then there's a 2 hour gap by accident. This guy will
# start jobs up at random intervals of 1 minute to 2 hours, based on
# that input. Edit the output to whack the outliers, perhaps. Or feel
# free to beef up the statistical modelling in this thing or workload(1) ;)
#
# But do see the comments in the BEGIN section for some variations.
#
# Then a parallel {} specification is emitted, with a bunch of clauses
# each a 'random {}' specification for the job specified. The class name
# with the + editted out, for example class 'ftp+7' is run as command
# 'ftp 7', and the max and min times are, well, the max and min times to
# use for sequencing job starts.
#
# To generate an Nx load, edit the 'parallel {' at the beginning to
# 'parallel N {'
#
# To run the same synthetic job over and over again, wrap the whole file
# up in 'sequential N {' and '} at the end.
#
BEGIN {
	granularity=30;
	seed="pid"; # seed for the 'random' specifications

	# For job classes with only one job, you can't compute any intervals
	# between jobs, so you just use these values as the max/min for
	# the random {} spec.

	maxtime=100;
	mintime=0;

	verbose=0;  # A debug flag, really

	# If you want to use average start times, set these up..
	# this computes the average interval between job starts, and
	# models the workload as uniformly distributed inter-job-start
	# gaps between [avg - delta , avg + delta]
	use_averages = 0;
	# If you're using averages, the average interval + or - this delta
	# will be the max and min times, not the actual observed max/min
	averages_delta = 3;

	# If you want to chop some range of the head and tail, set these
	# This model counts all the inter-job-start intervals for a job
	# class, and chops off the bottom (low_percentile)% and the top
	# (high_percentile)% from that list, and then models the workload
	# as uniformly distributed inter-job-start gaps between the bottom
	# and the top of the remaining list. Roughly, it chops off the
	# outlying observed inter-job-start intervals at the indicated
	# percentages.
	use_percentiles = 1;
	low_percentile = 45;
	high_percentile = 55;

	if(use_averages && use_percentiles) {
		print "ERROR: Can't use both averages and percentiles";
		exit;
	}
}

$3 == "start" {
	start[$1 "+" $2] = $4;
}

$3 == "end" {
	if(start[$1 "+" $2] > 0) {
		duration = $4 - start[$1 "+" $2];
		if(duration >= 0) {
			duration = int(duration / granularity);
			if(verbose)
				print "# Job " $1 "+" $2 " took " \
					duration " ticks.";
			class = $1 "+" duration;
			store_job(class, start[$1 "+" $2]);
		} else {
			print "# Error at " NR " Job " $1 " " $2 " took " \
				duration " seconds?";
		}
	} else {
		print "# Error at line " NR " no start record for job " \
			$1 " " $2;
	}
}

END {
	do_min_max();

	print "parallel {";
	for(c in minimums) {
		cmd = c;
		gsub(/\+/," ",cmd);
		if(next_id[c] == 2) {
			print "\t# Only one job in class " c;
			print "\trandom 1 " seed " " mintime " " maxtime " {"
			print "\t\tcommand \"" cmd "\"";
			print "\t}";
			continue;
		}
		# We had multiple jobs in the class..
		print "\t# " next_id[c]-1 " jobs in class " c;
		print "\trandom " next_id[c]-1 " " seed " " minimums[c] " " \
			maximums[c] " {";
		print "\t\tcommand \"" cmd "\"";
		print "\t}"
	}
	print "}"
}

#
# Stores a specified start time into a job class
#
function store_job(c, s,   id)
{
	if(next_id[c] > 0) {
		id = next_id[c];
		next_id[c] = id + 1;
	} else {
		id = 1;
		next_id[c] = 2;
	}

	starttimes[c "$" id] = s;
	classes[c] = 1; # It exists
}


# We have divided all the jobs input into classes, and stored the
# start times for every job in every class. Walk the list of all
# the starttimes, looking for the min and the max intervals between
# start times.

function do_min_max(c, i, mn, mx, total,lo, hi)
{
	for(c in classes) {
		mn = 1000000;
		mx = 0;
		total = 0;
		quicksort(c, starttimes, 1, next_id[c]-1);
		for(i = 1; i < next_id[c]-1; i++) {
			interval = starttimes[c "$" (i+1)] - \
				starttimes[c "$" i];
			total += interval;
			if(interval < mn) {
				mn = interval;
			}
			if(interval > mx) {
				mx = interval;
			}

			if(use_percentiles) {
				intervals["X$" i] = interval;
			}
		}

		averages[c] = int(total / next_id[c]-1);
		if(use_averages) {
			minimums[c] = averages[c] - averages_delta;
			maximums[c] = averages[c] + averages_delta;
		} else if(use_percentiles && next_id[c] > 2) {
			if(verbose) {
				print "INTERVALS for class " c;
				for(i = 1; i < next_id[c]-1; i++)
					print intervals["X$" i];
			}
			quicksort("X", intervals, 1, next_id[c]-2);
			if(verbose) {
				print "POST SORT INTERVALS for class " c;
				for(i = 1; i < next_id[c]-1; i++)
					print intervals["X$" i];
			}

			# Chop off the low & the high ends
			lo = int((low_percentile * (next_id[c]-1)) / 100);
			hi = int((high_percentile * (next_id[c]-1)) / 100);
			minimums[c] = intervals["X$" lo];
			maximums[c] = intervals["X$" hi];
		} else {
			minimums[c] = mn;
			maximums[c] = mx;
		}
	}
}


#
# Quicksort, from The AWK Book
#
function quicksort(class, A, left, right,     i, last) {
	if(left >= right)
		return;
	swap(class, A, left, left + int((right - left + 1)*rand()));
	last = left;
	for(i = left+1; i <= right; i++)
		if(A[class "$" i] < A[class "$" left])
			swap(class, A, ++last, i);
	swap(class, A, left, last);
	quicksort(class, A, left, last-1);
	quicksort(class, A, last+1, right);
}

function swap(class, A, i, j,     t) {
	t = A[class "$" i]; A[class "$" i] = A[class "$" j]; A[class "$" j] = t;
}

