/*
	B-SORT		A sorting program using the B-tree algorithm
	VRS 1.12   2-AUG-82		J.Riley

	This is a self-contained file except for the standard included
	definitions and is part of the  larger development of a message-
	retrieval system. Since the algorithms involved are well known
	it is felt that putting this program in public domain would serve
	a useful purpose, namely provide some examples of some programming
	techniques using the 'C' language. The compiler used is the BDS
	vs 1.46 and use is made of at least one special feature of this
	compiler(see 'printf' function).

	Nominally this is a sorting program and can certainly be used for
	that purpose. However, since the main benefit of using a B-tree is
	when using disk-based structures, it is quite possible that this
	will not be your hot-dog sort utility. It is relatively fast though
	and I would be interested in any comparisons anyone makes. A later
	effort is planned to build a virtual memory support package which
	will then allow the 'nodes' of the b-tree to be stored as disk
	sectors and at that point this program will allow sorting of
	arbitrarily large text files. Again, that is still not the main
	objective for it, information retrieval is.

	An introduction to the B-tree concept is not attempted here. The
	primary reference and, in fact, the bareface source of this program
	is N. Wirth's wonderful book "Algorithms+Data Structures=Programs".
	I passed this book up several times in the past, but when I was 
	looking for a treatment of this subject, I found that it had a very
	nice balance between explanation and example(the best way I learn)
	and since 'C' and PASCAL are 'kissin cousins', it was 'straight
	forward' to do the transcribing. So go to god and read section
	4.51 if you want to understand what is going on here. Also you can
	get a little contrast of the styles which result from the different
	features of the two languages(especially w re pointer variables).

	Things to note in this program which you may not already be
	familiar with are the following:

		1) A nested data structure(PAGE) is used. This is possible
		   in PASCAL and PL/I too and makes life a lot more pleasant.
		   Try doing the same thing in BASIC or FORTRAN, what a
		   mess! In a sense it would be easier in assembler.

		2) Note that the tree structure used is a balanced one
		   so that no single branch gets long at the expense of
		   others. To see the depth level of the tree, turn on
		   the SPRINT parameter and note the first column of the
		   output on any sort. There is a trade-off here, though
		   in that it takes longer to build the tree than if it
		   were just a binary tree(not an AVL though). Thats why
		   b-trees are best for retrieval rather than simple
		   sorts.

		3) The logic uses a rather interesting recursive structure
		   in which 'emerging' items are handed back through higher
		   levels until perhaps to the root of the tree. See the
		   variable called 'u' in function 'search'.

		4) The parameter KEYPTR allows the option of including the
		   string storage area in the nodes themselves as opposed
		   to allocated areas. The latter is faster since no move-
		   ment of strings is necessary after they have been given
		   initial allocations. However, for disk-based use the
		   strings(keys) would need to be in the nodes. To see the
		   difference in performance(big!) just undefine(comment
		   out) this parameter.

		5) One peculiarity of the 'C' language came up in an earlier
		   version of B-sort, it would not sort itself! Clue, this
		   had to do with the way that 'printf' works. See if you
		   can figure out how this happened and how it was fixed.

	I hope this program is useful for some in learning more about the
	'C' language and an interesting algorithm(the key to better software).
	If you make any improvements give me a copy. If you mutilate it, keep
	it to yourself.
				Jack Riley (303) 499-9169 RCPM, Boulder, CO.
*/

#include "bdscio.h"	/* version with ALLOC_ON false*/

#DEFINE N		2  /* half-size of b-tree node */

/* options for customizing the program */

/* #DEFINE SPRINT */ 		/* provides statistics on keys in output */
#DEFINE KEYPTR		/* uses pointer references to strings instead of
			   arrays in the b-tree nodes */
/* #DEFINE DIAGNOSE */	/* turns on voluminous trace information on actions
			   taken by program logic.. perhaps instructive */
/* special assigned values */

#DEFINE alloc		sbrk /* only need simple allocation method */
#DEFINE KEYSIZE		80
#DEFINE MAX_LEN		20
#DEFINE MAXPRINT	3000

/* dependent parameters */

#DEFINE NN		N+N
#DEFINE NM1		N-1
#DEFINE NM2		N-2
#DEFINE	NP1		N+1

/* structure definitions */

#DEFINE ITEM 		struct sitem
#DEFINE PAGE 		struct spage

ITEM {
#ifndef KEYPTR
	CHAR KEY[KEYSIZE];
#endif
#ifdef KEYPTR
	CHAR *KEY;
#endif
	PAGE *P;
#ifdef SPRINT
	UNSIGNED COUNT;
#endif
	} oneitem;

PAGE {
	UNSIGNED M;
	PAGE *P0;
	ITEM E[NN];
	} onepage;

/* external variables */

PAGE *root,*q;
ITEM g;
FILE infile,l_buffer;
CHAR infilnam[MAX_LEN], instrg[MAXLINE], o_flg;
INT sizitem,sizpage, maxcount, nkeys, tokcount;

/* beginning of programs */

use_err() /* Usage message: */

       { printfc("\nERROR: Invalid parameter specification\n\n");
       printfc("Usage: b-sort(vr 1.12) <filename> <flag(s)>\n\n");
       printfc("       -o <filename> = Write output to named file\n");
       exit(0); }

main(argc,argv)
int argc;
char **argv;
{
	char *arg;

	o_flg=FALSE;		/* output file flag */

	if (argc < 2) use_err();

	strcpy(infilnam,*++argv);

	if( fopen(infilnam,infile) == ERROR )
		{ printfc("\nERROR: Unable to open input file: %s\n",infilnam);
		  EXIT(0); }

	if(--argc > 0)
	   if(*(arg=*++argv) == '-')
		if(tolower(*++arg) == 'o')
			{ o_flg++;
			if(--argc == 0)
				{printfc("\nneed output file name");use_err();}
			if(fcreat(*++argv,l_buffer) == ERROR)
			 {printfc("ERROR: Unable to create output file - %s\n",
						*argv); exit(0);}
			}

	root=NULL; sizitem=sizeof(oneitem); sizpage=sizeof(onepage);
	tokcount=nkeys=0;

#ifdef DIAGNOSE
	printf("\n&root=%x,g=%x,sizi=%d,sizp=%d",&root,g,sizitem,sizpage);
#endif

	while( fgets(instrg,infile) )
		{
		if( trim(instrg) <= 0 ) continue;

		instrg[KEYSIZE-1]='\0';

#ifdef DIAGNOSE
		printf("\n\nsearch key= %s",instrg);
#endif
		if( search(instrg,root,g) )
			{ q=root; 
			 if( (root=alloc(sizpage)) == ERROR)
				{ printfc("\nERROR unable to allocate page");
				  EXIT(0); }
#ifdef DIAGNOSE
printf("  root=%x, q=%x",root,q);
#endif

			root->M=1; root->P0=q; movmem(g, root->E[0] , sizitem);

			}
		}

	printfc("\nEnd of input\n");

	printfc("\nnumber of unique keys=%d, total key count=%d\n",
					nkeys,tokcount);
	if(!o_flg) pause();

	maxcount=MAXPRINT; printtree(root,1);

	printf("\n");
	if(o_flg)
		{ putc(CPMEOF,l_buffer); fflush(l_buffer); fclose(l_buffer); }
}
CHAR search(x,a,v)
CHAR *x;
PAGE *a;
ITEM *v;
{
	INT i,k,l,r,cmp; PAGE *q,*b; ITEM u; CHAR *t;

/*	Search for key x on page a */

	if(a==NULL) 		/* ITEM with key x is not in tree */
		{ 
		++tokcount; ++nkeys; defkey(&v->KEY,x);

#ifdef DIAGNOSE
printf("\n             a ==  null v(=%x)->KEY=%s",v,v->KEY);
#endif

#ifdef SPRINT
		v->COUNT=1;
#endif
		v->P=NULL;  return (TRUE) ; /* TRUE means not found */
		}
	else
		{ l=0; r=a->M-1;  /* binary array search */
		do {
		    k=(l+r)/2;
		    cmp= strcmp(x,t=(a->E[k].KEY));

#ifdef DIAGNOSE
printf("\ncmp=%d,r=%d,l=%d,a(=%x)->P0=%x/E[k=%d].P=%x/E[k].KEY=%x=%s",
			cmp,r,l,a,a->P0,k,a->E[k].P,t,t );
#endif

		    if( cmp <= 0) r=k-1;
		    if( cmp >= 0) l=k+1;

		   } while ( r >= l );

		if( cmp == 0 ) /* found it, bump counter */
			{ ++tokcount;
#ifdef SPRINT
			  ++a->E[k].COUNT;
#endif
			  return(FALSE); }

		else    /* test if item is not on this page   */
			{
			q = ( r < 0 ) ? a->P0 : a->E[r].P;

			if( !search(x,q,u) ) return(FALSE);
			}
		}

/* ---- insert an item */

	if (a->M < NN)
		{ /* page not full yet, add to it. 'Bump' items from r+1 to
							M-1 */
		movmem( a->E[r+1] , a->E[r+2] , sizitem*((a->M++)-r-1) );
		movmem( u , a->E[r+1]  , sizitem );

		return(FALSE);
		}
	else
		{ /* page full, split it and push center item upward in tree */
		if( (b = alloc(sizpage)) == ERROR )
			{ printf("\nOut of memory"); EXIT(0) ; }

#ifdef DIAGNOSE
printf("\n\n ******  new node at %x",b);
#endif

		if ( r <= NM1 ) /* put new item in old page */
			{
			if ( r == NM1 )  movmem( u , v , sizitem );
			else
				{ /* 'bump' down items from r+2 to N-1 */

				movmem( a->E[NM1] , v , sizitem );
				movmem( a->E[r+1] , a->E[r+2] ,
							sizitem*(NM2-r) );
				movmem( u , a->E[r+1]  , sizitem );
				}
			movmem( a->E[N] , b->E[0] , sizitem*N );
			}
		else
			{/* move upper N items and new item to new page */

			movmem( a->E[N] , v , sizitem ) ;

			if( (r -= N) > 0 )	
				movmem( a->E[NP1] , b->E[0] , sizitem*r );

			movmem( u , b->E[r] , sizitem );

			if( (i = NM1-r) > 0 )
				movmem( a->E[NP1+r] , b->E[r+1], sizitem * i );
			}

		a->M = b->M = N ; b->P0 = v->P; v->P = b ;
		}

	return (TRUE);
}
printtree(p,l)
PAGE *p;
INT l;
{
	INT i,j;  CHAR *t;

	if(maxcount <= 0) return;

	if ( p != NULL )
		{

		printtree(p->P0, l+1 );

		for ( i=0; i <= (j=p->M-1) ; ++i )
			{ --maxcount;
			printf("\n");
#ifdef SPRINT
			printf(" %d %d ",l,p->E[i].COUNT );
#endif
			prints(p->E[i].KEY);

			printtree(p->E[i].P,l+1); 
			}
		}
}
/* defkey allows use of the KEYPTR parameter to specify whether to allocate
   strings to the node which refers to them or to their own locations  */

defkey(a,b)
char **a,*b;
{
#ifdef KEYPTR
	if ( (*a=alloc(strlen(b)+1)) == ERROR )
		{printfc("\ninsufficient string storage in defkey\n");EXIT(0);}
	strcpy(*a,b);
#endif
#ifndef KEYPTR
	strcpy(a,b);
#endif
}

/* general purpose support programs */

trim(strg)
char *strg;
{
	INT l;

	l=strlen(strg);
	while ( strg[l] <= ' ' )
		{ if( l <= 0 ) break;  strg[l--]='\0'; }
	return(l);
}
prints(str)
char *str;
{
	if(o_flg)
		fputs(str,l_buffer);
	else
		puts(str);		/* and print out the line	 */
}
/* Note: The following two functions were obtained from the BDS STDLIB2.C
	 file and may be quite dependent on this compiler since the 'C'
	 language does specify where arguments are to be found. Their
	 purpose is to provide output to one of two special devices based
	 on the 'o-flg' switch given at run-time.  */

printfc(format)
char *format;
{
	char line[MAXLINE];
	_spr(line,&format);	/* use "_spr" to form the output */

	puts(line);		/* and print out the line	 */
}
printf(format)
char *format;
{
	char line[MAXLINE];
	_spr(line,&format);	/* use "_spr" to form the output */

	prints(line);
}
/*  a special version of fputs to surpress those nasty double '\r's that
    sometimes get in */

fputs(s,iobuf)
char *s;
struct _buf *iobuf;
{
	char c;
	while (c = *s++) {
		if (c == '\r' || c == '\0') return;

		if (c == '\n') putc('\r',iobuf);
		if (putc(c,iobuf) == ERROR) return ERROR;
	}
	return OK;
}
