/*
	uniqbib - uniq a bibliographic database to eliminate duplicates.

	syntax: uniqbib [ -v ] [ file ]

	-v means verbose: prints feedback.

	Only one file may be explicitly named (use cat f1 f2 ... | uniqbib
	for > 1 file).

	Strategy:
	Read input file, writing file position and summary information
	for each entry.  Sort on summary info.  For each set of entries
	having identical summary information, compare directly, and write
	out those that are different.  (For those entries that have unique
	summary info, the entry is unique, of course.)

	The summary information is trivial:  the sum of the characters in
	the text of the entry.  This is invariant with respect to order of
	fields (records that are identical except for order of fields get
	identical checksums this way).

	7 April 1988	Paul DuBois	dubois@rhesus.primate.wisc.edu

	Change History:

	08/10/88 Fixed failure to check for citations that are too long.
		Added -v flag.  Put better error messages in BERead
		(prints text of citation seen so far, input line number
		where error occurred).  Removed inability to uniq from
		a pipe or redirected input (citations are copied to
		holding file during reading phase 1).
*/

# include	<stdio.h>
# include	<signal.h>
# include	"bibinfo.h"


int	count = 0;
int	verbose = 0;

int	cleanup ();
int	onintr ();

char	keyFile[BUFSIZ];
char	holdFile[BUFSIZ];


main (argc, argv)
int	argc;
char	**argv;
{
BibEnt	*be;
BibFld	*bf;
FILE	*kf, *hf;
char	buf[BUFSIZ];
char	*cp;
long	sum;
int	len;
int	result;
int	empty = 1;
int	useHold = 0;
long	holdPos = 0;

/*
	Arrange for cleanup of temp files in
	the event of abnormal termination.
*/

	setpanichook (cleanup);
	signal (SIGINT, onintr);
	signal (SIGHUP, onintr);
	signal (SIGQUIT, onintr);
	signal (SIGTERM, onintr);

/*
	Process arguments and set up files.  If a file is named, use
	that for input, else read whatever's on stdin and arrange to
	hold a copy in a seekable temp file.
*/

	--argc;
	++argv;
	if (argc > 0 && strcmp (*argv, "-v") == 0)
	{
		++verbose;
		--argc;
		++argv;
	}
	if (argc == 0)		/* reading from pipe or redirection */
	{
		sprintf (holdFile, "/tmp/ubh%05d", getpid ());
		if ((hf = fopen (holdFile, "w")) == NULL)
			panic ("Can't open temporary hold file");
		++useHold;
	}
	else if (argc == 1)
	{
		if (freopen (argv[0], "r", stdin) == NULL)
			panic ("Can't open: %s", argv[0]);
	}
	else
		panic ("Usage: uniqbib [-v] file");

	sprintf (keyFile, "/tmp/ub%05d", getpid ());
	if ((kf = fopen (keyFile, "w")) == NULL)
		panic ("Can't open temporary key file.");

/*
	Ready to make first pass through input.  Read each citation,
	compute the key and write out the key and file position of the
	citation to the key file.  If reading a pipe or redirected input
	write out the citation to a holding file so can re-read it.  In
	that case, the file position must be fixed, since the position
	in the holding file may well be different (e.g., if there are
	multiple blank lines between citations in the original input).
*/

	if ((be = BEAlloc ()) == NULL)
		panic ("Out of memory.");
	if (verbose)
		fprintf (stderr, "Reading citations\n");
	while ((result = BERead (stdin, be)) > 0)
	{
		++count;
		if (verbose)
			fprintf (stderr, ".");
		empty = 0;
		sum = 0;
		cp = BEText (be);
		len = BELen (be);
		if (useHold)
		{
			BEWrite (hf, be);
			BEFilPos (be) = holdPos;
			holdPos += len + 1;	/* +1 for preceding newline */
		}
		while (len-- > 0)
			sum += *cp++;
		fprintf (kf, "%D %D\n", sum, BEFilPos (be));
	}
	fclose (kf);
	if (useHold)	/* if using hold file, close and attach to stdin */
	{
		fclose (hf);
		if (freopen (holdFile, "r", stdin) == NULL)
			panic ("Can't reopen hold file.");
	}
	if (result < 0 || empty)
	{
		cleanup ();
		exit (0);
	}
	if (verbose)
		fprintf (stderr, "\nPass 1 done (%d citations)\n", count);

/*
	Pass two.  Sort the keys so duplicates will cluster, and uniq
	them.
*/

	if (verbose)
		fprintf (stderr, "Sorting keys\n");
	sprintf (buf, "exec sort -o %s %s", keyFile, keyFile);
	system (buf);
	if (verbose)
		fprintf (stderr, "Sort done\n");
	if ((kf = fopen (keyFile, "r")) == NULL)
		panic ("Can't reopen key file.");
	UniqBib (kf);
	fclose (kf);
	if (verbose)
		fprintf (stderr, "\nDone\n");
	cleanup ();
	exit (0);
}



/*
	The ugly heart of the program.

	Read checksum/file-position pairs from f.  It's sorted on checksum,
	so that all groups of identical checksums will cluster.  The
	bib entries within each cluster may or *may not* be content-identical,
	so the algorithm hangs onto each entry until it either knows it's
	unique or that it's a dup, as follows:

	First read one line and hold onto it for a reference.  Then read rest
	of lines.  If checksum is different, flush the reference and restart
	with the next line after the reference as the new reference.  If the
	checksum is the same, then do a direct compare of the bib entries
	themselves.  If they're the same, skip the reference and restart with
	the next line after the reference as the new reference.  If they are
	different, just keep reading.  (Eventually one will be found that's
	either a different checksum or identical, or EOF will be reached, so
	the reference can be either flushed or skipped.)

	When restarting so that the reference is bounced to the next
	line in the summary file, check first whether the comparison
	is that line.  If so, don't bother to reread that line or to
	fetch the bibliographic entry itself.  Since except in perverse
	cases the comparison almost always becomes the next reference, this
	is a big win.

*/

UniqBib (f)
FILE	*f;
{
long	refPos, comPos;		/* reference and comparison seek positions */
long	refCkSum, comCkSum;	/* reference and comparison checksums */
int	getNextRef = 1;		/* non-zero if need to read ref sum, pos */
long	refOff = 0, comOff;	/* offset of line after reference, comparison */
int	nCompares = 0;		/* number of comparisons made with ref */
BibEnt	b1, *beRef = &b1;
BibEnt	b2, *beCom = &b2;
BibEnt	*beTmp;
int	nondups = 0;

	if (verbose)
		fprintf (stderr, "Comparing keys\n");
	for (;;)
	{
		if (verbose)
			fprintf (stderr, ".");
		if (getNextRef)
		{
			getNextRef = 0;
			if (nCompares == 1)	/* make comparison next ref */
			{
				refCkSum = comCkSum;
				refPos = comPos;
				refOff = comOff;
				beTmp = beRef;
				beRef = beCom;
				beCom = beTmp;
			}
			else	/* seek to correct spot, read summary */
			{	/* info and entry from bib file */
				fseek (f, refOff, 0);
				if (fscanf (f, "%D %D\n", &refCkSum, &refPos)
						!= 2)
					break;	/* no more refs in file */
				refOff = ftell (f);
				fseek (stdin, refPos, 0);
				if (!BERead (stdin, beRef))
					panic ("Can't read reference entry.");
			}
			nCompares = 0;
		}
		if (fscanf (f, "%D %D\n", &comCkSum, &comPos) != 2)
		{
			BEWrite (stdout, beRef);	/* flush reference */
			++nondups;
			++getNextRef;
			continue;
		}
		comOff = ftell (f);
		fseek (stdin, comPos, 0);
		if (!BERead (stdin, beCom))
			panic ("Can't read comparison entry.");
		++nCompares;
		if (refCkSum != comCkSum)		/* different - flush */
		{
			BEWrite (stdout, beRef);	/* current reference */
			++nondups;
		}
		else if (BECmp (beRef, beCom))		/* compare directly, */
			continue;			/* skip ref if diff */
		++getNextRef;
	}
	fprintf (stderr, "%d citations (%d + %d duplicates)\n",
			count, nondups, count-nondups);
}


cleanup ()
{
	(void) unlink (keyFile);
	(void) unlink (holdFile);
}

onintr () { panic ("\nInterrupted..."); }
