/*
 * The combine utility is a product of Harris, Inc. and is provided for
 * unrestricted use provided that this legend is included on all tape
 * media and as a part of the software program in whole or part.  Users
 * may copy, modify, license or distribute the combine utility without charge.
 * 
 * THE COMBINE UTILITY IS PROVIDED AS IS WITH NO WARRANTIES OF ANY KIND
 * INCLUDING THE WARRANTIES OF DESIGN, MERCHANTABILITY AND FITNESS FOR A
 * PARTICULAR PURPOSE, OR ARISING FROM A COURSE OF DEALING, USAGE OR TRADE
 * PRACTICE.
 * 
 * The combine utility is provided with no support and without any obligation
 * on the part of Harris, Inc. to assist in its use, correction,
 * modification or enhancement.
 * 
 * HARRIS, INC. SHALL HAVE NO LIABILITY WITH RESPECT TO THE
 * INFRINGEMENT OF COPYRIGHTS, TRADE SECRETS OR ANY PATENTS BY THE COMBINE
 * UTILITY OR ANY PART THEREOF.
 * 
 * In no event will Harris, Inc. be liable for any lost revenue
 * or profits or other special, indirect and consequential damages, even if
 * Harris has been advised of the possibility of such damages.
 * 
 * Harris Computer Systems Division
 * 2101 W Cypress Creek Rd
 * Fort Lauderdale, Florida 33309
 */
#include <stdio.h>
#include <ctype.h>
#include "util.h"
#include "combine.h"
/*
 * pass1: Perform the file read and table build pass
 *
 * This routine reads all of the records of all of the input files
 * into the cache, symbol table, and record array.
 *
 * This routine alternately reads one record from each file. Assuming
 * that the files have similar contents, this technique improves the
 * chance that a record from the second and third files will be in
 * the cache.
 *
 * Return value:
 *	This procedure has no return value.
 */

void pass1 () {

	int	i;		/* Misc. variable */

	int	index;		/* Index into record array */

	int	files_left;	/* number of files left to do */

	bool file_done[MAX_FILE_COUNT];/* TRUE if EOT reached on file */

	int	status;		/* Misc. status variable */



       /*
	* Initialize completion parameters.
	*/

	files_left = file_count;
	for (i = 0; i < file_count; ++i) {
		file_done[i] = FALSE;
	}


	/*
	 * For each iteration of the file read the nth record of the each file.
	 */

	for (index = BEGIN_INDEX + 1; files_left != 0; ++index) {

		/*
		 * Handle each file.
		 */

		for (i = 0; i < file_count; ++i) {

			if (!file_done[i]) {

				if (index + 1 >= files[i].record_array_alloc) {
					files[i].record_array_alloc += RA_INCR;
					files[i].record = (record_type *)
						realloc(files[i].record,
						files[i].record_array_alloc * sizeof (record_type));
					if ( files[i].record == 0 ){
						error( "out of memory -- files are way too big");
					}
				}

				status = pass1_read_record (&files[i], index);

				if (status < 0) {
					file_done[i] = TRUE;
					files[i].record_array_size = index + 1;
					--files_left;
					if (files_left == 0) {
						break;
					}
				}

			}

		}

	}

}
/*
 * pass1_read_record: read a record for pass 1
 *
 * This routine reads a record into the cache, symbol table, and record
 * array.
 * For each record read, a unique hash value is computed.
 * That hash value is an index into the symbol table.
 *
 * Return value:
 *	 0:   record read as requested.
 *	-1:   end of the file was reached.
 */

int	pass1_read_record (file_ptr, index)
	file_type * file_ptr;	/* input -- Description of the file
				   to read record from. */

	int	index;		/* input -- Record number of the record
				   currently being read. */

{

	static	cache_entry_type * cache_ptr = 0;
				/* Pointer to the cache entry which is used to read into. This cache buffer is linked onto the cache list if this is the first time the record is read. */

	register int   *end_int_ptr;
				/* Pointer to last integer in hashing algorithm */

	cache_entry_type * found_cache_ptr;
				/* Pointer to the cache entry found by hashing */

	register int   *found_int_ptr;/* co-erced pointer to found record */

	int	hash_code;	/* index into symbol table */

	int	i;		/* Misc. variable */

	register int   *int_ptr;/* Co-erced pointer to read record */

	char	mbuffer[LINE_LENGTH];/* Misc. buffer */

	int	probe_count;	/* Number a rehashes attempted. */

	int	quotient;	/* quotient of sum/table size.
				   This value is used as the increment when
				   re-hashing into the symbol table */

	cache_entry_type * reread_cache_ptr;
				/* Pointer to the cache buffer used for cache
				   re-read operation. */

	file_type * reread_file_ptr;
				/* Pointer to the file to read from on a
				   cache re-read operation. */

	int	reread_index;	/* Record number to read on a cache re-read operation. */

	rfa_type rfa;		/* RFA of record read */

	int	status;		/* Misc. status variable */

	register int	sum;	/* Sum of all bytes in a record */


	/*
	 * If we don't have a cache buffer laying around,
	 *    delink one from end of list.
	 */

	if (cache_ptr == 0) {
		deq_tail_dll (cache_head_ptr, cache_tail_ptr, cache_ptr,
				cache_next_ptr, cache_prev_ptr);
		if (cache_ptr -> hash_code != HASH_FREE_ENTRY) {
			sym_tab_cache_ptr[cache_ptr -> hash_code] =
				(cache_entry_type *) CACHE_NOT_IN_CACHE;
		}
	}

	/*
	 * Read the next record into our local cache entry.
	 */

	rfa = ftell(file_ptr->seq_fd);
	if (p1_debug) {
		printf ("index: %d hash_col: %d cache_miss: %d",
				index, hash_collisions, cache_miss);
	}

	status = read_into_cache( file_ptr -> seq_fd, rfa, cache_ptr );
	if (status < 0) {
		return (-1);
	}

	/*
	 * Pad to end of word with blanks, this allows hashing and comparison
	 * algorithms to be word oriented rather than byte oriented.
	 *
	 * Compress the record based on input parameters.
	 */

	if (cache_ptr -> record_length >= 0) {

		if (compress_records) {
			pass1_record_compress (cache_ptr -> recordp,
					&cache_ptr -> record_length);
		}

		/* Pad to the end of the word with blanks */
		for (i = 0; i < sizeof (int) - 1; ++i) {
			cache_ptr->recordp[cache_ptr->record_length + i] = ' ';
		}
	}

	/*
	 * Compute hash code for the record.
	 *
	 * The value is shifted on each iteration to ensure that the
	 * magnitude of the sum is large enough. Otherwise when the sum is
	 * divided by the size of the symbol table, the remainder will be
	 * clustered toward the beginning of the table.
	 */

	sum = 0;
	int_ptr = (int *) cache_ptr -> recordp;
	end_int_ptr = int_ptr +
		((cache_ptr->record_length + sizeof (int) - 1) / sizeof (int));
	for (; int_ptr < end_int_ptr; int_ptr++) {

	/*	if (p1_debug) {
			printf ("sum: %x ", sum);
		}*/
	/*
	 * The next statement does a left rotate of 7. The add operation is
	 * used instead of a logical-or operation since the right shift
	 * is an arithmetic shift which propogates the sign bit. An
	 * or operation into 25 bits of propogated sign bit loses a lot
	 * of significance.
	 */
		sum = (sum >> (sizeof (int) * 8 - 7)) + (sum << 7);/* left rotate 7 */
	/*	if (p1_debug) {
			printf ("sum<<7: %x ", sum);
		} */
		sum += *int_ptr;
	/*	if (p1_debug) {
			printf ("sum:%x value:%X\n", sum, *int_ptr);
		} */
	}
	sum = abs (sum);
	if (p1_debug) {
		printf ("sum: %d ", sum);
	}

	hash_code = sum % sym_tab_size;
	quotient = sum / sym_tab_size;
	if ((quotient % sym_tab_size) == 0) {
		quotient = 1;
	}

	/*
	 * Loop until a unique hash code is found.
	 */

	for (probe_count = 0; probe_count < sym_tab_size;
			++probe_count, ++hash_collisions) {
		do {
			hash_code = (hash_code + quotient) % sym_tab_size;
		} while (hash_code == 0);/* ensure sym tab loc 0 is never used */

		/*
		 * if symbol table entry is used but record is not is cache,
		 *     read record into cache by flushing the least recently
		 *	entry entry from the cache.
		 */

		if (p1_debug) {
			printf ("hash_code: %d\n", hash_code);
		}

		if ((int) sym_tab_cache_ptr[hash_code] == CACHE_NOT_IN_CACHE) {

			cache_miss++;
			if (p1_debug) {
				printf ("cache_miss\n");
			}

			deq_tail_dll (cache_head_ptr, cache_tail_ptr, reread_cache_ptr,
					cache_next_ptr, cache_prev_ptr);

			if (reread_cache_ptr -> hash_code != HASH_FREE_ENTRY) {
				sym_tab_cache_ptr[reread_cache_ptr -> hash_code] =
					(cache_entry_type *) CACHE_NOT_IN_CACHE;
			}

			for (i = 0; i < file_count; ++i) {
				if (files[i].sym_tab_index[hash_code] != 0) {
					reread_file_ptr = &files[i];
					break;
				}
			}

			if (i == file_count) {
				error ("Consistency error on rehash read");
			}

			reread_index = abs (reread_file_ptr -> sym_tab_index[hash_code]);

			reread_into_cache( reread_file_ptr, reread_index,
				reread_cache_ptr);

			if (reread_cache_ptr -> record_length >= 0) {

				if (compress_records) {
					pass1_record_compress (
						reread_cache_ptr -> recordp,
						&reread_cache_ptr -> record_length);
				}

				for (i = 0; i < sizeof (int) - 1; ++i) {
					reread_cache_ptr->
					recordp[reread_cache_ptr->record_length + i] = ' ';
				}
			}

			if (p1_debug) {
				printf ("%d %s\n",
					reread_cache_ptr -> record_length,
					reread_cache_ptr -> recordp);
			}

			sym_tab_cache_ptr[hash_code] = reread_cache_ptr;
			reread_cache_ptr -> hash_code = hash_code;
			enq_head_dll (cache_head_ptr, cache_tail_ptr,
					reread_cache_ptr,
					cache_next_ptr, cache_prev_ptr);
		}

		/*
		 * if this is the first time this hash code has been used,
		 *     point the symbol table to the cache entry,
		 *     insert the cache entry in the front of the cache.
		 *     break out of loop.
		 */

		if (sym_tab_cache_ptr[hash_code] == CACHE_FREE_ENTRY) {
			if (p1_debug) {
				printf ("cache_free_entry\n");
			}
			sym_tab_cache_ptr[hash_code] = cache_ptr;
			cache_ptr -> hash_code = hash_code;
			enq_head_dll (cache_head_ptr, cache_tail_ptr, cache_ptr,
					cache_next_ptr, cache_prev_ptr);
			cache_ptr = 0;
			break;

		/*
		 * if this hash code is used and the record is in cache,
		 *     Compare our record with the one in the cache.
		 *     if the records are the same,
		 *	  move cache entry to front of list,
		 *	  break out of the loop.
		 */

		} else {
			found_cache_ptr = sym_tab_cache_ptr[hash_code];
			if (p1_debug) {
				printf ("len1: %d len2: %d\n",
					cache_ptr -> record_length,
					found_cache_ptr -> record_length);
			}
			if (cache_ptr->record_length ==
			    found_cache_ptr->record_length) {

				found_int_ptr = (int *) found_cache_ptr->recordp;
				int_ptr = (int *) cache_ptr -> recordp;
				end_int_ptr = int_ptr +
					((cache_ptr -> record_length +
					sizeof (int) - 1) / sizeof (int));
				for (; int_ptr < end_int_ptr; int_ptr++) {
				/* printf( "v1: %x v2:%x\n", *int_ptr, *found_int_ptr ) ; */
					if (*int_ptr != *found_int_ptr) {
						break;
					}
					found_int_ptr++;
				}

				if (int_ptr >= end_int_ptr) {
					rem_dll (cache_head_ptr,
						 cache_tail_ptr,
						 found_cache_ptr,
						 cache_next_ptr,
						 cache_prev_ptr);
					enq_head_dll (cache_head_ptr,
						      cache_tail_ptr,
						      found_cache_ptr,
						      cache_next_ptr,
						      cache_prev_ptr);
					break;
				}

			}
		}

	}

	if (probe_count == sym_tab_size) {
		error ("Symbol table overflow");
	}

	/*
	 * If the record array for the file has overflowed,
	 *     fatal error.
	 */

	if (index + 1 >= file_ptr -> record_array_alloc) {
		error ("record array size computed wrong");
	}

	 /*
	  * Install record into record array.
	  */

	file_ptr -> record[index].rfa = rfa;
	file_ptr -> record[index].value[0] = -hash_code;
	file_ptr -> record[index].value[1] = -hash_code;

	/*
	 * Install line into symbol table.
	 */

	if (file_ptr -> sym_tab_index[hash_code] == 0) {
		file_ptr -> sym_tab_index[hash_code] = index;
	}
	else {
		file_ptr -> sym_tab_index[hash_code] = -index;
	}

	return (0);

}
/*
 * pass1_record_compress: compress out insignificant bytes in a record.
 *
 * This routine handles options which compares only a portion of the
 * record. For the -b option, all occurrences of multiple blanks are
 * compressed into a single blank. For the -c option, all columns not
 * specified are removed.
 *
 * Return value:
 *	This procedure has no return value.
 */

void pass1_record_compress (record_ptr, record_len_ptr)
	char   *record_ptr;	/* input/output */
				/* Record to be compressed in place. */

	int    *record_len_ptr; /* input/output */
				/* Length of record. */

{

	int	i;		/* Misc. variable */

	int	j;		/* Misc. variable */

	int	k;		/* Misc. variable */

	bool space;		/* TRUE if currently doing a space or tab sequence */


	/*
	 * Handle column ranges:
	 */
	if (column_count != 0) {
		k = 0;
		for (i = 0; i < column_count; ++i) {

			for (j = first_column[i];
				j <= last_column[i] && j < *record_len_ptr;
				++j) {

				record_ptr[k] = record_ptr[j];
				++k;
			}

		}

		*record_len_ptr = k;

	}

	/*
	 * Handle blank compression:
	 *
	 * Blank compression converts all groups of blanks and horizontal
	 * tabs to a single blank.
	 */

	if (blank_compress) {
		k = 0;
		space = FALSE;

		for (i = 0; i < *record_len_ptr; ++i) {

			if (record_ptr[i] == ' ' || record_ptr[i] == '\t') {
				space = TRUE;
			} else {
				if (space) {
					record_ptr[k++] = ' ';
					space = FALSE;
				}
				record_ptr[k++] = record_ptr[i];
			}

		}

		if (space) {
			record_ptr[k++] = ' ';
		}

		*record_len_ptr = k;

	}

	/*
	 * Handle blank removal:
	 *
	 * Blank removal removes all blanks and horizontal tabs.
	 */

	if (blank_remove) {
		k = 0;
		space = FALSE;

		for (i = 0; i < *record_len_ptr; ++i) {

			if (record_ptr[i] == ' ' || record_ptr[i] == '\t') {
				space = TRUE;
			} else {
				if (space) {
					space = FALSE;
				}
				record_ptr[k++] = record_ptr[i];
			}

		}

		*record_len_ptr = k;

	}

}
