 /* 
  * UAE - The Un*x Amiga Emulator
  * 
  * Common code needed by all the various graphics systems.
  * 
  * (c) 1996 Bernd Schmidt, Ed Hanway, Samuel Devulder
  */

#include "sysconfig.h"
#include "sysdeps.h"

#include "config.h"
#include "options.h"
#include "memory.h"
#include "custom.h"
#include "newcpu.h"
#include "keyboard.h"
#include "xwin.h"
#include "keybuf.h"

#define	RED 	0
#define	GRN	1
#define	BLU	2

/*
 * dither matrix
 */
static UBYTE dither[4][4] =
{
  {0,8,2,10},
  {12,4,14,6},
  {3,11,1,9},
  {14 /* 15 */,7,13,5}
};

unsigned long doMask(int p, int bits, int shift)
{
    /* p is a value from 0 to 15 (Amiga color value)
     * scale to 0..255, shift to align msb with mask, and apply mask */

    unsigned long val = p * 0x11111111UL;
    val >>= (32 - bits);
    val <<= shift;

    return val;
}

void alloc_colors64k(int rw, int gw, int bw, int rs, int gs, int bs)
{
    int i;
    for(i=0; i<4096; i++) {
	int r = i >> 8;
	int g = (i >> 4) & 0xF;
	int b = i & 0xF;
	xcolors[i] = doMask(r, rw, rs) | doMask(g, gw, gs) | doMask(b, bw, bs);
    }
}

static int allocated[4096];

void alloc_colors256(allocfunc_type allocfunc)
{
    int nb_cols[3]; /* r,g,b */
    int maxcol = 256;
    int i,j,k,l,t;
    int diff[4096];

    xcolnr *map;
    
    map = (xcolnr *)malloc(sizeof(xcolnr) * maxcol);
    
    /*
     * compute #cols per components
     */
    for(i = 1; i*i*i <= maxcol; ++i)
	;
    --i;

    nb_cols[RED] = i;
    nb_cols[GRN] = i;
    nb_cols[BLU] = i;

    /*
     * set the colormap
     */
    l=0;
    for(i = 0; i < nb_cols[RED]; ++i) {
	int r = (i * 15 + (nb_cols[RED] - 1)/2) / (nb_cols[RED] - 1);
	for(j = 0; j < nb_cols[GRN]; ++j) {
	    int g = (j * 15 + (nb_cols[GRN] - 1)/2) / (nb_cols[GRN] - 1);
	    for(k = 0; k < nb_cols[BLU]; ++k) { 
		int b = (k * 15 + (nb_cols[BLU] - 1)/2) / (nb_cols[BLU] - 1);
		int result;
		result = allocfunc(r, g, b, map + l);
		l++;
	    }
	}
    }
/*    printf("%d color(s) lost\n",maxcol - l);*/

    /*
     * for each component compute the mapping
     */
    {
	int diffr, diffg, diffb, maxdiff = 0, won = 0, lost;
	int r, d = 8;
	for(r=0; r<16; ++r) {
	    int cr, g, q;
	  
	    k  = nb_cols[RED]-1;
	    cr = (r * k) / 15;
	    q  = (r * k) % 15;
	    if(q > d && cr < k) ++cr;
	    diffr = abs(cr*k-r);
	    for(g=0; g<16; ++g) {
		int cg, b;
	      
		k  = nb_cols[GRN]-1;
		cg = (g * k) / 15;
		q  = (g * k) % 15;
		if(q > d && cg < k) ++cg;
		diffg = abs(cg*k-g);
		for(b=0; b<16; ++b) {
		    int cb, rgb = (r<<8) | (g<<4) | b;

		    k  = nb_cols[BLU]-1;
		    cb = (b * k) / 15;
		    q  = (b * k) % 15;
		    if(q > d && cb < k) ++cb;
		    diffb = abs(cb*k-b);
		    xcolors[rgb] = map[(cr*nb_cols[GRN]+cg)*nb_cols[BLU]+cb];
		    diff[rgb] = diffr+diffg+diffb;
		    if (diff[rgb] > maxdiff)
			maxdiff = diff[rgb];
		}
	    }
	}
	while (maxdiff > 0 && l < maxcol) {
	    int newmaxdiff = 0;
	    lost = 0; won++;
	    for(r = 15; r >= 0; r--) {
		int cr, g, q;
	  
		for(g = 15; g >= 0; g--) {
		    int cg, b;
	      
		    for(b = 15; b >= 0; b--) {
			int cb, rgb = (r<<8) | (g<<4) | b;

			if (diff[rgb] == maxdiff) {
			    int result;
			
			    if (l >= maxcol)
				lost++;
			    else {
				result = allocfunc(r, g, b, xcolors + rgb);
				l++;
			    }
			    diff[rgb] = 0;
			} else if (diff[rgb] > newmaxdiff)
				newmaxdiff = diff[rgb];
			
		    }
		}
	    }
	    maxdiff = newmaxdiff;
	}
/*	printf("%d color(s) lost, %d stages won\n",lost, won);*/
    }
    free (map);
}

/*
 * This dithering process works by letting UAE run internaly in 12bit
 * mode and doing the dithering on the fly when rendering to the display.
 * The dithering algorithm is quite fast but uses lot of memory (4*8*2^12 =
 * 128Kb). I don't think that is a trouble right now, but when UAE will 
 * emulate AGA and work internaly in 24bit mode, that dithering algorithm
 * will need 4*8*2^24 = 512Mb. Obviously that fast algorithm will not be
 * tractable. However, we could then use an other algorithm, slower, but 
 * far more reasonable (I am thinking about the one that is used in DJPEG).
 */

UBYTE cidx[4][8*4096]; /* fast, but memory hungry =:-( */

/*
 * Compute dithering structures
 */
void setup_greydither(int bits, allocfunc_type allocfunc)
{
    int i,j,k,l,t;
    int maxcol = 1 << bits;
    xcolnr *map;
    
    for (i = 0; i < 4096; i++)
	xcolors[i] = i;
    
    map = (xcolnr *)malloc(sizeof(xcolnr) * maxcol);

    /*
     * set the colormap
     */
    for(i = 0; i < maxcol; ++i) {
	int c, result;
	c = (15 * i + (maxcol-1)/2) / (maxcol - 1);
        result = allocfunc(c, c, c, map + i);
	/* @@@ check for errors */
    }

    /*
     * for each componant compute the mapping
     */
    for(i=0;i<4;++i) {
	for(j=0;j<4;++j) {
	    int r, d = dither[i][j]*17;
	    for(r=0; r<16; ++r) {
		int g;
		for(g=0; g<16; ++g) {
		    int  b;
		    for(b=0; b<16; ++b) {
			int rgb = (r<<8) | (g<<4) | b;
			int c,p,q;
			
			c = (77  * r + 
			     151 * g + 
			     28  * b) / 15; /* c in 0..256 */
			
			k = maxcol-1;  
			p = (c * k) / 256;
			q = (c * k) % 256;
			if(q / k > d / k && p < k) ++p;
			cidx[i][rgb + (j+4)*4096] = 
			    cidx[i][rgb + j*4096] = map[p];
		    }
		}
	    }
	}
    }
    free (map);
}

void setup_dither(int bits, allocfunc_type allocfunc)
{
    int nb_cols[3]; /* r,g,b */
    int maxcol = 1 << bits;
    int i,j,k,l,t;

    xcolnr *map;
    int *redvals, *grnvals, *bluvals;
    
    map = (xcolnr *)malloc(sizeof(xcolnr) * maxcol);

    for (i = 0; i < 4096; i++)
	xcolors[i] = i;
    
    /*
     * compute #cols per components
     */
    for(i = 1; i*i*i <= maxcol; ++i)
	;
    --i;

    nb_cols[RED] = i;
    nb_cols[GRN] = i;
    nb_cols[BLU] = i;

    if(nb_cols[RED]*(++i)*nb_cols[BLU] <= maxcol) {
	nb_cols[GRN] = i;
	if((i)*nb_cols[GRN]*nb_cols[BLU] <= maxcol) nb_cols[RED] = i;
    }

    redvals = (int *)malloc(sizeof(int) * maxcol);
    grnvals = redvals + nb_cols[RED];
    bluvals = grnvals + nb_cols[BLU];
    /*
     * set the colormap
     */
    l=0;
    for(i = 0; i < nb_cols[RED]; ++i) {
	int r = (i * 15 + (nb_cols[RED] - 1)/2) / (nb_cols[RED] - 1);
	redvals[i] = r;
	for(j = 0; j < nb_cols[GRN]; ++j) {
	    int g = (j * 15 + (nb_cols[GRN] - 1)/2) / (nb_cols[GRN] - 1);
	    grnvals[j] = g;
	    for(k = 0; k < nb_cols[BLU]; ++k) { 
		int b = (k * 15 + (nb_cols[BLU] - 1)/2) / (nb_cols[BLU] - 1);
		int result;
		bluvals[k] = b;
		result = allocfunc(r, g, b, map + l);
		l++;
	    }
	}
    }
    fprintf(stderr, "%d color(s) lost\n",maxcol - l);

    /*
     * for each component compute the mapping
     */
    {
	int r;
	for(r=0; r<16; ++r) {
	    int g;
	    for(g=0; g<16; ++g) {
		int b;
		for(b=0; b<16; ++b) {
		    int rederr = 0, grnerr = 0, bluerr = 0;
		    int rgb = (r<<8) | (g<<4) | b;

		    for(i=0;i<4;++i) for(j=0;j<4;++j) {
			int d = dither[i][j];
			int cr, cg, cb, k, q;
#if 0 /* Slightly different algorithm. Needs some tuning. */
			k  = nb_cols[RED]-1;
			cr = r * k / 15;
			q  = r * k - 15*cr;
			if (cr < 0) cr = 0;
			else
			    if(q / k > d / k && rederr <= 0) ++cr;
			if (cr > k) cr = k;
			rederr += redvals[cr]-r;
	      
			k  = nb_cols[GRN]-1;
			cg = g * k / 15;
			q  = g * k - 15*cg;
			if (cg < 0) cg = 0;
			else
			    if (q / k > d / k && grnerr <= 0) ++cg;
			if (cg > k) cg = k;
			grnerr += grnvals[cg]-g;

			k  = nb_cols[BLU]-1;
			cb = b * k / 15;
			q  = b * k - 15*cb;
			if (cb < 0) cb = 0;
			else
			    if (q / k > d / k && bluerr <= 0) ++cb;
			if (cb > k) cb = k;
			bluerr += bluvals[cb]-b;
#else
			k  = nb_cols[RED]-1;
			cr = r * k / 15;
			q  = r * k - 15*cr;
			if (cr < 0) cr = 0;
			else
			    if(q / k > d / k) ++cr;
			if (cr > k) cr = k;
	      
			k  = nb_cols[GRN]-1;
			cg = g * k / 15;
			q  = g * k - 15*cg;
			if (cg < 0) cg = 0;
			else
			    if (q / k > d / k) ++cg;
			if (cg > k) cg = k;

			k  = nb_cols[BLU]-1;
			cb = b * k / 15;
			q  = b * k - 15*cb;
			if (cb < 0) cb = 0;
			else
			    if (q / k > d / k) ++cb;
			if (cb > k) cb = k;
#endif
			cidx[i][rgb + (j+4)*4096] = cidx[i][rgb + j*4096] = map[(cr*nb_cols[GRN]+cg)*nb_cols[BLU]+cb];
		    }
		}
	    }
	}
    }
    free (map);
}

#ifndef X86_ASSEMBLY
/*
 * Dither the line.
 * Make sure you call this only with (len & 3) == 0, or you'll just make
 * yourself unhappy.
 */

void DitherLine(UBYTE *l, UWORD *r4g4b4, int x, int y, UWORD len, int bits)
{
    UBYTE *dith = cidx[y&3]+(x&3)*4096;
    UBYTE d = 0;
    int bitsleft = 8;
    
    while(len) {
	int v;
	v = dith[0*4096 + *r4g4b4++];
	bitsleft -= bits;
	d |= (v << bitsleft);
	if (!bitsleft)
	    *l++ = d, bitsleft = 8, d = 0;
	
	v = dith[1*4096 + *r4g4b4++];
	bitsleft -= bits;
	d |= (v << bitsleft);
	if (!bitsleft)
	    *l++ = d, bitsleft = 8, d = 0;
	
	v = dith[2*4096 + *r4g4b4++];
	bitsleft -= bits;
	d |= (v << bitsleft);
	if (!bitsleft)
	    *l++ = d, bitsleft = 8, d = 0;
	
	v = dith[3*4096 + *r4g4b4++];
	bitsleft -= bits;
	d |= (v << bitsleft);
	if (!bitsleft)
	    *l++ = d, bitsleft = 8, d = 0;
	len -= 4;
    }
}
#endif
