;
; This code implements the basic idct on a 8x8 pixel block.
; Basically, it's the same as in the JPEG engine, with the sole difference
; that it's inlined and register-wise a little bit more optimized there.
;
; This is a complete rewrite in assembler. Heavy stuff. Lotsa work.
;
; Michael Rausch  14-4-94  1:14:00
;

;
; The whole code handles D-Frames not very well, but I'll fix it on day.
;


DCTSIZE	EQU	8

PASS1_BITS EQU	2
CONST_BITS EQU	13


FIX_0_298631336 EQU	2446	;1	+	$98e	100110001110
_FIX_0_390180644 EQU	-3196	;2	-	$c7c
FIX_0_541196100 EQU	4433	;3	+	$1151	u
FIX_0_765366865 EQU	6270	;4	+	$187e	u
_FIX_0_899976223 EQU	-7373	;5	-	$1ccd
FIX_1_175875602 EQU	9633	;6	+	$25a1
FIX_1_501321110 EQU	12299	;7	+	$300b
_FIX_1_847759065 EQU	-15137	;8	-	$3b21	u
_FIX_1_961570560 EQU	-16069	;9	-	$3ec5
FIX_2_053119869 EQU	16819	;10	+	$41b3
_FIX_2_562915447 EQU	-20995	;11	-	$5203
FIX_3_072711026 EQU	25172	;12	+	$6254

; FIX_1_847759065-FIX_0_765366865 = 2* FIX_0_541196100


; **************************************************************************

jrevdct:

	sub.w	#16,sp

	move.l	a0,-(sp)
	lea	compose1(pc),a5
	moveq	#DCTSIZE-1,d7
idct1:	move.l	d7,-(sp)

	lea	2(a0),a1
	move.l	(a1)+,d2
	move.l	d2,d0
	move.l	(a1)+,d4
	move.l	(a1)+,d3
	or.l	d4,d0
	or.w	(a1)+,d0
	or.l	d3,d0
	bne.s	idct1_no_ac0
	move.w	(a0),d0
	lsl.w	#PASS1_BITS,d0
	move.w	d0,d1
	swap	d0
	move.w	d1,d0
	REPT	4
	move.l	d0,(a0)+
	ENDR
	bra	idct1_next
idct1_no_ac0:

	move.w	d2,d1			; 2
	add.w	d3,d1			; 6
	muls	#FIX_0_541196100,d1
	muls	#_FIX_1_847759065,d3
	add.l	d1,d3
	muls	#FIX_0_765366865,d2
	add.l	d1,d2


	move.w	(a0),d0
	ext.l	d0			; 0
	ext.l	d4			; 4
	move.l	d0,d5
	sub.l	d4,d5
	add.l	d0,d4

	lsl.l	#5,d4
	lsl.l	#5,d5
	addq.l	#1<<2,d4
	addq.l	#1<<2,d5
	lsl.l	#8,d4
	lsl.l	#8,d5

	lea	12(sp),a1	; top + 2 longs -> 16 bytes platz auf dem stack
	move.l	d4,d0
	add.l	d2,d4
	move.l	d4,(a1)+	; tmp10
	sub.l	d2,d0
	move.l	d5,d1
	add.l	d3,d5
	move.l	d5,(a1)+	; tmp11
	sub.l	d3,d1
	move.l	d1,(a1)+	; tmp12
	move.l	d0,(a1)+	; tmp13

odd_part1: 
	move.w	7*2(a0),d1	;7
	beq	o0xxx
o1xxx:	move.w	5*2(a0),d2	;5
	beq	o10xx
o11xx:	move.w	3*2(a0),d3	;3
	beq	o110x
o111x:	move.w	1*2(a0),d4	;1
	bne.s	odd1_1111


;    7531
odd1_1110:
	move.w	d2,d6
	move.w	d1,d0
	moveq	#0,d4
	bra.s	abk_2

;    7531
odd1_1111:
	move.w	d2,d6
	add.w	d4,d6
	move.w	d1,d0
	add.w	d4,d0
	muls	#FIX_1_501321110,d4
abk_2:	move.w	d1,d5
	add.w	d3,d5
	move.w	d5,d7
	add.w	d6,d7
	muls	#FIX_1_175875602,d7
	muls	#_FIX_1_961570560,d5
	muls	#_FIX_0_390180644,d6
 	add.l	d7,d5
	add.l	d7,d6
	move.w	d2,d7
	add.w	d3,d7
	muls	#FIX_0_298631336,d1
	muls	#FIX_2_053119869,d2
	muls	#FIX_3_072711026,d3
	muls	#_FIX_0_899976223,d0
	muls	#_FIX_2_562915447,d7
	add.l	d0,d1
	add.l	d7,d2
	add.l	d5,d1
	add.l	d6,d2
	add.l	d3,d5
	add.l	d4,d6
	add.l	d7,d5
	add.l	d0,d6
	jmp	(a5)

o2110x:	move.w	1*DCTSIZE*2(a0),d4	;1
	bne.s	odd1_1101

;    7531
odd1_1100:
	move.w	d2,d6
	move.w	d1,d3
	moveq	#0,d4
	bra.s	abk_3

o110x:	move.w	1*2(a0),d4	;1
	beq.s	odd1_1100

;    7531
odd1_1101:
	move.w	d2,d6
	move.w	d1,d3
	add.w	d4,d6
	add.w	d4,d3
	muls	#FIX_1_501321110,d4
abk_3:
	move.w	d1,d5
	move.w	d5,d7
	add.w	d6,d7
	muls	#FIX_1_175875602,d7    
	muls	#_FIX_1_961570560,d5
	muls	#_FIX_0_390180644,d6
 	add.l	d7,d5
	add.l	d7,d6
	move.w	d2,d0
	muls	#FIX_0_298631336,d1
	muls	#FIX_2_053119869,d2
	muls	#_FIX_0_899976223,d3
	muls	#_FIX_2_562915447,d0
	add.l	d3,d1
	add.l	d0,d2
	add.l	d5,d1
	add.l	d6,d2
	add.l	d4,d6
	add.l	d0,d5
	add.l	d3,d6
	jmp	(a5)

o10xx:	move.w	3*2(a0),d3	;3
	beq	o100x
o101x:	move.w	1*2(a0),d4	;1
	beq.s	odd1_1010

;    7531
odd1_1011:
	move.w	d1,d5
	add.w	d3,d5
	move.w	d1,d0
	move.w	d4,d6
	add.w	d4,d0
	muls	#FIX_1_501321110,d4
	move.w	d5,d7
	add.w	d6,d7
	muls	#_FIX_0_390180644,d6
abk_4:	muls	#FIX_1_175875602,d7
	muls	#_FIX_1_961570560,d5
	add.l	d7,d6
 	add.l	d7,d5
	move.w	d3,d7
	muls	#FIX_0_298631336,d1
	muls	#FIX_3_072711026,d3
	muls	#_FIX_0_899976223,d0
	muls	#_FIX_2_562915447,d7
	add.l	d0,d1
	move.l	d6,d2
	add.l	d5,d1
	add.l	d7,d2
	add.l	d3,d5
	add.l	d4,d6
	add.l	d7,d5
	add.l	d0,d6
	jmp	(a5)

o210xx:	move.w	3*DCTSIZE*2(a0),d3	;3
	beq	o2100x
o2101x:	move.w	1*DCTSIZE*2(a0),d4	;1
	bne.s	odd1_1011

;    7531
odd1_1010:
	move.w	d1,d5
	add.w	d3,d5
	move.w	d1,d0
	moveq	#0,d4
	move.w	d5,d7
	moveq	#0,d6
	bra.s	abk_4

o100x:	move.w	1*2(a0),d4	;1
	beq.s	odd1_1000

;    7531
odd1_1001:
	move.w	d1,d0
	add.w	d4,d0
	move.w	d1,d5
	move.w	d4,d6
	move.w	d0,d7
	muls	#FIX_1_175875602,d7
	muls	#_FIX_1_961570560,d5
	muls	#_FIX_0_390180644,d6
 	add.l	d7,d5
	add.l	d7,d6
	muls	#FIX_0_298631336,d1
	muls	#FIX_1_501321110,d4
	muls	#_FIX_0_899976223,d0
	add.l	d0,d1
	add.l	d5,d1
	move.l	d6,d2
	add.l	d4,d6
	add.l	d0,d6
	jmp	(a5)

o2100x:	move.w	1*DCTSIZE*2(a0),d4	;1
	bne.s	odd1_1001

;    7531
odd1_1000:
	move.w	d1,d2
	move.w	d1,d5
	move.w	d1,d6
	muls	#FIX_1_175875602,d2
	muls	#FIX_1_175875602+_FIX_0_899976223,d6
	muls	#FIX_1_175875602+_FIX_1_961570560,d5
	muls	#FIX_1_175875602+_FIX_0_899976223+_FIX_1_961570560+FIX_0_298631336,d1
	jmp	(a5)


o0xxx:	move.w	5*2(a0),d2	;5
	beq	o00xx
o01xx:	move.w	3*2(a0),d3	;3
	beq	o010x
o011x:	move.w	1*2(a0),d4	;1
	beq.s	odd1_0110

;    7531
odd1_0111:	; opt8
	move.w	d2,d6
	add.w	d4,d6
	move.w	d4,d1
	muls	#FIX_1_501321110,d4
	muls	#_FIX_0_899976223,d1
abk_1:	move.w	d2,d0
	add.w	d3,d0
	move.w	d3,d5

	move.w	d5,d7
	add.w	d6,d7
	muls	#FIX_1_175875602,d7
	muls	#_FIX_1_961570560,d5
	muls	#_FIX_0_390180644,d6				; ???? 2
 	add.l	d7,d5
	add.l	d7,d6

	muls	#FIX_2_053119869,d2
	muls	#FIX_3_072711026,d3
	muls	#_FIX_2_562915447,d0
	add.l	d0,d2
	add.l	d6,d2
	add.l	d4,d6
	add.l	d1,d6
	add.l	d5,d1
	add.l	d3,d5
	add.l	d0,d5
	jmp	(a5)

o20xxx:	move.w	5*DCTSIZE*2(a0),d2	;5
	beq	o200xx
o201xx:	move.w	3*DCTSIZE*2(a0),d3	;3
	beq.s	o2010x
o2011x:	move.w	1*DCTSIZE*2(a0),d4	;1
	bne.s	odd1_0111

;    7531
odd1_0110:
	move.w	d2,d6
	moveq.l	#0,d1
	moveq.l	#0,d4
	bra.s	abk_1

o010x:	move.w	1*2(a0),d4	;1
	beq.s	odd1_0100

;    7531
odd1_0101:
	move.w	d2,d6
	move.w	d2,d7
	add.w	d4,d6
	move.w	d4,d1
	move.w	d6,d5
	muls	#FIX_1_175875602,d5
	muls	#_FIX_0_390180644+FIX_1_175875602,d6
	muls	#FIX_2_053119869+_FIX_2_562915447,d2
	muls	#FIX_1_501321110,d4
	muls	#_FIX_0_899976223,d1
	muls	#_FIX_2_562915447,d7
	add.l	d6,d2
	add.l	d1,d6
	add.l	d5,d1
	add.l	d7,d5
	add.l	d4,d6
	jmp	(a5)

o2010x:	move.w	1*DCTSIZE*2(a0),d4	;1
	bne.s	odd1_0101

;    7531
odd1_0100:
	move.w	d2,d6
	move.w	d2,d1
	move.w	d2,d5
	muls	#FIX_1_175875602,d1
	muls	#FIX_1_175875602+_FIX_2_562915447,d5
	muls	#FIX_1_175875602+_FIX_0_390180644,d6
	muls	#FIX_1_175875602+_FIX_2_562915447+_FIX_0_390180644+FIX_2_053119869,d2
	jmp	(a5)

o00xx:	move.w	3*2(a0),d5	;3
	beq.s	o000x
o001x:	move.w	1*2(a0),d4	;1
	beq.s	odd1_0010

;    7531
odd1_0011:	; opt12
	move.w	d5,d2
	move.w	d5,d3
	move.w	d4,d1
	move.w	d4,d6
	move.w	d3,d7
	add.w	d4,d7
	muls	#FIX_1_175875602,d7
	muls	#_FIX_1_961570560,d5
	muls	#_FIX_0_390180644,d6
 	add.l	d7,d5
	add.l	d7,d6
	muls	#_FIX_2_562915447+FIX_3_072711026,d3
	muls	#_FIX_0_899976223+FIX_1_501321110,d4
	muls	#_FIX_0_899976223,d1
	muls	#_FIX_2_562915447,d2
	add.l	d5,d1
	add.l	d6,d2
	add.l	d3,d5
	add.l	d4,d6
	jmp	(a5)

o200xx:	move.w	3*DCTSIZE*2(a0),d5	;3
	beq	o2000x
o2001x:	move.w	1*DCTSIZE*2(a0),d4	;1
	bne.s	odd1_0011

;    7531
odd1_0010:
	move.w	d5,d6
	move.w	d5,d2
	move.w	d5,d1
	muls	#FIX_1_175875602,d6
	muls	#FIX_1_175875602+_FIX_2_562915447,d2
	muls	#FIX_1_175875602+_FIX_1_961570560,d1
	muls	#FIX_1_175875602+_FIX_2_562915447+_FIX_1_961570560+FIX_3_072711026,d5
	jmp	(a5)

o000x:	move.w	1*2(a0),d6	;1
	beq.s	odd1_0000

;    7531
odd1_0001:	; opt 14
	move.w	d6,d5
	move.w	d6,d1
	move.w	d6,d2
	muls	#FIX_1_175875602,d5
	muls	#FIX_1_175875602+_FIX_0_899976223,d1
	muls	#FIX_1_175875602+_FIX_0_390180644,d2
	muls	#FIX_1_175875602+_FIX_0_899976223+_FIX_0_390180644+FIX_1_501321110,d6
	jmp	(a5)

; priority: 14 12 8 0 

;    7531
odd1_0000:
	moveq	#CONST_BITS-PASS1_BITS,d7	; optimized compose !

	lea	12(sp),a1
	move.l	(a1)+,d0	; tmp10
	lsl.l	#16-(CONST_BITS-PASS1_BITS),d0
	move.l	(a1)+,d1	; tmp11
	lsr.l	d7,d1
	move.w	d1,d0

	move.l	(a1)+,d2	; tmp12
	lsl.l	#16-(CONST_BITS-PASS1_BITS),d2
	move.l	(a1)+,d3	; tmp13
	lsr.l	d7,d3
	move.w	d3,d2

	move.l	d0,(a0)+
	swap	d0
	move.l	d2,(a0)+
	swap	d2
	move.l	d2,(a0)+
	move.l	d0,(a0)+

	move.l	(sp)+,d7
	dbra	d7,idct1
	bra.s	idct1_ready

; keep 1 2 5 6

compose1: moveq	#CONST_BITS-PASS1_BITS,d7

	lea	12(sp),a1
	move.l	(a1)+,d4	; tmp10
	sub.l	d6,d4
	add.l	d6,d6
	add.l	d4,d6
	lsl.l	#16-(CONST_BITS-PASS1_BITS),d6
	move.l	(a1)+,d3	; tmp11
	sub.l	d5,d3
	add.l	d5,d5
	add.l	d3,d5
	lsr.l	d7,d5
	move.w	d5,d6
	move.l	d6,(a0)+
	move.l	(a1)+,d6	; tmp12
	sub.l	d2,d6
	add.l	d2,d2
	add.l	d6,d2
	lsl.l	#16-(CONST_BITS-PASS1_BITS),d2
	move.l	(a1)+,d5	; tmp13
	sub.l	d1,d5
	add.l	d1,d1
	add.l	d5,d1
	lsr.l	d7,d1
	move.w	d1,d2
	move.l	d2,(a0)+
	lsl.l	#16-(CONST_BITS-PASS1_BITS),d5
	lsr.l	d7,d6
	move.w	d6,d5
	move.l	d5,(A0)+
	lsl.l	#16-(CONST_BITS-PASS1_BITS),d3
	lsr.l	d7,d4
	move.w	d4,d3
	move.l	d3,(a0)+

idct1_next:
	move.l	(sp)+,d7
	dbra	d7,idct1

idct1_ready:
; *******************************************************

	move.l	(sp)+,a0
	lea	compose2(pc),a5
	moveq	#DCTSIZE-1,d7
idct2:	move.l	d7,-(sp)


odd_part2: 
	move.w	7*DCTSIZE*2(a0),d1	;7
	beq	o20xxx
o21xxx:	move.w	5*DCTSIZE*2(a0),d2	;5
	beq	o210xx
o211xx:	move.w	3*DCTSIZE*2(a0),d3	;3
	beq	o2110x
o2111x:	move.w	1*DCTSIZE*2(a0),d4	;1
	beq	odd1_1110
	bra	odd1_1111

o2000x:	move.w	1*DCTSIZE*2(a0),d6	;1
	bne	odd1_0001

odd0_0000:
	move.w	2*DCTSIZE*2(a0),d2
	move.w	4*DCTSIZE*2(a0),d4
	move.w	6*DCTSIZE*2(a0),d3
	move.w	d2,d0
	add.w	d3,d0
	muls	#FIX_0_541196100/4,d0
	muls	#_FIX_1_847759065/4,d3
	add.l	d0,d3
	muls	#FIX_0_765366865/4,d2
	add.l	d0,d2

	move.w	(a0),d0
	add.w	#1<<(PASS1_BITS+3-1),d0		; precalc from the descaling part below
	ext.l	d4
	ext.l	d0
	move.l	d0,d5
	sub.l	d4,d5
	add.l	d0,d4

	moveq	#CONST_BITS-2,d0
	lsl.l	d0,d4
	lsl.l	d0,d5

	move.l	d4,d0
	add.l	d2,d4
	swap 	d4
	move.w	d4,(a0)+
	sub.l	d2,d0
	move.w	d4,7*DCTSIZE*2-2(a0)
	swap	d0
	move.w	d0,3*DCTSIZE*2-2(a0)
	move.l	d5,d4
	move.w	d0,4*DCTSIZE*2-2(a0)
	add.l	d3,d5
	swap 	d5
	sub.l	d3,d4
	move.w	d5,1*DCTSIZE*2-2(a0)
	swap	d4
	move.w	d5,6*DCTSIZE*2-2(a0)
	move.w	d4,2*DCTSIZE*2-2(a0)
	move.w	d4,5*DCTSIZE*2-2(a0)

	move.l	(sp)+,d7
	dbra	d7,idct2
	bra	idct2_ready


compose2:
	move.w	2*DCTSIZE*2(a0),d3
	move.w	4*DCTSIZE*2(a0),d4
	move.w	6*DCTSIZE*2(a0),d7

	move.w	d3,d0
	add.w	d7,d0
	muls	#FIX_0_541196100,d0
	muls	#_FIX_1_847759065,d7
	add.l	d0,d7
	muls	#FIX_0_765366865,d3
	add.l	d0,d3

	asr.l	#2,d7
	asr.l	#2,d3
	move.l	d7,a3

	move.w	(a0),d0
	add.w	#1<<(PASS1_BITS+3-1),d0	; precalc from the descaling part below
	ext.l	d4
	ext.l	d0
	move.l	d0,d7
	sub.l	d4,d7
	add.l	d0,d4

	moveq	#CONST_BITS-2,d0
	lsl.l	d0,d4
	lsl.l	d0,d7


	asr.l	#2,d6
	asr.l	#2,d5
	asr.l	#2,d2
	asr.l	#2,d1


	move.l	d4,d0
	add.l	d3,d4
	sub.l	d3,d0

	move.l	d7,d3
	add.l	a3,d7
	sub.l	a3,d3


	sub.l	d6,d4
	add.l	d6,d6
	add.l	d4,d6

	swap	d6				; moveq	#CONST_BITS+PASS1_BITS+3  -2   ,d6   ;   asr.l d6,d3
	move.w	d6,(a0)+
	swap	d4
	move.w	d4,7*DCTSIZE*2-2(a0)

	sub.l	d1,d0
	add.l	d1,d1
	add.l	d0,d1

	swap	d1
	move.w	d1,3*DCTSIZE*2-2(a0)
	swap	d0
	move.w	d0,4*DCTSIZE*2-2(a0)

	sub.l	d5,d7
	add.l	d5,d5
	add.l	d7,d5

	swap	d5
	move.w	d5,1*DCTSIZE*2-2(a0)
	swap	d7
	move.w	d7,6*DCTSIZE*2-2(a0)

	sub.l	d2,d3
	add.l	d2,d2
	add.l	d3,d2

	swap	d2
	move.w	d2,2*DCTSIZE*2-2(a0)
	swap	d3
	move.w	d3,5*DCTSIZE*2-2(a0)

idct2_next:
	move.l	(sp)+,d7
	dbra	d7,idct2

idct2_ready;
	add.w	#16,sp
;	movem.l	(sp)+,JREVDCTREGS

	movem.l	(sp)+,ri_regs
	rts


	XDEF	@j_rev_dct
@j_rev_dct:
	movem.l	ri_regs,-(sp)
	bra	jrevdct


	ifeq	1
; **************************************************************************

; Pre compute singleton coefficient IDCT values.
;
; void init_pre_idct(void)

;	XDEF @init_pre_idct
@init_pre_idct:
	movem.l	d2/a2,-(sp)

	lea	PreIDCT,a2
	move.w	#64*64/4/4-1,d2
preidctclr:
	clr.l	(a2)+
	clr.l	(a2)+
	clr.l	(a2)+
	clr.l	(a2)+
	dbra	d2,preidctclr

	lea	PreIDCT+63*64*2,a2
	moveq	#63,d2
preidctloop:
	move.w	#2048,(a2,d2.w)
	move.l	a2,a0
	bsr	@j_rev_dct
	sub.w	#64,a2
	dbra	d2,preidctloop

	movem.l	(sp)+,d2/a2
	rts

; ************************************************************************************

; Perform the inverse DCT on one block of coefficients.
;
; void j_rev_dct_sparse (DCTBLOCK data, int pos)  

;	XDEF	@j_rev_dct_sparse
@j_rev_dct_sparse:

	tst.l	d0
	bne	itsnotthedc

; the single element to cope with is the dc coefficient

	move.w	(a0),d1
	bpl.s	scale_dc
	subq.w	#3+4,d1				; "implement" the rounding error
scale_dc:addq.w	#4,d1
	asr.w	#3,d1

	move.w	d1,d0				; extend to longword
	swap	d0
	move.w	d1,d0

	moveq	#7,d1
set_dc:	move.l	d0,(a0)+
	move.l	d0,(a0)+
	move.l	d0,(a0)+
	move.l	d0,(a0)+
	dbra	d1,set_dc

	rts					; not that pretty
;	bra	exit_jrds
itsnotthedc:
	movem.l	d2/d3,-(sp)

; Some other coefficient. 

	move.w	(a0,d0.w),d1		; get coeff

	lea	PreIDCT,a1		; get precalculated DCT
	lsl.l	#7,d0
	add.l	d0,a1

	moveq	#CONST_BITS-PASS1_BITS-8,d3	; scale down

	moveq.l	#31,d0
set_ac:	move.w	d1,d2
	muls	(a1)+,d2
	lsr.l	d3,d2
	move.w	d2,(a0)+
	move.w	d1,d2
	muls	(a1)+,d2
	lsr.l	d3,d2
	move.w	d2,(a0)+
	dbra	d0,set_ac

	movem.l	(sp)+,d2/d3
exit_jrds: rts

; ************************************************************************************

	section bss,BSS

;
; Precomputed idct value arrays
;
PreIDCT: ds.w	64*64

	endc


;	END