
#include "stdafx.h"
#include <windows.h>
#include "TestObj.h"
#include "Tests.h"

IMPLEMENT_DYNCREATE(CT_MemCpyFloat, CTestObject)

/////////////////////////////////////////////////////////////////////////////
CT_MemCpyFloat::CT_MemCpyFloat()
{
	m_TestType = 8;
	m_bSelfWarming = TRUE;
}

CT_MemCpyFloat::CT_MemCpyFloat(UINT nTestType, BOOL bSelfWarming)
{
	m_TestType = nTestType;
	m_bSelfWarming = bSelfWarming;
}

CT_MemCpyFloat::~CT_MemCpyFloat()
{
}

/////////////////////////////////////////////////////////////////////////////
static void CT_MemCpyFloatC(
			  void* pDest,
			  void* pData,
			  UINT dwByteCount)
{
	//memcpy(pDest, pData, dwByteCount);

	_asm{
	cld						;2 (Not pairable)

	mov ecx, dwByteCount	;1
	mov edi, pDest			;0

	shr ecx, 2;				;1
	mov esi, pData			;0

	rep movsd		;13+ecx = 13+256 = 269
	// Total clocks = 273
	// Measured == 306 - 38 overhead (measured) = 273
	}
}

/////////////////////////////////////////////////////////////////////////////
static void CT_MemCpyFloatPent8(
			  void* pDest,
			  void* pData,
			  UINT dwByteCount)
{
	// Quick & dirty way to handle the trailing bytes
	DWORD extra = dwByteCount & (8-1);
	if(extra){
		CopyMemory((char*)pDest+dwByteCount-extra, pData, extra);
	}

	// 8 bytes per iteration
	_asm 
	{
		mov esi,pData;          
		mov ecx,dwByteCount;
		mov edi,pDest;          
		sub ecx, 8						; ecx counts samples
		jl DoneCopy;					; Nothing to do!

		fild qword ptr[esi+ecx]			; Load the first one
		sub ecx, 8
		jl CleanUp;
								
	LoopCopy:
		//	if we put fild AFTER fistp, its latency cycle will be eaten  
		//	while "sub ecx,8" is executing
		fistp qword ptr[edi+ecx+8]		; 6 clocks
		fild qword ptr[esi+ecx]			; 2+1 
		sub	ecx, 8						; 1
		 jge LoopCopy					; 0 
	CleanUp:
		fistp qword ptr[edi+ecx+8]		; Write the remaining 8 bytes
		 	
	DoneCopy:
	}
}

/////////////////////////////////////////////////////////////////////////////
static void CT_MemCpyFloatPent8sw(
			  void* pDest,
			  void* pData,
			  UINT dwByteCount)
{
	// Quick & dirty way to handle the trailing bytes
	DWORD extra = dwByteCount & (8-1);
	if(extra){
		CopyMemory((char*)pDest+dwByteCount-extra, pData, extra);
	}

	// 8 bytes per iteration
	_asm 
	{
		mov esi,pData;          
		mov ecx,dwByteCount;
		mov edi,pDest;   
		sub ecx, 256
		jl DonePreWarm
        
	ALIGN 16
	PreWarm:
		// Pre-warm the read buffer
							;clocks
		mov al, [esi]		;1
		mov bl, [esi+32]	;1
		mov al, [esi+64]	;1
		mov bl, [esi+96]	;1
		mov al, [esi+128]	;1
		mov bl, [esi+160]	;1
		mov al, [esi+192]	;1
		mov bl, [esi+224]	;1
		
		add esi, 256		;1	The nop will force the code
		 nop				;0	to pair better.
		
		sub ecx, 256		;1
		 jg PreWarm			;0

	DonePreWarm:
		mov ecx, dwByteCount;
		mov esi, pData;

		sub ecx, 8						; ecx counts samples
		jl DoneCopy;					; Nothing to do!

		fild qword ptr[esi+ecx]			; Load the first one
		sub ecx, 8
		jl CleanUp;
								
	LoopCopy:
		//	if we put fild AFTER fistp, its latency cycle will be eaten  
		//	while "sub ecx,8" is executing
		fistp qword ptr[edi+ecx+8]		; 6 clocks
		fild qword ptr[esi+ecx]			; 2+1 
		sub	ecx, 8						; 1
		 jge LoopCopy					; 0 
	CleanUp:
		fistp qword ptr[edi+ecx+8]		; Write the remaining 8 bytes
		 	
	DoneCopy:
	}
}

/////////////////////////////////////////////////////////////////////////////
static void CT_MemCpyFloatPent16(
			  void* pDest,
			  void* pData,
			  UINT dwByteCount)
{
	// Quick & dirty way to handle the trailing bytes
	DWORD extra = dwByteCount & (16-1);
	if(extra){
		CopyMemory((char*)pDest+dwByteCount-extra, pData, extra);
	}

	// 16 bytes per iteration
	_asm 
	{
		mov esi,pData;          
		mov ecx,dwByteCount;
		mov edi,pDest;          
		sub ecx, 16					;ecx counts samples
		jl DoneCopy;				; Nothing to do!

	ALIGN 16
	LoopCopy:
		fild qword ptr[esi+ecx]		;1+2	 
		fild qword ptr[esi+ecx+8]	;1+2
		  fxch st(1)				;0
		fistp qword ptr[edi+ecx]	;6		 
		fistp qword ptr[edi+ecx+8]	;6
		sub	ecx, 16					;1
		 jge LoopCopy				;0 (paired)
		 	
	DoneCopy:
	}
}

/////////////////////////////////////////////////////////////////////////////
static void CT_MemCpyFloatPent16sw(
			  void* pDest,
			  void* pData,
			  UINT dwByteCount)
{
	// Quick & dirty way to handle the trailing bytes
	DWORD extra = dwByteCount & (16-1);
	if(extra){
		CopyMemory((char*)pDest+dwByteCount-extra, pData, extra);
	}

	// 16 bytes per iteration
	_asm 
	{
		mov esi,pData;          
		mov ecx,dwByteCount;
		mov edi,pDest;          

		sub ecx, 256
		jl DonePreWarm
        
	ALIGN 16
	PreWarm:
		// Pre-warm the read buffer
							;clocks
		mov al, [esi]		;1
		mov bl, [esi+32]	;1
		mov al, [esi+64]	;1
		mov bl, [esi+96]	;1
		mov al, [esi+128]	;1
		mov bl, [esi+160]	;1
		mov al, [esi+192]	;1
		mov bl, [esi+224]	;1
		
		add esi, 256		;1	The nop will force the code
		 nop				;0	to pair better.
		
		sub ecx, 256		;1
		 jg PreWarm			;0

	DonePreWarm:
		mov ecx, dwByteCount;
		mov esi, pData;

		sub ecx, 16					;ecx counts samples
		jl DoneCopy;				; Nothing to do!

	ALIGN 16
	LoopCopy:
		fild qword ptr[esi+ecx]		;1+2	 
		fild qword ptr[esi+ecx+8]	;1+2	
		  fxch st(1)				;0
		fistp qword ptr[edi+ecx]	;6		 
		fistp qword ptr[edi+ecx+8]	;6		
		sub	ecx, 16					;1
		 jge LoopCopy				;0 (paired)
		 	
	DoneCopy:
	}
}

/////////////////////////////////////////////////////////////////////////////
static void CT_MemCpyFloatPent32(
			  void* pDest,
			  void* pData,
			  UINT dwByteCount)
{
	// Quick & dirty way to handle the trailing bytes
	DWORD extra = dwByteCount & (32-1);
	if(extra){
		CopyMemory((char*)pDest+dwByteCount-extra, pData, extra);
	}

	// 32 bytes per iteration
	_asm 
	{
		mov esi,pData;          
		mov ecx,dwByteCount;
		mov edi,pDest;          
		sub ecx, 32					;ecx counts samples
		jl DoneCopy;				; Nothing to do!
	ALIGN 16
	LoopCopy:
		fild qword ptr[esi+ecx]		;1+2	a 
		fild qword ptr[esi+ecx+8]	;1+2	b a
		fild qword ptr[esi+ecx+16]	;1+2	c b a	
		  fxch st(2)				;0		a b c
		fild qword ptr[esi+ecx+24]	;1+2	d a b c
		  fxch st(2)				;0		b a d c
		fistp qword ptr[edi+ecx+8]	;6		a d c
		fistp qword ptr[edi+ecx]	;6		d c
		fistp qword ptr[edi+ecx+24]	;6		c
		fistp qword ptr[edi+ecx+16]	;6

		sub	ecx, 32					;1
		 jge LoopCopy				;0 (paired)
		 	
	DoneCopy:
	}
}

/////////////////////////////////////////////////////////////////////////////
static void CT_MemCpyFloatPent32sw(
			  void* pDest,
			  void* pData,
			  UINT dwByteCount)
{
	// Quick & dirty way to handle the trailing bytes
	DWORD extra = dwByteCount & (32-1);
	if(extra){
		CopyMemory((char*)pDest+dwByteCount-extra, pData, extra);
	}

	// 32 bytes per iteration
	_asm 
	{
		mov esi,pData;          
		mov ecx,dwByteCount;
		mov edi,pDest;      
		
		sub ecx, 256
		jl DonePreWarm
        
	ALIGN 16
	PreWarm:
		// Pre-warm the read buffer
							;clocks
		mov al, [esi]		;1
		mov bl, [esi+32]	;1
		mov al, [esi+64]	;1
		mov bl, [esi+96]	;1
		mov al, [esi+128]	;1
		mov bl, [esi+160]	;1
		mov al, [esi+192]	;1
		mov bl, [esi+224]	;1
		
		add esi, 256		;1	The nop will force the code
		 nop				;0	to pair better.
		
		sub ecx, 256		;1
		 jg PreWarm			;0

	DonePreWarm:
		mov ecx, dwByteCount;
		mov esi, pData;

		sub ecx, 32					;ecx counts samples
		jl DoneCopy;				; Nothing to do!

	ALIGN 16
	LoopCopy:
		fild qword ptr[esi+ecx]		;1+2	a 
		fild qword ptr[esi+ecx+8]	;1+2	b a
		fild qword ptr[esi+ecx+16]	;1+2	c b a	
		  fxch st(2)				;0		a b c
		fild qword ptr[esi+ecx+24]	;1+2	d a b c
		  fxch st(2)				;0		b a d c
		fistp qword ptr[edi+ecx+8]	;6		a d c
		fistp qword ptr[edi+ecx]	;6		d c
		fistp qword ptr[edi+ecx+24]	;6		c
		fistp qword ptr[edi+ecx+16]	;6

		sub	ecx, 32					;1
		 jge LoopCopy				;0 (paired)
		 	
	DoneCopy:
	}
}

/////////////////////////////////////////////////////////////////////////////
static void CT_MemCpyFloatPent64(
			  void* pDest,
			  void* pData,
			  UINT dwByteCount)
{
	// Quick & dirty way to handle the trailing bytes
	DWORD extra = dwByteCount & (64-1);
	if(extra){
		CopyMemory((char*)pDest+dwByteCount-extra, pData, extra);
	}

	// 64 bytes per iteration
	_asm 
	{
		mov esi,pData;          
		mov ecx,dwByteCount;
		mov edi,pDest;          
		sub ecx, 64				//ecx counts samples
		jl DoneCopy;            // Nothing to do!

	ALIGN 16
	LoopCopy:
		fild qword ptr[esi+ecx]		;1+2	a 
		fild qword ptr[esi+ecx+8]	;1+2	b a	
		fild qword ptr[esi+ecx+16]	;1+2	c b a
		fild qword ptr[esi+ecx+24]	;1+2	d c b a
		fild qword ptr[esi+ecx+32]	;1+2	e d c b a
		fild qword ptr[esi+ecx+40]	;1+2	f e d c b a
		fild qword ptr[esi+ecx+48]	;1+2	g f e d c b a
		  fxch st(6)				;0		a f e d c b g
		fild qword ptr[esi+ecx+56]	;1+2	h a f e d c b g
		  fxch st(6)				;0		b a f e d c h g
		fistp qword ptr[edi+ecx+8]	;6		a f e d c h g
		fistp qword ptr[edi+ecx]	;6		f e d c h g
		fistp qword ptr[edi+ecx+40]	;6		e d c h g
		fistp qword ptr[edi+ecx+32]	;6		d c h g
		fistp qword ptr[edi+ecx+24]	;6		c h g
		fistp qword ptr[edi+ecx+16]	;6		h g
		fistp qword ptr[edi+ecx+56]	;6		g
		fistp qword ptr[edi+ecx+48]	;6

		sub	ecx, 64					;1
		 jge LoopCopy	            ;0 (paired)
		 	
	DoneCopy:
	}
}

/////////////////////////////////////////////////////////////////////////////
static void CT_MemCpyFloatPent64sw(
			  void* pDest,
			  void* pData,
			  UINT dwByteCount)
{
	// Quick & dirty way to handle the trailing bytes
	DWORD extra = dwByteCount & (64-1);
	if(extra){
		CopyMemory((char*)pDest+dwByteCount-extra, pData, extra);
	}

	// 64 bytes per iteration
	_asm 
	{
		mov esi,pData;          
		mov ecx,dwByteCount;
		mov edi,pDest;          

		sub ecx, 256
		jl DonePreWarm
        
	ALIGN 16
	PreWarm:
		// Pre-warm the read buffer
							;clocks
		mov al, [esi]		;1
		mov bl, [esi+32]	;1
		mov al, [esi+64]	;1
		mov bl, [esi+96]	;1
		mov al, [esi+128]	;1
		mov bl, [esi+160]	;1
		mov al, [esi+192]	;1
		mov bl, [esi+224]	;1
		
		add esi, 256		;1	The nop will force the code
		 nop				;0	to pair better.
		
		sub ecx, 256		;1
		 jg PreWarm			;0

	DonePreWarm:
		mov ecx, dwByteCount;
		mov esi, pData;

		sub ecx, 64				//ecx counts samples
		jl DoneCopy;            // Nothing to do!

	ALIGN 16
	LoopCopy:
		fild qword ptr[esi+ecx]		;1+2	a 
		fild qword ptr[esi+ecx+8]	;1+2	b a	
		fild qword ptr[esi+ecx+16]	;1+2	c b a
		fild qword ptr[esi+ecx+24]	;1+2	d c b a
		fild qword ptr[esi+ecx+32]	;1+2	e d c b a
		fild qword ptr[esi+ecx+40]	;1+2	f e d c b a
		fild qword ptr[esi+ecx+48]	;1+2	g f e d c b a
		  fxch st(6)				;0		a f e d c b g
		fild qword ptr[esi+ecx+56]	;1+2	h a f e d c b g
		  fxch st(6)				;0		b a f e d c h g
		fistp qword ptr[edi+ecx+8]	;6		a f e d c h g
		fistp qword ptr[edi+ecx]	;6		f e d c h g
		fistp qword ptr[edi+ecx+40]	;6		e d c h g
		fistp qword ptr[edi+ecx+32]	;6		d c h g
		fistp qword ptr[edi+ecx+24]	;6		c h g
		fistp qword ptr[edi+ecx+16]	;6		h g
		fistp qword ptr[edi+ecx+56]	;6		g
		fistp qword ptr[edi+ecx+48]	;6

		sub	ecx, 64					;1
		 jge LoopCopy	            ;0 (paired)
		 	
	DoneCopy:
	}
}

/////////////////////////////////////////////////////////////////////////////
void CT_MemCpyFloat::RunC(){
	CT_MemCpyFloatC(m_pucC2, m_pucC1, m_dwCount);
}

void CT_MemCpyFloat::RunPent(){

	if(m_bSelfWarming){
		switch(m_TestType){
			case 16:	CT_MemCpyFloatPent16sw(m_pucP2, m_pucP1, m_dwCount);	break;
			case 32:	CT_MemCpyFloatPent32sw(m_pucP2, m_pucP1, m_dwCount);	break;
			case 64:	CT_MemCpyFloatPent64sw(m_pucP2, m_pucP1, m_dwCount);	break;
			default:	CT_MemCpyFloatPent8sw(m_pucP2, m_pucP1, m_dwCount);	
		}
	} else {
		switch(m_TestType){
			case 16:	CT_MemCpyFloatPent16(m_pucP2, m_pucP1, m_dwCount);	break;
			case 32:	CT_MemCpyFloatPent32(m_pucP2, m_pucP1, m_dwCount);	break;
			case 64:	CT_MemCpyFloatPent64(m_pucP2, m_pucP1, m_dwCount);	break;
			default:	CT_MemCpyFloatPent8(m_pucP2, m_pucP1, m_dwCount);
		}	
	}
}

void CT_MemCpyFloat::RunMMX(){
}

/////////////////////////////////////////////////////////////////////////////
void CT_MemCpyFloat::RunTest()
{

	// Allocate the buffers
	m_MBC1.Allocate(&m_pucC1, m_dwCount);
	m_MBC2.Allocate(&m_pucC2, m_dwCount);
	m_MBP1.Allocate(&m_pucP1, m_dwCount);
	m_MBP2.Allocate(&m_pucP2, m_dwCount);

	// Initialize the buffers
	ZeroAllBuffers();
	for(DWORD i=0; i < m_dwCount;i++)
	{
		m_pucC1[i] = m_pucP1[i] = (char)i%255;
	}

	// Run Tests
	CTestObject::RunTest();

	// Verify Outputs
	m_tPent.diff = CompareBuffers(m_pucC2, m_pucP2, m_dwCount);

	// Output the results
	m_tC.dwFlags = RESULT_TIME;
	m_tPent.dwFlags = RESULT_TIME | RESULT_DIFF;

	if(m_bSelfWarming){
		switch(m_TestType){
			case 16:	ShowResults("MemCpyFloat16sw()");	break;
			case 32:	ShowResults("MemCpyFloat32sw()");	break;
			case 64:	ShowResults("MemCpyFloat64sw()");	break;
			default:	ShowResults("MemCpyFloat8sw()");	
		}
	} else {
		switch(m_TestType){
			case 16:	ShowResults("MemCpyFloat16()"); break;
			case 32:	ShowResults("MemCpyFloat32()");	break;
			case 64:	ShowResults("MemCpyFloat64()");	break;
			default:	ShowResults("MemCpyFloat8()");
		}	
	}
}