unit zp5timer;

interface

{ Written for BPW 7.0 }

{ This contains an object declartion (p5timerobj) that can be used to    }
{ access the Pentium internal counting registers. It is VERY helpful for }
{ identfing hardware and software bottlenecks in your code.              }
{ Obviously it can only be used on a Pentium.                            }

{ The reference for this article is Pentium Secrets by Terje Mathisen     }
{ BYTE Magazine, July 1994, p191-192                                      }
{ I don't reccommend you to understand this without that article !        }

{ All this unit is is a wrapper around his hard work, together some other }
{ magic I found in the MS-Developer's Network                             }

{ RDMSR and WRMSR only execute in ring 0, and Windows 3.1 only runs tasks }
{ in ring 3, so if you just causally try out the code in the BYTE article }
{ under Windows it will UAE. The trick is using a call gate to change the }
{ ring in which you are executing.                                        }

{ For credits on the call gate magic (name of wizard: Matt Pietrek) see   }
{ the unit ZLDT.                                                          }

{ Obvious improvements would be:                                          }
{ - This code should check to see if a PENTIUM is actually running,       }
{    I haven't checked into how to do this.                               }
{ - A mode that only benchmarks one register would improve the            }
{    timing overhead, when you are only interested in one register anyway.}
{ - More descriptive long names for the registers would be nice, I don't  }
{   understand several of them ( but I might not want to either ).        }

{ This module is hereby placed in the public domain.                      }
{ I will not be held responsible for any errors than this module may      }
{ contain or any damage that it may cause.                                }
{ Author: Mike Wise, 20 July 1994, Luebeck, Germany                       }
{         CompuServe 100014,170                                           }

{ Test and Illustration routines }
procedure testp5clean;
procedure testp5fast;
procedure testp5parallel;

{ Specifys if cycles or events (i.e. instructions) will be monitored     }
type p5countType=(P5_EVENTS,P5_CYCLES);

{ Specifys which code privilege level is to be monitored      }
{ If you are not interested in system interrupts use P5_RING3 }
type p5RingMonitor=(P5_RING012,P5_RING0123,P5_RING3);

{ An internal type, it would be hidden if TP had such stuff }
type p5timerreg=object idx:integer;
                       mode:p5countType;
                       ring:p5RingMonitor;
                       ctrl:word;
                       diff:longint;
                       procedure report( var tx:text; longnames:wordbool );
                       end;

{ The timer object that you use in your code }
type p5timerobj=object reg1:p5timerreg;
                       reg2:p5timerreg;
                       isset:wordbool;
                       isstarted:wordbool;
                       isevaluated:wordbool;
                       constructor init;
                       procedure setup( idx1:byte; mode1:p5countType; monitor1:p5RingMonitor;
                                        idx2:byte; mode2:p5countType; monitor2:p5RingMonitor );
                       procedure start;
                       procedure stop;
                       procedure evaluate;
                       procedure report(var tx:text);
                       procedure reportlong(var tx:text);
                       destructor done;
                       end;

{ These are declared here instead of in the implementation    }
{ part so that you can start and stop the timers directly     }
{ Then a start/stop cycle takes 10 instructions instead of 15 }

var p5starreg1lo:longint;
    p5starreg1hi:longint;{ not used in evaluation }
    p5endereg1lo:longint;
    p5endereg1hi:longint;{ not used in evaluation }
    p5starreg2lo:longint;
    p5starreg2hi:longint;{ not used in evaluation }
    p5endereg2lo:longint;
    p5endereg2hi:longint;{ not used in evaluation }
    RDMSR:procedure;
    WRMSR:procedure;

procedure startP5counters;
{ This inline proc stores the current state of the p5 counters in some global vars }
{ that represent the START of the timing.                                          }
inline(
 $FA                       {  cli                 }
/$66/$B9/$13/$00/$00/$00   {  mov ecx,00000013    }
/$ff/$1e/    >RDMSR        {  call RDMSR          }
/$66/$A3/    >p5starreg2lo {  mov [star.lo],eax   }
/$66/$89/$16/>p5starreg2hi {  mov [star.hi],edx   }
/$66/$49                   {  dec ecx             }
/$ff/$1e/    >RDMSR        {  call RDMSR          }
/$66/$A3/    >p5starreg1lo {  mov [star.lo],eax   }
/$66/$89/$16/>p5starreg1hi {  mov [star.hi],edx   }
/$FB                       {  sti                 }
);

procedure stopP5counters;
{ This inline proc stores the current state of the p5 counters in some global vars }
{ that represent the END of the timing.                                            }
inline(
 $FA                       {  cli                 }
/$66/$B9/$13/$00/$00/$00   {  mov ecx,00000013    }
/$ff/$1e/    >RDMSR        {  call RDMSR          }
/$66/$A3/    >p5endereg2lo {  mov [star.lo],eax   }
/$66/$89/$16/>p5endereg2hi {  mov [star.hi],edx   }
/$66/$49                   {  dec ecx             }
/$ff/$1e/    >RDMSR        {  call RDMSR          }
/$66/$A3/    >p5endereg1lo {  mov [ende.lo],eax   }
/$66/$89/$16/>p5endereg1hi {  mov [ende.hi],edx   }
/$FB                       {  sti                 }
);

implementation

uses zldt,wintypes,winprocs,win31;

procedure setP5counters( regspec:longint );
{ This proc tells the P5 registers what to count                 }
{ This is not pure inline because it does not have to be fast    }
{ I could of done it with ASM if asm supported 386 code (arrrg!) }
begin
inline(
 $66/$B9/$11/$00/$00/$00  {  mov    ecx,00000011     }
/$ff/$1e/    >RDMSR       {  call RDMSR              }
/$66/$25/$00/$FE/$00/$FE  {  and    eax,FE00FE00     }
/$66/$0B/$46/<regspec     {  or     eax,[bp+regspec] }
/$66/$B9/$11/$00/$00/$00  {  mov    ecx,00000011     }
/$ff/$1e/    >WRMSR       {  call RDMSR              }
);
end;


type ring0codeObj=object codeMem:pointer;
                         codeSelector:word;
                         constructor init;
                         procedure setup;
                         destructor done;
                         end;

const ringer:ring0codeObj=();{ Zero inited }

constructor p5timerobj.init;
begin
     isstarted := false;
     isset     := false;
     isevaluated := false;
     { Default setup is instructions executed in reg 1 }
     {       and v-pipe instructions executed in reg 2 }
     setup(  $16,P5_EVENTS,P5_RING3, $17,P5_EVENTS,P5_RING3 );
end;

function makeCtrlWord( mode:p5countType; monitor:p5RingMonitor ):word;
var retval:word;
begin
     retval := 0;
     case mode of
     P5_EVENTS:retval := $000;
     P5_CYCLES:retval := $100;
     end;
     case monitor of
     P5_RING012  :retval := retval or $040;
     P5_RING3    :retval := retval or $080;
     P5_RING0123 :retval := retval or $0c0;
     end;
     makeCtrlWord := retval;
end;

procedure p5timerobj.setup( idx1:byte; mode1:p5countType; monitor1:p5RingMonitor;
                            idx2:byte; mode2:p5countType; monitor2:p5RingMonitor );
var regspec:longint;
begin
     reg1.mode := mode1;
     reg1.ring := monitor1;
     reg1.ctrl := makeCtrlWord( mode1,monitor1 );
     reg1.idx := idx1;
     reg2.mode := mode2;
     reg2.ring := monitor2;
     reg2.ctrl := makeCtrlWord( mode2,monitor2 );
     reg2.idx := idx2;
     regspec := longint(reg2.idx+reg2.ctrl) shl 16 + (reg1.idx+reg1.ctrl);
     isstarted := false;
     isevaluated := false;
     isset     := true;
     ringer.setup;
     setP5counters( regspec );
end;

procedure p5timerobj.start;
begin
     isstarted := true;
     isevaluated := false;
     startP5counters;
end;

procedure p5timerobj.stop;
begin
     stopP5counters;
     isstarted := false;
     isevaluated := false;
end;

procedure p5timerobj.evaluate;
begin
     reg1.diff := p5endereg1lo-p5starreg1lo;
     if reg1.diff<0 then inc(reg1.diff,$7ffffff);{ wrap around }
     reg2.diff := p5endereg2lo-p5starreg2lo;
     if reg2.diff<0 then inc(reg2.diff,$7ffffff);{ wrap around }
     isevaluated := true;
end;

const longP5RegNames:array[0..$29] of Pchar=(
{ 0}'Data read',
{ 1}'Data write',
{ 2}'Data TLB',
{ 3}'Data read miss',
{ 4}'Data write miss',
{ 5}'Write (hit) to M or E state lines',
{ 6}'Data Cache lines written back',
{ 7}'Data Cache snoops',
{ 8}'Data Cache snoop hits',
{ 9}'Memory Access in both pipes',
{ A}'Bank conflicts',
{ B}'Misaligned Data memory references',
{ C}'Code read',
{ D}'Code TLB miss',
{ E}'Code Cache miss',
{ F}'Any segment register load',
{10}'',
{11}'',
{12}'Branches',
{13}'BTB (branch target buffer) hits',
{14}'Taken branch or BTB hit',
{15}'Pipeline flushes',
{16}'Instructions executed',
{17}'Instructions executed in the v-pipe',
{18}'Bus utilization (clocks)',
{19}'Pipeline stalled by write backup',
{1A}'Pipeline stalled by data memory read',
{1B}'Pipeline stalled by write to E or M line',
{1C}'Locked bus cycle',
{1D}'I/O read or write cycle',
{1E}'Noncacheable memory references',
{1F}'AGI (Addres Generation Interlock)',
{20}'',
{21}'',
{22}'Floatint-point operations',
{23}'Breakpoint 0 match',
{24}'Breakpoint 1 match',
{25}'Breakpoint 2 match',
{26}'Breakpoint 3 match',
{27}'Hardware interupts',
{28}'Data read or data write',
{29}'Data read miss or data write miss'
);
const shortP5RegNames:array[0..$29] of Pchar=(
{ 0}'Data R',
{ 1}'Data W',
{ 2}'Data TLB',
{ 3}'Data R miss',
{ 4}'Data W miss',
{ 5}'Write to M/E',
{ 6}'D.Cache wbacks',
{ 7}'D.Cache snoops',
{ 8}'D.Cache snoop hits',
{ 9}'Mem Access both pipes',
{ A}'Bank conflicts',
{ B}'Misaligned Data',
{ C}'Code read',
{ D}'Code TLB miss',
{ E}'C.Cache miss',
{ F}'Segreg load',
{10}'',
{11}'',
{12}'Branches',
{13}'BTB hits',
{14}'Took or hit',
{15}'Pipe flushes',
{16}'Instructions',
{17}'V-Pipe Instructions',
{18}'Bus clocks',
{19}'P. stalled write',
{1A}'P. stalled read',
{1B}'P. stalled M/E',
{1C}'Locked bus',
{1D}'I/O RW cycs',
{1E}'Noncacheable mem refs',
{1F}'AGIs',
{20}'',
{21}'',
{22}'Flops',
{23}'Bkpt. 0',
{24}'Bkpt. 1',
{25}'Bkpt. 2',
{26}'Bkpt. 3',
{27}'HW irpts.',
{28}'Data RW',
{29}'Data RW miss'
);
const counternames:array[p5countType] of Pchar=(
'Events',
'Cycles');

const monitornames:array[p5RingMonitor] of Pchar=(
'RING 012',
'RING 0123',
'RING 3');

procedure p5timerreg.report( var tx:text; longnames:wordbool );
begin
     write(tx,diff,' ',counternames[mode],' of ');
     if longnames then write(tx,longp5regnames[idx ])
                  else write(tx,shortp5regnames[idx ]);
     write(tx,' occured');
     writeln(tx);
end;


procedure p5timerobj.report(var tx:text);
begin
     if not isevaluated then evaluate;
     reg1.report(tx,false);
     reg2.report(tx,false);
end;
procedure p5timerobj.reportlong(var tx:text);
begin
     if not isevaluated then evaluate;
     reg1.report(tx,true);
     reg2.report(tx,true);
end;

destructor p5timerobj.done;
begin
     ringer.done;
end;


{--------------- ring0codeObj Object Code --------------------------------}


procedure dummy;far;
begin
end;

constructor ring0codeObj.init;
begin
     RDMSR := dummy;
     WRMSR := dummy;
end;

destructor ring0codeObj.done;
var LDT:LDTmesser;
begin
     { Free all that system gunk that we allocated }
     if codeSelector<>0 then begin

        { Free the call gates }
           LDT.init;
           LDT.freeCallGate( @RDMSR );
           LDT.freeCallGate( @WRMSR );
           LDT.done;

        FreeSelector( codeSelector );
        GlobalFree( GlobalHandle( hiword(longint(codeMem) ) ));
        end;

     codeSelector := 0;
     codeMem := nil;
     RDMSR   := dummy;
     WRMSR   := dummy;
end;

{ Code bytes       }
{/$0f/$30   WRMSR  }
{/$0f/$32   RDMSR  }
{ $cb       RETF   }
type byteptr=^byte;
procedure code1( var mp:bytePtr; b1:byte );
begin
     mp^ := b1; inc( mp );
end;
procedure code2( var mp:bytePtr; b1,b2:byte );
begin
     mp^ := b1; inc( mp );
     mp^ := b2; inc( mp );
end;

procedure ring0codeObj.setup;
{ This routine }
{ 1. Allocated memory to hold the RDMSR, WRMSR routines    }
{ 2. Writes (like a compiler) the code into this place     }
{ 2. Gets a code selector for the routines.                }
{ 3. Creates a ring0 call gates for these routines so that }
{    they can execute at ring 0.                           }
const CODESIZE  = 32;
      WRMSR_OFS =  0;
      RDMSR_OFS = 16;
var LDT:LDTmesser;
    codeMem1:byteptr;
begin
     if codeSelector<>0 then exit;{ already did it }

     { Allocate room }
        LDT.init;
        codeMem := GlobalLock( GlobalAlloc( GHND, CODESIZE ) );

     { "Compile" it }
        codeMem1 := codeMem;
        inc(codeMem1,WRMSR_OFS);
        code2( codeMem1, $0f, $30 );{ WRMSR }
        code1( codeMem1, $cb      );{ RETF  }

        codeMem1 := codeMem;
        inc(codeMem1,RDMSR_OFS);
        code2( codeMem1, $0f, $32 );{ RDMSR }
        code1( codeMem1, $cb      );{ RETF  }

     { Get a code selector }
        codeSelector := AllocSelector( 0 );
        Win31.PrestoChangoSelector( hiword(longint(codeMem)), codeSelector );

     { Make the ring 3 addresses, we won't keep them however }
        @WRMSR := ptr( codeSelector, WRMSR_OFS );
        @RDMSR := ptr( codeSelector, RDMSR_OFS );

     { Replace their addresses with call gates for ring 0 execution }
        @WRMSR := LDT.createCallGate16( @WRMSR, 0,0 );
        @RDMSR := LDT.createCallGate16( @RDMSR, 0,0 );

        LDT.done;
end;

procedure testp5clean;
{ This is a test routine that shows how to use the timerobj }
var p5:p5timerobj;
    cycsUsed,instructionsUsed:longint;
begin
     p5.init;

     { Here we specify what is to be timed.                                        }
     {    $16 => Instructions Executed (see longP5names above or BYTE Article)     }
     { We specify that register 1 will count how many cycles are used, whereas     }
     { register 2 will count how many instructions are executed                    }

     p5.setup(  $16,P5_CYCLES,P5_RING3, $16,P5_EVENTS,P5_RING3 );
     p5.start;

     { This is what gets timed, play with it to see how the results changes }
     asm inc ax;
         inc bx;
         inc cx;
         inc dx;
         inc ax;
         inc ax;
         end;

     p5.stop;
     p5.evaluate;
     cycsUsed         := p5.reg1.diff;
     instructionsUsed := p5.reg2.diff;

     p5.report(output);{ produces output on wincrt }

     p5.done;
end;

procedure testp5fast;
{ This is a test routine identical to one above, but using                            }
{ routines directly to hold down the timer overhead being reflected in the statistics }
var p5:p5timerobj;
    cycsUsed,instructionsUsed:longint;
begin
     p5.init;

     p5.setup(  $16,P5_CYCLES,P5_RING3, $16,P5_EVENTS,P5_RING3 );

     startP5counters;

     { This is what gets timed, play with it to see how the results changes  }
     asm inc ax;
         inc bx;
         inc cx;
         inc dx;
         inc ax;
         inc ax;
         end;

     stopP5counters;

     p5.evaluate;
     cycsUsed         := p5.reg1.diff;
     instructionsUsed := p5.reg2.diff;

     p5.report(output);{ produces output on wincrt }

     p5.done;
end;

procedure testp5parallel;
{ This is a test routine that shows how to calculate how full the V-pipe was }
{ i.e. how "parallelizable" your code was on the Pentium.                    }
var p5:p5timerobj;
    cycsUsed,instructionsUsed:longint;
    vpipe,upipe:longint;
    pctpar:real;
begin
     p5.init;

     p5.setup(  $16,P5_EVENTS,P5_RING3, $17,P5_EVENTS,P5_RING3 );

     startP5counters;

     { This is what gets timed, play with it to see how the results changes.  }
     { The following instructions should be completely parallelizable since   }
     { there is no data dependencies and they are v-pipe capable instructions.}
     asm inc ax; inc bx; inc cx; inc dx;
         inc ax; inc bx; inc cx; inc dx;
         inc ax; inc bx; inc cx; inc dx;
         inc ax; inc bx; inc cx; inc dx;
         inc ax; inc bx; inc cx; inc dx;
         inc ax; inc bx; inc cx; inc dx;
         inc ax; inc bx; inc cx; inc dx;
         inc ax; inc bx; inc cx; inc dx;
         inc ax; inc bx; inc cx; inc dx;
         inc ax; inc bx; inc cx; inc dx;
         inc ax; inc bx; inc cx; inc dx;
         inc ax; inc bx; inc cx; inc dx;
         inc ax; inc bx; inc cx; inc dx;
         inc ax; inc bx; inc cx; inc dx;
         inc ax; inc bx; inc cx; inc dx;
         inc ax; inc bx; inc cx; inc dx;
         inc ax; inc bx; inc cx; inc dx;
         inc ax; inc bx; inc cx; inc dx;
         inc ax; inc bx; inc cx; inc dx;
         inc ax; inc bx; inc cx; inc dx;
         inc ax; inc bx; inc cx; inc dx;
         inc ax; inc bx; inc cx; inc dx;
         end;

     stopP5counters;

     p5.evaluate;

     vpipe := p5.reg2.diff;
     upipe := p5.reg1.diff-vpipe;

     pctpar := 100*vpipe/upipe;

     p5.report(output);{ produces output on wincrt }

     writeln;
     writeln('U-Pipe instructions:',upipe);
     writeln('V-Pipe instructions:',vpipe);
     writeln('Parallelized:',pctpar:5:1,' %');

     p5.done;
end;

begin
     ringer.init;
end.