Some of the implementation choices
----------------------------------

gbz80:

Load from direct space:
  Alternatives:
  1.  Via HL
	ld hl,#dir
	ld x,(hl)
	inc hl
	ld y,(hl)
  2.  Via a
  	ld a,(dir)
	ld x,a
	ld a,(dir+1)
	ld x,a
  1 is bad when x or y involve HL (1b)
  					8	16	32
     1 = 12 + n*(8+8) - 8  		20	36	68
     1b = n*(12+12+8)			32	64	128
     2 = n*(16+4)			20	40	80
  So choose 2.

  Hmm.  (2) is too hard to support in the current model.

On stack word push
   1.	 lda  hl,x(sp)
	 ld   a,(hl+)
	 ld   h,(hl)
	 ld   l,a
	 push hl
   2.	 lda  hl,x(sp)
	 ld   e,(hl)
	 inc  hl
	 ld   d,(hl)
   1 = d + 8 + 8 + 4
   2 = d + 8 + 8 + 8

Structure member get:
   Normally fetch pair
   Then add pair and constant with result in hl

   ld	l,c	; 4
   ld	h,b	; 4
   inc  hl ..	; 6	= 8 + 6n
or
   ld	l,c	; 4
   ld	h,b	; 4
   ld	a,#0x06	; 7
   add	a,c	; 4
   ld	l,a	; 4
   ld	a,#0x00 ; 7
   adc	a,b	; 4
   ld	h,a	; 4	= 38
alt: (only when result=hl and left, rigth = pair, const)
   ld	   hl,#const	; 10
   add	   hl,pair	; 11	= 21

So (1) is best for n <= 2, (2) is just bad, (3) is good n > 2

How about:
    pair = pair + constant:
1:
    ld	a,#0x08	; 7
    add	a,c	; 4
    ld	c,a	; 4
    ld	a,#0x00	; 7
    adc	a,b	; 4
    ld	b,a	; 4	= 30
2:
	ld	hl,#const	; 10
	add	hl,pair		; 11
	ld	c,l		; 4
	ld	b,h		; 4	= 29
One cycle.  If I cache HL later it will throw away the advantage.  Choose 1.

PlusIncr on pairs:
1:
	 inc	pair		; 6 	= 6n
2:
	ld	a,#0x04		; 7
	add	a,c		; 4
	ld	c,a		; 4
	ld	a,#0x00		; 7
	adc	a,b		; 4
	ld	b,a		; 4 	= 30
So n <= 5 (1) is better.

Frame pointer:
It's nice to use HL as the temp register, but what if I used it as the
frame pointer instead of ix?

Instead of:
	ld	e,5(ix)		; 19
	ld	d,6(ix)		; 19	= 38

	ld	hl,#5		; 10
	add	hl,sp		; 11
	ld	e,(hl)		; 7
	inc	hl		; 6
	ld	d,(hl)		; 7	= 41

Things get better when you access the same set over, as you get rid
of the setup.  But they get worse when both ops are on the stack/in
direct space.  Easiest this way for now.  iy may benifit...

cmpGt:
	ld	l,#0x80		; 7
	ld	a,-1(ix)        ; 19
	xor	a,#0x80		; 7
	ld	h,a		; 4
	ld	a,#0x02		; 7
	sub	a,-2(ix)	; 19
	ld	a,l		; 4
	sbc	a,h		; 4 = 71

vs
        ld      hl,0x8002	; 10
        ld      a,-2(ix)	; 19
        xor     0x80		; 7
        ld      d,a		; 4
        ld      e,-1(ix)	; 19
        sbc     hl,de		; 15 = 74

Why is there the whole xor thing going on?

cmpGt using sub:
        left    right   l-r  	c       expect
        0       0       0       0       false
        -1      0       -1	0       false
        1       0       1       0       true
        0       -1      1       1       true
        0       1       -1      1       false

With top most bits xored
        80h     80h     0       0	false
        7fh     80h     FFh     1       false
        81h     80h     01h     0       true
        80h     7fh     01h     0       true
        80h     81h     FFh     1       false

r-l instead - ah.
        80h     80h     0       0	false
        7fh     80h     01h     0       false
        81h     80h     FFh     1       true
        80h     7fh     FFh     1       true
        80h     81h     01h     0       false

How about using the sign bit and no XOR on r-l?
        0       0        0              false
        FFh     0   	 01h            false
        01h     0        FFh            true
        0       FFh      FFh            true
        0       01h      01h            false  - works

cmpEq:
        ld      hl,#nn		; 10
	ld	c,(ix+-5)	; 19
	ld	b,(ix+-4)	; 19
	or	a		; 4
	sbc	hl,bc		; 15
	jp	nz,l19          ; 10 = 77

	ld	a,-82(ix)	; 19
	cp	a,#0x01		; 7
	jp	nz,00129$	; 10
	ld	a,-81(ix)	; 19
	or	a,a		; 7
	jp	nz,00129$	; 10 - 72

Add:
	ld	a,c             ; 4
	add	a,#0x04         ; 7
	ld	-4(ix),a        ; 19
	ld	a,b             ; 4
	adc	a,#0x00         ; 7
	ld	-3(ix),a        ; 19 = 60
vs
        ld      hl,#4           ; 10
        add     hl,bc           ; 11
        ld      -4(ix),l        ; 19
        ld      -3(ix),h        ; 19 = 59

Same argument as above - not worth the extra cycle.

Pending optimisations:
        iTemp1 = @iTemp2
        iTemp3 = iTemp1

        iTemp4 = something in direct space
        ...
        push    iTemp4

Swaps:
        ld      hl,bc           ; 8
        ld      bc,de           ; 8
        ld      de,hl           ; 8

vs
        push    bc		; 11
        ld      bc,de           ; 8
        pop     de              ; 11

Swaps 2:
        ld      a,h
        ld      h,b
        ld      b,a
        ld      a,l
        ld      l,c
        ld      c,aq            ; 6*4 = 24

Cleaning up the arguments to a call:
         ld     iy,#n           ; 14
         add    iy,sp           ; 15
         ld     sp,iy           ; 10 = 39

         pop    af              ; 5/byte


So for 8 bytes and above use the first form.

Pointer assign:
        ld      hl,bc           ; 4+4
        ld      e,(hl)          ; 7
        inc     hl              ; 6
        ld      d,(hl)          ; 7

vs:
        ld      a,(bc)          ; 7
        ld      e,a             ; 4
        inc     bc              ; 6
        ld      a,(bc)          ; 7
        ld      d,a             ; 4

Same cost.  Not worth it, although is does free up HL.

Shift left signed on HL
      sla  l
      rl   h                    ; 8+8 = 16

      add  hl,hl                ; 11