git.gag.com Git - fw/sdcc/blob - doc/choices.txt

   1 Some of the implementation choices
   2 ----------------------------------
   3
   4 gbz80:
   5
   6 Load from direct space:
   7   Alternatives:
   8   1.  Via HL
   9         ld hl,#dir
  10         ld x,(hl)
  11         inc hl
  12         ld y,(hl)
  13   2.  Via a
  14         ld a,(dir)
  15         ld x,a
  16         ld a,(dir+1)
  17         ld x,a
  18   1 is bad when x or y involve HL (1b)
  19                                         8       16      32
  20      1 = 12 + n*(8+8) - 8               20      36      68
  21      1b = n*(12+12+8)                   32      64      128
  22      2 = n*(16+4)                       20      40      80
  23   So choose 2.
  24
  25   Hmm.  (2) is too hard to support in the current model.
  26
  27 On stack word push
  28    1.    lda  hl,x(sp)
  29          ld   a,(hl+)
  30          ld   h,(hl)
  31          ld   l,a
  32          push hl
  33    2.    lda  hl,x(sp)
  34          ld   e,(hl)
  35          inc  hl
  36          ld   d,(hl)
  37    1 = d + 8 + 8 + 4
  38    2 = d + 8 + 8 + 8
  39
  40 Structure member get:
  41    Normally fetch pair
  42    Then add pair and constant with result in hl
  43
  44    ld   l,c     ; 4
  45    ld   h,b     ; 4
  46    inc  hl ..   ; 6     = 8 + 6n
  47 or
  48    ld   l,c     ; 4
  49    ld   h,b     ; 4
  50    ld   a,#0x06 ; 7
  51    add  a,c     ; 4
  52    ld   l,a     ; 4
  53    ld   a,#0x00 ; 7
  54    adc  a,b     ; 4
  55    ld   h,a     ; 4     = 38
  56 alt: (only when result=hl and left, rigth = pair, const)
  57    ld      hl,#const    ; 10
  58    add     hl,pair      ; 11    = 21
  59
  60 So (1) is best for n <= 2, (2) is just bad, (3) is good n > 2
  61
  62 How about:
  63     pair = pair + constant:
  64 1:
  65     ld  a,#0x08 ; 7
  66     add a,c     ; 4
  67     ld  c,a     ; 4
  68     ld  a,#0x00 ; 7
  69     adc a,b     ; 4
  70     ld  b,a     ; 4     = 30
  71 2:
  72         ld      hl,#const       ; 10
  73         add     hl,pair         ; 11
  74         ld      c,l             ; 4
  75         ld      b,h             ; 4     = 29
  76 One cycle.  If I cache HL later it will throw away the advantage.  Choose 1.
  77
  78 PlusIncr on pairs:
  79 1:
  80          inc    pair            ; 6     = 6n
  81 2:
  82         ld      a,#0x04         ; 7
  83         add     a,c             ; 4
  84         ld      c,a             ; 4
  85         ld      a,#0x00         ; 7
  86         adc     a,b             ; 4
  87         ld      b,a             ; 4     = 30
  88 So n <= 5 (1) is better.
  89
  90 Frame pointer:
  91 It's nice to use HL as the temp register, but what if I used it as the
  92 frame pointer instead of ix?
  93
  94 Instead of:
  95         ld      e,5(ix)         ; 19
  96         ld      d,6(ix)         ; 19    = 38
  97
  98         ld      hl,#5           ; 10
  99         add     hl,sp           ; 11
 100         ld      e,(hl)          ; 7
 101         inc     hl              ; 6
 102         ld      d,(hl)          ; 7     = 41
 103
 104 Things get better when you access the same set over, as you get rid
 105 of the setup.  But they get worse when both ops are on the stack/in
 106 direct space.  Easiest this way for now.  iy may benifit...
 107
 108 cmpGt:
 109         ld      l,#0x80         ; 7
 110         ld      a,-1(ix)        ; 19
 111         xor     a,#0x80         ; 7
 112         ld      h,a             ; 4
 113         ld      a,#0x02         ; 7
 114         sub     a,-2(ix)        ; 19
 115         ld      a,l             ; 4
 116         sbc     a,h             ; 4 = 71
 117
 118 vs
 119         ld      hl,0x8002       ; 10
 120         ld      a,-2(ix)        ; 19
 121         xor     0x80            ; 7
 122         ld      d,a             ; 4
 123         ld      e,-1(ix)        ; 19
 124         sbc     hl,de           ; 15 = 74
 125
 126 Why is there the whole xor thing going on?
 127
 128 cmpGt using sub:
 129         left    right   l-r     c       expect
 130         0       0       0       0       false
 131         -1      0       -1      0       false
 132         1       0       1       0       true
 133         0       -1      1       1       true
 134         0       1       -1      1       false
 135
 136 With top most bits xored
 137         80h     80h     0       0       false
 138         7fh     80h     FFh     1       false
 139         81h     80h     01h     0       true
 140         80h     7fh     01h     0       true
 141         80h     81h     FFh     1       false
 142
 143 r-l instead - ah.
 144         80h     80h     0       0       false
 145         7fh     80h     01h     0       false
 146         81h     80h     FFh     1       true
 147         80h     7fh     FFh     1       true
 148         80h     81h     01h     0       false
 149
 150 How about using the sign bit and no XOR on r-l?
 151         0       0        0              false
 152         FFh     0        01h            false
 153         01h     0        FFh            true
 154         0       FFh      FFh            true
 155         0       01h      01h            false  - works
 156
 157 cmpEq:
 158         ld      hl,#nn          ; 10
 159         ld      c,(ix+-5)       ; 19
 160         ld      b,(ix+-4)       ; 19
 161         or      a               ; 4
 162         sbc     hl,bc           ; 15
 163         jp      nz,l19          ; 10 = 77
 164
 165         ld      a,-82(ix)       ; 19
 166         cp      a,#0x01         ; 7
 167         jp      nz,00129$       ; 10
 168         ld      a,-81(ix)       ; 19
 169         or      a,a             ; 7
 170         jp      nz,00129$       ; 10 - 72
 171
 172 Add:
 173         ld      a,c             ; 4
 174         add     a,#0x04         ; 7
 175         ld      -4(ix),a        ; 19
 176         ld      a,b             ; 4
 177         adc     a,#0x00         ; 7
 178         ld      -3(ix),a        ; 19 = 60
 179 vs
 180         ld      hl,#4           ; 10
 181         add     hl,bc           ; 11
 182         ld      -4(ix),l        ; 19
 183         ld      -3(ix),h        ; 19 = 59
 184
 185 Same argument as above - not worth the extra cycle.
 186
 187 Pending optimisations:
 188         iTemp1 = @iTemp2
 189         iTemp3 = iTemp1
 190
 191         iTemp4 = something in direct space
 192         ...
 193         push    iTemp4
 194
 195 Swaps:
 196         ld      hl,bc           ; 8
 197         ld      bc,de           ; 8
 198         ld      de,hl           ; 8
 199
 200 vs
 201         push    bc              ; 11
 202         ld      bc,de           ; 8
 203         pop     de              ; 11
 204
 205 Swaps 2:
 206         ld      a,h
 207         ld      h,b
 208         ld      b,a
 209         ld      a,l
 210         ld      l,c
 211         ld      c,aq            ; 6*4 = 24
 212
 213 Cleaning up the arguments to a call:
 214          ld     iy,#n           ; 14
 215          add    iy,sp           ; 15
 216          ld     sp,iy           ; 10 = 39
 217
 218          pop    af              ; 5/byte
 219
 220
 221 So for 8 bytes and above use the first form.
 222
 223 Pointer assign:
 224         ld      hl,bc           ; 4+4
 225         ld      e,(hl)          ; 7
 226         inc     hl              ; 6
 227         ld      d,(hl)          ; 7
 228
 229 vs:
 230         ld      a,(bc)          ; 7
 231         ld      e,a             ; 4
 232         inc     bc              ; 6
 233         ld      a,(bc)          ; 7
 234         ld      d,a             ; 4
 235
 236 Same cost.  Not worth it, although is does free up HL.
 237
 238 Shift left signed on HL
 239       sla  l
 240       rl   h                    ; 8+8 = 16
 241
 242       add  hl,hl                ; 11