From a71e3529acf4739d5c8590d2507cd0929efb52a1 Mon Sep 17 00:00:00 2001
From: michaelh <michaelh@4a8a32a2-be11-0410-ad9d-d568d2c75423>
Date: Mon, 27 Aug 2001 06:49:01 +0000
Subject: [PATCH] Optimised mul, added asm string functions

git-svn-id: https://sdcc.svn.sourceforge.net/svnroot/sdcc/trunk/sdcc@1179 4a8a32a2-be11-0410-ad9d-d568d2c75423
---
 device/include/asm/z80/features.h |   7 ++
 device/lib/_memcpy.c              |   3 +
 device/lib/_strcmp.c              |   5 +
 device/lib/_strcpy.c              |   4 +
 device/lib/z80/Makefile           |   9 +-
 device/lib/z80/README             |   5 +
 device/lib/z80/asm_strings.s      | 190 ++++++++++++++++--------------
 device/lib/z80/mul.s              |  43 +++----
 src/z80/gen.c                     |  74 ++++++++----
 src/z80/profile.txt               |  49 ++++++++
 10 files changed, 256 insertions(+), 133 deletions(-)

diff --git a/device/include/asm/z80/features.h b/device/include/asm/z80/features.h
index 616eb02a..28866ec1 100644
--- a/device/include/asm/z80/features.h
+++ b/device/include/asm/z80/features.h
@@ -9,4 +9,11 @@
 #define _SDCC_MANGLES_SUPPORT_FUNS	1
 #define _SDCC_Z80_STYLE_LIB_OPT		1
 
+/* The following are disabled to make the dhrystone test more authentic.
+ */
+#define _SDCC_PORT_PROVIDES_MEMCPY	1
+#define _SDCC_PORT_PROVIDES_STRCMP	1
+/* Register allocator is as good as hand coded asm.  Cool. */
+#define _SDCC_PORT_PROVIDES_STRCPY	0
+
 #endif
diff --git a/device/lib/_memcpy.c b/device/lib/_memcpy.c
index ce087f2f..ec1ba04f 100644
--- a/device/lib/_memcpy.c
+++ b/device/lib/_memcpy.c
@@ -24,6 +24,8 @@
 #include "string.h" 
 #include <sdcc-lib.h>
 
+#if !_SDCC_PORT_PROVIDES_MEMCPY
+
 #define NULL (void *)0
 
 #pragma NOINDUCTION
@@ -72,3 +74,4 @@ void _generic * memcpy (
 	return(ret);
 #endif
 }
+#endif
diff --git a/device/lib/_strcmp.c b/device/lib/_strcmp.c
index 40f1e3d0..571f6961 100644
--- a/device/lib/_strcmp.c
+++ b/device/lib/_strcmp.c
@@ -24,6 +24,8 @@
 #include "string.h" 
 #include <sdcc-lib.h>
 
+#if !_SDCC_PORT_PROVIDES_STRCMP
+
 #define NULL (void *)0
 
 int strcmp (
@@ -56,3 +58,6 @@ int strcmp (
 	return( ret );
 #endif
 }
+
+#endif
+
diff --git a/device/lib/_strcpy.c b/device/lib/_strcpy.c
index 36ed8160..6c2f9434 100644
--- a/device/lib/_strcpy.c
+++ b/device/lib/_strcpy.c
@@ -24,6 +24,8 @@
 #include "string.h" 
 #include <sdcc-lib.h>
 
+#if !_SDCC_PORT_PROVIDES_STRCPY
+
 #define NULL (void *)0
 
 char _generic *strcpy (
@@ -45,3 +47,5 @@ char _generic *strcpy (
     return d;
 #endif
 }
+
+#endif
diff --git a/device/lib/z80/Makefile b/device/lib/z80/Makefile
index 7cae7cf0..de056322 100644
--- a/device/lib/z80/Makefile
+++ b/device/lib/z80/Makefile
@@ -5,10 +5,14 @@ TOPDIR = ../../..
 SCC = $(TOPDIR)/bin/sdcc -mz80
 SAS = $(TOPDIR)/bin/as-z80
 
-OBJ = div.o mul.o putchar.o printf.o shift.o stubs.o # asm_strings.o string.s
+OBJ = div.o mul.o putchar.o printf.o shift.o stubs.o \
+	asm_strings.o
+
 LIB = z80.lib
 CC = $(SCC)
 AS = $(SAS)
+ASFLAGS = -plosgff
+
 CFLAGS = -I../../include -I.
 
 all: $(LIB) crt0.o
@@ -20,6 +24,9 @@ $(LIB): $(OBJ) Makefile _dummy
 .c.o:
 	$(CC) $(CFLAGS) -c $<
 
+.s.o:
+	$(AS) $(ASFLAGS) $@ $<
+
 _dummy:
 
 clean:
diff --git a/device/lib/z80/README b/device/lib/z80/README
index a34ea453..259ca6dc 100644
--- a/device/lib/z80/README
+++ b/device/lib/z80/README
@@ -2,3 +2,8 @@ sdcc/device/lib/z80
 -------------------
 
 Z80 specific routines.
+
+Notes:
+* Cost of ld r,(ix+n):	19
+* Cost of ld r,(hl); inc hl: 7+6 = 13 and you don't have to pop ix
+        
\ No newline at end of file
diff --git a/device/lib/z80/asm_strings.s b/device/lib/z80/asm_strings.s
index fd263927..a6527fb9 100644
--- a/device/lib/z80/asm_strings.s
+++ b/device/lib/z80/asm_strings.s
@@ -3,100 +3,118 @@
 
 	;; Why - because I want a better dhrystone score :)
 
+        ;; strcpy is disabled as the C version is almost as good.
+        ;; Just the setup and return is slower.
+        .if 0
 ; char *strcpy(char *dest, const char *source)
 _strcpy::
-	push	de
-	push	ix
-	ld	ix,#0
-	add	ix,sp
-	ld	l,6(ix)
-	ld	h,7(ix)
-	ld	e,8(ix)
-	ld	d,9(ix)
-
-	push	hl
-1$:	
-	ld	a,(de)
-	ld	(hl),a
-	inc	hl
-	inc	de
-	or	a,a
-	jr	nz,1$
-
-	pop	hl
-	pop	ix
-	pop	de
-	ret
+        ;; Fall through to the correct type
+__strcpy_rrf_s::
+        ld      a,#5
+        rst     0x08
+__strcpy_rrx_s::
+        ld      hl,#2
+        add     hl,sp
+        ld      e,(hl)
+        inc     hl
+        ld      d,(hl)
+        inc     hl
+        ld      c,(hl)
+        inc     hl
+        ld      b,(hl)
+        ;; Setup the return value
+        ld      l,c
+        ld      h,b
+1$:
+        ld      a,(bc)
+        ld      (de),a
+        or      a
+        jp      nz,1$
 
+        ret
+        ;; Notes on strcpy styles:
+        ;;   *de = *hl; hl++; de++; or a; ret z; jp - slower as jp is
+        ;; same cost as conditional jump, so condition on ret is more expensive.
+        ;;   *de = *bc; bc++; de++; or a, jp nz - OK
+        ;; Can't use LDI as need to check for end of string.
+        ;; Above also matches the z88dk version.
+        .endif
+                
 ; void *memcpy(void *dest, const void *source, int count)
-_memcpy::
-	push	de
-	push	bc
-	push	ix
-	ld	ix,#0
-	add	ix,sp
-	ld	l,8(ix)
-	ld	h,9(ix)
-	ld	e,10(ix)
-	ld	d,11(ix)
-	ld	c,12(ix)
-	ld	b,13(ix)
+_memcpy::       
+        ;; Fall through to correct type
+__memcpy_rrf_s::                
+        ld      a,#5
+        rst     0x08
+__memcpy_rrx_s::       
+        ;; Using LDIR
+        ;; LDIR:        do; *DE = *HL; HL++; BC--; while BC != 0
+        
+        ;; All registers are already saved.
+        ld      hl,#2
+        add     hl,sp
+        ld      e,(hl)
+        inc     hl
+        ld      d,(hl)
+        inc     hl
+        ld      a,(hl)
+        inc     hl
+        ld      b,(hl)
+        inc     hl
+        ld      c,(hl)
+        inc     hl
+        ld      h,(hl)
+        ld      l,a
+        ld      a,h
+        ld      h,b
+        ld      b,a
 
-	inc	b
-	inc	c
-	push	hl
+        ;; Pending: could optimise this check to occur earlier.
+        or      c
+        ret     z
 
-	jr	2$
-1$:
-	ld	a,(de)
-	ld	(hl),a
-	inc	de
-	inc	hl
-2$:
-	dec	c
-	jr	nz,1$
-	dec	b
-	jr	nz,1$	
-
-	pop	hl
-	pop	ix
-	pop	bc
-	pop	de
-	ret
+        ldir
+        ret
 
 ; int strcmp(const char *s1, const char *s2) 
 _strcmp::
-	push	de
-	push	ix
-	ld	ix,#0
-	add	ix,sp
-	ld	e,6(ix)
-	ld	d,7(ix)
-	ld	l,8(ix)
-	ld	h,9(ix)
+        ;; Fall through to the correct style
+        ;; Fall through to correct type
+__strcmp_rrf_s::                
+        ld      a,#5
+        rst     0x08
+__strcmp_rrx_s::       
+        ld      hl,#2
+        add     hl,sp
+        
+        ld      e,(hl)
+        inc     hl
+        ld      d,(hl)
+        inc     hl
+        ld      a,(hl)
+        inc     hl
+        ld      h,(hl)
+        ld      l,a
+        
+1$:     
+        ld      a,(de)
+        sub     (hl)
+
+        ;; Normally not taken, so use a jr (12/7) instead of jp (10)
+        jr      nz,2$
 
-	jr	1$
-2$:	
-	ld	a,(de)
-	sub	(hl)
-	jr	nz,4$
-	;; A == 0
-	cp	(hl)
-	jr	z,3$
-1$:	
-	inc	de
-	inc	hl
-	jr	2$
+        ;; A == 0
+        cp      (hl)
 
-3$:
-	ld	hl,#0
-	jr	5$
-4$:
-	ld	hl,#1
-	jr	nc,5$
-	ld	hl,#-1
-5$:
-	pop	ix
-	pop	de
-	ret
-	
\ No newline at end of file
+        inc     de
+        inc     hl
+        ;; Normally taken.  Flag from the cp above.
+        jp      nz,1$
+2$:     
+        ;; Sign extend
+        ld      l,a
+        rla
+        sbc     a
+        ld      h,a
+        ret
+        
diff --git a/device/lib/z80/mul.s b/device/lib/z80/mul.s
index 4f3e793a..7e7cceff 100644
--- a/device/lib/z80/mul.s
+++ b/device/lib/z80/mul.s
@@ -79,25 +79,26 @@ __muluint_rrx_hds::
 	;; Register used: AF,BC,DE,HL
 .mul16:
 .mulu16:
-	LD	HL,#0x00	; Product = 0
-	LD	A,#15		; Count = bit length - 1
-	;; Shift-and-add algorithm
-	;; If MSB of multiplier is 1, add multiplicand to partial product
-	;; Shift partial product, multiplier left 1 bit
-.mlp:
-	SLA	E		; Shift multiplier left 1 bit
-	RL	D
-	jp	NC,.mlp1	; Jump if MSB of multiplier = 0
-	ADD	HL,BC		; Add multiplicand to partial product
-.mlp1:
-	ADD	HL,HL		; Shift partial product left
-	DEC	A
-	jp	NZ,.mlp		; Continue until count = 0
-	;; Add multiplicand one last time if MSB of multiplier is 1
-	BIT	7,D		; Get MSB of multiplier
-	JR	Z,.mend		; Exit if MSB of multiplier is 0
-	ADD	HL,BC		; Add multiplicand to product
-.mend:
-				; HL = result
-	ret
+        ld      hl,#0
+        ld      a,b
+        ; ld c,c
+        ld      b,#16
 
+        ;; Optimise for the case when this side has 8 bits of data or
+        ;; less.  This is often the case with support address calls.
+        or      a
+        jp      nz,1$
+        
+        ld      b,#8
+        ld      a,c
+1$:
+        ;; Taken from z88dk, which originally borrowed from the
+        ;; Spectrum rom.
+        add     hl,hl
+        rl      c
+        rla                     ;DLE 27/11/98
+        jr      nc,2$
+        add     hl,de
+2$:     
+        djnz    1$
+        ret
diff --git a/src/z80/gen.c b/src/z80/gen.c
index 0482186c..3ed83308 100644
--- a/src/z80/gen.c
+++ b/src/z80/gen.c
@@ -1,6 +1,33 @@
 /*-------------------------------------------------------------------------
   gen.c - Z80 specific code generator.
+     
+  Michael Hope <michaelh@juju.net.nz> 2000
+  Based on the mcs51 generator -
+      Sandeep Dutta . sandeep.dutta@usa.net (1998)
+   and -  Jean-Louis VERN.jlvern@writeme.com (1999)
+
+  This program is free software; you can redistribute it and/or modify it
+  under the terms of the GNU General Public License as published by the
+  Free Software Foundation; either version 2, or (at your option) any
+  later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
 
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+
+  In other words, you are welcome to use, share and improve this program.
+  You are forbidden to forbid anyone else to use, share and improve
+  what you give them.   Help stamp out software-hoarding!
+
+-------------------------------------------------------------------------*/
+
+/*
   Benchmarks on dhry.c 2.1 with 32766 loops and a 10ms clock:
                                        ticks dhry  size
   Base with asm strcpy / strcmp / memcpy: 23198 141 1A14
@@ -29,32 +56,29 @@
   5. Optimised strcmp further		21660 151 228C
   6. Optimised memcpy by unroling	20885 157 2201
   7. After turning loop induction on	19862 165 236D
+  8. Same as 7 but with more info	
+  9. With asm optimised strings		17030 192 2223
 
-  Michael Hope <michaelh@juju.net.nz> 2000
-  Based on the mcs51 generator -
-      Sandeep Dutta . sandeep.dutta@usa.net (1998)
-   and -  Jean-Louis VERN.jlvern@writeme.com (1999)
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms of the GNU General Public License as published by the
-  Free Software Foundation; either version 2, or (at your option) any
-  later version.
-
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-
-  You should have received a copy of the GNU General Public License
-  along with this program; if not, write to the Free Software
-  Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
-
-  In other words, you are welcome to use, share and improve this program.
-  You are forbidden to forbid anyone else to use, share and improve
-  what you give them.   Help stamp out software-hoarding!
-
--------------------------------------------------------------------------*/
+  10 and below are with asm strings off.
+  
+  Apparent advantage of turning on regparams:
+  1.  Cost of push
+        Decent case is push of a constant 
+          - ld hl,#n; push hl: (10+11)*nargs
+  2.  Cost of pull from stack
+        Using asm with ld hl, etc
+          - ld hl,#2; add hl,sp; (ld bc,(hl); hl+=2)*nargs
+            10+11+(7+6+7+6)*nargs
+  3.  Cost of fixing stack
+          - pop hl*nargs
+            10*nargs
+  
+  So cost is (10+11+7+6+7+10)*nargs+10+11 
+      = 51*nargs+21
+      = 123 for mul, div, strcmp, strcpy
+  Saving of (98298+32766+32766+32766)*123 = 24181308
+  At 192 d/s for 682411768t, speed up to 199.  Hmm.
+*/
 
 #include <stdio.h>
 #include <stdlib.h>
diff --git a/src/z80/profile.txt b/src/z80/profile.txt
index 3f076e3b..7c05ec00 100644
--- a/src/z80/profile.txt
+++ b/src/z80/profile.txt
@@ -112,3 +112,52 @@ __divulong 72840 0.01
 _memcpy 80800956 10.16
 _strcmp 97216722 12.22
 ; 795663339 t-states
+
+-- 8
+; Function total-ticks total-calls ticks-per-call total-percent
+0000 71 0 0 0.00
+_main 122823011 1 122823011 15.45
+_Proc_1 69267324 32766 2114 8.71
+_Proc_2 16514064 32766 504 2.08
+_Proc_3 13368528 32766 408 1.68
+_Proc_4 11009376 32766 336 1.38
+_Proc_5 4914900 32766 150 0.62
+_Proc_6 18840450 32766 575 2.37
+_Func_1 13466826 98298 137 1.69
+_Func_3 5308092 32766 162 0.67
+_Proc_7 22313646 98298 227 2.81
+_Proc_8 83553300 32766 2550 10.51
+_Func_2 32438340 32766 990 4.08
+_strcpy 46497792 32768 1419 5.85
+__mulsint_rrf_s 88861392 98298 904 11.18
+__divsint_rrf_s 67760088 32766 2068 8.52
+__rlulong_rrf_s 13056 128 102 0.00
+__divulong 52549 2 26274 0.01
+_memcpy 80800956 32766 2466 10.16
+_strcmp 97216722 32766 2967 12.23
+; 795020510 t-states
+; So the lib functions except mul, div etc take 28% of the time.
+
+-- 9
+; Function total-ticks total-calls ticks-per-call total-percent
+0000 71 0 0 0.00
+_main 122864113 1 122864113 18.00
+_Proc_1 96102678 32766 2933 14.08
+_Proc_2 16514064 32766 504 2.42
+_Proc_3 13368528 32766 408 1.96
+_Proc_4 11009376 32766 336 1.61
+_Proc_5 4914900 32766 150 0.72
+_Proc_6 18840450 32766 575 2.76
+_Func_1 13466826 98298 137 1.97
+_Func_3 5308092 32766 162 0.78
+_Proc_7 22313646 98298 227 3.27
+_Proc_8 83553300 32766 2550 12.24
+_Func_2 32438340 32766 990 4.75
+_strcpy 46497792 32768 1419 6.81
+__mulsint_rrf_s 88861392 98298 904 13.02
+__divsint_rrf_s 67760088 32766 2068 9.93
+__rlulong_rrf_s 13056 128 102 0.00
+__divulong 52213 2 26106 0.01
+__memcpy_rrf_s 3702558 32766 113 0.54
+__strcmp_rrf_s 34830258 32766 1063 5.10
+; 682411768 t-states
-- 
2.47.2