From a71e3529acf4739d5c8590d2507cd0929efb52a1 Mon Sep 17 00:00:00 2001 From: michaelh Date: Mon, 27 Aug 2001 06:49:01 +0000 Subject: [PATCH] Optimised mul, added asm string functions git-svn-id: https://sdcc.svn.sourceforge.net/svnroot/sdcc/trunk/sdcc@1179 4a8a32a2-be11-0410-ad9d-d568d2c75423 --- device/include/asm/z80/features.h | 7 ++ device/lib/_memcpy.c | 3 + device/lib/_strcmp.c | 5 + device/lib/_strcpy.c | 4 + device/lib/z80/Makefile | 9 +- device/lib/z80/README | 5 + device/lib/z80/asm_strings.s | 190 ++++++++++++++++-------------- device/lib/z80/mul.s | 43 +++---- src/z80/gen.c | 74 ++++++++---- src/z80/profile.txt | 49 ++++++++ 10 files changed, 256 insertions(+), 133 deletions(-) diff --git a/device/include/asm/z80/features.h b/device/include/asm/z80/features.h index 616eb02a..28866ec1 100644 --- a/device/include/asm/z80/features.h +++ b/device/include/asm/z80/features.h @@ -9,4 +9,11 @@ #define _SDCC_MANGLES_SUPPORT_FUNS 1 #define _SDCC_Z80_STYLE_LIB_OPT 1 +/* The following are disabled to make the dhrystone test more authentic. + */ +#define _SDCC_PORT_PROVIDES_MEMCPY 1 +#define _SDCC_PORT_PROVIDES_STRCMP 1 +/* Register allocator is as good as hand coded asm. Cool. */ +#define _SDCC_PORT_PROVIDES_STRCPY 0 + #endif diff --git a/device/lib/_memcpy.c b/device/lib/_memcpy.c index ce087f2f..ec1ba04f 100644 --- a/device/lib/_memcpy.c +++ b/device/lib/_memcpy.c @@ -24,6 +24,8 @@ #include "string.h" #include +#if !_SDCC_PORT_PROVIDES_MEMCPY + #define NULL (void *)0 #pragma NOINDUCTION @@ -72,3 +74,4 @@ void _generic * memcpy ( return(ret); #endif } +#endif diff --git a/device/lib/_strcmp.c b/device/lib/_strcmp.c index 40f1e3d0..571f6961 100644 --- a/device/lib/_strcmp.c +++ b/device/lib/_strcmp.c @@ -24,6 +24,8 @@ #include "string.h" #include +#if !_SDCC_PORT_PROVIDES_STRCMP + #define NULL (void *)0 int strcmp ( @@ -56,3 +58,6 @@ int strcmp ( return( ret ); #endif } + +#endif + diff --git a/device/lib/_strcpy.c b/device/lib/_strcpy.c index 36ed8160..6c2f9434 100644 --- a/device/lib/_strcpy.c +++ b/device/lib/_strcpy.c @@ -24,6 +24,8 @@ #include "string.h" #include +#if !_SDCC_PORT_PROVIDES_STRCPY + #define NULL (void *)0 char _generic *strcpy ( @@ -45,3 +47,5 @@ char _generic *strcpy ( return d; #endif } + +#endif diff --git a/device/lib/z80/Makefile b/device/lib/z80/Makefile index 7cae7cf0..de056322 100644 --- a/device/lib/z80/Makefile +++ b/device/lib/z80/Makefile @@ -5,10 +5,14 @@ TOPDIR = ../../.. SCC = $(TOPDIR)/bin/sdcc -mz80 SAS = $(TOPDIR)/bin/as-z80 -OBJ = div.o mul.o putchar.o printf.o shift.o stubs.o # asm_strings.o string.s +OBJ = div.o mul.o putchar.o printf.o shift.o stubs.o \ + asm_strings.o + LIB = z80.lib CC = $(SCC) AS = $(SAS) +ASFLAGS = -plosgff + CFLAGS = -I../../include -I. all: $(LIB) crt0.o @@ -20,6 +24,9 @@ $(LIB): $(OBJ) Makefile _dummy .c.o: $(CC) $(CFLAGS) -c $< +.s.o: + $(AS) $(ASFLAGS) $@ $< + _dummy: clean: diff --git a/device/lib/z80/README b/device/lib/z80/README index a34ea453..259ca6dc 100644 --- a/device/lib/z80/README +++ b/device/lib/z80/README @@ -2,3 +2,8 @@ sdcc/device/lib/z80 ------------------- Z80 specific routines. + +Notes: +* Cost of ld r,(ix+n): 19 +* Cost of ld r,(hl); inc hl: 7+6 = 13 and you don't have to pop ix + \ No newline at end of file diff --git a/device/lib/z80/asm_strings.s b/device/lib/z80/asm_strings.s index fd263927..a6527fb9 100644 --- a/device/lib/z80/asm_strings.s +++ b/device/lib/z80/asm_strings.s @@ -3,100 +3,118 @@ ;; Why - because I want a better dhrystone score :) + ;; strcpy is disabled as the C version is almost as good. + ;; Just the setup and return is slower. + .if 0 ; char *strcpy(char *dest, const char *source) _strcpy:: - push de - push ix - ld ix,#0 - add ix,sp - ld l,6(ix) - ld h,7(ix) - ld e,8(ix) - ld d,9(ix) - - push hl -1$: - ld a,(de) - ld (hl),a - inc hl - inc de - or a,a - jr nz,1$ - - pop hl - pop ix - pop de - ret + ;; Fall through to the correct type +__strcpy_rrf_s:: + ld a,#5 + rst 0x08 +__strcpy_rrx_s:: + ld hl,#2 + add hl,sp + ld e,(hl) + inc hl + ld d,(hl) + inc hl + ld c,(hl) + inc hl + ld b,(hl) + ;; Setup the return value + ld l,c + ld h,b +1$: + ld a,(bc) + ld (de),a + or a + jp nz,1$ + ret + ;; Notes on strcpy styles: + ;; *de = *hl; hl++; de++; or a; ret z; jp - slower as jp is + ;; same cost as conditional jump, so condition on ret is more expensive. + ;; *de = *bc; bc++; de++; or a, jp nz - OK + ;; Can't use LDI as need to check for end of string. + ;; Above also matches the z88dk version. + .endif + ; void *memcpy(void *dest, const void *source, int count) -_memcpy:: - push de - push bc - push ix - ld ix,#0 - add ix,sp - ld l,8(ix) - ld h,9(ix) - ld e,10(ix) - ld d,11(ix) - ld c,12(ix) - ld b,13(ix) +_memcpy:: + ;; Fall through to correct type +__memcpy_rrf_s:: + ld a,#5 + rst 0x08 +__memcpy_rrx_s:: + ;; Using LDIR + ;; LDIR: do; *DE = *HL; HL++; BC--; while BC != 0 + + ;; All registers are already saved. + ld hl,#2 + add hl,sp + ld e,(hl) + inc hl + ld d,(hl) + inc hl + ld a,(hl) + inc hl + ld b,(hl) + inc hl + ld c,(hl) + inc hl + ld h,(hl) + ld l,a + ld a,h + ld h,b + ld b,a - inc b - inc c - push hl + ;; Pending: could optimise this check to occur earlier. + or c + ret z - jr 2$ -1$: - ld a,(de) - ld (hl),a - inc de - inc hl -2$: - dec c - jr nz,1$ - dec b - jr nz,1$ - - pop hl - pop ix - pop bc - pop de - ret + ldir + ret ; int strcmp(const char *s1, const char *s2) _strcmp:: - push de - push ix - ld ix,#0 - add ix,sp - ld e,6(ix) - ld d,7(ix) - ld l,8(ix) - ld h,9(ix) + ;; Fall through to the correct style + ;; Fall through to correct type +__strcmp_rrf_s:: + ld a,#5 + rst 0x08 +__strcmp_rrx_s:: + ld hl,#2 + add hl,sp + + ld e,(hl) + inc hl + ld d,(hl) + inc hl + ld a,(hl) + inc hl + ld h,(hl) + ld l,a + +1$: + ld a,(de) + sub (hl) + + ;; Normally not taken, so use a jr (12/7) instead of jp (10) + jr nz,2$ - jr 1$ -2$: - ld a,(de) - sub (hl) - jr nz,4$ - ;; A == 0 - cp (hl) - jr z,3$ -1$: - inc de - inc hl - jr 2$ + ;; A == 0 + cp (hl) -3$: - ld hl,#0 - jr 5$ -4$: - ld hl,#1 - jr nc,5$ - ld hl,#-1 -5$: - pop ix - pop de - ret - \ No newline at end of file + inc de + inc hl + ;; Normally taken. Flag from the cp above. + jp nz,1$ +2$: + ;; Sign extend + ld l,a + rla + sbc a + ld h,a + ret + diff --git a/device/lib/z80/mul.s b/device/lib/z80/mul.s index 4f3e793a..7e7cceff 100644 --- a/device/lib/z80/mul.s +++ b/device/lib/z80/mul.s @@ -79,25 +79,26 @@ __muluint_rrx_hds:: ;; Register used: AF,BC,DE,HL .mul16: .mulu16: - LD HL,#0x00 ; Product = 0 - LD A,#15 ; Count = bit length - 1 - ;; Shift-and-add algorithm - ;; If MSB of multiplier is 1, add multiplicand to partial product - ;; Shift partial product, multiplier left 1 bit -.mlp: - SLA E ; Shift multiplier left 1 bit - RL D - jp NC,.mlp1 ; Jump if MSB of multiplier = 0 - ADD HL,BC ; Add multiplicand to partial product -.mlp1: - ADD HL,HL ; Shift partial product left - DEC A - jp NZ,.mlp ; Continue until count = 0 - ;; Add multiplicand one last time if MSB of multiplier is 1 - BIT 7,D ; Get MSB of multiplier - JR Z,.mend ; Exit if MSB of multiplier is 0 - ADD HL,BC ; Add multiplicand to product -.mend: - ; HL = result - ret + ld hl,#0 + ld a,b + ; ld c,c + ld b,#16 + ;; Optimise for the case when this side has 8 bits of data or + ;; less. This is often the case with support address calls. + or a + jp nz,1$ + + ld b,#8 + ld a,c +1$: + ;; Taken from z88dk, which originally borrowed from the + ;; Spectrum rom. + add hl,hl + rl c + rla ;DLE 27/11/98 + jr nc,2$ + add hl,de +2$: + djnz 1$ + ret diff --git a/src/z80/gen.c b/src/z80/gen.c index 0482186c..3ed83308 100644 --- a/src/z80/gen.c +++ b/src/z80/gen.c @@ -1,6 +1,33 @@ /*------------------------------------------------------------------------- gen.c - Z80 specific code generator. + + Michael Hope 2000 + Based on the mcs51 generator - + Sandeep Dutta . sandeep.dutta@usa.net (1998) + and - Jean-Louis VERN.jlvern@writeme.com (1999) + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 2, or (at your option) any + later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + In other words, you are welcome to use, share and improve this program. + You are forbidden to forbid anyone else to use, share and improve + what you give them. Help stamp out software-hoarding! + +-------------------------------------------------------------------------*/ + +/* Benchmarks on dhry.c 2.1 with 32766 loops and a 10ms clock: ticks dhry size Base with asm strcpy / strcmp / memcpy: 23198 141 1A14 @@ -29,32 +56,29 @@ 5. Optimised strcmp further 21660 151 228C 6. Optimised memcpy by unroling 20885 157 2201 7. After turning loop induction on 19862 165 236D + 8. Same as 7 but with more info + 9. With asm optimised strings 17030 192 2223 - Michael Hope 2000 - Based on the mcs51 generator - - Sandeep Dutta . sandeep.dutta@usa.net (1998) - and - Jean-Louis VERN.jlvern@writeme.com (1999) - - This program is free software; you can redistribute it and/or modify it - under the terms of the GNU General Public License as published by the - Free Software Foundation; either version 2, or (at your option) any - later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - - In other words, you are welcome to use, share and improve this program. - You are forbidden to forbid anyone else to use, share and improve - what you give them. Help stamp out software-hoarding! - --------------------------------------------------------------------------*/ + 10 and below are with asm strings off. + + Apparent advantage of turning on regparams: + 1. Cost of push + Decent case is push of a constant + - ld hl,#n; push hl: (10+11)*nargs + 2. Cost of pull from stack + Using asm with ld hl, etc + - ld hl,#2; add hl,sp; (ld bc,(hl); hl+=2)*nargs + 10+11+(7+6+7+6)*nargs + 3. Cost of fixing stack + - pop hl*nargs + 10*nargs + + So cost is (10+11+7+6+7+10)*nargs+10+11 + = 51*nargs+21 + = 123 for mul, div, strcmp, strcpy + Saving of (98298+32766+32766+32766)*123 = 24181308 + At 192 d/s for 682411768t, speed up to 199. Hmm. +*/ #include #include diff --git a/src/z80/profile.txt b/src/z80/profile.txt index 3f076e3b..7c05ec00 100644 --- a/src/z80/profile.txt +++ b/src/z80/profile.txt @@ -112,3 +112,52 @@ __divulong 72840 0.01 _memcpy 80800956 10.16 _strcmp 97216722 12.22 ; 795663339 t-states + +-- 8 +; Function total-ticks total-calls ticks-per-call total-percent +0000 71 0 0 0.00 +_main 122823011 1 122823011 15.45 +_Proc_1 69267324 32766 2114 8.71 +_Proc_2 16514064 32766 504 2.08 +_Proc_3 13368528 32766 408 1.68 +_Proc_4 11009376 32766 336 1.38 +_Proc_5 4914900 32766 150 0.62 +_Proc_6 18840450 32766 575 2.37 +_Func_1 13466826 98298 137 1.69 +_Func_3 5308092 32766 162 0.67 +_Proc_7 22313646 98298 227 2.81 +_Proc_8 83553300 32766 2550 10.51 +_Func_2 32438340 32766 990 4.08 +_strcpy 46497792 32768 1419 5.85 +__mulsint_rrf_s 88861392 98298 904 11.18 +__divsint_rrf_s 67760088 32766 2068 8.52 +__rlulong_rrf_s 13056 128 102 0.00 +__divulong 52549 2 26274 0.01 +_memcpy 80800956 32766 2466 10.16 +_strcmp 97216722 32766 2967 12.23 +; 795020510 t-states +; So the lib functions except mul, div etc take 28% of the time. + +-- 9 +; Function total-ticks total-calls ticks-per-call total-percent +0000 71 0 0 0.00 +_main 122864113 1 122864113 18.00 +_Proc_1 96102678 32766 2933 14.08 +_Proc_2 16514064 32766 504 2.42 +_Proc_3 13368528 32766 408 1.96 +_Proc_4 11009376 32766 336 1.61 +_Proc_5 4914900 32766 150 0.72 +_Proc_6 18840450 32766 575 2.76 +_Func_1 13466826 98298 137 1.97 +_Func_3 5308092 32766 162 0.78 +_Proc_7 22313646 98298 227 3.27 +_Proc_8 83553300 32766 2550 12.24 +_Func_2 32438340 32766 990 4.75 +_strcpy 46497792 32768 1419 6.81 +__mulsint_rrf_s 88861392 98298 904 13.02 +__divsint_rrf_s 67760088 32766 2068 9.93 +__rlulong_rrf_s 13056 128 102 0.00 +__divulong 52213 2 26106 0.01 +__memcpy_rrf_s 3702558 32766 113 0.54 +__strcmp_rrf_s 34830258 32766 1063 5.10 +; 682411768 t-states -- 2.30.2