X-Git-Url: https://git.gag.com/?a=blobdiff_plain;f=lib%2Fmatch.c;h=ac32ed0f6e34fa5eb21bc020a94c0f370cd47f24;hb=f277ee25feefa0c56bcc71d96d99a78a4c17bffb;hp=66cb1ca124b1447ae8014060bdfa92e620673f5a;hpb=0d5a754ba8f2cc33e98d48d559f0ad6cd18a43a3;p=debian%2Fgzip diff --git a/lib/match.c b/lib/match.c index 66cb1ca..ac32ed0 100644 --- a/lib/match.c +++ b/lib/match.c @@ -1,11 +1,11 @@ /* match.s -- optional optimized asm version of longest match in deflate.c - Copyright (C) 2002, 2006 Free Software Foundation, Inc. + Copyright (C) 2002, 2006, 2009-2012 Free Software Foundation, Inc. Copyright (C) 1992-1993 Jean-loup Gailly This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2, or (at your option) + the Free Software Foundation; either version 3, or (at your option) any later version. This program is distributed in the hope that it will be useful, @@ -28,8 +28,6 @@ * David Mosberger-Tang . */ -/* $Id: match.c,v 1.1 2006/11/20 08:40:34 eggert Exp $ */ - /* Preprocess with -DNO_UNDERLINE if your C compiler does not prefix * external symbols with an underline character '_'. */ @@ -61,7 +59,7 @@ * This version assumes static allocation of the arrays (-DDYN_ALLOC not used). */ - .file "match.S" + .file "match.S" #define MAX_MATCH 258 #define MAX_MATCH2 $128 /* MAX_MATCH/2-1 */ @@ -69,13 +67,13 @@ #define WSIZE $32768 #define MAX_DIST WSIZE - MAX_MATCH - MIN_MATCH - 1 - .globl _match_init - .globl _longest_match + .globl _match_init + .globl _longest_match - .text + .text _match_init: - ret + ret /*----------------------------------------------------------------------- * Set match_start to the longest match starting at the given string and @@ -92,8 +90,8 @@ _longest_match: /* int longest_match(cur_match) */ /* return address */ /* esp+16 */ push %ebp /* esp+12 */ push %edi /* esp+8 */ - push %esi /* esp+4 */ - push %ebx /* esp */ + push %esi /* esp+4 */ + push %ebx /* esp */ /* * match equ esi @@ -102,21 +100,21 @@ _longest_match: /* int longest_match(cur_match) */ * best_len equ ebx * limit equ edx */ - mov cur_match,%esi + mov cur_match,%esi mov _max_chain_length,%ebp /* chain_length = max_chain_length */ - mov _strstart,%edi - mov %edi,%edx - sub MAX_DIST,%edx /* limit = strstart-MAX_DIST */ - jae limit_ok - sub %edx,%edx /* limit = NIL */ + mov _strstart,%edi + mov %edi,%edx + sub MAX_DIST,%edx /* limit = strstart-MAX_DIST */ + jae limit_ok + sub %edx,%edx /* limit = NIL */ limit_ok: add $2+_window,%edi /* edi = offset(window+strstart+2) */ mov _prev_length,%ebx /* best_len = prev_length */ movw -3(%ebx,%edi),%ax /* ax = scan[best_len-1..best_len] */ movw -2(%edi),%cx /* cx = scan[0..1] */ - cmp _good_match,%ebx /* do we have a good match already? */ + cmp _good_match,%ebx /* do we have a good match already? */ jb do_scan - shr $2,%ebp /* chain_length >>= 2 */ + shr $2,%ebp /* chain_length >>= 2 */ jmp do_scan .align 4 @@ -152,8 +150,8 @@ mismatch: subb -2(%esi),%cl /* cl = 0 if first bytes equal */ xchg %edi,%eax /* edi = scan+2, eax = end of scan */ sub %edi,%eax /* eax = len */ - sub %eax,%esi /* esi = cur_match + 2 + offset(window) */ - sub $2+_window,%esi /* esi = cur_match */ + sub %eax,%esi /* esi = cur_match + 2 + offset(window) */ + sub $2+_window,%esi /* esi = cur_match */ subb $1,%cl /* set carry if cl == 0 (cannot use DEC) */ adc $0,%eax /* eax = carry ? len+1 : len */ cmp %ebx,%eax /* len > best_len ? */ @@ -164,7 +162,7 @@ mismatch: jl long_loop the_end: mov %ebx,%eax /* result = eax = best_len */ - pop %ebx + pop %ebx pop %esi pop %edi pop %ebp @@ -276,15 +274,15 @@ maxmatch: #define WSIZE 32768 #define MAX_DIST (WSIZE - MAX_MATCH - MIN_MATCH - 1) - GLOBAL (_match_init) - GLOBAL (_longest_match) + GLOBAL (_match_init) + GLOBAL (_longest_match) - TEXT + TEXT - FILE ("match.S") + FILE ("match.S") _match_init: - rts + rts /*----------------------------------------------------------------------- * Set match_start to the longest match starting at the given string and @@ -306,91 +304,91 @@ _match_init: #endif _longest_match: - movel IndBase(4,Stack_Pointer),Cur_Match - moveml imm(pushreg),predec(Stack_Pointer) - movel _max_chain_length,Chain_Length - movel _prev_length,Best_Len - movel imm(_prev),Prev_Address - movel imm(_window+MIN_MATCH),Match_Ini - movel _strstart,Limit - movel Match_Ini,Scan_Ini - addl Limit,Scan_Ini - subw imm(MAX_DIST),Limit - bhis L__limit_ok - clrl Limit + movel IndBase(4,Stack_Pointer),Cur_Match + moveml imm(pushreg),predec(Stack_Pointer) + movel _max_chain_length,Chain_Length + movel _prev_length,Best_Len + movel imm(_prev),Prev_Address + movel imm(_window+MIN_MATCH),Match_Ini + movel _strstart,Limit + movel Match_Ini,Scan_Ini + addl Limit,Scan_Ini + subw imm(MAX_DIST),Limit + bhis L__limit_ok + clrl Limit L__limit_ok: - cmpl invert_maybe(_good_match,Best_Len) - blos L__length_ok - lsrl imm(2),Chain_Length + cmpl invert_maybe(_good_match,Best_Len) + blos L__length_ok + lsrl imm(2),Chain_Length L__length_ok: - subql imm(1),Chain_Length + subql imm(1),Chain_Length #ifdef UNALIGNED_OK - movew IndBase(-MIN_MATCH,Scan_Ini),Scan_Start - movew IndBaseNdxw(-MIN_MATCH-1,Scan_Ini,Best_Len),Scan_End + movew IndBase(-MIN_MATCH,Scan_Ini),Scan_Start + movew IndBaseNdxw(-MIN_MATCH-1,Scan_Ini,Best_Len),Scan_End #else - moveb IndBase(-MIN_MATCH,Scan_Ini),Scan_Start - lslw imm(8),Scan_Start - moveb IndBase(-MIN_MATCH+1,Scan_Ini),Scan_Start - moveb IndBaseNdxw(-MIN_MATCH-1,Scan_Ini,Best_Len),Scan_End - lslw imm(8),Scan_End - moveb IndBaseNdxw(-MIN_MATCH,Scan_Ini,Best_Len),Scan_End + moveb IndBase(-MIN_MATCH,Scan_Ini),Scan_Start + lslw imm(8),Scan_Start + moveb IndBase(-MIN_MATCH+1,Scan_Ini),Scan_Start + moveb IndBaseNdxw(-MIN_MATCH-1,Scan_Ini,Best_Len),Scan_End + lslw imm(8),Scan_End + moveb IndBaseNdxw(-MIN_MATCH,Scan_Ini,Best_Len),Scan_End #endif - bras L__do_scan + bras L__do_scan L__long_loop: #ifdef UNALIGNED_OK - movew IndBaseNdxw(-MIN_MATCH-1,Scan_Ini,Best_Len),Scan_End + movew IndBaseNdxw(-MIN_MATCH-1,Scan_Ini,Best_Len),Scan_End #else - moveb IndBaseNdxw(-MIN_MATCH-1,Scan_Ini,Best_Len),Scan_End - lslw imm(8),Scan_End - moveb IndBaseNdxw(-MIN_MATCH,Scan_Ini,Best_Len),Scan_End + moveb IndBaseNdxw(-MIN_MATCH-1,Scan_Ini,Best_Len),Scan_End + lslw imm(8),Scan_End + moveb IndBaseNdxw(-MIN_MATCH,Scan_Ini,Best_Len),Scan_End #endif L__short_loop: - lslw imm(1),Cur_Match - movew IndBaseNdxl(0,Prev_Address,Cur_Match),Cur_Match - cmpw invert_maybe(Limit,Cur_Match) - dbls Chain_Length,L__do_scan - bras L__return + lslw imm(1),Cur_Match + movew IndBaseNdxl(0,Prev_Address,Cur_Match),Cur_Match + cmpw invert_maybe(Limit,Cur_Match) + dbls Chain_Length,L__do_scan + bras L__return L__do_scan: - movel Match_Ini,Match - addl Cur_Match,Match + movel Match_Ini,Match + addl Cur_Match,Match #ifdef UNALIGNED_OK - cmpw invert_maybe(IndBaseNdxw(-MIN_MATCH-1,Match,Best_Len),Scan_End) - bne L__short_loop - cmpw invert_maybe(IndBase(-MIN_MATCH,Match),Scan_Start) - bne L__short_loop + cmpw invert_maybe(IndBaseNdxw(-MIN_MATCH-1,Match,Best_Len),Scan_End) + bne L__short_loop + cmpw invert_maybe(IndBase(-MIN_MATCH,Match),Scan_Start) + bne L__short_loop #else - moveb IndBaseNdxw(-MIN_MATCH-1,Match,Best_Len),Scan_Test - lslw imm(8),Scan_Test - moveb IndBaseNdxw(-MIN_MATCH,Match,Best_Len),Scan_Test - cmpw invert_maybe(Scan_Test,Scan_End) - bne L__short_loop - moveb IndBase(-MIN_MATCH,Match),Scan_Test - lslw imm(8),Scan_Test - moveb IndBase(-MIN_MATCH+1,Match),Scan_Test - cmpw invert_maybe(Scan_Test,Scan_Start) - bne L__short_loop + moveb IndBaseNdxw(-MIN_MATCH-1,Match,Best_Len),Scan_Test + lslw imm(8),Scan_Test + moveb IndBaseNdxw(-MIN_MATCH,Match,Best_Len),Scan_Test + cmpw invert_maybe(Scan_Test,Scan_End) + bne L__short_loop + moveb IndBase(-MIN_MATCH,Match),Scan_Test + lslw imm(8),Scan_Test + moveb IndBase(-MIN_MATCH+1,Match),Scan_Test + cmpw invert_maybe(Scan_Test,Scan_Start) + bne L__short_loop #endif - movew imm((MAX_MATCH-MIN_MATCH+1)-1),Loop_Counter - movel Scan_Ini,Scan + movew imm((MAX_MATCH-MIN_MATCH+1)-1),Loop_Counter + movel Scan_Ini,Scan L__scan_loop: - cmpmb postinc(Match),postinc(Scan) - dbne Loop_Counter,L__scan_loop - - subl Scan_Ini,Scan - addql imm(MIN_MATCH-1),Scan - cmpl invert_maybe(Best_Len,Scan) - bls L__short_loop - movel Scan,Best_Len - movel Cur_Match,_match_start - cmpl invert_maybe(_nice_match,Best_Len) - blos L__long_loop + cmpmb postinc(Match),postinc(Scan) + dbne Loop_Counter,L__scan_loop + + subl Scan_Ini,Scan + addql imm(MIN_MATCH-1),Scan + cmpl invert_maybe(Best_Len,Scan) + bls L__short_loop + movel Scan,Best_Len + movel Cur_Match,_match_start + cmpl invert_maybe(_nice_match,Best_Len) + blos L__long_loop L__return: - moveml postinc(Stack_Pointer),imm(popreg) - rts + moveml postinc(Stack_Pointer),imm(popreg) + rts #else @@ -495,188 +493,188 @@ L__return: #define cgeu cmp.ge.unc #define cneu cmp.ne.unc - .global longest_match - .proc longest_match - .align 32 + .global longest_match + .proc longest_match + .align 32 longest_match: // -- Cycle: 0 - .prologue + .prologue {.mmi - alloc r2=ar.pfs,R_INPUT,R_LOCAL,R_OUTPUT,R_ROTATING - .rotr scan[MLAT+2], match[MLAT+2], shscan0[SHLAT+1], \ - shscan1[SHLAT+1], shmatch0[SHLAT+1], shmatch1[SHLAT+1] - .rotp lc[MLAT+SHLAT+2] - mova s_vspec_cmatch=in0 // cur_match from input register - add s_tm1=@gprel(strstart),gp // a(a(strstart)) + alloc r2=ar.pfs,R_INPUT,R_LOCAL,R_OUTPUT,R_ROTATING + .rotr scan[MLAT+2], match[MLAT+2], shscan0[SHLAT+1], \ + shscan1[SHLAT+1], shmatch0[SHLAT+1], shmatch1[SHLAT+1] + .rotp lc[MLAT+SHLAT+2] + mova s_vspec_cmatch=in0 // cur_match from input register + add s_tm1=@gprel(strstart),gp // a(a(strstart)) }{.mmi - add s_tm3=@gprel(prev_length),gp // a(a(prev_length)) - add s_tm5=@ltoff(window),gp // a(a(window)) - add s_tm6=@ltoff(prev),gp // a(a(prev)) - ;; + add s_tm3=@gprel(prev_length),gp // a(a(prev_length)) + add s_tm5=@ltoff(window),gp // a(a(window)) + add s_tm6=@ltoff(prev),gp // a(a(prev)) + ;; }{.mmb // Cycle: 1 - ld4 s_vstrstart=[s_tm1] // strstart - ld4 s_vbestlen=[s_tm3] // best_len = prev_length - brp.loop.imp .cmploop,.cmploop+48 + ld4 s_vstrstart=[s_tm1] // strstart + ld4 s_vbestlen=[s_tm3] // best_len = prev_length + brp.loop.imp .cmploop,.cmploop+48 }{.mli - add s_tm2=@gprel(max_chain_length),gp // a(a(max_chain_length)) - movl s_wmask=WSIZE-1 - ;; + add s_tm2=@gprel(max_chain_length),gp // a(a(max_chain_length)) + movl s_wmask=WSIZE-1 + ;; }{.mmi // Cycle: 2 - ld8 s_aprev=[s_tm6] // a(prev) - ld8 s_awindow=[s_tm5] // a(window) - .save pr, s_prsave - movi0 s_prsave=pr // save predicates + ld8 s_aprev=[s_tm6] // a(prev) + ld8 s_awindow=[s_tm5] // a(window) + .save pr, s_prsave + movi0 s_prsave=pr // save predicates }{.mmi - add s_tm4=@gprel(good_match),gp // a(a(good_match)) - add s_tm7=@ltoff(nice_match),gp // a(a(nice_match)) - add s_tm8=@ltoff(match_start),gp // a(match_start) - ;; + add s_tm4=@gprel(good_match),gp // a(a(good_match)) + add s_tm7=@ltoff(nice_match),gp // a(a(nice_match)) + add s_tm8=@ltoff(match_start),gp // a(match_start) + ;; }{.mmi // Cycle: 3 - ld8 s_anicematch=[s_tm7] // a(nice_match) - ld8 s_amatchstart=[s_tm8] // a(match_start) - .save ar.lc, s_lcsave - movi0 s_lcsave=ar.lc // save loop count register + ld8 s_anicematch=[s_tm7] // a(nice_match) + ld8 s_amatchstart=[s_tm8] // a(match_start) + .save ar.lc, s_lcsave + movi0 s_lcsave=ar.lc // save loop count register }{.mmi - .body - add s_tm1=-(MAX_MATCH + MIN_MATCH),s_wmask // maxdist - cmp.eq p_ll,p0=r0,r0 // parallel compare initialized as 'true' - mova s_vcurmatch=s_vspec_cmatch - ;; + .body + add s_tm1=-(MAX_MATCH + MIN_MATCH),s_wmask // maxdist + cmp.eq p_ll,p0=r0,r0 // parallel compare initialized as 'true' + mova s_vcurmatch=s_vspec_cmatch + ;; }{.mmi // Cycle: 4 - ld4 s_vchainlen=[s_tm2] // chain_length=max_chain_length - ld4 s_tm4=[s_tm4] // v(good_match) - add s_ascan=s_awindow,s_vstrstart // scan=window + strstart + ld4 s_vchainlen=[s_tm2] // chain_length=max_chain_length + ld4 s_tm4=[s_tm4] // v(good_match) + add s_ascan=s_awindow,s_vstrstart // scan=window + strstart }{.mmi - sub s_vlimit=s_vstrstart, s_tm1 // limit=strstart - MAX_DIST - add s_amatch=s_awindow,s_vspec_cmatch // match=window + cur_match - and s_vspec_cmatch =s_vspec_cmatch,s_wmask - ;; + sub s_vlimit=s_vstrstart, s_tm1 // limit=strstart - MAX_DIST + add s_amatch=s_awindow,s_vspec_cmatch // match=window + cur_match + and s_vspec_cmatch =s_vspec_cmatch,s_wmask + ;; }{.mmi // Cycle: 5 - add s_amatblen=s_amatch,s_vbestlen // - cneu p_bn2,p0=2,s_vbestlen // set if bestlen != 2 - add s_ascanend=s_ascan,s_vbestlen // compute a(scan) + best_len + add s_amatblen=s_amatch,s_vbestlen // + cneu p_bn2,p0=2,s_vbestlen // set if bestlen != 2 + add s_ascanend=s_ascan,s_vbestlen // compute a(scan) + best_len }{.mmi - ld1 s_vscan0=[s_ascan],1 // NB: s_ascan++ - ld1 s_vmatch0=[s_amatch],1 - cgtu p0,p_no=s_vlimit,r0 // is result positive ? - ;; + ld1 s_vscan0=[s_ascan],1 // NB: s_ascan++ + ld1 s_vmatch0=[s_amatch],1 + cgtu p0,p_no=s_vlimit,r0 // is result positive ? + ;; }{.mmi // Cycle: 6 - ld1.nt1 s_vscan1=[s_ascan],2 // NB: s_ascan+3 in total - ld1.nt1 s_vmatch1=[s_amatch],2 - add s_awinbest=s_awindow,s_vbestlen // - ;; + ld1.nt1 s_vscan1=[s_ascan],2 // NB: s_ascan+3 in total + ld1.nt1 s_vmatch1=[s_amatch],2 + add s_awinbest=s_awindow,s_vbestlen // + ;; }{.mmi // Cycle: 7 - ld1.nt1 s_vscanend=[s_ascanend],-1 // scan_end=scan[best_len] - ld1.nt1 s_vmatbst=[s_amatblen],-1 + ld1.nt1 s_vscanend=[s_ascanend],-1 // scan_end=scan[best_len] + ld1.nt1 s_vmatbst=[s_amatblen],-1 (p_no) mova s_vlimit=r0 - ;; + ;; }{.mmi // Cycle: 8 (p_bn2) ld1.nt1 s_vscanend1=[s_ascanend],1 // scan_end1=scan[best_len-1] (p_bn2) ld1.nt1 s_vmatbst1=[s_amatblen] - shladd s_vspec_cmatch =s_vspec_cmatch,1,s_aprev + shladd s_vspec_cmatch =s_vspec_cmatch,1,s_aprev }{.mmi - cgeu p_shf,p0=s_vbestlen,s_tm4 // is (prev_length >= good_match) ? - ;; + cgeu p_shf,p0=s_vbestlen,s_tm4 // is (prev_length >= good_match) ? + ;; }{.mmi // Cycle: 9 - ld1.nt1 s_vscan3=[s_ascan] - ld2.nt1 s_vspec_cmatch=[s_vspec_cmatch] - mova s_vlen=3 + ld1.nt1 s_vscan3=[s_ascan] + ld2.nt1 s_vspec_cmatch=[s_vspec_cmatch] + mova s_vlen=3 }{.mmi (p_shf) shr.u s_vchainlen=s_vchainlen,2 // (cur_len) >> 2 - ;; + ;; }{.mmi // Cycle: 10 - ld1.nt1 s_vmatch3=[s_amatch] - // p_ll switched on as soon as we get a mismatch: - cmp.eq.and p_ll,p0=s_vmatch0,s_vscan0 - cmp.eq.and p_ll,p0=s_vmatbst,s_vscanend + ld1.nt1 s_vmatch3=[s_amatch] + // p_ll switched on as soon as we get a mismatch: + cmp.eq.and p_ll,p0=s_vmatch0,s_vscan0 + cmp.eq.and p_ll,p0=s_vmatbst,s_vscanend }{.mib - cmp.eq.and p_ll,p0=s_vmatch1,s_vscan1 + cmp.eq.and p_ll,p0=s_vmatch1,s_vscan1 (p_bn2) cmp.eq.and p_ll,p0=s_vmatbst1,s_vscanend1 (p_ll) br.cond.dpnt.many .test_more - ;; + ;; } .next_iter: {.mmi // Cycle 0 - add s_amatch=s_awindow,s_vspec_cmatch // match=window + cur_match - mov s_vcurmatch=s_vspec_cmatch // current value - add s_vchainlen=-1,s_vchainlen // --chain_length + add s_amatch=s_awindow,s_vspec_cmatch // match=window + cur_match + mov s_vcurmatch=s_vspec_cmatch // current value + add s_vchainlen=-1,s_vchainlen // --chain_length }{.mib - cmp.le.unc p_end,p0=s_vspec_cmatch,s_vlimit - and s_vspec_cmatch=s_vspec_cmatch,s_wmask + cmp.le.unc p_end,p0=s_vspec_cmatch,s_vlimit + and s_vspec_cmatch=s_vspec_cmatch,s_wmask (p_end) br.cond.dptk.many .terminate - ;; + ;; }{.mmi // Cycle 1 - ld1 s_vmatch0=[s_amatch],1 // load match[0] - // compute prev[cur_match]: - shladd s_vspec_cmatch=s_vspec_cmatch,1,s_aprev - cmp.eq.unc p_end,p0=s_vchainlen,r0 + ld1 s_vmatch0=[s_amatch],1 // load match[0] + // compute prev[cur_match]: + shladd s_vspec_cmatch=s_vspec_cmatch,1,s_aprev + cmp.eq.unc p_end,p0=s_vchainlen,r0 } {.mib - nop.m 0 - add s_amatblen=s_awinbest,s_vcurmatch // match=window + cur_match + nop.m 0 + add s_amatblen=s_awinbest,s_vcurmatch // match=window + cur_match (p_end) br.cond.dptk.many .terminate - ;; + ;; }{.mmi // Cycle 2 (short) - ld2.nt1 s_vspec_cmatch=[s_vspec_cmatch] // get next cur_match - ;; + ld2.nt1 s_vspec_cmatch=[s_vspec_cmatch] // get next cur_match + ;; }{.mmi // Cycle 3 (short) - ld1.nt1 s_vmatbst=[s_amatblen],-1 // load match[best_len] - cmp.ne.unc p_ll,p0=r0,r0 // parallel compare initialized as 'false' - ;; + ld1.nt1 s_vmatbst=[s_amatblen],-1 // load match[best_len] + cmp.ne.unc p_ll,p0=r0,r0 // parallel compare initialized as 'false' + ;; }{.mmi // Cycle 4 (short) - // load match[1] - - note: match += 3 (in total): - ld1.nt1 s_vmatch1=[s_amatch],2 - ;; - // Cycle 5 (short) + // load match[1] - - note: match += 3 (in total): + ld1.nt1 s_vmatch1=[s_amatch],2 + ;; + // Cycle 5 (short) (p_bn2) ld1.nt1 s_vmatbst1=[s_amatblen] // load match[best_len-1] }{.mib // Here we (MOST LIKELY) pay a L2-fetch stall - // p_ll switched on as soon as we get a mismatch: - cmp.ne.or p_ll,p0=s_vmatch0,s_vscan0 - cmp.ne.or p_ll,p0=s_vmatbst,s_vscanend + // p_ll switched on as soon as we get a mismatch: + cmp.ne.or p_ll,p0=s_vmatch0,s_vscan0 + cmp.ne.or p_ll,p0=s_vmatbst,s_vscanend (p_ll) br.cond.dptk.many .next_iter - ;; + ;; }{.mmi // Cycle 6 - ld1.nt1 s_vmatch3=[s_amatch] - mova s_vlen=3 - nop.i 0 + ld1.nt1 s_vmatch3=[s_amatch] + mova s_vlen=3 + nop.i 0 }{.mib - cmp.ne.or p_ll,p0=s_vmatch1,s_vscan1 + cmp.ne.or p_ll,p0=s_vmatch1,s_vscan1 (p_bn2) cmp.ne.or p_ll,p0=s_vmatbst1,s_vscanend1 (p_ll) br.cond.dptk.many .next_iter - ;; + ;; } // We have passed the first hurdle - Are there additional matches ??? .test_more: {.mmi // Cycle 0 - and s_tm3=7,s_ascan // get byte offset - and s_tm4=7,s_amatch // get byte offset - movi0 ar.ec=MLAT+SHLAT+2 // NB: One trip more than usual + and s_tm3=7,s_ascan // get byte offset + and s_tm4=7,s_amatch // get byte offset + movi0 ar.ec=MLAT+SHLAT+2 // NB: One trip more than usual }{.mib - cmp.ne.unc p_no,p0=s_vscan3,s_vmatch3 // does not next one differ? + cmp.ne.unc p_no,p0=s_vscan3,s_vmatch3 // does not next one differ? (p_no) br.cond.dptk.many .only3 - ;; + ;; }{.mmi // Cycle 1 - and s_tm1=-8,s_ascan // get aligned address - shladd s_tm3=s_tm3,3,r0 - movi0 ar.lc=31 // 32 times around the loop (8B at a time) + and s_tm1=-8,s_ascan // get aligned address + shladd s_tm3=s_tm3,3,r0 + movi0 ar.lc=31 // 32 times around the loop (8B at a time) }{.mib - and s_tm2=-8,s_amatch // get aligned address - shladd s_tm4=s_tm4,3,r0 - nop.b 0 - ;; + and s_tm2=-8,s_amatch // get aligned address + shladd s_tm4=s_tm4,3,r0 + nop.b 0 + ;; }{.mmi // Cycle 2 - ld8.nt1 scan[1]=[s_tm1],8 // load first chunk - sub s_tm5=64,s_tm3 // 64 - amount - movi0 pr.rot=1<<16 + ld8.nt1 scan[1]=[s_tm1],8 // load first chunk + sub s_tm5=64,s_tm3 // 64 - amount + movi0 pr.rot=1<<16 }{.mmi - ld8.nt1 match[1]=[s_tm2],8 // load first chunk - sub s_tm6=64,s_tm4 // 64 - amount - add s_vlen=-8,s_vlen // will be updated at least once - ;; + ld8.nt1 match[1]=[s_tm2],8 // load first chunk + sub s_tm6=64,s_tm4 // 64 - amount + add s_vlen=-8,s_vlen // will be updated at least once + ;; } - .align 32 + .align 32 .cmploop: {.mmi // Cycle 0 (lc[0]) ld8 scan[0]=[s_tm1],8 // next scan chunk @@ -686,84 +684,84 @@ longest_match: (lc[MLAT+SHLAT+1]) cmp.ne.unc p_no,p0=s_tm7,s_tm8 // break search if != (lc[MLAT]) first shmatch0[0]=match[MLAT+1],s_tm4 (p_no) br.cond.dpnt.many .mismatch - ;; + ;; }{.mii // Cycle 1 (lc[0]) ld8 match[0]=[s_tm2],8 - // shift left(le) or right(be): + // shift left(le) or right(be): (lc[MLAT]) second shscan1[0]=scan[MLAT],s_tm5 (lc[MLAT]) second shmatch1[0]=match[MLAT],s_tm6 }{.mmb (lc[MLAT+SHLAT]) or s_tm7=shscan0[SHLAT],shscan1[SHLAT] (lc[MLAT+SHLAT]) or s_tm8=shmatch0[SHLAT],shmatch1[SHLAT] - br.ctop.dptk.many .cmploop - ;; + br.ctop.dptk.many .cmploop + ;; }{.mfi - mov s_vlen=258 - nop.f 0 + mov s_vlen=258 + nop.f 0 }{.mfi - nop.f 0 // realign - ;; + nop.f 0 // realign + ;; } .mismatch: {.mii // Cycle 0 (short) (p_no) pcmp1.eq s_tm2=s_tm7,s_tm8 // find first non-matching character - nop.i 0 - ;; - // Cycle 1 (short) + nop.i 0 + ;; + // Cycle 1 (short) (p_no) count s_tm1=s_tm2 - ;; + ;; }{.mib // Cycle 2 (short) (p_no) add s_vlen=s_vlen,s_tm1 // effective length - nop.i 0 - clrrrb - ;; + nop.i 0 + clrrrb + ;; } .only3: {.mib // Cycle 0 (short) - cmp.gt.unc p0,p_nbs=s_vlen,s_vbestlen // (len > best_len) ? + cmp.gt.unc p0,p_nbs=s_vlen,s_vbestlen // (len > best_len) ? (p_nbs) br.cond.dpnt.many .next_iter // if not, re-iternate - ;; + ;; }{.mmi // Cycle 1 (short) - ld4 s_tm7=[s_anicematch] // nice_match - st4 [s_amatchstart]= s_vcurmatch - add s_ascanend=s_ascan,s_vlen // reset with best_len - ;; + ld4 s_tm7=[s_anicematch] // nice_match + st4 [s_amatchstart]= s_vcurmatch + add s_ascanend=s_ascan,s_vlen // reset with best_len + ;; }{.mmi // Cycle 2 (short) - mova s_vbestlen=s_vlen - add s_ascanend=-3,s_ascanend // remember extra offset - ;; + mova s_vbestlen=s_vlen + add s_ascanend=-3,s_ascanend // remember extra offset + ;; }{.mmi // Cycle 3 (short) - ld1 s_vscanend=[s_ascanend],-1 // scan_end=scan[best_len] - add s_awinbest=s_awindow,s_vbestlen // update with new best_len - cmp.ne.unc p_bn2,p0=2,s_vbestlen // set if bestlen != 2 - ;; + ld1 s_vscanend=[s_ascanend],-1 // scan_end=scan[best_len] + add s_awinbest=s_awindow,s_vbestlen // update with new best_len + cmp.ne.unc p_bn2,p0=2,s_vbestlen // set if bestlen != 2 + ;; }{.mib // Cycle 4 (short) - // scan_end1=scan[best_len-1] NB: s_ascanend reset: - ld1.nt1 s_vscanend1=[s_ascanend],1 - cmp.lt.unc p_nnc,p0=s_vlen,s_tm7 // compare with nice_match + // scan_end1=scan[best_len-1] NB: s_ascanend reset: + ld1.nt1 s_vscanend1=[s_ascanend],1 + cmp.lt.unc p_nnc,p0=s_vlen,s_tm7 // compare with nice_match (p_nnc) br.cond.dptk.many .next_iter - ;; + ;; } .terminate: {.mii // Cycle 0/1 - nop.m 0 - movi0 ar.lc=s_lcsave - movi0 pr=s_prsave,-1 + nop.m 0 + movi0 ar.lc=s_lcsave + movi0 pr=s_prsave,-1 }{.mbb - nop.m 0 - nop.b 0 - br.ret.sptk.many rp // ret0 is identical to best_len - ;; + nop.m 0 + nop.b 0 + br.ret.sptk.many rp // ret0 is identical to best_len + ;; } - .endp + .endp - .global match_init - .proc match_init + .global match_init + .proc match_init match_init: - sub ret0=ret0,ret0 - br.ret.sptk.many rp - .endp + sub ret0=ret0,ret0 + br.ret.sptk.many rp + .endp # else error: this asm version is for 386 or 680x0 or ia64 only