git.gag.com Git - debian/gnuradio/blob - gnuradio-core/src/lib/filter/short_dotprod_mmx64.S

   1 #
   2 # Copyright 2002,2005 Free Software Foundation, Inc.
   3 #
   4 # This file is part of GNU Radio
   5 #
   6 # GNU Radio is free software; you can redistribute it and/or modify
   7 # it under the terms of the GNU General Public License as published by
   8 # the Free Software Foundation; either version 3, or (at your option)
   9 # any later version.
  10 #
  11 # GNU Radio is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 # GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License
  17 # along with GNU Radio; see the file COPYING.  If not, write to
  18 # the Free Software Foundation, Inc., 51 Franklin Street,
  19 # Boston, MA 02110-1301, USA.
  20 #
  21
  22 # SIMD MMX dot product
  23 # Equivalent to the following C code:
  24 # long dotprod(signed short *a,signed short *b,int cnt)
  25 # {
  26 #       long sum = 0;
  27 #       cnt *= 4;
  28 #       while(cnt--)
  29 #               sum += *a++ + *b++;
  30 #       return sum;
  31 # }
  32 # a and b should also be 64-bit aligned, or speed will suffer greatly
  33 # Copyright 1999, Phil Karn KA9Q
  34 # May be used under the terms of the GNU public license
  35
  36 #include "assembly.h"
  37
  38
  39         .file   "short_dotprod_mmx64.S"
  40         .version        "01.01"
  41 .text
  42         .p2align 3
  43 .globl GLOB_SYMB(short_dotprod_mmx)
  44         DEF_FUNC_HEAD(short_dotprod_mmx)
  45 GLOB_SYMB(short_dotprod_mmx):
  46
  47         # a: rdi, b: rsi, cnt: rdx
  48
  49         pxor %mm0,%mm0          # clear running sum (in two 32-bit halves)
  50
  51 # MMX dot product loop unrolled 4 times, crunching 16 terms per loop
  52         .p2align 4
  53 .Loop1mmx:      sub $4,%rdx
  54         jl   .Loop1Done
  55
  56         movq (%rdi),%mm1        # mm1 = a[3],a[2],a[1],a[0]
  57         pmaddwd (%rsi),%mm1     # mm1 = b[3]*a[3]+b[2]*a[2],b[1]*a[1]+b[0]*a[0]
  58         paddd %mm1,%mm0
  59
  60         movq 8(%rdi),%mm1
  61         pmaddwd 8(%rsi),%mm1
  62         paddd %mm1,%mm0
  63
  64         movq 16(%rdi),%mm1
  65         pmaddwd 16(%rsi),%mm1
  66         paddd %mm1,%mm0
  67
  68         movq 24(%rdi),%mm1
  69         add $32,%rdi
  70         pmaddwd 24(%rsi),%mm1
  71         add $32,%rsi
  72         paddd %mm1,%mm0
  73
  74         jmp .Loop1mmx
  75 .Loop1Done:
  76
  77         add $4,%rdx
  78
  79 # MMX dot product loop, not unrolled, crunching 4 terms per loop
  80 # This could be redone as Duff's Device on the unrolled loop above
  81 .Loop2: sub $1,%rdx
  82         jl   .Loop2Done
  83
  84         movq (%rdi),%mm1
  85         add $8,%rdi
  86         pmaddwd (%rsi),%mm1
  87         add $8,%rsi
  88         paddd %mm1,%mm0
  89         jmp .Loop2
  90 .Loop2Done:
  91
  92         movd %mm0,%edx          # right-hand word to edx
  93         punpckhdq %mm0,%mm0     # left-hand word to right side of %mm0
  94         movd %mm0,%eax
  95         addl %edx,%eax          # running sum now in %eax
  96         emms                    # done with MMX
  97
  98         retq
  99
 100 FUNC_TAIL(short_dotprod_mmx)
 101         .ident  "Hand coded x86_64 MMX assembly"
 102
 103 #if defined(__linux__) && defined(__ELF__)
 104 .section .note.GNU-stack,"",%progbits
 105 #endif