git.gag.com Git - debian/gnuradio/blob - gnuradio-core/src/lib/filter/short_dotprod_mmx.S

   1 #
   2 # Copyright 2002 Free Software Foundation, Inc.
   3 #
   4 # This file is part of GNU Radio
   5 #
   6 # GNU Radio is free software; you can redistribute it and/or modify
   7 # it under the terms of the GNU General Public License as published by
   8 # the Free Software Foundation; either version 3, or (at your option)
   9 # any later version.
  10 #
  11 # GNU Radio is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 # GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License
  17 # along with GNU Radio; see the file COPYING.  If not, write to
  18 # the Free Software Foundation, Inc., 51 Franklin Street,
  19 # Boston, MA 02110-1301, USA.
  20 #
  21
  22 # SIMD MMX dot product
  23 # Equivalent to the following C code:
  24 # long dotprod(signed short *a,signed short *b,int cnt)
  25 # {
  26 #       long sum = 0;
  27 #       cnt *= 4;
  28 #       while(cnt--)
  29 #               sum += *a++ + *b++;
  30 #       return sum;
  31 # }
  32 # a and b should also be 64-bit aligned, or speed will suffer greatly
  33 # Copyright 1999, Phil Karn KA9Q
  34 # May be used under the terms of the GNU public license
  35
  36 #include "assembly.h"
  37
  38
  39         .file   "short_dotprod_mmx.S"
  40         .version        "01.01"
  41 .text
  42         .p2align 3
  43 .globl GLOB_SYMB(short_dotprod_mmx)
  44         DEF_FUNC_HEAD(short_dotprod_mmx)
  45 GLOB_SYMB(short_dotprod_mmx):
  46         pushl %ebp
  47         movl %esp,%ebp
  48         pushl %esi
  49         pushl %edi
  50         pushl %ecx
  51         pushl %ebx
  52         movl 8(%ebp),%esi       # a
  53         movl 12(%ebp),%edi      # b
  54         movl 16(%ebp),%ecx      # cnt
  55         pxor %mm0,%mm0          # clear running sum (in two 32-bit halves)
  56
  57 # MMX dot product loop unrolled 4 times, crunching 16 terms per loop
  58         .p2align 4
  59 .Loop1mmx:      subl $4,%ecx
  60         jl   .Loop1Done
  61
  62         movq (%esi),%mm1        # mm1 = a[3],a[2],a[1],a[0]
  63         pmaddwd (%edi),%mm1     # mm1 = b[3]*a[3]+b[2]*a[2],b[1]*a[1]+b[0]*a[0]
  64         paddd %mm1,%mm0
  65
  66         movq 8(%esi),%mm1
  67         pmaddwd 8(%edi),%mm1
  68         paddd %mm1,%mm0
  69
  70         movq 16(%esi),%mm1
  71         pmaddwd 16(%edi),%mm1
  72         paddd %mm1,%mm0
  73
  74         movq 24(%esi),%mm1
  75         addl $32,%esi
  76         pmaddwd 24(%edi),%mm1
  77         addl $32,%edi
  78         paddd %mm1,%mm0
  79
  80         jmp .Loop1mmx
  81 .Loop1Done:
  82
  83         addl $4,%ecx
  84
  85 # MMX dot product loop, not unrolled, crunching 4 terms per loop
  86 # This could be redone as Duff's Device on the unrolled loop above
  87 .Loop2: subl $1,%ecx
  88         jl   .Loop2Done
  89
  90         movq (%esi),%mm1
  91         addl $8,%esi
  92         pmaddwd (%edi),%mm1
  93         addl $8,%edi
  94         paddd %mm1,%mm0
  95         jmp .Loop2
  96 .Loop2Done:
  97
  98         movd %mm0,%ebx          # right-hand word to ebx
  99         punpckhdq %mm0,%mm0     # left-hand word to right side of %mm0
 100         movd %mm0,%eax
 101         addl %ebx,%eax          # running sum now in %eax
 102         emms                    # done with MMX
 103
 104         popl %ebx
 105         popl %ecx
 106         popl %edi
 107         popl %esi
 108         movl %ebp,%esp
 109         popl    %ebp
 110         ret
 111
 112 FUNC_TAIL(short_dotprod_mmx)
 113         .ident  "Hand coded x86 MMX assembly"
 114
 115 #if defined(__linux__) && defined(__ELF__)
 116 .section .note.GNU-stack,"",%progbits
 117 #endif