shll $1, %ecx # count * 2
.p2align 4
-.3dnow_float_dotprod_really_simple_loop1:
+.Loop1:
movq (%eax), %mm0
pfmul (%edx), %mm0
pfadd %mm0, %mm4
addl $8, %edx
addl $8, %eax
decl %ecx
- jne .3dnow_float_dotprod_really_simple_loop1
+ jne .Loop1
# at this point mm4 contains partial sums
pxor %mm5, %mm5 # mm5 = 0 0
.p2align 4
-.3dnow_float_dotprod_simple_loop1:
+.Loop1:
movq 0(%eax), %mm0
movq 8(%eax), %mm1
addl $16, %edx
addl $16, %eax
decl %ecx
- jne .3dnow_float_dotprod_simple_loop1
+ jne .Loop1
# at this point mm4 and mm5 contain partial sums
#
.p2align 4
-.ccomplex_dotprod_3dnow_loop1:
+.Loop1:
# complex prod: C += A * B, w/ temp Z, mmPN=$80000000
#
.L1_test:
decl %ecx
- jge .ccomplex_dotprod_3dnow_loop1
+ jge .Loop1
# We've handled the bulk of multiplies up to here.
# Let's see if original n_2_ccomplex_blocks was odd.
#
.p2align 4
-.ccomplex_dotprod_3dnow64_loop1:
+.Loop1:
# complex prod: C += A * B, w/ temp Z, mmPN=$80000000
#
.L1_test:
dec %rax
- jge .ccomplex_dotprod_3dnow64_loop1
+ jge .Loop1
# We've handled the bulk of multiplies up to here.
# Let's see if original n_2_ccomplex_blocks was odd.
#
.p2align 4
-.ccomplex_dotprod_3dnowext_loop1:
+.Loop1:
# complex prod: C += A * B, w/ temp Z
#
.L1_test:
decl %ecx
- jge .ccomplex_dotprod_3dnowext_loop1
+ jge .Loop1
# We've handled the bulk of multiplies up to here.
# Let's see if original n_2_ccomplex_blocks was odd.
#
.p2align 4
-.ccomplex_dotprod_3dnowext64_loop1:
+.Loop1:
# complex prod: C += A * B, w/ temp Z
#
.L1_test:
dec %rax
- jge .ccomplex_dotprod_3dnowext64_loop1
+ jge .Loop1
# We've handled the bulk of multiplies up to here.
# Let's see if original n_2_ccomplex_blocks was odd.
#
.p2align 4
-.ccomplex_dotprod_sse_loop1:
+.Loop1:
# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000
#
.L1_test:
decl %ecx
- jge .ccomplex_dotprod_sse_loop1
+ jge .Loop1
# We've handled the bulk of multiplies up to here.
# Let's sse if original n_2_ccomplex_blocks was odd.
#
.p2align 4
-.ccomplex_dotprod_sse64_loop1:
+.Loop1:
# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000
#
.L1_test:
dec %rax
- jge .ccomplex_dotprod_sse64_loop1
+ jge .Loop1
# We've handled the bulk of multiplies up to here.
# Let's sse if original n_2_ccomplex_blocks was odd.
#
.p2align 4
-.complex_dotprod_3dnow_loop1:
+.Loop1:
pfmul 0(%edx), %mm0
pfadd %mm2, %mm6
.L1_test:
decl %ecx
- jge .complex_dotprod_3dnow_loop1
+ jge .Loop1
# We've handled the bulk of multiplies up to here.
# Now accumulate the final two additions and see if original
#
.p2align 4
-.complex_dotprod_3dnow64_loop1:
+.Loop1:
pfmul 0(%rsi), %mm0
pfadd %mm2, %mm6
.L1_test:
dec %rax
- jge .complex_dotprod_3dnow64_loop1
+ jge .Loop1
# We've handled the bulk of multiplies up to here.
# Now accumulate the final two additions and see if original
#
.p2align 4
-.complex_dotprod_3dnowext_loop1:
+.Loop1:
pfmul 0(%edx), %mm0
pfadd %mm2, %mm6
.L1_test:
decl %ecx
- jge .complex_dotprod_3dnowext_loop1
+ jge .Loop1
# We've handled the bulk of multiplies up to here.
# Now accumulate the final two additions and see if original
#
.p2align 4
-.complex_dotprod_3dnowext64_loop1:
+.Loop1:
pfmul 0(%rsi), %mm0
pfadd %mm2, %mm6
.L1_test:
dec %rax
- jge .complex_dotprod_3dnowext64_loop1
+ jge .Loop1
# We've handled the bulk of multiplies up to here.
# Now accumulate the final two additions and see if original
jmp .L1_test
.p2align 4
-.complex_dotprod_sse_loop1:
+.Loop1:
pxor %mm0, %mm0
punpcklwd 0(%eax), %mm0
addps %xmm0, %xmm4
.L1_test:
decl %ecx
- jge .complex_dotprod_sse_loop1
+ jge .Loop1
# set up for primary loop which is unrolled 4 times
# hence enter loop at top
.p2align 4
-.complex_dotprod_sse_loop2:
+.Loop2:
mulps (%edx), %xmm0
addps %xmm2, %xmm6
addl $0x40, %edx
addl $0x10, %eax
decl %ecx
- jne .complex_dotprod_sse_loop2
+ jne .Loop2
# OK, now we've done with all the multiplies, but
# we still need to handle the unaccumulated
jmp .L1_test
.p2align 4
-.complex_dotprod_sse64_loop1:
+.Loop1:
pxor %mm0, %mm0
punpcklwd 0(%rdi), %mm0
addps %xmm0, %xmm4
.L1_test:
dec %rax
- jge .complex_dotprod_sse64_loop1
+ jge .Loop1
# set up for primary loop which is unrolled 4 times
# hence enter loop at top
.p2align 4
-.complex_dotprod_sse64_loop2:
+.Loop2:
mulps (%rsi), %xmm0
addps %xmm2, %xmm6
add $0x40, %rsi
add $0x10, %rdi
dec %rdx
- jne .complex_dotprod_sse64_loop2
+ jne .Loop2
# OK, now we've done with all the multiplies, but
# we still need to handle the unaccumulated
#
.p2align 4
-.fcomplex_dotprod_3dnow_loop1:
+.Loop1:
pfmul 0(%edx), %mm0
pfadd %mm2, %mm6
.L1_test:
decl %ecx
- jge .fcomplex_dotprod_3dnow_loop1
+ jge .Loop1
# We've handled the bulk of multiplies up to here.
# Now accumulate the final two additions and see if original
#
.p2align 4
-.fcomplex_dotprod_3dnow64_loop1:
+.Loop1:
pfmul 0(%rsi), %mm0
pfadd %mm2, %mm6
.L1_test:
dec %rax
- jge .fcomplex_dotprod_3dnow64_loop1
+ jge .Loop1
# We've handled the bulk of multiplies up to here.
# Now accumulate the final two additions and see if original
jmp .L1_test
.p2align 4
-.fcomplex_dotprod_sse_loop1:
+.Loop1:
movlps 0(%eax), %xmm0
shufps $0x50, %xmm0, %xmm0 # b01010000
addps %xmm0, %xmm4
.L1_test:
decl %ecx
- jge .fcomplex_dotprod_sse_loop1
+ jge .Loop1
# set up for primary loop which is unrolled 4 times
# hence enter loop at top
.p2align 4
-.fcomplex_dotprod_sse_loop2:
+.Loop2:
addps %xmm2, %xmm6
movlps 0x10(%eax), %xmm2
addl $0x40, %edx
addl $0x20, %eax
decl %ecx
- jne .fcomplex_dotprod_sse_loop2
+ jne .Loop2
# OK, now we've done with all the multiplies, but
# we still need to handle the unaccumulated
jmp .L1_test
.p2align 4
-.fcomplex_dotprod_sse64_loop1:
+.Loop1:
movlps 0(%rdi), %xmm0
shufps $0x50, %xmm0, %xmm0 # b01010000
addps %xmm0, %xmm4
.L1_test:
dec %rax
- jge .fcomplex_dotprod_sse64_loop1
+ jge .Loop1
# set up for primary loop which is unrolled 4 times
# hence enter loop at top
.p2align 4
-.fcomplex_dotprod_sse64_loop2:
+.Loop2:
addps %xmm2, %xmm6
movlps 0x10(%rdi), %xmm2
add $0x40, %rsi
add $0x20, %rdi
dec %rdx
- jne .fcomplex_dotprod_sse64_loop2
+ jne .Loop2
# OK, now we've done with all the multiplies, but
# we still need to handle the unaccumulated
#
.p2align 4
-.float_dotprod_3dnow_loop1:
+.Loop1:
pfmul 0(%edx), %mm0
pfadd %mm2, %mm6
movq 16(%eax), %mm2
addl $32, %eax
.L1_test:
decl %ecx
- jge .float_dotprod_3dnow_loop1
+ jge .Loop1
# We've handled the bulk of multiplies up to here.
# Now accumulate the final two additions and see if original
#
.p2align 4
-.float_dotprod_3dnow64_loop1:
+.Loop1:
pfmul 0(%rdi), %mm0
pfadd %mm2, %mm6
movq 16(%rsi), %mm2
add $32, %rsi
.L1_test:
dec %rax
- jge .float_dotprod_3dnow64_loop1
+ jge .Loop1
# We've handled the bulk of multiplies up to here.
# Now accumulate the final two additions and see if original
jmp .L1_test
.p2align 4
-.float_dotprod_sse_loop1:
+.Loop1:
movaps (%eax), %xmm0
mulps (%edx), %xmm0
addl $0x10, %edx
addps %xmm0, %xmm4
.L1_test:
decl %ecx
- jge .float_dotprod_sse_loop1
+ jge .Loop1
# set up for primary loop which is unrolled 4 times
# hence enter loop at top
.p2align 4
-.float_dotprod_sse_loop2:
+.Loop2:
mulps (%edx), %xmm0
addps %xmm2, %xmm6
movaps 0x20(%eax), %xmm2
addl $0x40, %edx
addl $0x40, %eax
decl %ecx
- jne .float_dotprod_sse_loop2
+ jne .Loop2
# OK, now we've done with all the multiplies, but
# we still need to handle the unaccumulated
jmp .L1_test
.p2align 4
-.float_dotprod_sse64_loop1:
+.Loop1:
movaps (%rsi), %xmm0
mulps (%rdi), %xmm0
add $0x10, %rdi
addps %xmm0, %xmm4
.L1_test:
dec %rax
- jge .float_dotprod_sse64_loop1
+ jge .Loop1
# set up for primary loop which is unrolled 4 times
# hence enter loop at top
.p2align 4
-.float_dotprod_sse64_loop2:
+.Loop2:
mulps (%rdi), %xmm0
addps %xmm2, %xmm6
movaps 0x20(%rsi), %xmm2
add $0x40, %rdi
add $0x40, %rsi
dec %rdx
- jne .float_dotprod_sse64_loop2
+ jne .Loop2
# OK, now we've done with all the multiplies, but
# we still need to handle the unaccumulated