jmp .L1_test
.p2align 4
-.loop1:
+.float_dotprod_sse64_loop1:
movaps (%rsi), %xmm0
mulps (%rdi), %xmm0
add $0x10, %rdi
addps %xmm0, %xmm4
.L1_test:
dec %rax
- jge .loop1
+ jge .float_dotprod_sse64_loop1
# set up for primary loop which is unrolled 4 times
# hence enter loop at top
.p2align 4
-.loop2:
+.float_dotprod_sse64_loop2:
mulps (%rdi), %xmm0
addps %xmm2, %xmm6
movaps 0x20(%rsi), %xmm2
add $0x40, %rdi
add $0x40, %rsi
dec %rdx
- jne .loop2
+ jne .float_dotprod_sse64_loop2
# OK, now we've done with all the multiplies, but
# we still need to handle the unaccumulated