SECTION .bss
align 64
%assign size 4000h
src1: resb size
src1_end:
src2: resb size
src2_end:
dst: resb size
dst_end:
SECTION .text
global _start
_start:
vzeroupper
vxorps ymm2, ymm2, ymm2
mov rcx, 7031250 ; 3.6GHz -> 1 sec/clock
.L1:
mov rax, -size
align 16
.L2:
%assign unroll 1
%assign i 0
%rep unroll
vmovaps ymm1, [i+src1_end+rax]
vfmadd231ps ymm1, ymm2, [i+src2_end+rax]
vmovaps [i+dst_end+rax], ymm1
%assign i i+32
%endrep
add rax, 32*unroll
jne .L2
dec rcx
jne .L1
_exit:
vzeroupper
mov rax, 60
mov rdi, 0
syscall
ICAgIFNFQ1RJT04gLmJzcwphbGlnbiA2NAogICAgJWFzc2lnbiBzaXplIDQwMDBoCiAgICBzcmMxOiAgICAgICByZXNiIHNpemUKICAgIHNyYzFfZW5kOgogICAgc3JjMjogICAgICAgcmVzYiBzaXplCiAgICBzcmMyX2VuZDoKICAgIGRzdDogICAgICAgIHJlc2Igc2l6ZQogICAgZHN0X2VuZDoKCiAgICBTRUNUSU9OIC50ZXh0CiAgICBnbG9iYWwgX3N0YXJ0Cgpfc3RhcnQ6CiAgICB2emVyb3VwcGVyCiAgICB2eG9ycHMgICAgICB5bW0yLCB5bW0yLCB5bW0yCiAgICBtb3YgICAgICAgICByY3gsIDcwMzEyNTAgOyAzLjZHSHogLT4gMSBzZWMvY2xvY2sKCi5MMToKICAgIG1vdiAgICAgICAgIHJheCwgLXNpemUKCmFsaWduIDE2Ci5MMjoKICAgICVhc3NpZ24gdW5yb2xsIDEKICAgICVhc3NpZ24gaSAwCgogICAgJXJlcCB1bnJvbGwKCiAgICB2bW92YXBzICAgICB5bW0xLCBbaStzcmMxX2VuZCtyYXhdCiAgICB2Zm1hZGQyMzFwcyB5bW0xLCB5bW0yLCBbaStzcmMyX2VuZCtyYXhdCiAgICB2bW92YXBzICAgICBbaStkc3RfZW5kK3JheF0sIHltbTEKCiAgICAlYXNzaWduIGkgaSszMgogICAgJWVuZHJlcAoKICAgIGFkZCAgICAgICAgIHJheCwgMzIqdW5yb2xsCiAgICBqbmUgICAgICAgICAuTDIKCiAgICBkZWMgICAgICAgICByY3gKICAgIGpuZSAgICAgICAgIC5MMQoKX2V4aXQ6CiAgICB2emVyb3VwcGVyCiAgICBtb3YgcmF4LCA2MAogICAgbW92IHJkaSwgMAogICAgc3lzY2FsbAo=