    SECTION .bss
align 64
    %assign size 4000h
    src1:       resb size
    src1_end:
    src2:       resb size
    src2_end:
    dst:        resb size
    dst_end:

    SECTION .text
    global _start

_start:
    vzeroupper
    vxorps      ymm2, ymm2, ymm2
    mov         rcx, 7031250 ; 3.6GHz -> 1 sec/clock

.L1:
    mov         rax, -size

align 16
.L2:
    %assign unroll 1
    %assign i 0

    %rep unroll

    vmovaps     ymm1, [i+src1_end+rax]
    vfmadd231ps ymm1, ymm2, [i+src2_end+rax]
    vmovaps     [i+dst_end+rax], ymm1

    %assign i i+32
    %endrep

    add         rax, 32*unroll
    jne         .L2

    dec         rcx
    jne         .L1

_exit:
    vzeroupper
    mov rax, 60
    mov rdi, 0
    syscall
