global _start
section .data
timelo dd 0
timehi dd 0
b2hout db 0,0,0,0,0,0,0,0,0
b2hlut db '0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'
align 8
strbuf1 resb 256
pad1 resb 1 ;; misaligned by 1
strbuf2 resb 256
pad2 resb 1 ;; misaligned by 2
strbuf3 resb 256
pad3 resb 1 ;; misaligned by 3
strbuf4 resb 256
section .text
_start:
push 100000
push 255
push strlen_3
call benchmark
push 100000
push 251
push strlen_3
call benchmark
push 100000
push 255
push strlen_4
call benchmark
push 100000
push 251
push strlen_4
call benchmark
call exit
benchmark: ;; function, string length, iterations
push ebx
push esi
mov ebx, [esp + 20]
mov ecx, [esp + 16]
mov esi, [esp + 12]
push ecx
call makestring
rdtsc
mov dword [timelo], eax
mov dword [timehi], edx
.loop:
push strbuf1
push strbuf2
push strbuf3
push strbuf4
call esi
call esi
call esi
call esi
sub ebx, 1
jnz .loop
rdtsc
sub eax, dword [timelo]
sbb edx, dword [timehi]
push eax
;;push edx
;;call writehex
call writehex
pop esi
pop ebx
ret 12
align 16
strlen_1:
push edi
mov ecx, -1
xor edx, edx
mov edi, [esp + 8] ;; str ptr
cld
xor eax, eax
sub edx, edi
repne scasb
lea eax, [edi + edx - 1]
pop edi
ret 4
align 16
strlen_2:
mov eax, [esp + 4]
dec eax
jmp .loop
align 16
.loop:
add eax, 1
test byte [eax], 0ffh
jnz .loop
sub eax, [esp + 4]
ret 4
align 16
strlen_3:
push ebx
push esi
push edi
mov eax, [esp + 16]
mov ebx, -01010101h
.aligning:
test eax, 3
jz .scan
mov dl, [eax]
test dl, dl
jz .found
inc eax
jmp .aligning
align 32
.scan:
mov esi, [eax]
mov edi, [eax + 4]
lea eax, [eax + 8]
lea ecx, [esi + ebx] ;!
lea edx, [edi + ebx]
not esi
not edi
and ecx, esi
and edx, edi
and ecx, $80808080
and edx, $80808080
test ecx, ecx ;!!
jnz .sub8
test edx, edx
jz .scan
lea eax, [eax-4]
mov ecx, edx
jmp .bytesearch
.sub8:
lea eax,[eax-8]
.bytesearch:
test cl, cl
jnz .found
inc eax
test ch, ch
jnz .found
shr ecx, 16
inc eax
test cl, cl
jnz .found
inc eax
.found:
sub eax, [esp + 16]
pop edi
pop esi
pop ebx
ret 4
strlen_4:
push ebx
push esi
push edi
mov eax, [esp + 16]
mov ebx, -01010101h
test eax, 3
jz .scan
mov edx, [eax]
test dl, dl
jz .found
inc eax
test eax, 3
jz .scan
test dh, dh
jz .found
inc eax
shr edx, 16
test eax, 3
jz .scan
test dl, dl
jz .found
inc eax
jmp .scan
align 16
.scan:
mov esi, [eax]
mov edi, [eax + 4]
add eax, 8
lea ecx, [esi + ebx]
lea edx, [edi + ebx]
not esi
not edi
and ecx, esi
and edx, edi
and ecx, 80808080h
jnz .foundlo
and edx, 80808080h
jz .scan
.foundhi:
bsf edx, edx
sub eax, [esp + 16]
shr edx, 3
lea eax, [eax + edx - 4]
pop edi
pop esi
pop ebx
ret 4
.foundlo:
bsf ecx, ecx
sub eax, [esp + 16]
shr ecx, 3
lea eax, [eax + ecx - 8]
pop edi
pop esi
pop ebx
ret 4
.found:
sub eax, [esp + 16]
pop edi
pop esi
pop ebx
ret 4
makestring:
mov eax, [esp + 4] ;; size
push eax
push strbuf1
push eax
push strbuf2
push eax
push strbuf3
push eax
push strbuf4
call fillstring
call fillstring
call fillstring
call fillstring
ret 4
fillstring:
mov al, 1
mov edx, [esp + 4] ;; ptr
mov ecx, [esp + 8] ;; size
mov byte [edx + ecx], 0
.loop:
dec ecx
js .done
mov byte [edx + ecx], al
add al, 1
jnz .skip
mov al, 1
.skip:
jmp .loop
.done:
ret 8
writehex:
push ebx
push esi
push edi
mov eax, [esp + 16] ;; binary value
mov ecx, 32 - 4
mov esi, b2hlut
mov edi, b2hout
.loop:
mov edx, eax
shr edx, cl
and edx, 0fh
mov dl, byte [esi + edx]
mov byte [edi], dl
add edi, 1
sub ecx, 4
jns .loop
mov byte [edi], 10
;;
mov eax, 4
mov ebx, 1
mov ecx, b2hout
mov edx, 9
int 80h
pop edi
pop esi
pop ebx
ret 4
exit:
mov eax, 01h
xor ebx, ebx
int 80h
Z2xvYmFsIF9zdGFydAoKCgpzZWN0aW9uIC5kYXRhCgoJdGltZWxvCWRkCTAKCXRpbWVoaQlkZAkwCgliMmhvdXQJZGIJMCwwLDAsMCwwLDAsMCwwLDAKCgoJYjJobHV0CWRiCScwJywnMScsJzInLCczJywnNCcsJzUnLCc2JywnNycsJzgnLCc5JywnYScsJ2InLCdjJywnZCcsJ2UnLCdmJwphbGlnbiA4CglzdHJidWYxCXJlc2IJMjU2CglwYWQxCXJlc2IJMSA7OyBtaXNhbGlnbmVkIGJ5IDEKCXN0cmJ1ZjIJcmVzYgkyNTYKCXBhZDIJcmVzYgkxIDs7IG1pc2FsaWduZWQgYnkgMgoJc3RyYnVmMwlyZXNiCTI1NgoJcGFkMwlyZXNiCTEgOzsgbWlzYWxpZ25lZCBieSAzCglzdHJidWY0CXJlc2IJMjU2CgpzZWN0aW9uIC50ZXh0CgoKX3N0YXJ0OgoKCSBwdXNoCTEwMDAwMAoJIHB1c2gJMjU1CgkgcHVzaAlzdHJsZW5fMwoJY2FsbAliZW5jaG1hcmsKCSBwdXNoCTEwMDAwMAoJIHB1c2gJMjUxCgkgcHVzaAlzdHJsZW5fMwoJY2FsbAliZW5jaG1hcmsKCgkgcHVzaAkxMDAwMDAKCSBwdXNoCTI1NQoJIHB1c2gJc3RybGVuXzQKCWNhbGwJYmVuY2htYXJrCgkgcHVzaAkxMDAwMDAKCSBwdXNoCTI1MQoJIHB1c2gJc3RybGVuXzQKCWNhbGwJYmVuY2htYXJrCgoJY2FsbAlleGl0CgoKCmJlbmNobWFyazogOzsgZnVuY3Rpb24sIHN0cmluZyBsZW5ndGgsIGl0ZXJhdGlvbnMKCXB1c2gJZWJ4CglwdXNoCWVzaQoJbW92CWVieCwgW2VzcCArIDIwXQoJbW92CWVjeCwgW2VzcCArIDE2XQoJbW92CWVzaSwgW2VzcCArIDEyXQoJcHVzaAllY3gKCWNhbGwJbWFrZXN0cmluZwoJcmR0c2MKCW1vdglkd29yZCBbdGltZWxvXSwgZWF4Cgltb3YJZHdvcmQgW3RpbWVoaV0sIGVkeAoubG9vcDoKCXB1c2gJc3RyYnVmMQoJcHVzaAlzdHJidWYyCglwdXNoCXN0cmJ1ZjMKCXB1c2gJc3RyYnVmNAoJY2FsbAllc2kKCWNhbGwJZXNpCgljYWxsCWVzaQoJY2FsbAllc2kKCXN1YgllYngsIDEKCWpuegkubG9vcAoJcmR0c2MKCXN1YgllYXgsIGR3b3JkIFt0aW1lbG9dCglzYmIJZWR4LCBkd29yZCBbdGltZWhpXQoJcHVzaAllYXgKCTs7cHVzaAllZHgKCTs7Y2FsbAl3cml0ZWhleAoJY2FsbAl3cml0ZWhleAkKCXBvcAllc2kKCXBvcAllYngKCXJldAkxMgoKYWxpZ24gMTYKc3RybGVuXzE6CglwdXNoCWVkaQoJbW92CWVjeCwgLTEKCXhvcgllZHgsIGVkeAoJbW92CWVkaSwgW2VzcCArIDhdIDs7IHN0ciBwdHIKCWNsZAoJeG9yCWVheCwgZWF4CglzdWIJZWR4LCBlZGkKCXJlcG5lIHNjYXNiCglsZWEJZWF4LCBbZWRpICsgZWR4IC0gMV0KCXBvcAllZGkKCXJldAk0CgoKYWxpZ24gMTYKc3RybGVuXzI6Cgltb3YJZWF4LCBbZXNwICsgNF0KCWRlYwllYXgKCWptcAkubG9vcAphbGlnbiAxNgoubG9vcDoKCWFkZAllYXgsIDEKCXRlc3QJYnl0ZSBbZWF4XSwgMGZmaAoJam56CS5sb29wCglzdWIJZWF4LCBbZXNwICsgNF0KCXJldAk0CgoKYWxpZ24gMTYKc3RybGVuXzM6CglwdXNoCWVieAoJcHVzaAllc2kKCXB1c2gJZWRpCiAgICAgICAgbW92ICAgICBlYXgsIFtlc3AgKyAxNl0gCiAgICAgICAgbW92ICAgICBlYngsIC0wMTAxMDEwMWggCi5hbGlnbmluZzogCiAgICAgICAgdGVzdCAgICBlYXgsIDMgCiAgICAgICAganogICAgICAuc2NhbiAKICAgICAgICBtb3YgICAgIGRsLCBbZWF4XSAKICAgICAgICB0ZXN0ICAgIGRsLCBkbCAKICAgICAgICBqeiAgICAgIC5mb3VuZCAKICAgICAgICBpbmMgICAgIGVheCAKICAgICAgICBqbXAgICAgIC5hbGlnbmluZyAKYWxpZ24gMzIgCi5zY2FuOiAKCW1vdgllc2ksIFtlYXhdIAoJbW92CWVkaSwgW2VheCArIDRdIAoJbGVhCWVheCwgW2VheCArIDhdIAoJbGVhCWVjeCwgW2VzaSArIGVieF0gICA7ISAKCWxlYQllZHgsIFtlZGkgKyBlYnhdIAoJbm90CWVzaSAKCW5vdAllZGkgCglhbmQJZWN4LCBlc2kgCglhbmQJZWR4LCBlZGkgCglhbmQJZWN4LCAkODA4MDgwODAgCglhbmQJZWR4LCAkODA4MDgwODAgCgl0ZXN0CWVjeCwgZWN4ICAgICAgICAgOyEhIAoJam56CS5zdWI4IAoJdGVzdAllZHgsIGVkeCAKCWp6CS5zY2FuIAoJbGVhCWVheCwgW2VheC00XSAKCW1vdgllY3gsIGVkeCAKCWptcAkuYnl0ZXNlYXJjaCAKLnN1Yjg6IAoJbGVhCWVheCxbZWF4LThdIAouYnl0ZXNlYXJjaDogCgl0ZXN0CWNsLCBjbCAKCWpueiAgICAgLmZvdW5kIAoJaW5jCWVheCAKCXRlc3QJY2gsIGNoIAoJam56CS5mb3VuZCAKCXNocgllY3gsIDE2IAoJaW5jCWVheCAKCXRlc3QJY2wsIGNsIAoJam56CS5mb3VuZCAKCWluYwllYXggCi5mb3VuZDogCglzdWIJZWF4LCBbZXNwICsgMTZdCglwb3AJZWRpCglwb3AJZXNpCglwb3AJZWJ4CiAgICAgICAgcmV0CTQKCgpzdHJsZW5fNDoKCXB1c2gJZWJ4CglwdXNoCWVzaQoJcHVzaAllZGkKCW1vdgllYXgsIFtlc3AgKyAxNl0KCW1vdgllYngsIC0wMTAxMDEwMWgKCXRlc3QJZWF4LCAzCglqegkuc2NhbgoJbW92CWVkeCwgW2VheF0KCXRlc3QJZGwsIGRsCglqegkuZm91bmQKCWluYwllYXgKCXRlc3QJZWF4LCAzCglqegkuc2NhbgoJdGVzdAlkaCwgZGgKCWp6CS5mb3VuZAoJaW5jCWVheAoJc2hyCWVkeCwgMTYKCXRlc3QJZWF4LCAzCglqegkuc2NhbgoJdGVzdAlkbCwgZGwKCWp6CS5mb3VuZAoJaW5jCWVheAoJam1wCS5zY2FuCmFsaWduIDE2Ci5zY2FuOgoJbW92CWVzaSwgW2VheF0KCW1vdgllZGksIFtlYXggKyA0XQoJYWRkCWVheCwgOAoJbGVhCWVjeCwgW2VzaSArIGVieF0KCWxlYQllZHgsIFtlZGkgKyBlYnhdCglub3QJZXNpCglub3QJZWRpCglhbmQJZWN4LCBlc2kKCWFuZAllZHgsIGVkaQoJYW5kCWVjeCwgODA4MDgwODBoCglqbnoJLmZvdW5kbG8KCWFuZAllZHgsIDgwODA4MDgwaAoJanoJLnNjYW4KLmZvdW5kaGk6Cglic2YJZWR4LCBlZHgKCXN1YgllYXgsIFtlc3AgKyAxNl0KCXNocgllZHgsIDMKCWxlYQllYXgsIFtlYXggKyBlZHggLSA0XQoJcG9wCWVkaQoJcG9wCWVzaQoJcG9wCWVieAoJcmV0CTQKLmZvdW5kbG86Cglic2YJZWN4LCBlY3gKCXN1YgllYXgsIFtlc3AgKyAxNl0KCXNocgllY3gsIDMKCWxlYQllYXgsIFtlYXggKyBlY3ggLSA4XQoJcG9wCWVkaQoJcG9wCWVzaQoJcG9wCWVieAoJcmV0CTQKLmZvdW5kOgoJc3ViCWVheCwgW2VzcCArIDE2XQoJcG9wCWVkaQoJcG9wCWVzaQoJcG9wCWVieAoJcmV0CTQKCQptYWtlc3RyaW5nOgoJbW92CWVheCwgW2VzcCArIDRdIDs7IHNpemUKCXB1c2gJZWF4CglwdXNoCXN0cmJ1ZjEKCXB1c2gJZWF4CglwdXNoCXN0cmJ1ZjIKCXB1c2gJZWF4CglwdXNoCXN0cmJ1ZjMKCXB1c2gJZWF4CglwdXNoCXN0cmJ1ZjQKCWNhbGwJZmlsbHN0cmluZwoJY2FsbAlmaWxsc3RyaW5nCgljYWxsCWZpbGxzdHJpbmcKCWNhbGwJZmlsbHN0cmluZwoJcmV0CTQKCmZpbGxzdHJpbmc6Cgltb3YJYWwsIDEKCW1vdgllZHgsIFtlc3AgKyA0XSA7OyBwdHIKCW1vdgllY3gsIFtlc3AgKyA4XSA7OyBzaXplCgltb3YJYnl0ZSBbZWR4ICsgZWN4XSwgMAoubG9vcDoKCWRlYwllY3gKCWpzCS5kb25lCgltb3YJYnl0ZSBbZWR4ICsgZWN4XSwgYWwKCWFkZAlhbCwgMQoJam56CS5za2lwCgltb3YJYWwsIDEKLnNraXA6CglqbXAJLmxvb3AKLmRvbmU6CglyZXQJOAoKd3JpdGVoZXg6CglwdXNoCWVieAoJcHVzaAllc2kKCXB1c2gJZWRpCgltb3YJZWF4LCBbZXNwICsgMTZdIDs7IGJpbmFyeSB2YWx1ZQoJbW92CWVjeCwgMzIgLSA0Cgltb3YJZXNpLCBiMmhsdXQKCW1vdgllZGksIGIyaG91dAoubG9vcDoKCW1vdgllZHgsIGVheAoJc2hyCWVkeCwgY2wKCWFuZAllZHgsIDBmaCAKCW1vdglkbCwgYnl0ZSBbZXNpICsgZWR4XQoJbW92CWJ5dGUgW2VkaV0sIGRsCglhZGQJZWRpLCAxCglzdWIJZWN4LCA0CglqbnMJLmxvb3AKCW1vdglieXRlIFtlZGldLCAxMAoJOzsKCW1vdgllYXgsIDQKCW1vdgllYngsIDEKCW1vdgllY3gsIGIyaG91dAoJbW92CWVkeCwgOQoJaW50CTgwaAoJcG9wCWVkaQoJcG9wCWVzaQoJcG9wCWVieAoJcmV0CTQKCmV4aXQ6CgoJbW92CWVheCwgMDFoCgl4b3IJZWJ4LCBlYngKCWludAk4MGg=