global _start
section .data
timelo dd 0
timehi dd 0
b2hout db 0,0,0,0,0,0,0,0,0
b2hlut db '0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'
align 8
strbuf1 resb 256
pad1 resb 1 ;; misaligned by 1
strbuf2 resb 256
pad2 resb 1 ;; misaligned by 2
strbuf3 resb 256
pad3 resb 1 ;; misaligned by 3
strbuf4 resb 256
section .text
_start:
push 100000
push 255
push strlen_3
call benchmark
push 100000
push 251
push strlen_3
call benchmark
push 100000
push 255
push strlen_4
call benchmark
push 100000
push 251
push strlen_4
call benchmark
call exit
benchmark: ;; function, string length, iterations
push ebx
push esi
mov ebx, [esp + 20]
mov ecx, [esp + 16]
mov esi, [esp + 12]
push ecx
call makestring
rdtsc
mov dword [timelo], eax
mov dword [timehi], edx
.loop:
push strbuf1
push strbuf2
push strbuf3
push strbuf4
call esi
call esi
call esi
call esi
sub ebx, 1
jnz .loop
rdtsc
sub eax, dword [timelo]
sbb edx, dword [timehi]
push eax
;;push edx
;;call writehex
call writehex
pop esi
pop ebx
ret 12
align 16
strlen_1:
push edi
mov ecx, -1
xor edx, edx
mov edi, [esp + 8] ;; str ptr
cld
xor eax, eax
sub edx, edi
repne scasb
lea eax, [edi + edx - 1]
pop edi
ret 4
align 16
strlen_2:
mov eax, [esp + 4]
dec eax
jmp .loop
align 16
.loop:
add eax, 1
test byte [eax], 0ffh
jnz .loop
sub eax, [esp + 4]
ret 4
align 16
strlen_3:
push ebx
push esi
push edi
mov eax, [esp + 16]
mov ebx, -01010101h
test eax, 3
jz .scan
mov edx, [eax]
test dl, dl
jz .found
inc eax
test eax, 3
jz .scan
test dh, dh
jz .found
inc eax
shr edx, 16
test eax, 3
jz .scan
test dl, dl
jz .found
inc eax
jmp .scan
align 16
.scan:
mov esi, [eax]
mov edi, [eax + 4]
add eax, 8
lea ecx, [esi + ebx]
lea edx, [edi + ebx]
not esi
not edi
and ecx, esi
and edx, edi
and ecx, 80808080h
jnz .foundlo
and edx, 80808080h
jnz .foundhi
mov esi, [eax]
mov edi, [eax + 4]
add eax, 8
lea ecx, [esi + ebx]
lea edx, [edi + ebx]
not esi
not edi
and ecx, esi
and edx, edi
and ecx, 80808080h
jnz .foundlo
and edx, 80808080h
jz .scan
.foundhi:
bsf edx, edx
sub eax, [esp + 16]
shr edx, 3
lea eax, [eax + edx - 4]
pop edi
pop esi
pop ebx
ret 4
.foundlo:
bsf ecx, ecx
sub eax, [esp + 16]
shr ecx, 3
lea eax, [eax + ecx - 8]
pop edi
pop esi
pop ebx
ret 4
.found:
sub eax, [esp + 16]
pop edi
pop esi
pop ebx
ret 4
strlen_4:
push ebx
push esi
push edi
mov eax, [esp + 16]
mov ebx, -01010101h
test eax, 3
jz .scan
mov edx, [eax]
test dl, dl
jz .found
inc eax
test eax, 3
jz .scan
test dh, dh
jz .found
inc eax
shr edx, 16
test eax, 3
jz .scan
test dl, dl
jz .found
inc eax
jmp .scan
.found:
sub eax, [esp + 16]
pop edi
pop esi
pop ebx
ret 4
align 16
.scan:
mov esi, [eax]
mov edi, [eax + 4]
add eax, 8
lea ecx, [esi + ebx]
lea edx, [edi + ebx]
not esi
not edi
and ecx, esi
and edx, edi
and ecx, 80808080h
jnz .foundlo
and edx, 80808080h
jnz .foundhi
mov esi, [eax]
mov edi, [eax + 4]
add eax, 8
lea ecx, [esi + ebx]
lea edx, [edi + ebx]
not esi
not edi
and ecx, esi
and edx, edi
and ecx, 80808080h
jnz .foundlo
and edx, 80808080h
jnz .foundhi
mov esi, [eax]
mov edi, [eax + 4]
add eax, 8
lea ecx, [esi + ebx]
lea edx, [edi + ebx]
not esi
not edi
and ecx, esi
and edx, edi
and ecx, 80808080h
jnz .foundlo
and edx, 80808080h
jnz .foundhi
mov esi, [eax]
mov edi, [eax + 4]
add eax, 8
lea ecx, [esi + ebx]
lea edx, [edi + ebx]
not esi
not edi
and ecx, esi
and edx, edi
and ecx, 80808080h
jnz .foundlo
and edx, 80808080h
jz .scan
.foundhi:
bsf edx, edx
sub eax, [esp + 16]
shr edx, 3
lea eax, [eax + edx - 4]
pop edi
pop esi
pop ebx
ret 4
.foundlo:
bsf ecx, ecx
sub eax, [esp + 16]
shr ecx, 3
lea eax, [eax + ecx - 8]
pop edi
pop esi
pop ebx
ret 4
makestring:
mov eax, [esp + 4] ;; size
push eax
push strbuf1
push eax
push strbuf2
push eax
push strbuf3
push eax
push strbuf4
call fillstring
call fillstring
call fillstring
call fillstring
ret 4
fillstring:
mov al, 1
mov edx, [esp + 4] ;; ptr
mov ecx, [esp + 8] ;; size
mov byte [edx + ecx], 0
.loop:
dec ecx
js .done
mov byte [edx + ecx], al
add al, 1
jnz .skip
mov al, 1
.skip:
jmp .loop
.done:
ret 8
writehex:
push ebx
push esi
push edi
mov eax, [esp + 16] ;; binary value
mov ecx, 32 - 4
mov esi, b2hlut
mov edi, b2hout
.loop:
mov edx, eax
shr edx, cl
and edx, 0fh
mov dl, byte [esi + edx]
mov byte [edi], dl
add edi, 1
sub ecx, 4
jns .loop
mov byte [edi], 10
;;
mov eax, 4
mov ebx, 1
mov ecx, b2hout
mov edx, 9
int 80h
pop edi
pop esi
pop ebx
ret 4
exit:
mov eax, 01h
xor ebx, ebx
int 80h
Z2xvYmFsIF9zdGFydAoKCgpzZWN0aW9uIC5kYXRhCgoJdGltZWxvCWRkCTAKCXRpbWVoaQlkZAkwCgliMmhvdXQJZGIJMCwwLDAsMCwwLDAsMCwwLDAKCgoJYjJobHV0CWRiCScwJywnMScsJzInLCczJywnNCcsJzUnLCc2JywnNycsJzgnLCc5JywnYScsJ2InLCdjJywnZCcsJ2UnLCdmJwphbGlnbiA4CglzdHJidWYxCXJlc2IJMjU2CglwYWQxCXJlc2IJMSA7OyBtaXNhbGlnbmVkIGJ5IDEKCXN0cmJ1ZjIJcmVzYgkyNTYKCXBhZDIJcmVzYgkxIDs7IG1pc2FsaWduZWQgYnkgMgoJc3RyYnVmMwlyZXNiCTI1NgoJcGFkMwlyZXNiCTEgOzsgbWlzYWxpZ25lZCBieSAzCglzdHJidWY0CXJlc2IJMjU2CgpzZWN0aW9uIC50ZXh0CgoKX3N0YXJ0OgoKCSBwdXNoCTEwMDAwMAoJIHB1c2gJMjU1CgkgcHVzaAlzdHJsZW5fMwoJY2FsbAliZW5jaG1hcmsKCSBwdXNoCTEwMDAwMAoJIHB1c2gJMjUxCgkgcHVzaAlzdHJsZW5fMwoJY2FsbAliZW5jaG1hcmsKCgkgcHVzaAkxMDAwMDAKCSBwdXNoCTI1NQoJIHB1c2gJc3RybGVuXzQKCWNhbGwJYmVuY2htYXJrCgkgcHVzaAkxMDAwMDAKCSBwdXNoCTI1MQoJIHB1c2gJc3RybGVuXzQKCWNhbGwJYmVuY2htYXJrCgoJY2FsbAlleGl0CgoKCmJlbmNobWFyazogOzsgZnVuY3Rpb24sIHN0cmluZyBsZW5ndGgsIGl0ZXJhdGlvbnMKCXB1c2gJZWJ4CglwdXNoCWVzaQoJbW92CWVieCwgW2VzcCArIDIwXQoJbW92CWVjeCwgW2VzcCArIDE2XQoJbW92CWVzaSwgW2VzcCArIDEyXQoJcHVzaAllY3gKCWNhbGwJbWFrZXN0cmluZwoJcmR0c2MKCW1vdglkd29yZCBbdGltZWxvXSwgZWF4Cgltb3YJZHdvcmQgW3RpbWVoaV0sIGVkeAoubG9vcDoKCXB1c2gJc3RyYnVmMQoJcHVzaAlzdHJidWYyCglwdXNoCXN0cmJ1ZjMKCXB1c2gJc3RyYnVmNAoJY2FsbAllc2kKCWNhbGwJZXNpCgljYWxsCWVzaQoJY2FsbAllc2kKCXN1YgllYngsIDEKCWpuegkubG9vcAoJcmR0c2MKCXN1YgllYXgsIGR3b3JkIFt0aW1lbG9dCglzYmIJZWR4LCBkd29yZCBbdGltZWhpXQoJcHVzaAllYXgKCTs7cHVzaAllZHgKCTs7Y2FsbAl3cml0ZWhleAoJY2FsbAl3cml0ZWhleAkKCXBvcAllc2kKCXBvcAllYngKCXJldAkxMgoKYWxpZ24gMTYKc3RybGVuXzE6CglwdXNoCWVkaQoJbW92CWVjeCwgLTEKCXhvcgllZHgsIGVkeAoJbW92CWVkaSwgW2VzcCArIDhdIDs7IHN0ciBwdHIKCWNsZAoJeG9yCWVheCwgZWF4CglzdWIJZWR4LCBlZGkKCXJlcG5lIHNjYXNiCglsZWEJZWF4LCBbZWRpICsgZWR4IC0gMV0KCXBvcAllZGkKCXJldAk0CgoKYWxpZ24gMTYKc3RybGVuXzI6Cgltb3YJZWF4LCBbZXNwICsgNF0KCWRlYwllYXgKCWptcAkubG9vcAphbGlnbiAxNgoubG9vcDoKCWFkZAllYXgsIDEKCXRlc3QJYnl0ZSBbZWF4XSwgMGZmaAoJam56CS5sb29wCglzdWIJZWF4LCBbZXNwICsgNF0KCXJldAk0CgoKYWxpZ24gMTYKc3RybGVuXzM6CglwdXNoCWVieAoJcHVzaAllc2kKCXB1c2gJZWRpCgltb3YJZWF4LCBbZXNwICsgMTZdCgltb3YJZWJ4LCAtMDEwMTAxMDFoCgl0ZXN0CWVheCwgMwoJanoJLnNjYW4KCW1vdgllZHgsIFtlYXhdCgl0ZXN0CWRsLCBkbAoJanoJLmZvdW5kCglpbmMJZWF4Cgl0ZXN0CWVheCwgMwoJanoJLnNjYW4KCXRlc3QJZGgsIGRoCglqegkuZm91bmQKCWluYwllYXgKCXNocgllZHgsIDE2Cgl0ZXN0CWVheCwgMwoJanoJLnNjYW4KCXRlc3QJZGwsIGRsCglqegkuZm91bmQKCWluYwllYXgKCWptcAkuc2NhbgphbGlnbiAxNgouc2NhbjoKCW1vdgllc2ksIFtlYXhdCgltb3YJZWRpLCBbZWF4ICsgNF0KCWFkZAllYXgsIDgKCWxlYQllY3gsIFtlc2kgKyBlYnhdCglsZWEJZWR4LCBbZWRpICsgZWJ4XQoJbm90CWVzaQoJbm90CWVkaQoJYW5kCWVjeCwgZXNpCglhbmQJZWR4LCBlZGkKCWFuZAllY3gsIDgwODA4MDgwaAoJam56CS5mb3VuZGxvCglhbmQJZWR4LCA4MDgwODA4MGgKCWpuegkuZm91bmRoaQoJbW92CWVzaSwgW2VheF0KCW1vdgllZGksIFtlYXggKyA0XQoJYWRkCWVheCwgOAoJbGVhCWVjeCwgW2VzaSArIGVieF0KCWxlYQllZHgsIFtlZGkgKyBlYnhdCglub3QJZXNpCglub3QJZWRpCglhbmQJZWN4LCBlc2kKCWFuZAllZHgsIGVkaQoJYW5kCWVjeCwgODA4MDgwODBoCglqbnoJLmZvdW5kbG8KCWFuZAllZHgsIDgwODA4MDgwaAoJanoJLnNjYW4JCi5mb3VuZGhpOgoJYnNmCWVkeCwgZWR4CglzdWIJZWF4LCBbZXNwICsgMTZdCglzaHIJZWR4LCAzCglsZWEJZWF4LCBbZWF4ICsgZWR4IC0gNF0KCXBvcAllZGkKCXBvcAllc2kKCXBvcAllYngKCXJldAk0Ci5mb3VuZGxvOgoJYnNmCWVjeCwgZWN4CglzdWIJZWF4LCBbZXNwICsgMTZdCglzaHIJZWN4LCAzCglsZWEJZWF4LCBbZWF4ICsgZWN4IC0gOF0KCXBvcAllZGkKCXBvcAllc2kKCXBvcAllYngKCXJldAk0Ci5mb3VuZDoKCXN1YgllYXgsIFtlc3AgKyAxNl0KCXBvcAllZGkKCXBvcAllc2kKCXBvcAllYngKCXJldAk0CgoKc3RybGVuXzQ6CglwdXNoCWVieAoJcHVzaAllc2kKCXB1c2gJZWRpCgltb3YJZWF4LCBbZXNwICsgMTZdCgltb3YJZWJ4LCAtMDEwMTAxMDFoCgl0ZXN0CWVheCwgMwoJanoJLnNjYW4KCW1vdgllZHgsIFtlYXhdCgl0ZXN0CWRsLCBkbAoJanoJLmZvdW5kCglpbmMJZWF4Cgl0ZXN0CWVheCwgMwoJanoJLnNjYW4KCXRlc3QJZGgsIGRoCglqegkuZm91bmQKCWluYwllYXgKCXNocgllZHgsIDE2Cgl0ZXN0CWVheCwgMwoJanoJLnNjYW4KCXRlc3QJZGwsIGRsCglqegkuZm91bmQKCWluYwllYXgKCWptcAkuc2NhbgouZm91bmQ6CglzdWIJZWF4LCBbZXNwICsgMTZdCglwb3AJZWRpCglwb3AJZXNpCglwb3AJZWJ4CglyZXQJNAkKYWxpZ24gMTYKLnNjYW46Cgltb3YJZXNpLCBbZWF4XQoJbW92CWVkaSwgW2VheCArIDRdCglhZGQJZWF4LCA4CglsZWEJZWN4LCBbZXNpICsgZWJ4XQoJbGVhCWVkeCwgW2VkaSArIGVieF0KCW5vdAllc2kKCW5vdAllZGkKCWFuZAllY3gsIGVzaQoJYW5kCWVkeCwgZWRpCglhbmQJZWN4LCA4MDgwODA4MGgKCWpuegkuZm91bmRsbwoJYW5kCWVkeCwgODA4MDgwODBoCglqbnoJLmZvdW5kaGkKCW1vdgllc2ksIFtlYXhdCgltb3YJZWRpLCBbZWF4ICsgNF0KCWFkZAllYXgsIDgKCWxlYQllY3gsIFtlc2kgKyBlYnhdCglsZWEJZWR4LCBbZWRpICsgZWJ4XQoJbm90CWVzaQoJbm90CWVkaQoJYW5kCWVjeCwgZXNpCglhbmQJZWR4LCBlZGkKCWFuZAllY3gsIDgwODA4MDgwaAoJam56CS5mb3VuZGxvCglhbmQJZWR4LCA4MDgwODA4MGgKCWpuegkuZm91bmRoaQoJbW92CWVzaSwgW2VheF0KCW1vdgllZGksIFtlYXggKyA0XQoJYWRkCWVheCwgOAoJbGVhCWVjeCwgW2VzaSArIGVieF0KCWxlYQllZHgsIFtlZGkgKyBlYnhdCglub3QJZXNpCglub3QJZWRpCglhbmQJZWN4LCBlc2kKCWFuZAllZHgsIGVkaQoJYW5kCWVjeCwgODA4MDgwODBoCglqbnoJLmZvdW5kbG8KCWFuZAllZHgsIDgwODA4MDgwaAoJam56CS5mb3VuZGhpCgltb3YJZXNpLCBbZWF4XQoJbW92CWVkaSwgW2VheCArIDRdCglhZGQJZWF4LCA4CglsZWEJZWN4LCBbZXNpICsgZWJ4XQoJbGVhCWVkeCwgW2VkaSArIGVieF0KCW5vdAllc2kKCW5vdAllZGkKCWFuZAllY3gsIGVzaQoJYW5kCWVkeCwgZWRpCglhbmQJZWN4LCA4MDgwODA4MGgKCWpuegkuZm91bmRsbwoJYW5kCWVkeCwgODA4MDgwODBoCglqegkuc2NhbgkJCi5mb3VuZGhpOgoJYnNmCWVkeCwgZWR4CglzdWIJZWF4LCBbZXNwICsgMTZdCglzaHIJZWR4LCAzCglsZWEJZWF4LCBbZWF4ICsgZWR4IC0gNF0KCXBvcAllZGkKCXBvcAllc2kKCXBvcAllYngKCXJldAk0Ci5mb3VuZGxvOgoJYnNmCWVjeCwgZWN4CglzdWIJZWF4LCBbZXNwICsgMTZdCglzaHIJZWN4LCAzCglsZWEJZWF4LCBbZWF4ICsgZWN4IC0gOF0KCXBvcAllZGkKCXBvcAllc2kKCXBvcAllYngKCXJldAk0CgoJCm1ha2VzdHJpbmc6Cgltb3YJZWF4LCBbZXNwICsgNF0gOzsgc2l6ZQoJcHVzaAllYXgKCXB1c2gJc3RyYnVmMQoJcHVzaAllYXgKCXB1c2gJc3RyYnVmMgoJcHVzaAllYXgKCXB1c2gJc3RyYnVmMwoJcHVzaAllYXgKCXB1c2gJc3RyYnVmNAoJY2FsbAlmaWxsc3RyaW5nCgljYWxsCWZpbGxzdHJpbmcKCWNhbGwJZmlsbHN0cmluZwoJY2FsbAlmaWxsc3RyaW5nCglyZXQJNAoKZmlsbHN0cmluZzoKCW1vdglhbCwgMQoJbW92CWVkeCwgW2VzcCArIDRdIDs7IHB0cgoJbW92CWVjeCwgW2VzcCArIDhdIDs7IHNpemUKCW1vdglieXRlIFtlZHggKyBlY3hdLCAwCi5sb29wOgoJZGVjCWVjeAoJanMJLmRvbmUKCW1vdglieXRlIFtlZHggKyBlY3hdLCBhbAoJYWRkCWFsLCAxCglqbnoJLnNraXAKCW1vdglhbCwgMQouc2tpcDoKCWptcAkubG9vcAouZG9uZToKCXJldAk4Cgp3cml0ZWhleDoKCXB1c2gJZWJ4CglwdXNoCWVzaQoJcHVzaAllZGkKCW1vdgllYXgsIFtlc3AgKyAxNl0gOzsgYmluYXJ5IHZhbHVlCgltb3YJZWN4LCAzMiAtIDQKCW1vdgllc2ksIGIyaGx1dAoJbW92CWVkaSwgYjJob3V0Ci5sb29wOgoJbW92CWVkeCwgZWF4CglzaHIJZWR4LCBjbAoJYW5kCWVkeCwgMGZoIAoJbW92CWRsLCBieXRlIFtlc2kgKyBlZHhdCgltb3YJYnl0ZSBbZWRpXSwgZGwKCWFkZAllZGksIDEKCXN1YgllY3gsIDQKCWpucwkubG9vcAoJbW92CWJ5dGUgW2VkaV0sIDEwCgk7OwoJbW92CWVheCwgNAoJbW92CWVieCwgMQoJbW92CWVjeCwgYjJob3V0Cgltb3YJZWR4LCA5CglpbnQJODBoCglwb3AJZWRpCglwb3AJZXNpCglwb3AJZWJ4CglyZXQJNAoKZXhpdDoKCgltb3YJZWF4LCAwMWgKCXhvcgllYngsIGVieAoJaW50CTgwaA==