global _start
section .data
timelo dd 0
timehi dd 0
b2hout db 0,0,0,0,0,0,0,0,0
b2hlut db '0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'
align 8
STRINGTBL dq 0, 0xff, 0xffff, 0xffffff, 0xffffffff, 0xffffffffff, 0xffffffffffff, 0xffffffffffffff
align 8
STRLENMASK dd 0x0000ffff, 0x00007fff, 0x00003fff, 0x00001fff, 0x00000fff, 0x000007ff, 0x000003ff, 0x000001ff, 0x000000ff, 0x0000007f, 0x0000003f, 0x0000001f, 0x0000000f, 0x00000007, 0x00000003, 0x00000001
align 16
teststr db 0x01, 0x05, 0x09, 0x0d, 0x11, 0x15, 0x19, 0x1d, 0xf1, 0xf5, 0xf9, 0xfd, 0x81, 0x85, 0x89, 0x8d, 0x01, 0x05, 0x09, 0x0d, 0x11, 0x15, 0x19, 0x1d, 0xf1, 0xf5, 0xf9, 0xfd, 0x81, 0x85, 0x89, 0x8d, 0
align 16
strbuf1 resb 1024
pad1 resb 1 ;; misaligned by 1
strbuf2 resb 1024
pad2 resb 1 ;; misaligned by 2
strbuf3 resb 1024
pad3 resb 1 ;; misaligned by 3
strbuf4 resb 1024
pad4 resb 64
section .text
FUNCS dd strlen_sse, strlen_mmx, strlen_opt, 0
_start:
mov ebp, 100000 ;; iterations
mov ebx, 1024 ;; length
.loop:
push ebx
call writehex
mov esi, FUNCS
.next
push ebp
push ebx
mov eax, [esi]
push eax
call benchmark
add esi, 4
mov eax, [esi]
cmp eax, 0
jnz .next
shr ebx, 1
jnz .loop
call exit
benchmark: ;; function, string length, iterations
push ebx
push esi
mov ebx, [esp + 20]
mov ecx, [esp + 16]
mov esi, [esp + 12]
push ecx
call makestring
rdtsc
mov dword [timelo], eax
mov dword [timehi], edx
.loop:
push strbuf1
push strbuf2
push strbuf3
push strbuf4
call esi
call esi
call esi
call esi
sub ebx, 1
jnz .loop
rdtsc
sub eax, dword [timelo]
sbb edx, dword [timehi]
push eax
;;push edx
;;call writehex
call writehex
pop esi
pop ebx
ret 12
align 16
strlen_sse:
mov ecx, [esp + 4]
pxor xmm1, xmm1
movdqu xmm0, [ecx]
mov edx, ecx
pcmpeqb xmm0, xmm1
and edx, 15
add ecx, 16
pmovmskb eax, xmm0
and ecx, -16
and eax, [STRLENMASK + edx * 4]
jz .scan
bsf eax, eax
sub ecx, [esp + 4]
lea eax, [ecx + eax - 16]
ret 4
align 16
.scan:
movdqa xmm0, [ecx]
pcmpeqb xmm0, xmm1
add ecx, 16
pmovmskb eax, xmm0
test eax, eax
jz .scan
bsf eax, eax
sub ecx, [esp + 4]
lea eax, [ecx + eax - 16]
ret 4
align 16
strlen_mmx:
mov eax, [esp + 4]
pxor mm1, mm1
mov ecx, eax
mov edx, eax
and ecx, -8
and eax, 7
movq mm0, [ecx]
por mm0, [STRINGTBL+eax*8]
jmp .scan
align 16
.scan:
add ecx, 8
pcmpeqb mm0, mm1
packsswb mm0, mm0
movd eax, mm0
movq mm0, [ecx]
test eax, eax
jz .scan
bsf eax, eax
shr eax, 2
lea eax, [ecx+eax-8]
sub eax, edx
emms
ret 4
align 16
strlen_opt:
push ebx
push esi
push edi
mov eax, [esp + 16]
mov ebx, -01010101h
test eax, 3
jz .scan
mov edx, [eax]
test dl, dl
jz .found
inc eax
test eax, 3
jz .scan
test dh, dh
jz .found
inc eax
shr edx, 16
test eax, 3
jz .scan
test dl, dl
jz .found
inc eax
jmp .scan
.found:
sub eax, [esp + 16]
pop edi
pop esi
pop ebx
ret 4
align 16
.scan:
mov esi, [eax]
mov edi, [eax + 4]
add eax, 8
lea ecx, [esi + ebx]
lea edx, [edi + ebx]
not esi
not edi
and ecx, esi
and edx, edi
and ecx, 80808080h
jnz .foundlo
and edx, 80808080h
jnz .foundhi
mov esi, [eax]
mov edi, [eax + 4]
add eax, 8
lea ecx, [esi + ebx]
lea edx, [edi + ebx]
not esi
not edi
and ecx, esi
and edx, edi
and ecx, 80808080h
jnz .foundlo
and edx, 80808080h
jnz .foundhi
mov esi, [eax]
mov edi, [eax + 4]
add eax, 8
lea ecx, [esi + ebx]
lea edx, [edi + ebx]
not esi
not edi
and ecx, esi
and edx, edi
and ecx, 80808080h
jnz .foundlo
and edx, 80808080h
jnz .foundhi
mov esi, [eax]
mov edi, [eax + 4]
add eax, 8
lea ecx, [esi + ebx]
lea edx, [edi + ebx]
not esi
not edi
and ecx, esi
and edx, edi
and ecx, 80808080h
jnz .foundlo
and edx, 80808080h
jz .scan
.foundhi:
bsf edx, edx
sub eax, [esp + 16]
shr edx, 3
lea eax, [eax + edx - 4]
pop edi
pop esi
pop ebx
ret 4
.foundlo:
bsf ecx, ecx
sub eax, [esp + 16]
shr ecx, 3
lea eax, [eax + ecx - 8]
pop edi
pop esi
pop ebx
ret 4
makestring:
mov eax, [esp + 4] ;; size
push eax
push strbuf1
push eax
push strbuf2
push eax
push strbuf3
push eax
push strbuf4
call fillstring
call fillstring
call fillstring
call fillstring
ret 4
fillstring:
mov al, 1
mov edx, [esp + 4] ;; ptr
mov ecx, [esp + 8] ;; size
mov byte [edx + ecx], 0
.loop:
dec ecx
js .done
mov byte [edx + ecx], al
add al, 1
jnz .skip
mov al, 1
.skip:
jmp .loop
.done:
ret 8
writehex:
push ebx
push esi
push edi
mov eax, [esp + 16] ;; binary value
mov ecx, 32 - 4
mov esi, b2hlut
mov edi, b2hout
.loop:
mov edx, eax
shr edx, cl
and edx, 0fh
mov dl, byte [esi + edx]
mov byte [edi], dl
add edi, 1
sub ecx, 4
jns .loop
mov byte [edi], 10
;;
mov eax, 4
mov ebx, 1
mov ecx, b2hout
mov edx, 9
int 80h
pop edi
pop esi
pop ebx
ret 4
exit:
mov eax, 01h
xor ebx, ebx
int 80h
Z2xvYmFsIF9zdGFydAoKc2VjdGlvbiAuZGF0YQoJdGltZWxvCWRkCTAKCXRpbWVoaQlkZAkwCgliMmhvdXQJZGIJMCwwLDAsMCwwLDAsMCwwLDAKCWIyaGx1dAlkYgknMCcsJzEnLCcyJywnMycsJzQnLCc1JywnNicsJzcnLCc4JywnOScsJ2EnLCdiJywnYycsJ2QnLCdlJywnZicKYWxpZ24gOAoJU1RSSU5HVEJMCWRxCTAsIDB4ZmYsIDB4ZmZmZiwgMHhmZmZmZmYsIDB4ZmZmZmZmZmYsIDB4ZmZmZmZmZmZmZiwgMHhmZmZmZmZmZmZmZmYsIDB4ZmZmZmZmZmZmZmZmZmYKYWxpZ24gOAoJU1RSTEVOTUFTSwlkZAkweDAwMDBmZmZmLCAweDAwMDA3ZmZmLCAweDAwMDAzZmZmLCAweDAwMDAxZmZmLCAweDAwMDAwZmZmLCAweDAwMDAwN2ZmLCAweDAwMDAwM2ZmLCAweDAwMDAwMWZmLCAweDAwMDAwMGZmLCAweDAwMDAwMDdmLCAweDAwMDAwMDNmLCAweDAwMDAwMDFmLCAweDAwMDAwMDBmLCAweDAwMDAwMDA3LCAweDAwMDAwMDAzLCAweDAwMDAwMDAxCmFsaWduIDE2Cgl0ZXN0c3RyIGRiIDB4MDEsIDB4MDUsIDB4MDksIDB4MGQsIDB4MTEsIDB4MTUsIDB4MTksIDB4MWQsIDB4ZjEsIDB4ZjUsIDB4ZjksIDB4ZmQsIDB4ODEsIDB4ODUsIDB4ODksIDB4OGQsIDB4MDEsIDB4MDUsIDB4MDksIDB4MGQsIDB4MTEsIDB4MTUsIDB4MTksIDB4MWQsIDB4ZjEsIDB4ZjUsIDB4ZjksIDB4ZmQsIDB4ODEsIDB4ODUsIDB4ODksIDB4OGQsIDAKYWxpZ24gMTYKCXN0cmJ1ZjEJcmVzYgkxMDI0CglwYWQxCXJlc2IJMSA7OyBtaXNhbGlnbmVkIGJ5IDEKCXN0cmJ1ZjIJcmVzYgkxMDI0CglwYWQyCXJlc2IJMSA7OyBtaXNhbGlnbmVkIGJ5IDIKCXN0cmJ1ZjMJcmVzYgkxMDI0CglwYWQzCXJlc2IJMSA7OyBtaXNhbGlnbmVkIGJ5IDMKCXN0cmJ1ZjQJcmVzYgkxMDI0CglwYWQ0CXJlc2IJNjQKCnNlY3Rpb24gLnRleHQKCUZVTkNTCWRkCXN0cmxlbl9zc2UsIHN0cmxlbl9tbXgsIHN0cmxlbl9vcHQsIDAKX3N0YXJ0OgoJbW92CWVicCwgMTAwMDAwIDs7IGl0ZXJhdGlvbnMKCW1vdgllYngsIDEwMjQgOzsgbGVuZ3RoCi5sb29wOgoJcHVzaAllYngKCWNhbGwJd3JpdGVoZXgKCW1vdgllc2ksIEZVTkNTCi5uZXh0CglwdXNoCWVicAoJcHVzaAllYngKCW1vdgllYXgsIFtlc2ldCglwdXNoCWVheAoJY2FsbAliZW5jaG1hcmsKCWFkZAllc2ksIDQKCW1vdgllYXgsIFtlc2ldCgljbXAJZWF4LCAwCglqbnoJLm5leHQKCXNocgllYngsIDEKCWpuegkubG9vcAoJCgljYWxsCWV4aXQKCgoKYmVuY2htYXJrOiA7OyBmdW5jdGlvbiwgc3RyaW5nIGxlbmd0aCwgaXRlcmF0aW9ucwoJcHVzaAllYngKCXB1c2gJZXNpCgltb3YJZWJ4LCBbZXNwICsgMjBdCgltb3YJZWN4LCBbZXNwICsgMTZdCgltb3YJZXNpLCBbZXNwICsgMTJdCglwdXNoCWVjeAoJY2FsbAltYWtlc3RyaW5nCglyZHRzYwoJbW92CWR3b3JkIFt0aW1lbG9dLCBlYXgKCW1vdglkd29yZCBbdGltZWhpXSwgZWR4Ci5sb29wOgoJcHVzaAlzdHJidWYxCglwdXNoCXN0cmJ1ZjIKCXB1c2gJc3RyYnVmMwoJcHVzaAlzdHJidWY0CgljYWxsCWVzaQoJY2FsbAllc2kKCWNhbGwJZXNpCgljYWxsCWVzaQoJc3ViCWVieCwgMQoJam56CS5sb29wCglyZHRzYwoJc3ViCWVheCwgZHdvcmQgW3RpbWVsb10KCXNiYgllZHgsIGR3b3JkIFt0aW1laGldCglwdXNoCWVheAoJOztwdXNoCWVkeAoJOztjYWxsCXdyaXRlaGV4CgljYWxsCXdyaXRlaGV4CQoJcG9wCWVzaQoJcG9wCWVieAoJcmV0CTEyCgoKYWxpZ24gMTYKc3RybGVuX3NzZToKCW1vdgllY3gsIFtlc3AgKyA0XQoJcHhvcgl4bW0xLCB4bW0xCgltb3ZkcXUJeG1tMCwgW2VjeF0KCW1vdgllZHgsIGVjeAoJcGNtcGVxYgl4bW0wLCB4bW0xCglhbmQJZWR4LCAxNQoJYWRkCWVjeCwgMTYKCXBtb3Ztc2tiIGVheCwgeG1tMAoJYW5kCWVjeCwgLTE2CglhbmQgZWF4LCBbU1RSTEVOTUFTSyArIGVkeCAqIDRdCglqeiAuc2NhbgoJYnNmCWVheCwgZWF4CglzdWIJZWN4LCBbZXNwICsgNF0KCWxlYQllYXgsIFtlY3ggKyBlYXggLSAxNl0KCXJldAk0CmFsaWduIDE2Ci5zY2FuOgoJbW92ZHFhCXhtbTAsIFtlY3hdCglwY21wZXFiCXhtbTAsIHhtbTEKCWFkZAllY3gsIDE2CglwbW92bXNrYgllYXgsIHhtbTAKCXRlc3QJZWF4LCBlYXgKCWp6CS5zY2FuCglic2YJZWF4LCBlYXgKCXN1YgllY3gsIFtlc3AgKyA0XQoJbGVhCWVheCwgW2VjeCArIGVheCAtIDE2XQoJcmV0IDQKCmFsaWduIDE2CnN0cmxlbl9tbXg6Cgltb3YgICAgIGVheCwgW2VzcCArIDRdIAoJcHhvciAgICAgbW0xLCBtbTEgCgltb3YgICAgICBlY3gsIGVheCAKCW1vdiAgICAgIGVkeCwgZWF4IAoJYW5kICAgICAgZWN4LCAtOCAKCWFuZCAgICAgIGVheCwgNyAKCW1vdnEgICAgIG1tMCwgW2VjeF0gCglwb3IgICAgICBtbTAsIFtTVFJJTkdUQkwrZWF4KjhdCglqbXAJLnNjYW4KYWxpZ24gMTYKLnNjYW46IAoJYWRkICAgICAgZWN4LCA4IAoJcGNtcGVxYiAgbW0wLCBtbTEgCglwYWNrc3N3YiBtbTAsIG1tMCAKCW1vdmQgICAgIGVheCwgbW0wIAoJbW92cSAgICAgbW0wLCBbZWN4XSAKCXRlc3QgICAgIGVheCwgZWF4IAoJanogICAgICAgLnNjYW4KCWJzZiAgICAgIGVheCwgZWF4IAoJc2hyICAgICAgZWF4LCAyIAoJbGVhICAgICAgZWF4LCBbZWN4K2VheC04XSAKCXN1YiAgICAgIGVheCwgZWR4IAoJZW1tcwoJcmV0IDQKCmFsaWduIDE2CnN0cmxlbl9vcHQ6CglwdXNoCWVieAoJcHVzaAllc2kKCXB1c2gJZWRpCgltb3YJZWF4LCBbZXNwICsgMTZdCgltb3YJZWJ4LCAtMDEwMTAxMDFoCgl0ZXN0CWVheCwgMwoJanoJLnNjYW4KCW1vdgllZHgsIFtlYXhdCgl0ZXN0CWRsLCBkbAoJanoJLmZvdW5kCglpbmMJZWF4Cgl0ZXN0CWVheCwgMwoJanoJLnNjYW4KCXRlc3QJZGgsIGRoCglqegkuZm91bmQKCWluYwllYXgKCXNocgllZHgsIDE2Cgl0ZXN0CWVheCwgMwoJanoJLnNjYW4KCXRlc3QJZGwsIGRsCglqegkuZm91bmQKCWluYwllYXgKCWptcAkuc2NhbgouZm91bmQ6CglzdWIJZWF4LCBbZXNwICsgMTZdCglwb3AJZWRpCglwb3AJZXNpCglwb3AJZWJ4CglyZXQJNAkKYWxpZ24gMTYKLnNjYW46Cgltb3YJZXNpLCBbZWF4XQoJbW92CWVkaSwgW2VheCArIDRdCglhZGQJZWF4LCA4CglsZWEJZWN4LCBbZXNpICsgZWJ4XQoJbGVhCWVkeCwgW2VkaSArIGVieF0KCW5vdAllc2kKCW5vdAllZGkKCWFuZAllY3gsIGVzaQoJYW5kCWVkeCwgZWRpCglhbmQJZWN4LCA4MDgwODA4MGgKCWpuegkuZm91bmRsbwoJYW5kCWVkeCwgODA4MDgwODBoCglqbnoJLmZvdW5kaGkKCW1vdgllc2ksIFtlYXhdCgltb3YJZWRpLCBbZWF4ICsgNF0KCWFkZAllYXgsIDgKCWxlYQllY3gsIFtlc2kgKyBlYnhdCglsZWEJZWR4LCBbZWRpICsgZWJ4XQoJbm90CWVzaQoJbm90CWVkaQoJYW5kCWVjeCwgZXNpCglhbmQJZWR4LCBlZGkKCWFuZAllY3gsIDgwODA4MDgwaAoJam56CS5mb3VuZGxvCglhbmQJZWR4LCA4MDgwODA4MGgKCWpuegkuZm91bmRoaQoJbW92CWVzaSwgW2VheF0KCW1vdgllZGksIFtlYXggKyA0XQoJYWRkCWVheCwgOAoJbGVhCWVjeCwgW2VzaSArIGVieF0KCWxlYQllZHgsIFtlZGkgKyBlYnhdCglub3QJZXNpCglub3QJZWRpCglhbmQJZWN4LCBlc2kKCWFuZAllZHgsIGVkaQoJYW5kCWVjeCwgODA4MDgwODBoCglqbnoJLmZvdW5kbG8KCWFuZAllZHgsIDgwODA4MDgwaAoJam56CS5mb3VuZGhpCgltb3YJZXNpLCBbZWF4XQoJbW92CWVkaSwgW2VheCArIDRdCglhZGQJZWF4LCA4CglsZWEJZWN4LCBbZXNpICsgZWJ4XQoJbGVhCWVkeCwgW2VkaSArIGVieF0KCW5vdAllc2kKCW5vdAllZGkKCWFuZAllY3gsIGVzaQoJYW5kCWVkeCwgZWRpCglhbmQJZWN4LCA4MDgwODA4MGgKCWpuegkuZm91bmRsbwoJYW5kCWVkeCwgODA4MDgwODBoCglqegkuc2NhbgkJCi5mb3VuZGhpOgoJYnNmCWVkeCwgZWR4CglzdWIJZWF4LCBbZXNwICsgMTZdCglzaHIJZWR4LCAzCglsZWEJZWF4LCBbZWF4ICsgZWR4IC0gNF0KCXBvcAllZGkKCXBvcAllc2kKCXBvcAllYngKCXJldAk0Ci5mb3VuZGxvOgoJYnNmCWVjeCwgZWN4CglzdWIJZWF4LCBbZXNwICsgMTZdCglzaHIJZWN4LCAzCglsZWEJZWF4LCBbZWF4ICsgZWN4IC0gOF0KCXBvcAllZGkKCXBvcAllc2kKCXBvcAllYngKCXJldAk0CgoJCm1ha2VzdHJpbmc6Cgltb3YJZWF4LCBbZXNwICsgNF0gOzsgc2l6ZQoJcHVzaAllYXgKCXB1c2gJc3RyYnVmMQoJcHVzaAllYXgKCXB1c2gJc3RyYnVmMgoJcHVzaAllYXgKCXB1c2gJc3RyYnVmMwoJcHVzaAllYXgKCXB1c2gJc3RyYnVmNAoJY2FsbAlmaWxsc3RyaW5nCgljYWxsCWZpbGxzdHJpbmcKCWNhbGwJZmlsbHN0cmluZwoJY2FsbAlmaWxsc3RyaW5nCglyZXQJNAoKZmlsbHN0cmluZzoKCW1vdglhbCwgMQoJbW92CWVkeCwgW2VzcCArIDRdIDs7IHB0cgoJbW92CWVjeCwgW2VzcCArIDhdIDs7IHNpemUKCW1vdglieXRlIFtlZHggKyBlY3hdLCAwCi5sb29wOgoJZGVjCWVjeAoJanMJLmRvbmUKCW1vdglieXRlIFtlZHggKyBlY3hdLCBhbAoJYWRkCWFsLCAxCglqbnoJLnNraXAKCW1vdglhbCwgMQouc2tpcDoKCWptcAkubG9vcAouZG9uZToKCXJldAk4Cgp3cml0ZWhleDoKCXB1c2gJZWJ4CglwdXNoCWVzaQoJcHVzaAllZGkKCW1vdgllYXgsIFtlc3AgKyAxNl0gOzsgYmluYXJ5IHZhbHVlCgltb3YJZWN4LCAzMiAtIDQKCW1vdgllc2ksIGIyaGx1dAoJbW92CWVkaSwgYjJob3V0Ci5sb29wOgoJbW92CWVkeCwgZWF4CglzaHIJZWR4LCBjbAoJYW5kCWVkeCwgMGZoIAoJbW92CWRsLCBieXRlIFtlc2kgKyBlZHhdCgltb3YJYnl0ZSBbZWRpXSwgZGwKCWFkZAllZGksIDEKCXN1YgllY3gsIDQKCWpucwkubG9vcAoJbW92CWJ5dGUgW2VkaV0sIDEwCgk7OwoJbW92CWVheCwgNAoJbW92CWVieCwgMQoJbW92CWVjeCwgYjJob3V0Cgltb3YJZWR4LCA5CglpbnQJODBoCglwb3AJZWRpCglwb3AJZXNpCglwb3AJZWJ4CglyZXQJNAoKZXhpdDoKCgltb3YJZWF4LCAwMWgKCXhvcgllYngsIGVieAoJaW50CTgwaA==