global _start
 
 
 
section .data
 
	timelo	dd	0
	timehi	dd	0
	b2hout	db	0,0,0,0,0,0,0,0,0
 
 
	b2hlut	db	'0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'
align 8
	strbuf1	resb	256
	pad1	resb	1 ;; misaligned by 1
	strbuf2	resb	256
	pad2	resb	1 ;; misaligned by 2
	strbuf3	resb	256
	pad3	resb	1 ;; misaligned by 3
	strbuf4	resb	256
 
section .text
 
 
_start:
 
	 push	100000
	 push	255
	 push	strlen_3
	call	benchmark
	 push	100000
	 push	251
	 push	strlen_3
	call	benchmark
 
	 push	100000
	 push	255
	 push	strlen_4
	call	benchmark
	 push	100000
	 push	251
	 push	strlen_4
	call	benchmark
 
	call	exit
 
 
 
benchmark: ;; function, string length, iterations
	push	ebx
	push	esi
	mov	ebx, [esp + 20]
	mov	ecx, [esp + 16]
	mov	esi, [esp + 12]
	push	ecx
	call	makestring
	rdtsc
	mov	dword [timelo], eax
	mov	dword [timehi], edx
.loop:
	push	strbuf1
	push	strbuf2
	push	strbuf3
	push	strbuf4
	call	esi
	call	esi
	call	esi
	call	esi
	sub	ebx, 1
	jnz	.loop
	rdtsc
	sub	eax, dword [timelo]
	sbb	edx, dword [timehi]
	push	eax
	;;push	edx
	;;call	writehex
	call	writehex	
	pop	esi
	pop	ebx
	ret	12
 
align 16
strlen_1:
	push	edi
	mov	ecx, -1
	xor	edx, edx
	mov	edi, [esp + 8] ;; str ptr
	cld
	xor	eax, eax
	sub	edx, edi
	repne scasb
	lea	eax, [edi + edx - 1]
	pop	edi
	ret	4
 
 
align 16
strlen_2:
	mov	eax, [esp + 4]
	dec	eax
	jmp	.loop
align 16
.loop:
	add	eax, 1
	test	byte [eax], 0ffh
	jnz	.loop
	sub	eax, [esp + 4]
	ret	4
 
 
align 16
strlen_3:
	push	ebx
	push	esi
	push	edi
        mov     eax, [esp + 16] 
        mov     ebx, -01010101h 
.aligning: 
        test    eax, 3 
        jz      .scan 
        mov     dl, [eax] 
        test    dl, dl 
        jz      .found 
        inc     eax 
        jmp     .aligning 
align 32 
.scan: 
	mov	esi, [eax] 
	mov	edi, [eax + 4] 
	lea	eax, [eax + 8] 
	lea	ecx, [esi + ebx]   ;! 
	lea	edx, [edi + ebx] 
	not	esi 
	not	edi 
	and	ecx, esi 
	and	edx, edi 
	and	ecx, $80808080 
	and	edx, $80808080 
	test	ecx, ecx         ;!! 
	jnz	.sub8 
	test	edx, edx 
	jz	.scan 
	lea	eax, [eax-4] 
	mov	ecx, edx 
	jmp	.bytesearch 
.sub8: 
	lea	eax,[eax-8] 
.bytesearch: 
	test	cl, cl 
	jnz     .found 
	inc	eax 
	test	ch, ch 
	jnz	.found 
	shr	ecx, 16 
	inc	eax 
	test	cl, cl 
	jnz	.found 
	inc	eax 
.found: 
	sub	eax, [esp + 16]
	pop	edi
	pop	esi
	pop	ebx
        ret	4
 
 
strlen_4:
	push	ebx
	push	esi
	push	edi
	mov	eax, [esp + 16]
	mov	ebx, -01010101h
	test	eax, 3
	jz	.scan
	mov	edx, [eax]
	test	dl, dl
	jz	.found
	inc	eax
	test	eax, 3
	jz	.scan
	test	dh, dh
	jz	.found
	inc	eax
	shr	edx, 16
	test	eax, 3
	jz	.scan
	test	dl, dl
	jz	.found
	inc	eax
	jmp	.scan
align 16
.scan:
	mov	esi, [eax]
	mov	edi, [eax + 4]
	add	eax, 8
	lea	ecx, [esi + ebx]
	lea	edx, [edi + ebx]
	not	esi
	not	edi
	and	ecx, esi
	and	edx, edi
	and	ecx, 80808080h
	jnz	.foundlo
	and	edx, 80808080h
	jz	.scan
.foundhi:
	bsf	edx, edx
	sub	eax, [esp + 16]
	shr	edx, 3
	lea	eax, [eax + edx - 4]
	pop	edi
	pop	esi
	pop	ebx
	ret	4
.foundlo:
	bsf	ecx, ecx
	sub	eax, [esp + 16]
	shr	ecx, 3
	lea	eax, [eax + ecx - 8]
	pop	edi
	pop	esi
	pop	ebx
	ret	4
.found:
	sub	eax, [esp + 16]
	pop	edi
	pop	esi
	pop	ebx
	ret	4
 
makestring:
	mov	eax, [esp + 4] ;; size
	push	eax
	push	strbuf1
	push	eax
	push	strbuf2
	push	eax
	push	strbuf3
	push	eax
	push	strbuf4
	call	fillstring
	call	fillstring
	call	fillstring
	call	fillstring
	ret	4
 
fillstring:
	mov	al, 1
	mov	edx, [esp + 4] ;; ptr
	mov	ecx, [esp + 8] ;; size
	mov	byte [edx + ecx], 0
.loop:
	dec	ecx
	js	.done
	mov	byte [edx + ecx], al
	add	al, 1
	jnz	.skip
	mov	al, 1
.skip:
	jmp	.loop
.done:
	ret	8
 
writehex:
	push	ebx
	push	esi
	push	edi
	mov	eax, [esp + 16] ;; binary value
	mov	ecx, 32 - 4
	mov	esi, b2hlut
	mov	edi, b2hout
.loop:
	mov	edx, eax
	shr	edx, cl
	and	edx, 0fh 
	mov	dl, byte [esi + edx]
	mov	byte [edi], dl
	add	edi, 1
	sub	ecx, 4
	jns	.loop
	mov	byte [edi], 10
	;;
	mov	eax, 4
	mov	ebx, 1
	mov	ecx, b2hout
	mov	edx, 9
	int	80h
	pop	edi
	pop	esi
	pop	ebx
	ret	4
 
exit:
 
	mov	eax, 01h
	xor	ebx, ebx
	int	80h
				Z2xvYmFsIF9zdGFydAoKCgpzZWN0aW9uIC5kYXRhCgoJdGltZWxvCWRkCTAKCXRpbWVoaQlkZAkwCgliMmhvdXQJZGIJMCwwLDAsMCwwLDAsMCwwLDAKCgoJYjJobHV0CWRiCScwJywnMScsJzInLCczJywnNCcsJzUnLCc2JywnNycsJzgnLCc5JywnYScsJ2InLCdjJywnZCcsJ2UnLCdmJwphbGlnbiA4CglzdHJidWYxCXJlc2IJMjU2CglwYWQxCXJlc2IJMSA7OyBtaXNhbGlnbmVkIGJ5IDEKCXN0cmJ1ZjIJcmVzYgkyNTYKCXBhZDIJcmVzYgkxIDs7IG1pc2FsaWduZWQgYnkgMgoJc3RyYnVmMwlyZXNiCTI1NgoJcGFkMwlyZXNiCTEgOzsgbWlzYWxpZ25lZCBieSAzCglzdHJidWY0CXJlc2IJMjU2CgpzZWN0aW9uIC50ZXh0CgoKX3N0YXJ0OgoKCSBwdXNoCTEwMDAwMAoJIHB1c2gJMjU1CgkgcHVzaAlzdHJsZW5fMwoJY2FsbAliZW5jaG1hcmsKCSBwdXNoCTEwMDAwMAoJIHB1c2gJMjUxCgkgcHVzaAlzdHJsZW5fMwoJY2FsbAliZW5jaG1hcmsKCgkgcHVzaAkxMDAwMDAKCSBwdXNoCTI1NQoJIHB1c2gJc3RybGVuXzQKCWNhbGwJYmVuY2htYXJrCgkgcHVzaAkxMDAwMDAKCSBwdXNoCTI1MQoJIHB1c2gJc3RybGVuXzQKCWNhbGwJYmVuY2htYXJrCgoJY2FsbAlleGl0CgoKCmJlbmNobWFyazogOzsgZnVuY3Rpb24sIHN0cmluZyBsZW5ndGgsIGl0ZXJhdGlvbnMKCXB1c2gJZWJ4CglwdXNoCWVzaQoJbW92CWVieCwgW2VzcCArIDIwXQoJbW92CWVjeCwgW2VzcCArIDE2XQoJbW92CWVzaSwgW2VzcCArIDEyXQoJcHVzaAllY3gKCWNhbGwJbWFrZXN0cmluZwoJcmR0c2MKCW1vdglkd29yZCBbdGltZWxvXSwgZWF4Cgltb3YJZHdvcmQgW3RpbWVoaV0sIGVkeAoubG9vcDoKCXB1c2gJc3RyYnVmMQoJcHVzaAlzdHJidWYyCglwdXNoCXN0cmJ1ZjMKCXB1c2gJc3RyYnVmNAoJY2FsbAllc2kKCWNhbGwJZXNpCgljYWxsCWVzaQoJY2FsbAllc2kKCXN1YgllYngsIDEKCWpuegkubG9vcAoJcmR0c2MKCXN1YgllYXgsIGR3b3JkIFt0aW1lbG9dCglzYmIJZWR4LCBkd29yZCBbdGltZWhpXQoJcHVzaAllYXgKCTs7cHVzaAllZHgKCTs7Y2FsbAl3cml0ZWhleAoJY2FsbAl3cml0ZWhleAkKCXBvcAllc2kKCXBvcAllYngKCXJldAkxMgoKYWxpZ24gMTYKc3RybGVuXzE6CglwdXNoCWVkaQoJbW92CWVjeCwgLTEKCXhvcgllZHgsIGVkeAoJbW92CWVkaSwgW2VzcCArIDhdIDs7IHN0ciBwdHIKCWNsZAoJeG9yCWVheCwgZWF4CglzdWIJZWR4LCBlZGkKCXJlcG5lIHNjYXNiCglsZWEJZWF4LCBbZWRpICsgZWR4IC0gMV0KCXBvcAllZGkKCXJldAk0CgoKYWxpZ24gMTYKc3RybGVuXzI6Cgltb3YJZWF4LCBbZXNwICsgNF0KCWRlYwllYXgKCWptcAkubG9vcAphbGlnbiAxNgoubG9vcDoKCWFkZAllYXgsIDEKCXRlc3QJYnl0ZSBbZWF4XSwgMGZmaAoJam56CS5sb29wCglzdWIJZWF4LCBbZXNwICsgNF0KCXJldAk0CgoKYWxpZ24gMTYKc3RybGVuXzM6CglwdXNoCWVieAoJcHVzaAllc2kKCXB1c2gJZWRpCiAgICAgICAgbW92ICAgICBlYXgsIFtlc3AgKyAxNl0gCiAgICAgICAgbW92ICAgICBlYngsIC0wMTAxMDEwMWggCi5hbGlnbmluZzogCiAgICAgICAgdGVzdCAgICBlYXgsIDMgCiAgICAgICAganogICAgICAuc2NhbiAKICAgICAgICBtb3YgICAgIGRsLCBbZWF4XSAKICAgICAgICB0ZXN0ICAgIGRsLCBkbCAKICAgICAgICBqeiAgICAgIC5mb3VuZCAKICAgICAgICBpbmMgICAgIGVheCAKICAgICAgICBqbXAgICAgIC5hbGlnbmluZyAKYWxpZ24gMzIgCi5zY2FuOiAKCW1vdgllc2ksIFtlYXhdIAoJbW92CWVkaSwgW2VheCArIDRdIAoJbGVhCWVheCwgW2VheCArIDhdIAoJbGVhCWVjeCwgW2VzaSArIGVieF0gICA7ISAKCWxlYQllZHgsIFtlZGkgKyBlYnhdIAoJbm90CWVzaSAKCW5vdAllZGkgCglhbmQJZWN4LCBlc2kgCglhbmQJZWR4LCBlZGkgCglhbmQJZWN4LCAkODA4MDgwODAgCglhbmQJZWR4LCAkODA4MDgwODAgCgl0ZXN0CWVjeCwgZWN4ICAgICAgICAgOyEhIAoJam56CS5zdWI4IAoJdGVzdAllZHgsIGVkeCAKCWp6CS5zY2FuIAoJbGVhCWVheCwgW2VheC00XSAKCW1vdgllY3gsIGVkeCAKCWptcAkuYnl0ZXNlYXJjaCAKLnN1Yjg6IAoJbGVhCWVheCxbZWF4LThdIAouYnl0ZXNlYXJjaDogCgl0ZXN0CWNsLCBjbCAKCWpueiAgICAgLmZvdW5kIAoJaW5jCWVheCAKCXRlc3QJY2gsIGNoIAoJam56CS5mb3VuZCAKCXNocgllY3gsIDE2IAoJaW5jCWVheCAKCXRlc3QJY2wsIGNsIAoJam56CS5mb3VuZCAKCWluYwllYXggCi5mb3VuZDogCglzdWIJZWF4LCBbZXNwICsgMTZdCglwb3AJZWRpCglwb3AJZXNpCglwb3AJZWJ4CiAgICAgICAgcmV0CTQKCgpzdHJsZW5fNDoKCXB1c2gJZWJ4CglwdXNoCWVzaQoJcHVzaAllZGkKCW1vdgllYXgsIFtlc3AgKyAxNl0KCW1vdgllYngsIC0wMTAxMDEwMWgKCXRlc3QJZWF4LCAzCglqegkuc2NhbgoJbW92CWVkeCwgW2VheF0KCXRlc3QJZGwsIGRsCglqegkuZm91bmQKCWluYwllYXgKCXRlc3QJZWF4LCAzCglqegkuc2NhbgoJdGVzdAlkaCwgZGgKCWp6CS5mb3VuZAoJaW5jCWVheAoJc2hyCWVkeCwgMTYKCXRlc3QJZWF4LCAzCglqegkuc2NhbgoJdGVzdAlkbCwgZGwKCWp6CS5mb3VuZAoJaW5jCWVheAoJam1wCS5zY2FuCmFsaWduIDE2Ci5zY2FuOgoJbW92CWVzaSwgW2VheF0KCW1vdgllZGksIFtlYXggKyA0XQoJYWRkCWVheCwgOAoJbGVhCWVjeCwgW2VzaSArIGVieF0KCWxlYQllZHgsIFtlZGkgKyBlYnhdCglub3QJZXNpCglub3QJZWRpCglhbmQJZWN4LCBlc2kKCWFuZAllZHgsIGVkaQoJYW5kCWVjeCwgODA4MDgwODBoCglqbnoJLmZvdW5kbG8KCWFuZAllZHgsIDgwODA4MDgwaAoJanoJLnNjYW4KLmZvdW5kaGk6Cglic2YJZWR4LCBlZHgKCXN1YgllYXgsIFtlc3AgKyAxNl0KCXNocgllZHgsIDMKCWxlYQllYXgsIFtlYXggKyBlZHggLSA0XQoJcG9wCWVkaQoJcG9wCWVzaQoJcG9wCWVieAoJcmV0CTQKLmZvdW5kbG86Cglic2YJZWN4LCBlY3gKCXN1YgllYXgsIFtlc3AgKyAxNl0KCXNocgllY3gsIDMKCWxlYQllYXgsIFtlYXggKyBlY3ggLSA4XQoJcG9wCWVkaQoJcG9wCWVzaQoJcG9wCWVieAoJcmV0CTQKLmZvdW5kOgoJc3ViCWVheCwgW2VzcCArIDE2XQoJcG9wCWVkaQoJcG9wCWVzaQoJcG9wCWVieAoJcmV0CTQKCQptYWtlc3RyaW5nOgoJbW92CWVheCwgW2VzcCArIDRdIDs7IHNpemUKCXB1c2gJZWF4CglwdXNoCXN0cmJ1ZjEKCXB1c2gJZWF4CglwdXNoCXN0cmJ1ZjIKCXB1c2gJZWF4CglwdXNoCXN0cmJ1ZjMKCXB1c2gJZWF4CglwdXNoCXN0cmJ1ZjQKCWNhbGwJZmlsbHN0cmluZwoJY2FsbAlmaWxsc3RyaW5nCgljYWxsCWZpbGxzdHJpbmcKCWNhbGwJZmlsbHN0cmluZwoJcmV0CTQKCmZpbGxzdHJpbmc6Cgltb3YJYWwsIDEKCW1vdgllZHgsIFtlc3AgKyA0XSA7OyBwdHIKCW1vdgllY3gsIFtlc3AgKyA4XSA7OyBzaXplCgltb3YJYnl0ZSBbZWR4ICsgZWN4XSwgMAoubG9vcDoKCWRlYwllY3gKCWpzCS5kb25lCgltb3YJYnl0ZSBbZWR4ICsgZWN4XSwgYWwKCWFkZAlhbCwgMQoJam56CS5za2lwCgltb3YJYWwsIDEKLnNraXA6CglqbXAJLmxvb3AKLmRvbmU6CglyZXQJOAoKd3JpdGVoZXg6CglwdXNoCWVieAoJcHVzaAllc2kKCXB1c2gJZWRpCgltb3YJZWF4LCBbZXNwICsgMTZdIDs7IGJpbmFyeSB2YWx1ZQoJbW92CWVjeCwgMzIgLSA0Cgltb3YJZXNpLCBiMmhsdXQKCW1vdgllZGksIGIyaG91dAoubG9vcDoKCW1vdgllZHgsIGVheAoJc2hyCWVkeCwgY2wKCWFuZAllZHgsIDBmaCAKCW1vdglkbCwgYnl0ZSBbZXNpICsgZWR4XQoJbW92CWJ5dGUgW2VkaV0sIGRsCglhZGQJZWRpLCAxCglzdWIJZWN4LCA0CglqbnMJLmxvb3AKCW1vdglieXRlIFtlZGldLCAxMAoJOzsKCW1vdgllYXgsIDQKCW1vdgllYngsIDEKCW1vdgllY3gsIGIyaG91dAoJbW92CWVkeCwgOQoJaW50CTgwaAoJcG9wCWVkaQoJcG9wCWVzaQoJcG9wCWVieAoJcmV0CTQKCmV4aXQ6CgoJbW92CWVheCwgMDFoCgl4b3IJZWJ4LCBlYngKCWludAk4MGg=