global _start
 
 
 
section .data
 
	timelo	dd	0
	timehi	dd	0
	b2hout	db	0,0,0,0,0,0,0,0,0
 
 
	b2hlut	db	'0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'
align 8
	strbuf1	resb	256
	pad1	resb	1 ;; misaligned by 1
	strbuf2	resb	256
	pad2	resb	1 ;; misaligned by 2
	strbuf3	resb	256
	pad3	resb	1 ;; misaligned by 3
	strbuf4	resb	256
 
section .text
 
 
_start:
 
	 push	100000
	 push	255
	 push	strlen_3
	call	benchmark
	 push	100000
	 push	251
	 push	strlen_3
	call	benchmark
 
	 push	100000
	 push	255
	 push	strlen_4
	call	benchmark
	 push	100000
	 push	251
	 push	strlen_4
	call	benchmark
 
	call	exit
 
 
 
benchmark: ;; function, string length, iterations
	push	ebx
	push	esi
	mov	ebx, [esp + 20]
	mov	ecx, [esp + 16]
	mov	esi, [esp + 12]
	push	ecx
	call	makestring
	rdtsc
	mov	dword [timelo], eax
	mov	dword [timehi], edx
.loop:
	push	strbuf1
	push	strbuf2
	push	strbuf3
	push	strbuf4
	call	esi
	call	esi
	call	esi
	call	esi
	sub	ebx, 1
	jnz	.loop
	rdtsc
	sub	eax, dword [timelo]
	sbb	edx, dword [timehi]
	push	eax
	;;push	edx
	;;call	writehex
	call	writehex	
	pop	esi
	pop	ebx
	ret	12
 
align 16
strlen_1:
	push	edi
	mov	ecx, -1
	xor	edx, edx
	mov	edi, [esp + 8] ;; str ptr
	cld
	xor	eax, eax
	sub	edx, edi
	repne scasb
	lea	eax, [edi + edx - 1]
	pop	edi
	ret	4
 
 
align 16
strlen_2:
	mov	eax, [esp + 4]
	dec	eax
	jmp	.loop
align 16
.loop:
	add	eax, 1
	test	byte [eax], 0ffh
	jnz	.loop
	sub	eax, [esp + 4]
	ret	4
 
 
align 16
strlen_3:
	push	ebx
	push	esi
	push	edi
	mov	eax, [esp + 16]
	mov	ebx, -01010101h
	test	eax, 3
	jz	.scan
	mov	edx, [eax]
	test	dl, dl
	jz	.found
	inc	eax
	test	eax, 3
	jz	.scan
	test	dh, dh
	jz	.found
	inc	eax
	shr	edx, 16
	test	eax, 3
	jz	.scan
	test	dl, dl
	jz	.found
	inc	eax
	jmp	.scan
align 16
.scan:
	mov	esi, [eax]
	mov	edi, [eax + 4]
	add	eax, 8
	lea	ecx, [esi + ebx]
	lea	edx, [edi + ebx]
	not	esi
	not	edi
	and	ecx, esi
	and	edx, edi
	and	ecx, 80808080h
	jnz	.foundlo
	and	edx, 80808080h
	jnz	.foundhi
	mov	esi, [eax]
	mov	edi, [eax + 4]
	add	eax, 8
	lea	ecx, [esi + ebx]
	lea	edx, [edi + ebx]
	not	esi
	not	edi
	and	ecx, esi
	and	edx, edi
	and	ecx, 80808080h
	jnz	.foundlo
	and	edx, 80808080h
	jz	.scan	
.foundhi:
	bsf	edx, edx
	sub	eax, [esp + 16]
	shr	edx, 3
	lea	eax, [eax + edx - 4]
	pop	edi
	pop	esi
	pop	ebx
	ret	4
.foundlo:
	bsf	ecx, ecx
	sub	eax, [esp + 16]
	shr	ecx, 3
	lea	eax, [eax + ecx - 8]
	pop	edi
	pop	esi
	pop	ebx
	ret	4
.found:
	sub	eax, [esp + 16]
	pop	edi
	pop	esi
	pop	ebx
	ret	4
 
 
strlen_4:
	push	ebx
	push	esi
	push	edi
	mov	eax, [esp + 16]
	mov	ebx, -01010101h
	test	eax, 3
	jz	.scan
	mov	edx, [eax]
	test	dl, dl
	jz	.found
	inc	eax
	test	eax, 3
	jz	.scan
	test	dh, dh
	jz	.found
	inc	eax
	shr	edx, 16
	test	eax, 3
	jz	.scan
	test	dl, dl
	jz	.found
	inc	eax
	jmp	.scan
.found:
	sub	eax, [esp + 16]
	pop	edi
	pop	esi
	pop	ebx
	ret	4	
align 16
.scan:
	mov	esi, [eax]
	mov	edi, [eax + 4]
	add	eax, 8
	lea	ecx, [esi + ebx]
	lea	edx, [edi + ebx]
	not	esi
	not	edi
	and	ecx, esi
	and	edx, edi
	and	ecx, 80808080h
	jnz	.foundlo
	and	edx, 80808080h
	jnz	.foundhi
	mov	esi, [eax]
	mov	edi, [eax + 4]
	add	eax, 8
	lea	ecx, [esi + ebx]
	lea	edx, [edi + ebx]
	not	esi
	not	edi
	and	ecx, esi
	and	edx, edi
	and	ecx, 80808080h
	jnz	.foundlo
	and	edx, 80808080h
	jnz	.foundhi
	mov	esi, [eax]
	mov	edi, [eax + 4]
	add	eax, 8
	lea	ecx, [esi + ebx]
	lea	edx, [edi + ebx]
	not	esi
	not	edi
	and	ecx, esi
	and	edx, edi
	and	ecx, 80808080h
	jnz	.foundlo
	and	edx, 80808080h
	jnz	.foundhi
	mov	esi, [eax]
	mov	edi, [eax + 4]
	add	eax, 8
	lea	ecx, [esi + ebx]
	lea	edx, [edi + ebx]
	not	esi
	not	edi
	and	ecx, esi
	and	edx, edi
	and	ecx, 80808080h
	jnz	.foundlo
	and	edx, 80808080h
	jz	.scan		
.foundhi:
	bsf	edx, edx
	sub	eax, [esp + 16]
	shr	edx, 3
	lea	eax, [eax + edx - 4]
	pop	edi
	pop	esi
	pop	ebx
	ret	4
.foundlo:
	bsf	ecx, ecx
	sub	eax, [esp + 16]
	shr	ecx, 3
	lea	eax, [eax + ecx - 8]
	pop	edi
	pop	esi
	pop	ebx
	ret	4
 
 
makestring:
	mov	eax, [esp + 4] ;; size
	push	eax
	push	strbuf1
	push	eax
	push	strbuf2
	push	eax
	push	strbuf3
	push	eax
	push	strbuf4
	call	fillstring
	call	fillstring
	call	fillstring
	call	fillstring
	ret	4
 
fillstring:
	mov	al, 1
	mov	edx, [esp + 4] ;; ptr
	mov	ecx, [esp + 8] ;; size
	mov	byte [edx + ecx], 0
.loop:
	dec	ecx
	js	.done
	mov	byte [edx + ecx], al
	add	al, 1
	jnz	.skip
	mov	al, 1
.skip:
	jmp	.loop
.done:
	ret	8
 
writehex:
	push	ebx
	push	esi
	push	edi
	mov	eax, [esp + 16] ;; binary value
	mov	ecx, 32 - 4
	mov	esi, b2hlut
	mov	edi, b2hout
.loop:
	mov	edx, eax
	shr	edx, cl
	and	edx, 0fh 
	mov	dl, byte [esi + edx]
	mov	byte [edi], dl
	add	edi, 1
	sub	ecx, 4
	jns	.loop
	mov	byte [edi], 10
	;;
	mov	eax, 4
	mov	ebx, 1
	mov	ecx, b2hout
	mov	edx, 9
	int	80h
	pop	edi
	pop	esi
	pop	ebx
	ret	4
 
exit:
 
	mov	eax, 01h
	xor	ebx, ebx
	int	80h
				Z2xvYmFsIF9zdGFydAoKCgpzZWN0aW9uIC5kYXRhCgoJdGltZWxvCWRkCTAKCXRpbWVoaQlkZAkwCgliMmhvdXQJZGIJMCwwLDAsMCwwLDAsMCwwLDAKCgoJYjJobHV0CWRiCScwJywnMScsJzInLCczJywnNCcsJzUnLCc2JywnNycsJzgnLCc5JywnYScsJ2InLCdjJywnZCcsJ2UnLCdmJwphbGlnbiA4CglzdHJidWYxCXJlc2IJMjU2CglwYWQxCXJlc2IJMSA7OyBtaXNhbGlnbmVkIGJ5IDEKCXN0cmJ1ZjIJcmVzYgkyNTYKCXBhZDIJcmVzYgkxIDs7IG1pc2FsaWduZWQgYnkgMgoJc3RyYnVmMwlyZXNiCTI1NgoJcGFkMwlyZXNiCTEgOzsgbWlzYWxpZ25lZCBieSAzCglzdHJidWY0CXJlc2IJMjU2CgpzZWN0aW9uIC50ZXh0CgoKX3N0YXJ0OgoKCSBwdXNoCTEwMDAwMAoJIHB1c2gJMjU1CgkgcHVzaAlzdHJsZW5fMwoJY2FsbAliZW5jaG1hcmsKCSBwdXNoCTEwMDAwMAoJIHB1c2gJMjUxCgkgcHVzaAlzdHJsZW5fMwoJY2FsbAliZW5jaG1hcmsKCgkgcHVzaAkxMDAwMDAKCSBwdXNoCTI1NQoJIHB1c2gJc3RybGVuXzQKCWNhbGwJYmVuY2htYXJrCgkgcHVzaAkxMDAwMDAKCSBwdXNoCTI1MQoJIHB1c2gJc3RybGVuXzQKCWNhbGwJYmVuY2htYXJrCgoJY2FsbAlleGl0CgoKCmJlbmNobWFyazogOzsgZnVuY3Rpb24sIHN0cmluZyBsZW5ndGgsIGl0ZXJhdGlvbnMKCXB1c2gJZWJ4CglwdXNoCWVzaQoJbW92CWVieCwgW2VzcCArIDIwXQoJbW92CWVjeCwgW2VzcCArIDE2XQoJbW92CWVzaSwgW2VzcCArIDEyXQoJcHVzaAllY3gKCWNhbGwJbWFrZXN0cmluZwoJcmR0c2MKCW1vdglkd29yZCBbdGltZWxvXSwgZWF4Cgltb3YJZHdvcmQgW3RpbWVoaV0sIGVkeAoubG9vcDoKCXB1c2gJc3RyYnVmMQoJcHVzaAlzdHJidWYyCglwdXNoCXN0cmJ1ZjMKCXB1c2gJc3RyYnVmNAoJY2FsbAllc2kKCWNhbGwJZXNpCgljYWxsCWVzaQoJY2FsbAllc2kKCXN1YgllYngsIDEKCWpuegkubG9vcAoJcmR0c2MKCXN1YgllYXgsIGR3b3JkIFt0aW1lbG9dCglzYmIJZWR4LCBkd29yZCBbdGltZWhpXQoJcHVzaAllYXgKCTs7cHVzaAllZHgKCTs7Y2FsbAl3cml0ZWhleAoJY2FsbAl3cml0ZWhleAkKCXBvcAllc2kKCXBvcAllYngKCXJldAkxMgoKYWxpZ24gMTYKc3RybGVuXzE6CglwdXNoCWVkaQoJbW92CWVjeCwgLTEKCXhvcgllZHgsIGVkeAoJbW92CWVkaSwgW2VzcCArIDhdIDs7IHN0ciBwdHIKCWNsZAoJeG9yCWVheCwgZWF4CglzdWIJZWR4LCBlZGkKCXJlcG5lIHNjYXNiCglsZWEJZWF4LCBbZWRpICsgZWR4IC0gMV0KCXBvcAllZGkKCXJldAk0CgoKYWxpZ24gMTYKc3RybGVuXzI6Cgltb3YJZWF4LCBbZXNwICsgNF0KCWRlYwllYXgKCWptcAkubG9vcAphbGlnbiAxNgoubG9vcDoKCWFkZAllYXgsIDEKCXRlc3QJYnl0ZSBbZWF4XSwgMGZmaAoJam56CS5sb29wCglzdWIJZWF4LCBbZXNwICsgNF0KCXJldAk0CgoKYWxpZ24gMTYKc3RybGVuXzM6CglwdXNoCWVieAoJcHVzaAllc2kKCXB1c2gJZWRpCgltb3YJZWF4LCBbZXNwICsgMTZdCgltb3YJZWJ4LCAtMDEwMTAxMDFoCgl0ZXN0CWVheCwgMwoJanoJLnNjYW4KCW1vdgllZHgsIFtlYXhdCgl0ZXN0CWRsLCBkbAoJanoJLmZvdW5kCglpbmMJZWF4Cgl0ZXN0CWVheCwgMwoJanoJLnNjYW4KCXRlc3QJZGgsIGRoCglqegkuZm91bmQKCWluYwllYXgKCXNocgllZHgsIDE2Cgl0ZXN0CWVheCwgMwoJanoJLnNjYW4KCXRlc3QJZGwsIGRsCglqegkuZm91bmQKCWluYwllYXgKCWptcAkuc2NhbgphbGlnbiAxNgouc2NhbjoKCW1vdgllc2ksIFtlYXhdCgltb3YJZWRpLCBbZWF4ICsgNF0KCWFkZAllYXgsIDgKCWxlYQllY3gsIFtlc2kgKyBlYnhdCglsZWEJZWR4LCBbZWRpICsgZWJ4XQoJbm90CWVzaQoJbm90CWVkaQoJYW5kCWVjeCwgZXNpCglhbmQJZWR4LCBlZGkKCWFuZAllY3gsIDgwODA4MDgwaAoJam56CS5mb3VuZGxvCglhbmQJZWR4LCA4MDgwODA4MGgKCWpuegkuZm91bmRoaQoJbW92CWVzaSwgW2VheF0KCW1vdgllZGksIFtlYXggKyA0XQoJYWRkCWVheCwgOAoJbGVhCWVjeCwgW2VzaSArIGVieF0KCWxlYQllZHgsIFtlZGkgKyBlYnhdCglub3QJZXNpCglub3QJZWRpCglhbmQJZWN4LCBlc2kKCWFuZAllZHgsIGVkaQoJYW5kCWVjeCwgODA4MDgwODBoCglqbnoJLmZvdW5kbG8KCWFuZAllZHgsIDgwODA4MDgwaAoJanoJLnNjYW4JCi5mb3VuZGhpOgoJYnNmCWVkeCwgZWR4CglzdWIJZWF4LCBbZXNwICsgMTZdCglzaHIJZWR4LCAzCglsZWEJZWF4LCBbZWF4ICsgZWR4IC0gNF0KCXBvcAllZGkKCXBvcAllc2kKCXBvcAllYngKCXJldAk0Ci5mb3VuZGxvOgoJYnNmCWVjeCwgZWN4CglzdWIJZWF4LCBbZXNwICsgMTZdCglzaHIJZWN4LCAzCglsZWEJZWF4LCBbZWF4ICsgZWN4IC0gOF0KCXBvcAllZGkKCXBvcAllc2kKCXBvcAllYngKCXJldAk0Ci5mb3VuZDoKCXN1YgllYXgsIFtlc3AgKyAxNl0KCXBvcAllZGkKCXBvcAllc2kKCXBvcAllYngKCXJldAk0CgoKc3RybGVuXzQ6CglwdXNoCWVieAoJcHVzaAllc2kKCXB1c2gJZWRpCgltb3YJZWF4LCBbZXNwICsgMTZdCgltb3YJZWJ4LCAtMDEwMTAxMDFoCgl0ZXN0CWVheCwgMwoJanoJLnNjYW4KCW1vdgllZHgsIFtlYXhdCgl0ZXN0CWRsLCBkbAoJanoJLmZvdW5kCglpbmMJZWF4Cgl0ZXN0CWVheCwgMwoJanoJLnNjYW4KCXRlc3QJZGgsIGRoCglqegkuZm91bmQKCWluYwllYXgKCXNocgllZHgsIDE2Cgl0ZXN0CWVheCwgMwoJanoJLnNjYW4KCXRlc3QJZGwsIGRsCglqegkuZm91bmQKCWluYwllYXgKCWptcAkuc2NhbgouZm91bmQ6CglzdWIJZWF4LCBbZXNwICsgMTZdCglwb3AJZWRpCglwb3AJZXNpCglwb3AJZWJ4CglyZXQJNAkKYWxpZ24gMTYKLnNjYW46Cgltb3YJZXNpLCBbZWF4XQoJbW92CWVkaSwgW2VheCArIDRdCglhZGQJZWF4LCA4CglsZWEJZWN4LCBbZXNpICsgZWJ4XQoJbGVhCWVkeCwgW2VkaSArIGVieF0KCW5vdAllc2kKCW5vdAllZGkKCWFuZAllY3gsIGVzaQoJYW5kCWVkeCwgZWRpCglhbmQJZWN4LCA4MDgwODA4MGgKCWpuegkuZm91bmRsbwoJYW5kCWVkeCwgODA4MDgwODBoCglqbnoJLmZvdW5kaGkKCW1vdgllc2ksIFtlYXhdCgltb3YJZWRpLCBbZWF4ICsgNF0KCWFkZAllYXgsIDgKCWxlYQllY3gsIFtlc2kgKyBlYnhdCglsZWEJZWR4LCBbZWRpICsgZWJ4XQoJbm90CWVzaQoJbm90CWVkaQoJYW5kCWVjeCwgZXNpCglhbmQJZWR4LCBlZGkKCWFuZAllY3gsIDgwODA4MDgwaAoJam56CS5mb3VuZGxvCglhbmQJZWR4LCA4MDgwODA4MGgKCWpuegkuZm91bmRoaQoJbW92CWVzaSwgW2VheF0KCW1vdgllZGksIFtlYXggKyA0XQoJYWRkCWVheCwgOAoJbGVhCWVjeCwgW2VzaSArIGVieF0KCWxlYQllZHgsIFtlZGkgKyBlYnhdCglub3QJZXNpCglub3QJZWRpCglhbmQJZWN4LCBlc2kKCWFuZAllZHgsIGVkaQoJYW5kCWVjeCwgODA4MDgwODBoCglqbnoJLmZvdW5kbG8KCWFuZAllZHgsIDgwODA4MDgwaAoJam56CS5mb3VuZGhpCgltb3YJZXNpLCBbZWF4XQoJbW92CWVkaSwgW2VheCArIDRdCglhZGQJZWF4LCA4CglsZWEJZWN4LCBbZXNpICsgZWJ4XQoJbGVhCWVkeCwgW2VkaSArIGVieF0KCW5vdAllc2kKCW5vdAllZGkKCWFuZAllY3gsIGVzaQoJYW5kCWVkeCwgZWRpCglhbmQJZWN4LCA4MDgwODA4MGgKCWpuegkuZm91bmRsbwoJYW5kCWVkeCwgODA4MDgwODBoCglqegkuc2NhbgkJCi5mb3VuZGhpOgoJYnNmCWVkeCwgZWR4CglzdWIJZWF4LCBbZXNwICsgMTZdCglzaHIJZWR4LCAzCglsZWEJZWF4LCBbZWF4ICsgZWR4IC0gNF0KCXBvcAllZGkKCXBvcAllc2kKCXBvcAllYngKCXJldAk0Ci5mb3VuZGxvOgoJYnNmCWVjeCwgZWN4CglzdWIJZWF4LCBbZXNwICsgMTZdCglzaHIJZWN4LCAzCglsZWEJZWF4LCBbZWF4ICsgZWN4IC0gOF0KCXBvcAllZGkKCXBvcAllc2kKCXBvcAllYngKCXJldAk0CgoJCm1ha2VzdHJpbmc6Cgltb3YJZWF4LCBbZXNwICsgNF0gOzsgc2l6ZQoJcHVzaAllYXgKCXB1c2gJc3RyYnVmMQoJcHVzaAllYXgKCXB1c2gJc3RyYnVmMgoJcHVzaAllYXgKCXB1c2gJc3RyYnVmMwoJcHVzaAllYXgKCXB1c2gJc3RyYnVmNAoJY2FsbAlmaWxsc3RyaW5nCgljYWxsCWZpbGxzdHJpbmcKCWNhbGwJZmlsbHN0cmluZwoJY2FsbAlmaWxsc3RyaW5nCglyZXQJNAoKZmlsbHN0cmluZzoKCW1vdglhbCwgMQoJbW92CWVkeCwgW2VzcCArIDRdIDs7IHB0cgoJbW92CWVjeCwgW2VzcCArIDhdIDs7IHNpemUKCW1vdglieXRlIFtlZHggKyBlY3hdLCAwCi5sb29wOgoJZGVjCWVjeAoJanMJLmRvbmUKCW1vdglieXRlIFtlZHggKyBlY3hdLCBhbAoJYWRkCWFsLCAxCglqbnoJLnNraXAKCW1vdglhbCwgMQouc2tpcDoKCWptcAkubG9vcAouZG9uZToKCXJldAk4Cgp3cml0ZWhleDoKCXB1c2gJZWJ4CglwdXNoCWVzaQoJcHVzaAllZGkKCW1vdgllYXgsIFtlc3AgKyAxNl0gOzsgYmluYXJ5IHZhbHVlCgltb3YJZWN4LCAzMiAtIDQKCW1vdgllc2ksIGIyaGx1dAoJbW92CWVkaSwgYjJob3V0Ci5sb29wOgoJbW92CWVkeCwgZWF4CglzaHIJZWR4LCBjbAoJYW5kCWVkeCwgMGZoIAoJbW92CWRsLCBieXRlIFtlc2kgKyBlZHhdCgltb3YJYnl0ZSBbZWRpXSwgZGwKCWFkZAllZGksIDEKCXN1YgllY3gsIDQKCWpucwkubG9vcAoJbW92CWJ5dGUgW2VkaV0sIDEwCgk7OwoJbW92CWVheCwgNAoJbW92CWVieCwgMQoJbW92CWVjeCwgYjJob3V0Cgltb3YJZWR4LCA5CglpbnQJODBoCglwb3AJZWRpCglwb3AJZXNpCglwb3AJZWJ4CglyZXQJNAoKZXhpdDoKCgltb3YJZWF4LCAwMWgKCXhvcgllYngsIGVieAoJaW50CTgwaA==