global _start
 
section .data
	timelo	dd	0
	timehi	dd	0
	b2hout	db	0,0,0,0,0,0,0,0,0
	b2hlut	db	'0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'
align 8
	STRINGTBL	dq	0, 0xff, 0xffff, 0xffffff, 0xffffffff, 0xffffffffff, 0xffffffffffff, 0xffffffffffffff
align 8
	STRLENMASK	dd	0x0000ffff, 0x00007fff, 0x00003fff, 0x00001fff, 0x00000fff, 0x000007ff, 0x000003ff, 0x000001ff, 0x000000ff, 0x0000007f, 0x0000003f, 0x0000001f, 0x0000000f, 0x00000007, 0x00000003, 0x00000001
align 16
	teststr db 0x01, 0x05, 0x09, 0x0d, 0x11, 0x15, 0x19, 0x1d, 0xf1, 0xf5, 0xf9, 0xfd, 0x81, 0x85, 0x89, 0x8d, 0x01, 0x05, 0x09, 0x0d, 0x11, 0x15, 0x19, 0x1d, 0xf1, 0xf5, 0xf9, 0xfd, 0x81, 0x85, 0x89, 0x8d, 0
align 16
	strbuf1	resb	1024
	pad1	resb	1 ;; misaligned by 1
	strbuf2	resb	1024
	pad2	resb	1 ;; misaligned by 2
	strbuf3	resb	1024
	pad3	resb	1 ;; misaligned by 3
	strbuf4	resb	1024
	pad4	resb	64
 
section .text
	FUNCS	dd	strlen_sse, strlen_mmx, strlen_opt, 0
_start:
	mov	ebp, 100000 ;; iterations
	mov	ebx, 1024 ;; length
.loop:
	push	ebx
	call	writehex
	mov	esi, FUNCS
.next
	push	ebp
	push	ebx
	mov	eax, [esi]
	push	eax
	call	benchmark
	add	esi, 4
	mov	eax, [esi]
	cmp	eax, 0
	jnz	.next
	shr	ebx, 1
	jnz	.loop
 
	call	exit
 
 
 
benchmark: ;; function, string length, iterations
	push	ebx
	push	esi
	mov	ebx, [esp + 20]
	mov	ecx, [esp + 16]
	mov	esi, [esp + 12]
	push	ecx
	call	makestring
	rdtsc
	mov	dword [timelo], eax
	mov	dword [timehi], edx
.loop:
	push	strbuf1
	push	strbuf2
	push	strbuf3
	push	strbuf4
	call	esi
	call	esi
	call	esi
	call	esi
	sub	ebx, 1
	jnz	.loop
	rdtsc
	sub	eax, dword [timelo]
	sbb	edx, dword [timehi]
	push	eax
	;;push	edx
	;;call	writehex
	call	writehex	
	pop	esi
	pop	ebx
	ret	12
 
 
align 16
strlen_sse:
	mov	ecx, [esp + 4]
	pxor	xmm1, xmm1
	movdqu	xmm0, [ecx]
	mov	edx, ecx
	pcmpeqb	xmm0, xmm1
	and	edx, 15
	add	ecx, 16
	pmovmskb eax, xmm0
	and	ecx, -16
	and eax, [STRLENMASK + edx * 4]
	jz .scan
	bsf	eax, eax
	sub	ecx, [esp + 4]
	lea	eax, [ecx + eax - 16]
	ret	4
align 16
.scan:
	movdqa	xmm0, [ecx]
	pcmpeqb	xmm0, xmm1
	add	ecx, 16
	pmovmskb	eax, xmm0
	test	eax, eax
	jz	.scan
	bsf	eax, eax
	sub	ecx, [esp + 4]
	lea	eax, [ecx + eax - 16]
	ret 4
 
align 16
strlen_mmx:
	mov     eax, [esp + 4] 
	pxor     mm1, mm1 
	mov      ecx, eax 
	mov      edx, eax 
	and      ecx, -8 
	and      eax, 7 
	movq     mm0, [ecx] 
	por      mm0, [STRINGTBL+eax*8]
	jmp	.scan
align 16
.scan: 
	add      ecx, 8 
	pcmpeqb  mm0, mm1 
	packsswb mm0, mm0 
	movd     eax, mm0 
	movq     mm0, [ecx] 
	test     eax, eax 
	jz       .scan
	bsf      eax, eax 
	shr      eax, 2 
	lea      eax, [ecx+eax-8] 
	sub      eax, edx 
	emms
	ret 4
 
align 16
strlen_opt:
	push	ebx
	push	esi
	push	edi
	mov	eax, [esp + 16]
	mov	ebx, -01010101h
	test	eax, 3
	jz	.scan
	mov	edx, [eax]
	test	dl, dl
	jz	.found
	inc	eax
	test	eax, 3
	jz	.scan
	test	dh, dh
	jz	.found
	inc	eax
	shr	edx, 16
	test	eax, 3
	jz	.scan
	test	dl, dl
	jz	.found
	inc	eax
	jmp	.scan
.found:
	sub	eax, [esp + 16]
	pop	edi
	pop	esi
	pop	ebx
	ret	4	
align 16
.scan:
	mov	esi, [eax]
	mov	edi, [eax + 4]
	add	eax, 8
	lea	ecx, [esi + ebx]
	lea	edx, [edi + ebx]
	not	esi
	not	edi
	and	ecx, esi
	and	edx, edi
	and	ecx, 80808080h
	jnz	.foundlo
	and	edx, 80808080h
	jnz	.foundhi
	mov	esi, [eax]
	mov	edi, [eax + 4]
	add	eax, 8
	lea	ecx, [esi + ebx]
	lea	edx, [edi + ebx]
	not	esi
	not	edi
	and	ecx, esi
	and	edx, edi
	and	ecx, 80808080h
	jnz	.foundlo
	and	edx, 80808080h
	jnz	.foundhi
	mov	esi, [eax]
	mov	edi, [eax + 4]
	add	eax, 8
	lea	ecx, [esi + ebx]
	lea	edx, [edi + ebx]
	not	esi
	not	edi
	and	ecx, esi
	and	edx, edi
	and	ecx, 80808080h
	jnz	.foundlo
	and	edx, 80808080h
	jnz	.foundhi
	mov	esi, [eax]
	mov	edi, [eax + 4]
	add	eax, 8
	lea	ecx, [esi + ebx]
	lea	edx, [edi + ebx]
	not	esi
	not	edi
	and	ecx, esi
	and	edx, edi
	and	ecx, 80808080h
	jnz	.foundlo
	and	edx, 80808080h
	jz	.scan		
.foundhi:
	bsf	edx, edx
	sub	eax, [esp + 16]
	shr	edx, 3
	lea	eax, [eax + edx - 4]
	pop	edi
	pop	esi
	pop	ebx
	ret	4
.foundlo:
	bsf	ecx, ecx
	sub	eax, [esp + 16]
	shr	ecx, 3
	lea	eax, [eax + ecx - 8]
	pop	edi
	pop	esi
	pop	ebx
	ret	4
 
 
makestring:
	mov	eax, [esp + 4] ;; size
	push	eax
	push	strbuf1
	push	eax
	push	strbuf2
	push	eax
	push	strbuf3
	push	eax
	push	strbuf4
	call	fillstring
	call	fillstring
	call	fillstring
	call	fillstring
	ret	4
 
fillstring:
	mov	al, 1
	mov	edx, [esp + 4] ;; ptr
	mov	ecx, [esp + 8] ;; size
	mov	byte [edx + ecx], 0
.loop:
	dec	ecx
	js	.done
	mov	byte [edx + ecx], al
	add	al, 1
	jnz	.skip
	mov	al, 1
.skip:
	jmp	.loop
.done:
	ret	8
 
writehex:
	push	ebx
	push	esi
	push	edi
	mov	eax, [esp + 16] ;; binary value
	mov	ecx, 32 - 4
	mov	esi, b2hlut
	mov	edi, b2hout
.loop:
	mov	edx, eax
	shr	edx, cl
	and	edx, 0fh 
	mov	dl, byte [esi + edx]
	mov	byte [edi], dl
	add	edi, 1
	sub	ecx, 4
	jns	.loop
	mov	byte [edi], 10
	;;
	mov	eax, 4
	mov	ebx, 1
	mov	ecx, b2hout
	mov	edx, 9
	int	80h
	pop	edi
	pop	esi
	pop	ebx
	ret	4
 
exit:
 
	mov	eax, 01h
	xor	ebx, ebx
	int	80h
				Z2xvYmFsIF9zdGFydAoKc2VjdGlvbiAuZGF0YQoJdGltZWxvCWRkCTAKCXRpbWVoaQlkZAkwCgliMmhvdXQJZGIJMCwwLDAsMCwwLDAsMCwwLDAKCWIyaGx1dAlkYgknMCcsJzEnLCcyJywnMycsJzQnLCc1JywnNicsJzcnLCc4JywnOScsJ2EnLCdiJywnYycsJ2QnLCdlJywnZicKYWxpZ24gOAoJU1RSSU5HVEJMCWRxCTAsIDB4ZmYsIDB4ZmZmZiwgMHhmZmZmZmYsIDB4ZmZmZmZmZmYsIDB4ZmZmZmZmZmZmZiwgMHhmZmZmZmZmZmZmZmYsIDB4ZmZmZmZmZmZmZmZmZmYKYWxpZ24gOAoJU1RSTEVOTUFTSwlkZAkweDAwMDBmZmZmLCAweDAwMDA3ZmZmLCAweDAwMDAzZmZmLCAweDAwMDAxZmZmLCAweDAwMDAwZmZmLCAweDAwMDAwN2ZmLCAweDAwMDAwM2ZmLCAweDAwMDAwMWZmLCAweDAwMDAwMGZmLCAweDAwMDAwMDdmLCAweDAwMDAwMDNmLCAweDAwMDAwMDFmLCAweDAwMDAwMDBmLCAweDAwMDAwMDA3LCAweDAwMDAwMDAzLCAweDAwMDAwMDAxCmFsaWduIDE2Cgl0ZXN0c3RyIGRiIDB4MDEsIDB4MDUsIDB4MDksIDB4MGQsIDB4MTEsIDB4MTUsIDB4MTksIDB4MWQsIDB4ZjEsIDB4ZjUsIDB4ZjksIDB4ZmQsIDB4ODEsIDB4ODUsIDB4ODksIDB4OGQsIDB4MDEsIDB4MDUsIDB4MDksIDB4MGQsIDB4MTEsIDB4MTUsIDB4MTksIDB4MWQsIDB4ZjEsIDB4ZjUsIDB4ZjksIDB4ZmQsIDB4ODEsIDB4ODUsIDB4ODksIDB4OGQsIDAKYWxpZ24gMTYKCXN0cmJ1ZjEJcmVzYgkxMDI0CglwYWQxCXJlc2IJMSA7OyBtaXNhbGlnbmVkIGJ5IDEKCXN0cmJ1ZjIJcmVzYgkxMDI0CglwYWQyCXJlc2IJMSA7OyBtaXNhbGlnbmVkIGJ5IDIKCXN0cmJ1ZjMJcmVzYgkxMDI0CglwYWQzCXJlc2IJMSA7OyBtaXNhbGlnbmVkIGJ5IDMKCXN0cmJ1ZjQJcmVzYgkxMDI0CglwYWQ0CXJlc2IJNjQKCnNlY3Rpb24gLnRleHQKCUZVTkNTCWRkCXN0cmxlbl9zc2UsIHN0cmxlbl9tbXgsIHN0cmxlbl9vcHQsIDAKX3N0YXJ0OgoJbW92CWVicCwgMTAwMDAwIDs7IGl0ZXJhdGlvbnMKCW1vdgllYngsIDEwMjQgOzsgbGVuZ3RoCi5sb29wOgoJcHVzaAllYngKCWNhbGwJd3JpdGVoZXgKCW1vdgllc2ksIEZVTkNTCi5uZXh0CglwdXNoCWVicAoJcHVzaAllYngKCW1vdgllYXgsIFtlc2ldCglwdXNoCWVheAoJY2FsbAliZW5jaG1hcmsKCWFkZAllc2ksIDQKCW1vdgllYXgsIFtlc2ldCgljbXAJZWF4LCAwCglqbnoJLm5leHQKCXNocgllYngsIDEKCWpuegkubG9vcAoJCgljYWxsCWV4aXQKCgoKYmVuY2htYXJrOiA7OyBmdW5jdGlvbiwgc3RyaW5nIGxlbmd0aCwgaXRlcmF0aW9ucwoJcHVzaAllYngKCXB1c2gJZXNpCgltb3YJZWJ4LCBbZXNwICsgMjBdCgltb3YJZWN4LCBbZXNwICsgMTZdCgltb3YJZXNpLCBbZXNwICsgMTJdCglwdXNoCWVjeAoJY2FsbAltYWtlc3RyaW5nCglyZHRzYwoJbW92CWR3b3JkIFt0aW1lbG9dLCBlYXgKCW1vdglkd29yZCBbdGltZWhpXSwgZWR4Ci5sb29wOgoJcHVzaAlzdHJidWYxCglwdXNoCXN0cmJ1ZjIKCXB1c2gJc3RyYnVmMwoJcHVzaAlzdHJidWY0CgljYWxsCWVzaQoJY2FsbAllc2kKCWNhbGwJZXNpCgljYWxsCWVzaQoJc3ViCWVieCwgMQoJam56CS5sb29wCglyZHRzYwoJc3ViCWVheCwgZHdvcmQgW3RpbWVsb10KCXNiYgllZHgsIGR3b3JkIFt0aW1laGldCglwdXNoCWVheAoJOztwdXNoCWVkeAoJOztjYWxsCXdyaXRlaGV4CgljYWxsCXdyaXRlaGV4CQoJcG9wCWVzaQoJcG9wCWVieAoJcmV0CTEyCgoKYWxpZ24gMTYKc3RybGVuX3NzZToKCW1vdgllY3gsIFtlc3AgKyA0XQoJcHhvcgl4bW0xLCB4bW0xCgltb3ZkcXUJeG1tMCwgW2VjeF0KCW1vdgllZHgsIGVjeAoJcGNtcGVxYgl4bW0wLCB4bW0xCglhbmQJZWR4LCAxNQoJYWRkCWVjeCwgMTYKCXBtb3Ztc2tiIGVheCwgeG1tMAoJYW5kCWVjeCwgLTE2CglhbmQgZWF4LCBbU1RSTEVOTUFTSyArIGVkeCAqIDRdCglqeiAuc2NhbgoJYnNmCWVheCwgZWF4CglzdWIJZWN4LCBbZXNwICsgNF0KCWxlYQllYXgsIFtlY3ggKyBlYXggLSAxNl0KCXJldAk0CmFsaWduIDE2Ci5zY2FuOgoJbW92ZHFhCXhtbTAsIFtlY3hdCglwY21wZXFiCXhtbTAsIHhtbTEKCWFkZAllY3gsIDE2CglwbW92bXNrYgllYXgsIHhtbTAKCXRlc3QJZWF4LCBlYXgKCWp6CS5zY2FuCglic2YJZWF4LCBlYXgKCXN1YgllY3gsIFtlc3AgKyA0XQoJbGVhCWVheCwgW2VjeCArIGVheCAtIDE2XQoJcmV0IDQKCmFsaWduIDE2CnN0cmxlbl9tbXg6Cgltb3YgICAgIGVheCwgW2VzcCArIDRdIAoJcHhvciAgICAgbW0xLCBtbTEgCgltb3YgICAgICBlY3gsIGVheCAKCW1vdiAgICAgIGVkeCwgZWF4IAoJYW5kICAgICAgZWN4LCAtOCAKCWFuZCAgICAgIGVheCwgNyAKCW1vdnEgICAgIG1tMCwgW2VjeF0gCglwb3IgICAgICBtbTAsIFtTVFJJTkdUQkwrZWF4KjhdCglqbXAJLnNjYW4KYWxpZ24gMTYKLnNjYW46IAoJYWRkICAgICAgZWN4LCA4IAoJcGNtcGVxYiAgbW0wLCBtbTEgCglwYWNrc3N3YiBtbTAsIG1tMCAKCW1vdmQgICAgIGVheCwgbW0wIAoJbW92cSAgICAgbW0wLCBbZWN4XSAKCXRlc3QgICAgIGVheCwgZWF4IAoJanogICAgICAgLnNjYW4KCWJzZiAgICAgIGVheCwgZWF4IAoJc2hyICAgICAgZWF4LCAyIAoJbGVhICAgICAgZWF4LCBbZWN4K2VheC04XSAKCXN1YiAgICAgIGVheCwgZWR4IAoJZW1tcwoJcmV0IDQKCmFsaWduIDE2CnN0cmxlbl9vcHQ6CglwdXNoCWVieAoJcHVzaAllc2kKCXB1c2gJZWRpCgltb3YJZWF4LCBbZXNwICsgMTZdCgltb3YJZWJ4LCAtMDEwMTAxMDFoCgl0ZXN0CWVheCwgMwoJanoJLnNjYW4KCW1vdgllZHgsIFtlYXhdCgl0ZXN0CWRsLCBkbAoJanoJLmZvdW5kCglpbmMJZWF4Cgl0ZXN0CWVheCwgMwoJanoJLnNjYW4KCXRlc3QJZGgsIGRoCglqegkuZm91bmQKCWluYwllYXgKCXNocgllZHgsIDE2Cgl0ZXN0CWVheCwgMwoJanoJLnNjYW4KCXRlc3QJZGwsIGRsCglqegkuZm91bmQKCWluYwllYXgKCWptcAkuc2NhbgouZm91bmQ6CglzdWIJZWF4LCBbZXNwICsgMTZdCglwb3AJZWRpCglwb3AJZXNpCglwb3AJZWJ4CglyZXQJNAkKYWxpZ24gMTYKLnNjYW46Cgltb3YJZXNpLCBbZWF4XQoJbW92CWVkaSwgW2VheCArIDRdCglhZGQJZWF4LCA4CglsZWEJZWN4LCBbZXNpICsgZWJ4XQoJbGVhCWVkeCwgW2VkaSArIGVieF0KCW5vdAllc2kKCW5vdAllZGkKCWFuZAllY3gsIGVzaQoJYW5kCWVkeCwgZWRpCglhbmQJZWN4LCA4MDgwODA4MGgKCWpuegkuZm91bmRsbwoJYW5kCWVkeCwgODA4MDgwODBoCglqbnoJLmZvdW5kaGkKCW1vdgllc2ksIFtlYXhdCgltb3YJZWRpLCBbZWF4ICsgNF0KCWFkZAllYXgsIDgKCWxlYQllY3gsIFtlc2kgKyBlYnhdCglsZWEJZWR4LCBbZWRpICsgZWJ4XQoJbm90CWVzaQoJbm90CWVkaQoJYW5kCWVjeCwgZXNpCglhbmQJZWR4LCBlZGkKCWFuZAllY3gsIDgwODA4MDgwaAoJam56CS5mb3VuZGxvCglhbmQJZWR4LCA4MDgwODA4MGgKCWpuegkuZm91bmRoaQoJbW92CWVzaSwgW2VheF0KCW1vdgllZGksIFtlYXggKyA0XQoJYWRkCWVheCwgOAoJbGVhCWVjeCwgW2VzaSArIGVieF0KCWxlYQllZHgsIFtlZGkgKyBlYnhdCglub3QJZXNpCglub3QJZWRpCglhbmQJZWN4LCBlc2kKCWFuZAllZHgsIGVkaQoJYW5kCWVjeCwgODA4MDgwODBoCglqbnoJLmZvdW5kbG8KCWFuZAllZHgsIDgwODA4MDgwaAoJam56CS5mb3VuZGhpCgltb3YJZXNpLCBbZWF4XQoJbW92CWVkaSwgW2VheCArIDRdCglhZGQJZWF4LCA4CglsZWEJZWN4LCBbZXNpICsgZWJ4XQoJbGVhCWVkeCwgW2VkaSArIGVieF0KCW5vdAllc2kKCW5vdAllZGkKCWFuZAllY3gsIGVzaQoJYW5kCWVkeCwgZWRpCglhbmQJZWN4LCA4MDgwODA4MGgKCWpuegkuZm91bmRsbwoJYW5kCWVkeCwgODA4MDgwODBoCglqegkuc2NhbgkJCi5mb3VuZGhpOgoJYnNmCWVkeCwgZWR4CglzdWIJZWF4LCBbZXNwICsgMTZdCglzaHIJZWR4LCAzCglsZWEJZWF4LCBbZWF4ICsgZWR4IC0gNF0KCXBvcAllZGkKCXBvcAllc2kKCXBvcAllYngKCXJldAk0Ci5mb3VuZGxvOgoJYnNmCWVjeCwgZWN4CglzdWIJZWF4LCBbZXNwICsgMTZdCglzaHIJZWN4LCAzCglsZWEJZWF4LCBbZWF4ICsgZWN4IC0gOF0KCXBvcAllZGkKCXBvcAllc2kKCXBvcAllYngKCXJldAk0CgoJCm1ha2VzdHJpbmc6Cgltb3YJZWF4LCBbZXNwICsgNF0gOzsgc2l6ZQoJcHVzaAllYXgKCXB1c2gJc3RyYnVmMQoJcHVzaAllYXgKCXB1c2gJc3RyYnVmMgoJcHVzaAllYXgKCXB1c2gJc3RyYnVmMwoJcHVzaAllYXgKCXB1c2gJc3RyYnVmNAoJY2FsbAlmaWxsc3RyaW5nCgljYWxsCWZpbGxzdHJpbmcKCWNhbGwJZmlsbHN0cmluZwoJY2FsbAlmaWxsc3RyaW5nCglyZXQJNAoKZmlsbHN0cmluZzoKCW1vdglhbCwgMQoJbW92CWVkeCwgW2VzcCArIDRdIDs7IHB0cgoJbW92CWVjeCwgW2VzcCArIDhdIDs7IHNpemUKCW1vdglieXRlIFtlZHggKyBlY3hdLCAwCi5sb29wOgoJZGVjCWVjeAoJanMJLmRvbmUKCW1vdglieXRlIFtlZHggKyBlY3hdLCBhbAoJYWRkCWFsLCAxCglqbnoJLnNraXAKCW1vdglhbCwgMQouc2tpcDoKCWptcAkubG9vcAouZG9uZToKCXJldAk4Cgp3cml0ZWhleDoKCXB1c2gJZWJ4CglwdXNoCWVzaQoJcHVzaAllZGkKCW1vdgllYXgsIFtlc3AgKyAxNl0gOzsgYmluYXJ5IHZhbHVlCgltb3YJZWN4LCAzMiAtIDQKCW1vdgllc2ksIGIyaGx1dAoJbW92CWVkaSwgYjJob3V0Ci5sb29wOgoJbW92CWVkeCwgZWF4CglzaHIJZWR4LCBjbAoJYW5kCWVkeCwgMGZoIAoJbW92CWRsLCBieXRlIFtlc2kgKyBlZHhdCgltb3YJYnl0ZSBbZWRpXSwgZGwKCWFkZAllZGksIDEKCXN1YgllY3gsIDQKCWpucwkubG9vcAoJbW92CWJ5dGUgW2VkaV0sIDEwCgk7OwoJbW92CWVheCwgNAoJbW92CWVieCwgMQoJbW92CWVjeCwgYjJob3V0Cgltb3YJZWR4LCA5CglpbnQJODBoCglwb3AJZWRpCglwb3AJZXNpCglwb3AJZWJ4CglyZXQJNAoKZXhpdDoKCgltb3YJZWF4LCAwMWgKCXhvcgllYngsIGVieAoJaW50CTgwaA==