global _start



section .data

	timelo	dd	0
	timehi	dd	0
	b2hout	db	0,0,0,0,0,0,0,0,0


	b2hlut	db	'0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'
align 8
	strbuf1	resb	256
	pad1	resb	1 ;; misaligned by 1
	strbuf2	resb	256
	pad2	resb	1 ;; misaligned by 2
	strbuf3	resb	256
	pad3	resb	1 ;; misaligned by 3
	strbuf4	resb	256

section .text


_start:

	 push	100000
	 push	255
	 push	strlen_3
	call	benchmark
	 push	100000
	 push	251
	 push	strlen_3
	call	benchmark

	 push	100000
	 push	255
	 push	strlen_4
	call	benchmark
	 push	100000
	 push	251
	 push	strlen_4
	call	benchmark

	call	exit



benchmark: ;; function, string length, iterations
	push	ebx
	push	esi
	mov	ebx, [esp + 20]
	mov	ecx, [esp + 16]
	mov	esi, [esp + 12]
	push	ecx
	call	makestring
	rdtsc
	mov	dword [timelo], eax
	mov	dword [timehi], edx
.loop:
	push	strbuf1
	push	strbuf2
	push	strbuf3
	push	strbuf4
	call	esi
	call	esi
	call	esi
	call	esi
	sub	ebx, 1
	jnz	.loop
	rdtsc
	sub	eax, dword [timelo]
	sbb	edx, dword [timehi]
	push	eax
	;;push	edx
	;;call	writehex
	call	writehex	
	pop	esi
	pop	ebx
	ret	12

align 16
strlen_1:
	push	edi
	mov	ecx, -1
	xor	edx, edx
	mov	edi, [esp + 8] ;; str ptr
	cld
	xor	eax, eax
	sub	edx, edi
	repne scasb
	lea	eax, [edi + edx - 1]
	pop	edi
	ret	4


align 16
strlen_2:
	mov	eax, [esp + 4]
	dec	eax
	jmp	.loop
align 16
.loop:
	add	eax, 1
	test	byte [eax], 0ffh
	jnz	.loop
	sub	eax, [esp + 4]
	ret	4


align 16
strlen_3:
	push	ebx
	push	esi
	push	edi
	mov	eax, [esp + 16]
	mov	ebx, -01010101h
	test	eax, 3
	jz	.scan
	mov	edx, [eax]
	test	dl, dl
	jz	.found
	inc	eax
	test	eax, 3
	jz	.scan
	test	dh, dh
	jz	.found
	inc	eax
	shr	edx, 16
	test	eax, 3
	jz	.scan
	test	dl, dl
	jz	.found
	inc	eax
	jmp	.scan
align 16
.scan:
	mov	esi, [eax]
	mov	edi, [eax + 4]
	add	eax, 8
	lea	ecx, [esi + ebx]
	lea	edx, [edi + ebx]
	not	esi
	not	edi
	and	ecx, esi
	and	edx, edi
	and	ecx, 80808080h
	jnz	.foundlo
	and	edx, 80808080h
	jnz	.foundhi
	mov	esi, [eax]
	mov	edi, [eax + 4]
	add	eax, 8
	lea	ecx, [esi + ebx]
	lea	edx, [edi + ebx]
	not	esi
	not	edi
	and	ecx, esi
	and	edx, edi
	and	ecx, 80808080h
	jnz	.foundlo
	and	edx, 80808080h
	jz	.scan	
.foundhi:
	bsf	edx, edx
	sub	eax, [esp + 16]
	shr	edx, 3
	lea	eax, [eax + edx - 4]
	pop	edi
	pop	esi
	pop	ebx
	ret	4
.foundlo:
	bsf	ecx, ecx
	sub	eax, [esp + 16]
	shr	ecx, 3
	lea	eax, [eax + ecx - 8]
	pop	edi
	pop	esi
	pop	ebx
	ret	4
.found:
	sub	eax, [esp + 16]
	pop	edi
	pop	esi
	pop	ebx
	ret	4


strlen_4:
	push	ebx
	push	esi
	push	edi
	mov	eax, [esp + 16]
	mov	ebx, -01010101h
	test	eax, 3
	jz	.scan
	mov	edx, [eax]
	test	dl, dl
	jz	.found
	inc	eax
	test	eax, 3
	jz	.scan
	test	dh, dh
	jz	.found
	inc	eax
	shr	edx, 16
	test	eax, 3
	jz	.scan
	test	dl, dl
	jz	.found
	inc	eax
	jmp	.scan
.found:
	sub	eax, [esp + 16]
	pop	edi
	pop	esi
	pop	ebx
	ret	4	
align 16
.scan:
	mov	esi, [eax]
	mov	edi, [eax + 4]
	add	eax, 8
	lea	ecx, [esi + ebx]
	lea	edx, [edi + ebx]
	not	esi
	not	edi
	and	ecx, esi
	and	edx, edi
	and	ecx, 80808080h
	jnz	.foundlo
	and	edx, 80808080h
	jnz	.foundhi
	mov	esi, [eax]
	mov	edi, [eax + 4]
	add	eax, 8
	lea	ecx, [esi + ebx]
	lea	edx, [edi + ebx]
	not	esi
	not	edi
	and	ecx, esi
	and	edx, edi
	and	ecx, 80808080h
	jnz	.foundlo
	and	edx, 80808080h
	jnz	.foundhi
	mov	esi, [eax]
	mov	edi, [eax + 4]
	add	eax, 8
	lea	ecx, [esi + ebx]
	lea	edx, [edi + ebx]
	not	esi
	not	edi
	and	ecx, esi
	and	edx, edi
	and	ecx, 80808080h
	jnz	.foundlo
	and	edx, 80808080h
	jnz	.foundhi
	mov	esi, [eax]
	mov	edi, [eax + 4]
	add	eax, 8
	lea	ecx, [esi + ebx]
	lea	edx, [edi + ebx]
	not	esi
	not	edi
	and	ecx, esi
	and	edx, edi
	and	ecx, 80808080h
	jnz	.foundlo
	and	edx, 80808080h
	jz	.scan		
.foundhi:
	bsf	edx, edx
	sub	eax, [esp + 16]
	shr	edx, 3
	lea	eax, [eax + edx - 4]
	pop	edi
	pop	esi
	pop	ebx
	ret	4
.foundlo:
	bsf	ecx, ecx
	sub	eax, [esp + 16]
	shr	ecx, 3
	lea	eax, [eax + ecx - 8]
	pop	edi
	pop	esi
	pop	ebx
	ret	4

	
makestring:
	mov	eax, [esp + 4] ;; size
	push	eax
	push	strbuf1
	push	eax
	push	strbuf2
	push	eax
	push	strbuf3
	push	eax
	push	strbuf4
	call	fillstring
	call	fillstring
	call	fillstring
	call	fillstring
	ret	4

fillstring:
	mov	al, 1
	mov	edx, [esp + 4] ;; ptr
	mov	ecx, [esp + 8] ;; size
	mov	byte [edx + ecx], 0
.loop:
	dec	ecx
	js	.done
	mov	byte [edx + ecx], al
	add	al, 1
	jnz	.skip
	mov	al, 1
.skip:
	jmp	.loop
.done:
	ret	8

writehex:
	push	ebx
	push	esi
	push	edi
	mov	eax, [esp + 16] ;; binary value
	mov	ecx, 32 - 4
	mov	esi, b2hlut
	mov	edi, b2hout
.loop:
	mov	edx, eax
	shr	edx, cl
	and	edx, 0fh 
	mov	dl, byte [esi + edx]
	mov	byte [edi], dl
	add	edi, 1
	sub	ecx, 4
	jns	.loop
	mov	byte [edi], 10
	;;
	mov	eax, 4
	mov	ebx, 1
	mov	ecx, b2hout
	mov	edx, 9
	int	80h
	pop	edi
	pop	esi
	pop	ebx
	ret	4

exit:

	mov	eax, 01h
	xor	ebx, ebx
	int	80h