fork download
  1. global _start
  2.  
  3. section .data
  4. timelo dd 0
  5. timehi dd 0
  6. b2hout db 0,0,0,0,0,0,0,0,0
  7. b2hlut db '0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'
  8. align 8
  9. STRINGTBL dq 0, 0xff, 0xffff, 0xffffff, 0xffffffff, 0xffffffffff, 0xffffffffffff, 0xffffffffffffff
  10. align 8
  11. STRLENMASK dd 0x0000ffff, 0x00007fff, 0x00003fff, 0x00001fff, 0x00000fff, 0x000007ff, 0x000003ff, 0x000001ff, 0x000000ff, 0x0000007f, 0x0000003f, 0x0000001f, 0x0000000f, 0x00000007, 0x00000003, 0x00000001
  12. align 16
  13. teststr db 0x01, 0x05, 0x09, 0x0d, 0x11, 0x15, 0x19, 0x1d, 0xf1, 0xf5, 0xf9, 0xfd, 0x81, 0x85, 0x89, 0x8d, 0x01, 0x05, 0x09, 0x0d, 0x11, 0x15, 0x19, 0x1d, 0xf1, 0xf5, 0xf9, 0xfd, 0x81, 0x85, 0x89, 0x8d, 0
  14. align 16
  15. strbuf1 resb 1024
  16. pad1 resb 1 ;; misaligned by 1
  17. strbuf2 resb 1024
  18. pad2 resb 1 ;; misaligned by 2
  19. strbuf3 resb 1024
  20. pad3 resb 1 ;; misaligned by 3
  21. strbuf4 resb 1024
  22. pad4 resb 64
  23.  
  24. section .text
  25. FUNCS dd strlen_sse, strlen_mmx, strlen_opt, 0
  26. _start:
  27. mov ebp, 100000 ;; iterations
  28. mov ebx, 1024 ;; length
  29. .loop:
  30. push ebx
  31. call writehex
  32. mov esi, FUNCS
  33. .next
  34. push ebp
  35. push ebx
  36. mov eax, [esi]
  37. push eax
  38. call benchmark
  39. add esi, 4
  40. mov eax, [esi]
  41. cmp eax, 0
  42. jnz .next
  43. shr ebx, 1
  44. jnz .loop
  45.  
  46. call exit
  47.  
  48.  
  49.  
  50. benchmark: ;; function, string length, iterations
  51. push ebx
  52. push esi
  53. mov ebx, [esp + 20]
  54. mov ecx, [esp + 16]
  55. mov esi, [esp + 12]
  56. push ecx
  57. call makestring
  58. rdtsc
  59. mov dword [timelo], eax
  60. mov dword [timehi], edx
  61. .loop:
  62. push strbuf1
  63. push strbuf2
  64. push strbuf3
  65. push strbuf4
  66. call esi
  67. call esi
  68. call esi
  69. call esi
  70. sub ebx, 1
  71. jnz .loop
  72. rdtsc
  73. sub eax, dword [timelo]
  74. sbb edx, dword [timehi]
  75. push eax
  76. ;;push edx
  77. ;;call writehex
  78. call writehex
  79. pop esi
  80. pop ebx
  81. ret 12
  82.  
  83.  
  84. align 16
  85. strlen_sse:
  86. mov ecx, [esp + 4]
  87. pxor xmm1, xmm1
  88. movdqu xmm0, [ecx]
  89. mov edx, ecx
  90. pcmpeqb xmm0, xmm1
  91. and edx, 15
  92. add ecx, 16
  93. pmovmskb eax, xmm0
  94. and ecx, -16
  95. and eax, [STRLENMASK + edx * 4]
  96. jz .scan
  97. bsf eax, eax
  98. sub ecx, [esp + 4]
  99. lea eax, [ecx + eax - 16]
  100. ret 4
  101. align 16
  102. .scan:
  103. movdqa xmm0, [ecx]
  104. pcmpeqb xmm0, xmm1
  105. add ecx, 16
  106. pmovmskb eax, xmm0
  107. test eax, eax
  108. jz .scan
  109. bsf eax, eax
  110. sub ecx, [esp + 4]
  111. lea eax, [ecx + eax - 16]
  112. ret 4
  113.  
  114. align 16
  115. strlen_mmx:
  116. mov eax, [esp + 4]
  117. pxor mm1, mm1
  118. mov ecx, eax
  119. mov edx, eax
  120. and ecx, -8
  121. and eax, 7
  122. movq mm0, [ecx]
  123. por mm0, [STRINGTBL+eax*8]
  124. jmp .scan
  125. align 16
  126. .scan:
  127. add ecx, 8
  128. pcmpeqb mm0, mm1
  129. packsswb mm0, mm0
  130. movd eax, mm0
  131. movq mm0, [ecx]
  132. test eax, eax
  133. jz .scan
  134. bsf eax, eax
  135. shr eax, 2
  136. lea eax, [ecx+eax-8]
  137. sub eax, edx
  138. emms
  139. ret 4
  140.  
  141. align 16
  142. strlen_opt:
  143. push ebx
  144. push esi
  145. push edi
  146. mov eax, [esp + 16]
  147. mov ebx, -01010101h
  148. test eax, 3
  149. jz .scan
  150. mov edx, [eax]
  151. test dl, dl
  152. jz .found
  153. inc eax
  154. test eax, 3
  155. jz .scan
  156. test dh, dh
  157. jz .found
  158. inc eax
  159. shr edx, 16
  160. test eax, 3
  161. jz .scan
  162. test dl, dl
  163. jz .found
  164. inc eax
  165. jmp .scan
  166. .found:
  167. sub eax, [esp + 16]
  168. pop edi
  169. pop esi
  170. pop ebx
  171. ret 4
  172. align 16
  173. .scan:
  174. mov esi, [eax]
  175. mov edi, [eax + 4]
  176. add eax, 8
  177. lea ecx, [esi + ebx]
  178. lea edx, [edi + ebx]
  179. not esi
  180. not edi
  181. and ecx, esi
  182. and edx, edi
  183. and ecx, 80808080h
  184. jnz .foundlo
  185. and edx, 80808080h
  186. jnz .foundhi
  187. mov esi, [eax]
  188. mov edi, [eax + 4]
  189. add eax, 8
  190. lea ecx, [esi + ebx]
  191. lea edx, [edi + ebx]
  192. not esi
  193. not edi
  194. and ecx, esi
  195. and edx, edi
  196. and ecx, 80808080h
  197. jnz .foundlo
  198. and edx, 80808080h
  199. jnz .foundhi
  200. mov esi, [eax]
  201. mov edi, [eax + 4]
  202. add eax, 8
  203. lea ecx, [esi + ebx]
  204. lea edx, [edi + ebx]
  205. not esi
  206. not edi
  207. and ecx, esi
  208. and edx, edi
  209. and ecx, 80808080h
  210. jnz .foundlo
  211. and edx, 80808080h
  212. jnz .foundhi
  213. mov esi, [eax]
  214. mov edi, [eax + 4]
  215. add eax, 8
  216. lea ecx, [esi + ebx]
  217. lea edx, [edi + ebx]
  218. not esi
  219. not edi
  220. and ecx, esi
  221. and edx, edi
  222. and ecx, 80808080h
  223. jnz .foundlo
  224. and edx, 80808080h
  225. jz .scan
  226. .foundhi:
  227. bsf edx, edx
  228. sub eax, [esp + 16]
  229. shr edx, 3
  230. lea eax, [eax + edx - 4]
  231. pop edi
  232. pop esi
  233. pop ebx
  234. ret 4
  235. .foundlo:
  236. bsf ecx, ecx
  237. sub eax, [esp + 16]
  238. shr ecx, 3
  239. lea eax, [eax + ecx - 8]
  240. pop edi
  241. pop esi
  242. pop ebx
  243. ret 4
  244.  
  245.  
  246. makestring:
  247. mov eax, [esp + 4] ;; size
  248. push eax
  249. push strbuf1
  250. push eax
  251. push strbuf2
  252. push eax
  253. push strbuf3
  254. push eax
  255. push strbuf4
  256. call fillstring
  257. call fillstring
  258. call fillstring
  259. call fillstring
  260. ret 4
  261.  
  262. fillstring:
  263. mov al, 1
  264. mov edx, [esp + 4] ;; ptr
  265. mov ecx, [esp + 8] ;; size
  266. mov byte [edx + ecx], 0
  267. .loop:
  268. dec ecx
  269. js .done
  270. mov byte [edx + ecx], al
  271. add al, 1
  272. jnz .skip
  273. mov al, 1
  274. .skip:
  275. jmp .loop
  276. .done:
  277. ret 8
  278.  
  279. writehex:
  280. push ebx
  281. push esi
  282. push edi
  283. mov eax, [esp + 16] ;; binary value
  284. mov ecx, 32 - 4
  285. mov esi, b2hlut
  286. mov edi, b2hout
  287. .loop:
  288. mov edx, eax
  289. shr edx, cl
  290. and edx, 0fh
  291. mov dl, byte [esi + edx]
  292. mov byte [edi], dl
  293. add edi, 1
  294. sub ecx, 4
  295. jns .loop
  296. mov byte [edi], 10
  297. ;;
  298. mov eax, 4
  299. mov ebx, 1
  300. mov ecx, b2hout
  301. mov edx, 9
  302. int 80h
  303. pop edi
  304. pop esi
  305. pop ebx
  306. ret 4
  307.  
  308. exit:
  309.  
  310. mov eax, 01h
  311. xor ebx, ebx
  312. int 80h
Success #stdin #stdout 0.58s 148KB
stdin
Standard input is empty
stdout
00000400
0641c23e
0a80fde9
0e697b1c
00000200
03a59e7f
05e5d3ca
07c14d6b
00000100
027d47a5
036d9bc1
046b9386
00000080
014b8d72
024a1ab9
02523cc5
00000040
0102d80f
015da569
01430cfb
00000020
00b62eae
01115d9b
00f0b372
00000010
009eeeef
00c1b732
00af4376
00000008
00912c52
00ae8d0f
00aa7af1
00000004
0090e2de
00a613f9
008cf805
00000002
0091004e
00a5cb0a
008bb17a
00000001
0091360d
00a5b3f5
0079f4ec