fork download
  1. #include <stdint.h>
  2. #include <stdio.h>
  3. #include <string.h>
  4. #include <immintrin.h>
  5. #include <math.h>
  6.  
  7. struct md5x8_context {
  8. __m256i k[64];
  9. __m256i iv[4];
  10.  
  11. __m256i a, b, c, d;
  12. };
  13.  
  14. void md5x8_pre_init(struct md5x8_context* ctx) {
  15. for (uint32_t i = 0; i < 64; ++i) {
  16. uint32_t x = floor(fabs(sin(i + 1)) * (double)0x100000000ull);
  17. ctx->k[i] = _mm256_set1_epi32(x);
  18. }
  19. ctx->iv[0] = _mm256_set1_epi32(0x67452301);
  20. ctx->iv[1] = _mm256_set1_epi32(0xefcdab89);
  21. ctx->iv[2] = _mm256_set1_epi32(0x98badcfe);
  22. ctx->iv[3] = _mm256_set1_epi32(0x10325476);
  23. }
  24.  
  25. void md5x8_init(struct md5x8_context* ctx) {
  26. ctx->a = ctx->iv[0];
  27. ctx->b = ctx->iv[1];
  28. ctx->c = ctx->iv[2];
  29. ctx->d = ctx->iv[3];
  30. }
  31.  
  32. inline __m256i md5x8_fx(int s, __m256i a, __m256i b, __m256i k, __m256i x, __m256i f) {
  33. f = _mm256_add_epi32(_mm256_add_epi32(f, a), _mm256_add_epi32(k, x));
  34. return _mm256_add_epi32(b, _mm256_or_si256(_mm256_slli_epi32(f, s), _mm256_srli_epi32(f, 32 - s)));
  35. }
  36.  
  37. inline __m256i md5x8_f1(int s, __m256i a, __m256i b, __m256i c, __m256i d, __m256i k, __m256i x) {
  38. return md5x8_fx(s, a, b, k, x, _mm256_or_si256(_mm256_and_si256(b, c), _mm256_andnot_si256(b, d)));
  39. }
  40.  
  41. inline __m256i md5x8_f2(int s, __m256i a, __m256i b, __m256i c, __m256i d, __m256i k, __m256i x) {
  42. return md5x8_fx(s, a, b, k, x, _mm256_or_si256(_mm256_and_si256(d, b), _mm256_andnot_si256(d, c)));
  43. }
  44.  
  45. inline __m256i md5x8_f3(int s, __m256i a, __m256i b, __m256i c, __m256i d, __m256i k, __m256i x) {
  46. return md5x8_fx(s, a, b, k, x, _mm256_xor_si256(_mm256_xor_si256(b, c), d));
  47. }
  48.  
  49. inline __m256i md5x8_f4(int s, __m256i a, __m256i b, __m256i c, __m256i d, __m256i k, __m256i x) {
  50. return md5x8_fx(s, a, b, k, x, _mm256_xor_si256(c, _mm256_or_si256(b, _mm256_xor_si256(d, _mm256_set1_epi32(0xFFFFFFFF)))));
  51. }
  52.  
  53. void md5x8_raw_update(struct md5x8_context* ctx, const uint8_t blocks[8][64])
  54. {
  55. __m256i x[16] = {0};
  56. for (size_t i = 0; i < 2; ++i) {
  57. __m256i x0 = _mm256_loadu_si256((const __m256i*)blocks[0] + i);
  58. __m256i x1 = _mm256_loadu_si256((const __m256i*)blocks[1] + i);
  59. __m256i x2 = _mm256_loadu_si256((const __m256i*)blocks[2] + i);
  60. __m256i x3 = _mm256_loadu_si256((const __m256i*)blocks[3] + i);
  61. __m256i x4 = _mm256_loadu_si256((const __m256i*)blocks[4] + i);
  62. __m256i x5 = _mm256_loadu_si256((const __m256i*)blocks[5] + i);
  63. __m256i x6 = _mm256_loadu_si256((const __m256i*)blocks[6] + i);
  64. __m256i x7 = _mm256_loadu_si256((const __m256i*)blocks[7] + i);
  65.  
  66. __m256i t0 = _mm256_unpacklo_epi32(x0, x1);
  67. __m256i t1 = _mm256_unpackhi_epi32(x0, x1);
  68. __m256i t2 = _mm256_unpacklo_epi32(x2, x3);
  69. __m256i t3 = _mm256_unpackhi_epi32(x2, x3);
  70. __m256i t4 = _mm256_unpacklo_epi32(x4, x5);
  71. __m256i t5 = _mm256_unpackhi_epi32(x4, x5);
  72. __m256i t6 = _mm256_unpacklo_epi32(x6, x7);
  73. __m256i t7 = _mm256_unpackhi_epi32(x6, x7);
  74.  
  75. __m256i s0 = _mm256_unpacklo_epi64(t0, t2);
  76. __m256i s1 = _mm256_unpackhi_epi64(t0, t2);
  77. __m256i s2 = _mm256_unpacklo_epi64(t1, t3);
  78. __m256i s3 = _mm256_unpackhi_epi64(t1, t3);
  79. __m256i s4 = _mm256_unpacklo_epi64(t4, t6);
  80. __m256i s5 = _mm256_unpackhi_epi64(t4, t6);
  81. __m256i s6 = _mm256_unpacklo_epi64(t5, t7);
  82. __m256i s7 = _mm256_unpackhi_epi64(t5, t7);
  83.  
  84. x[i * 8 + 0] = _mm256_permute2x128_si256(s0, s4, 0x20);
  85. x[i * 8 + 1] = _mm256_permute2x128_si256(s1, s5, 0x20);
  86. x[i * 8 + 2] = _mm256_permute2x128_si256(s2, s6, 0x20);
  87. x[i * 8 + 3] = _mm256_permute2x128_si256(s3, s7, 0x20);
  88. x[i * 8 + 4] = _mm256_permute2x128_si256(s0, s4, 0x31);
  89. x[i * 8 + 5] = _mm256_permute2x128_si256(s1, s5, 0x31);
  90. x[i * 8 + 6] = _mm256_permute2x128_si256(s2, s6, 0x31);
  91. x[i * 8 + 7] = _mm256_permute2x128_si256(s3, s7, 0x31);
  92. }
  93.  
  94. __m256i a = ctx->a;
  95. __m256i b = ctx->b;
  96. __m256i c = ctx->c;
  97. __m256i d = ctx->d;
  98.  
  99. a = md5x8_f1(7, a, b, c, d, ctx->k[0], x[0]);
  100. d = md5x8_f1(12, d, a, b, c, ctx->k[1], x[1]);
  101. c = md5x8_f1(17, c, d, a, b, ctx->k[2], x[2]);
  102. b = md5x8_f1(22, b, c, d, a, ctx->k[3], x[3]);
  103. a = md5x8_f1(7, a, b, c, d, ctx->k[4], x[4]);
  104. d = md5x8_f1(12, d, a, b, c, ctx->k[5], x[5]);
  105. c = md5x8_f1(17, c, d, a, b, ctx->k[6], x[6]);
  106. b = md5x8_f1(22, b, c, d, a, ctx->k[7], x[7]);
  107. a = md5x8_f1(7, a, b, c, d, ctx->k[8], x[8]);
  108. d = md5x8_f1(12, d, a, b, c, ctx->k[9], x[9]);
  109. c = md5x8_f1(17, c, d, a, b, ctx->k[10], x[10]);
  110. b = md5x8_f1(22, b, c, d, a, ctx->k[11], x[11]);
  111. a = md5x8_f1(7, a, b, c, d, ctx->k[12], x[12]);
  112. d = md5x8_f1(12, d, a, b, c, ctx->k[13], x[13]);
  113. c = md5x8_f1(17, c, d, a, b, ctx->k[14], x[14]);
  114. b = md5x8_f1(22, b, c, d, a, ctx->k[15], x[15]);
  115.  
  116. a = md5x8_f2(5, a, b, c, d, ctx->k[16], x[1]);
  117. d = md5x8_f2(9, d, a, b, c, ctx->k[17], x[6]);
  118. c = md5x8_f2(14, c, d, a, b, ctx->k[18], x[11]);
  119. b = md5x8_f2(20, b, c, d, a, ctx->k[19], x[0]);
  120. a = md5x8_f2(5, a, b, c, d, ctx->k[20], x[5]);
  121. d = md5x8_f2(9, d, a, b, c, ctx->k[21], x[10]);
  122. c = md5x8_f2(14, c, d, a, b, ctx->k[22], x[15]);
  123. b = md5x8_f2(20, b, c, d, a, ctx->k[23], x[4]);
  124. a = md5x8_f2(5, a, b, c, d, ctx->k[24], x[9]);
  125. d = md5x8_f2(9, d, a, b, c, ctx->k[25], x[14]);
  126. c = md5x8_f2(14, c, d, a, b, ctx->k[26], x[3]);
  127. b = md5x8_f2(20, b, c, d, a, ctx->k[27], x[8]);
  128. a = md5x8_f2(5, a, b, c, d, ctx->k[28], x[13]);
  129. d = md5x8_f2(9, d, a, b, c, ctx->k[29], x[2]);
  130. c = md5x8_f2(14, c, d, a, b, ctx->k[30], x[7]);
  131. b = md5x8_f2(20, b, c, d, a, ctx->k[31], x[12]);
  132.  
  133. a = md5x8_f3(4, a, b, c, d, ctx->k[32], x[5]);
  134. d = md5x8_f3(11, d, a, b, c, ctx->k[33], x[8]);
  135. c = md5x8_f3(16, c, d, a, b, ctx->k[34], x[11]);
  136. b = md5x8_f3(23, b, c, d, a, ctx->k[35], x[14]);
  137. a = md5x8_f3(4, a, b, c, d, ctx->k[36], x[1]);
  138. d = md5x8_f3(11, d, a, b, c, ctx->k[37], x[4]);
  139. c = md5x8_f3(16, c, d, a, b, ctx->k[38], x[7]);
  140. b = md5x8_f3(23, b, c, d, a, ctx->k[39], x[10]);
  141. a = md5x8_f3(4, a, b, c, d, ctx->k[40], x[13]);
  142. d = md5x8_f3(11, d, a, b, c, ctx->k[41], x[0]);
  143. c = md5x8_f3(16, c, d, a, b, ctx->k[42], x[3]);
  144. b = md5x8_f3(23, b, c, d, a, ctx->k[43], x[6]);
  145. a = md5x8_f3(4, a, b, c, d, ctx->k[44], x[9]);
  146. d = md5x8_f3(11, d, a, b, c, ctx->k[45], x[12]);
  147. c = md5x8_f3(16, c, d, a, b, ctx->k[46], x[15]);
  148. b = md5x8_f3(23, b, c, d, a, ctx->k[47], x[2]);
  149.  
  150. a = md5x8_f4(6, a, b, c, d, ctx->k[48], x[0]);
  151. d = md5x8_f4(10, d, a, b, c, ctx->k[49], x[7]);
  152. c = md5x8_f4(15, c, d, a, b, ctx->k[50], x[14]);
  153. b = md5x8_f4(21, b, c, d, a, ctx->k[51], x[5]);
  154. a = md5x8_f4(6, a, b, c, d, ctx->k[52], x[12]);
  155. d = md5x8_f4(10, d, a, b, c, ctx->k[53], x[3]);
  156. c = md5x8_f4(15, c, d, a, b, ctx->k[54], x[10]);
  157. b = md5x8_f4(21, b, c, d, a, ctx->k[55], x[1]);
  158. a = md5x8_f4(6, a, b, c, d, ctx->k[56], x[8]);
  159. d = md5x8_f4(10, d, a, b, c, ctx->k[57], x[15]);
  160. c = md5x8_f4(15, c, d, a, b, ctx->k[58], x[6]);
  161. b = md5x8_f4(21, b, c, d, a, ctx->k[59], x[13]);
  162. a = md5x8_f4(6, a, b, c, d, ctx->k[60], x[4]);
  163. d = md5x8_f4(10, d, a, b, c, ctx->k[61], x[11]);
  164. c = md5x8_f4(15, c, d, a, b, ctx->k[62], x[2]);
  165. b = md5x8_f4(21, b, c, d, a, ctx->k[63], x[9]);
  166.  
  167. ctx->a = _mm256_add_epi32(ctx->a, a);
  168. ctx->b = _mm256_add_epi32(ctx->b, b);
  169. ctx->c = _mm256_add_epi32(ctx->c, c);
  170. ctx->d = _mm256_add_epi32(ctx->d, d);
  171. }
  172.  
  173. void md5x8_final(struct md5x8_context* ctx, uint8_t out[8][32]) {
  174. __m256i x0 = ctx->a;
  175. __m256i x1 = ctx->b;
  176. __m256i x2 = ctx->c;
  177. __m256i x3 = ctx->d;
  178. __m256i x4 = _mm256_set1_epi32(0);
  179. __m256i x5 = _mm256_set1_epi32(0);
  180. __m256i x6 = _mm256_set1_epi32(0);
  181. __m256i x7 = _mm256_set1_epi32(0);
  182.  
  183. __m256i t0 = _mm256_unpacklo_epi32(x0, x1);
  184. __m256i t1 = _mm256_unpackhi_epi32(x0, x1);
  185. __m256i t2 = _mm256_unpacklo_epi32(x2, x3);
  186. __m256i t3 = _mm256_unpackhi_epi32(x2, x3);
  187. __m256i t4 = _mm256_unpacklo_epi32(x4, x5);
  188. __m256i t5 = _mm256_unpackhi_epi32(x4, x5);
  189. __m256i t6 = _mm256_unpacklo_epi32(x6, x7);
  190. __m256i t7 = _mm256_unpackhi_epi32(x6, x7);
  191.  
  192. __m256i s0 = _mm256_unpacklo_epi64(t0, t2);
  193. __m256i s1 = _mm256_unpackhi_epi64(t0, t2);
  194. __m256i s2 = _mm256_unpacklo_epi64(t1, t3);
  195. __m256i s3 = _mm256_unpackhi_epi64(t1, t3);
  196. __m256i s4 = _mm256_unpacklo_epi64(t4, t6);
  197. __m256i s5 = _mm256_unpackhi_epi64(t4, t6);
  198. __m256i s6 = _mm256_unpacklo_epi64(t5, t7);
  199. __m256i s7 = _mm256_unpackhi_epi64(t5, t7);
  200.  
  201. _mm256_storeu_si256((__m256i*)(out + 0), _mm256_permute2x128_si256(s0, s4, 0x20));
  202. _mm256_storeu_si256((__m256i*)(out + 1), _mm256_permute2x128_si256(s1, s5, 0x20));
  203. _mm256_storeu_si256((__m256i*)(out + 2), _mm256_permute2x128_si256(s2, s6, 0x20));
  204. _mm256_storeu_si256((__m256i*)(out + 3), _mm256_permute2x128_si256(s3, s7, 0x20));
  205. _mm256_storeu_si256((__m256i*)(out + 4), _mm256_permute2x128_si256(s0, s4, 0x31));
  206. _mm256_storeu_si256((__m256i*)(out + 5), _mm256_permute2x128_si256(s1, s5, 0x31));
  207. _mm256_storeu_si256((__m256i*)(out + 6), _mm256_permute2x128_si256(s2, s6, 0x31));
  208. _mm256_storeu_si256((__m256i*)(out + 7), _mm256_permute2x128_si256(s3, s7, 0x31));
  209. }
  210.  
  211.  
  212. void unsafe_pad_block(uint8_t block[64], size_t length) {
  213. block[length] = 0x80;
  214. *(uint64_t*)(block + 56) = length * 8;
  215. }
  216.  
  217. int main() {
  218. struct md5x8_context ctx;
  219. md5x8_pre_init(&ctx);
  220.  
  221. uint8_t data[8][64] = {
  222. "We are the others",
  223. "We are the cast-outs",
  224. "We're the outsiders",
  225. "But you can't hide us",
  226. };
  227.  
  228. unsafe_pad_block(data[0], 17);
  229. unsafe_pad_block(data[1], 20);
  230. unsafe_pad_block(data[2], 19);
  231. unsafe_pad_block(data[3], 21);
  232.  
  233. uint8_t digests[8][32] = {0};
  234.  
  235. md5x8_init(&ctx);
  236. md5x8_raw_update(&ctx, data);
  237. md5x8_final(&ctx, digests);
  238.  
  239. for (size_t i = 0; i < 8; ++i) {
  240. for (size_t j = 0; j < 16; ++j)
  241. printf("%02X ", digests[i][j]);
  242. printf("\n");
  243. }
  244.  
  245. return 0;
  246. }
Compilation error #stdin compilation error #stdout 0s 0KB
stdin
Standard input is empty
compilation info
prog.c: In function ‘md5x8_fx’:
prog.c:32:1: warning: AVX vector return without AVX enabled changes the ABI [-Wpsabi]
 inline __m256i md5x8_fx(int s, __m256i a, __m256i b, __m256i k, __m256i x, __m256i f) {
 ^~~~~~
prog.c: In function ‘md5x8_f1’:
prog.c:37:16: note: The ABI for passing parameters with 32-byte alignment has changed in GCC 4.6
 inline __m256i md5x8_f1(int s, __m256i a, __m256i b, __m256i c, __m256i d, __m256i k, __m256i x) {
                ^~~~~~~~
In file included from /usr/lib/gcc/x86_64-linux-gnu/8/include/immintrin.h:43,
                 from prog.c:4:
prog.c: In function ‘md5x8_fx’:
/usr/lib/gcc/x86_64-linux-gnu/8/include/avx2intrin.h:119:1: error: inlining failed in call to always_inline ‘_mm256_add_epi32’: target specific option mismatch
 _mm256_add_epi32 (__m256i __A, __m256i __B)
 ^~~~~~~~~~~~~~~~
prog.c:34:12: note: called from here
     return _mm256_add_epi32(b, _mm256_or_si256(_mm256_slli_epi32(f, s), _mm256_srli_epi32(f, 32 - s)));
            ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
In file included from /usr/lib/gcc/x86_64-linux-gnu/8/include/immintrin.h:43,
                 from prog.c:4:
/usr/lib/gcc/x86_64-linux-gnu/8/include/avx2intrin.h:574:1: error: inlining failed in call to always_inline ‘_mm256_or_si256’: target specific option mismatch
 _mm256_or_si256 (__m256i __A, __m256i __B)
 ^~~~~~~~~~~~~~~
prog.c:34:12: note: called from here
     return _mm256_add_epi32(b, _mm256_or_si256(_mm256_slli_epi32(f, s), _mm256_srli_epi32(f, 32 - s)));
            ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
In file included from /usr/lib/gcc/x86_64-linux-gnu/8/include/immintrin.h:43,
                 from prog.c:4:
/usr/lib/gcc/x86_64-linux-gnu/8/include/avx2intrin.h:682:1: error: inlining failed in call to always_inline ‘_mm256_slli_epi32’: target specific option mismatch
 _mm256_slli_epi32 (__m256i __A, int __B)
 ^~~~~~~~~~~~~~~~~
prog.c:34:12: note: called from here
     return _mm256_add_epi32(b, _mm256_or_si256(_mm256_slli_epi32(f, s), _mm256_srli_epi32(f, 32 - s)));
            ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
In file included from /usr/lib/gcc/x86_64-linux-gnu/8/include/immintrin.h:43,
                 from prog.c:4:
/usr/lib/gcc/x86_64-linux-gnu/8/include/avx2intrin.h:773:1: error: inlining failed in call to always_inline ‘_mm256_srli_epi32’: target specific option mismatch
 _mm256_srli_epi32 (__m256i __A, int __B)
 ^~~~~~~~~~~~~~~~~
prog.c:34:12: note: called from here
     return _mm256_add_epi32(b, _mm256_or_si256(_mm256_slli_epi32(f, s), _mm256_srli_epi32(f, 32 - s)));
            ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
In file included from /usr/lib/gcc/x86_64-linux-gnu/8/include/immintrin.h:43,
                 from prog.c:4:
/usr/lib/gcc/x86_64-linux-gnu/8/include/avx2intrin.h:119:1: error: inlining failed in call to always_inline ‘_mm256_add_epi32’: target specific option mismatch
 _mm256_add_epi32 (__m256i __A, __m256i __B)
 ^~~~~~~~~~~~~~~~
prog.c:33:9: note: called from here
     f = _mm256_add_epi32(_mm256_add_epi32(f, a), _mm256_add_epi32(k, x));
         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
In file included from /usr/lib/gcc/x86_64-linux-gnu/8/include/immintrin.h:43,
                 from prog.c:4:
/usr/lib/gcc/x86_64-linux-gnu/8/include/avx2intrin.h:119:1: error: inlining failed in call to always_inline ‘_mm256_add_epi32’: target specific option mismatch
 _mm256_add_epi32 (__m256i __A, __m256i __B)
 ^~~~~~~~~~~~~~~~
prog.c:33:9: note: called from here
     f = _mm256_add_epi32(_mm256_add_epi32(f, a), _mm256_add_epi32(k, x));
         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
In file included from /usr/lib/gcc/x86_64-linux-gnu/8/include/immintrin.h:43,
                 from prog.c:4:
/usr/lib/gcc/x86_64-linux-gnu/8/include/avx2intrin.h:119:1: error: inlining failed in call to always_inline ‘_mm256_add_epi32’: target specific option mismatch
 _mm256_add_epi32 (__m256i __A, __m256i __B)
 ^~~~~~~~~~~~~~~~
prog.c:33:9: note: called from here
     f = _mm256_add_epi32(_mm256_add_epi32(f, a), _mm256_add_epi32(k, x));
         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
stdout
Standard output is empty