fork download
  1. #include <iostream>
  2. #include <chrono>
  3. #include <ratio>
  4. #include <random>
  5. #include <algorithm>
  6. #include <numeric>
  7. #include <functional>
  8. #include <stdlib.h>
  9. #include <x86intrin.h>
  10. using namespace std;
  11.  
  12. using time_clk = chrono::steady_clock;
  13. using timepoint = chrono::time_point<time_clk>;
  14. using duration = chrono::duration<double, nano>;
  15.  
  16. constexpr size_t kMegebyte = 1024 * 1024;
  17. constexpr size_t size = 4 * kMegebyte;
  18. alignas(64) uint8_t bytes[size];
  19. alignas(64) unsigned mtable[0x10000];
  20.  
  21. void fillMtable()
  22. {
  23. union
  24. {
  25. uint64_t dep;
  26. uint8_t v[8];
  27. };
  28.  
  29. for (size_t i = 0; i < 0x10000; ++i)
  30. {
  31. dep = _pdep_u64(i, 0x0606060606060606) | 0x0101010101010101;
  32. mtable[i] = accumulate(begin(v), end(v), 1u, multiplies<unsigned>{});
  33. }
  34. }
  35.  
  36. template<unsigned x>
  37. unsigned bexp(unsigned pow)
  38. {
  39. unsigned acc1 = x;
  40. unsigned acc2 = 1;
  41.  
  42. for (; pow; pow >>= 1, acc1 *= acc1)
  43. {
  44. if (pow & 1)
  45. acc2 *= acc1;
  46. }
  47.  
  48. return acc2;
  49. }
  50.  
  51. int main()
  52. {
  53. constexpr size_t bits = 3;
  54. mt19937 rnd;
  55. auto ud = uniform_int_distribution<size_t>(0, (1 << (bits - 1)) - 1);
  56. generate_n(bytes, size, [&]{ return ud(rnd) * 2 + 1; });
  57.  
  58. timepoint start1 = time_clk::now();
  59. const auto prod1 = accumulate(begin(bytes), end(bytes), 1u, multiplies<unsigned>{});
  60. timepoint end1 = time_clk::now();
  61. duration d1 = end1 - start1;
  62. cout << "accumulate: res = " << prod1 << ", time = " << d1.count() / size << " ns\n";
  63.  
  64. timepoint start2 = time_clk::now();
  65. unsigned p1 = 1u, p2 = 1u, p3 = 1u, p4 = 1u;
  66. for (size_t i = 0; i < size; i += 4)
  67. {
  68. p1 *= bytes[i + 0];
  69. p2 *= bytes[i + 1];
  70. p3 *= bytes[i + 2];
  71. p4 *= bytes[i + 3];
  72. }
  73. const auto prod2 = p1 * p2 * p3 * p4;
  74. timepoint end2 = time_clk::now();
  75. duration d2 = end2 - start2;
  76. cout << " accum * 4: res = " << prod2 << ", time = " << d2.count() / size << " ns\n";
  77.  
  78. auto m128 = reinterpret_cast<__m128i*>(bytes);
  79. timepoint start3 = time_clk::now();
  80. __m256i pv1 = _mm256_set1_epi32(1);
  81. __m256i pv2 = _mm256_set1_epi32(1);
  82. __m256i pv3 = _mm256_set1_epi32(1);
  83. __m256i pv4 = _mm256_set1_epi32(1);
  84. __m256i pv5 = _mm256_set1_epi32(1);
  85. __m256i pv6 = _mm256_set1_epi32(1);
  86. __m256i pv7 = _mm256_set1_epi32(1);
  87. __m256i pv8 = _mm256_set1_epi32(1);
  88. for (size_t i = 0; i < size / 16; i += 8)
  89. {
  90. const __m256i t1 = _mm256_mullo_epi16(_mm256_cvtepu8_epi16(m128[i + 0]),
  91. _mm256_cvtepu8_epi16(m128[i + 1]));
  92. pv1 = _mm256_mullo_epi32(pv1, _mm256_and_si256(t1, _mm256_set1_epi32(0xFFFF)));
  93. pv2 = _mm256_mullo_epi32(pv2, _mm256_srli_epi32(t1, 16));
  94. const __m256i t2 = _mm256_mullo_epi16(_mm256_cvtepu8_epi16(m128[i + 2]),
  95. _mm256_cvtepu8_epi16(m128[i + 3]));
  96. pv3 = _mm256_mullo_epi32(pv3, _mm256_and_si256(t2, _mm256_set1_epi32(0xFFFF)));
  97. pv4 = _mm256_mullo_epi32(pv4, _mm256_srli_epi32(t2, 16));
  98. const __m256i t3 = _mm256_mullo_epi16(_mm256_cvtepu8_epi16(m128[i + 4]),
  99. _mm256_cvtepu8_epi16(m128[i + 5]));
  100. pv5 = _mm256_mullo_epi32(pv5, _mm256_and_si256(t3, _mm256_set1_epi32(0xFFFF)));
  101. pv6 = _mm256_mullo_epi32(pv6, _mm256_srli_epi32(t3, 16));
  102. const __m256i t4 = _mm256_mullo_epi16(_mm256_cvtepu8_epi16(m128[i + 6]),
  103. _mm256_cvtepu8_epi16(m128[i + 7]));
  104. pv7 = _mm256_mullo_epi32(pv7, _mm256_and_si256(t4, _mm256_set1_epi32(0xFFFF)));
  105. pv8 = _mm256_mullo_epi32(pv8, _mm256_srli_epi32(t4, 16));
  106. }
  107. pv1 = _mm256_mullo_epi32(pv1, pv2);
  108. pv3 = _mm256_mullo_epi32(pv3, pv4);
  109. pv5 = _mm256_mullo_epi32(pv5, pv6);
  110. pv7 = _mm256_mullo_epi32(pv7, pv8);
  111. pv1 = _mm256_mullo_epi32(pv1, pv3);
  112. pv5 = _mm256_mullo_epi32(pv5, pv7);
  113. pv1 = _mm256_mullo_epi32(pv1, pv5);
  114. __m128i hi = _mm256_extracti128_si256(pv1, 1);
  115. __m128i lo = _mm256_extracti128_si256(pv1, 0);
  116. lo = _mm_mullo_epi32(hi, lo);
  117. const auto prod3 = unsigned(_mm_extract_epi32(lo, 0) * _mm_extract_epi32(lo, 1) *
  118. _mm_extract_epi32(lo, 2) * _mm_extract_epi32(lo, 3));
  119. timepoint end3 = time_clk::now();
  120. duration d3 = end3 - start3;
  121. cout << " AVX2: res = " << prod3 << ", time = " << d3.count() / size << " ns\n";
  122.  
  123. auto qwords = reinterpret_cast<uint64_t*>(bytes);
  124. timepoint start4 = time_clk::now();
  125. unsigned acc1 = 1;
  126. unsigned acc3 = 0;
  127. unsigned acc5 = 0;
  128. unsigned acc7 = 0;
  129. for (size_t i = 0; i < size / 8; i += 2)
  130. {
  131. auto compr = (qwords[i] << 4) | qwords[i + 1];
  132. constexpr uint64_t lsb = 0x1111111111111111;
  133. if ((compr & lsb) != lsb) // if there is at least one even value
  134. {
  135. auto b = reinterpret_cast<uint8_t*>(qwords + i);
  136. acc1 *= accumulate(b, b + 16, acc1, multiplies<unsigned>{});
  137. if (!acc1)
  138. break;
  139. }
  140. else
  141. {
  142. const auto b2 = compr & 0x2222222222222222;
  143. const auto b4 = compr & 0x4444444444444444;
  144. const auto b24 = b4 & (b2 * 2);
  145. const unsigned c7 = __builtin_popcountll(b24);
  146. acc3 += __builtin_popcountll(b2) - c7;
  147. acc5 += __builtin_popcountll(b4) - c7;
  148. acc7 += c7;
  149. }
  150. }
  151. const auto prod4 = acc1 * bexp<3>(acc3) * bexp<5>(acc5) * bexp<7>(acc7);
  152. timepoint end4 = time_clk::now();
  153. duration d4 = end4 - start4;
  154. cout << "binary exp: res = " << prod4 << ", time = " << d4.count() / size << " ns\n";
  155.  
  156. auto words = reinterpret_cast<uint16_t*>(bytes);
  157. timepoint start5 = time_clk::now();
  158. auto prod5 = 1u;
  159. for (size_t i = 0; i < size / 2; i += 4)
  160. {
  161. const auto t1 = uint32_t(words[i + 0]) * words[i + 1];
  162. const auto t2 = uint32_t(words[i + 2]) * words[i + 3];
  163. const auto t3 = uint64_t(t1 & 0xFF00FF) * (t2 & 0xFF00FF);
  164. prod5 *= uint32_t(t3 >> 32) * uint32_t(t3 & 0xFFFF);
  165. }
  166. timepoint end5 = time_clk::now();
  167. duration d5 = end5 - start5;
  168. cout << "2 mul in 1: res = " << prod5 << ", time = " << d5.count() / size << " ns\n";
  169.  
  170. fillMtable();
  171. timepoint start6 = time_clk::now();
  172. unsigned a1 = 1;
  173. unsigned a2 = 1;
  174. for (size_t i = 0; i < size / 8; i += 2)
  175. {
  176. constexpr uint64_t lsb = 0x0101010101010101;
  177. if ((qwords[i] & lsb) != lsb || (qwords[i + 1] & lsb) != lsb)
  178. { // if there is at least one even value
  179. auto b = reinterpret_cast<uint8_t*>(qwords + i);
  180. acc1 *= accumulate(b, b + 16, acc1, multiplies<unsigned>{});
  181. if (!acc1)
  182. break;
  183. }
  184. else
  185. {
  186. a1 *= mtable[_pext_u64(qwords[i + 0], 0x0606060606060606)];
  187. a2 *= mtable[_pext_u64(qwords[i + 1], 0x0606060606060606)];
  188. }
  189. }
  190. const auto prod6 = a1 * a2;
  191. timepoint end6 = time_clk::now();
  192. duration d6 = end6 - start6;
  193. cout << "mult table: res = " << prod6 << ", time = " << d6.count() / size << " ns\n";
  194.  
  195. return 0;
  196. }
  197.  
Compilation error #stdin compilation error #stdout 0s 0KB
stdin
Standard input is empty
compilation info
prog.cpp: In function ‘void fillMtable()’:
prog.cpp:31:46: error: ‘_pdep_u64’ was not declared in this scope
         dep = _pdep_u64(i, 0x0606060606060606) | 0x0101010101010101;
                                              ^
prog.cpp: In function ‘int main()’:
prog.cpp:78:34: error: expected type-specifier before ‘__m128i’
     auto m128 = reinterpret_cast<__m128i*>(bytes);
                                  ^
prog.cpp:78:34: error: expected ‘>’ before ‘__m128i’
prog.cpp:78:34: error: expected ‘(’ before ‘__m128i’
prog.cpp:78:34: error: ‘__m128i’ was not declared in this scope
prog.cpp:78:42: error: expected primary-expression before ‘>’ token
     auto m128 = reinterpret_cast<__m128i*>(bytes);
                                          ^
prog.cpp:78:50: error: expected ‘)’ before ‘;’ token
     auto m128 = reinterpret_cast<__m128i*>(bytes);
                                                  ^
prog.cpp:80:5: error: ‘__m256i’ was not declared in this scope
     __m256i pv1 = _mm256_set1_epi32(1);
     ^
prog.cpp:80:13: error: expected ‘;’ before ‘pv1’
     __m256i pv1 = _mm256_set1_epi32(1);
             ^
prog.cpp:81:13: error: expected ‘;’ before ‘pv2’
     __m256i pv2 = _mm256_set1_epi32(1);
             ^
prog.cpp:82:13: error: expected ‘;’ before ‘pv3’
     __m256i pv3 = _mm256_set1_epi32(1);
             ^
prog.cpp:83:13: error: expected ‘;’ before ‘pv4’
     __m256i pv4 = _mm256_set1_epi32(1);
             ^
prog.cpp:84:13: error: expected ‘;’ before ‘pv5’
     __m256i pv5 = _mm256_set1_epi32(1);
             ^
prog.cpp:85:13: error: expected ‘;’ before ‘pv6’
     __m256i pv6 = _mm256_set1_epi32(1);
             ^
prog.cpp:86:13: error: expected ‘;’ before ‘pv7’
     __m256i pv7 = _mm256_set1_epi32(1);
             ^
prog.cpp:87:13: error: expected ‘;’ before ‘pv8’
     __m256i pv8 = _mm256_set1_epi32(1);
             ^
prog.cpp:90:15: error: ‘__m256i’ does not name a type
         const __m256i t1 = _mm256_mullo_epi16(_mm256_cvtepu8_epi16(m128[i + 0]),
               ^
prog.cpp:92:9: error: ‘pv1’ was not declared in this scope
         pv1 = _mm256_mullo_epi32(pv1, _mm256_and_si256(t1, _mm256_set1_epi32(0xFFFF)));
         ^
prog.cpp:92:56: error: ‘t1’ was not declared in this scope
         pv1 = _mm256_mullo_epi32(pv1, _mm256_and_si256(t1, _mm256_set1_epi32(0xFFFF)));
                                                        ^
prog.cpp:92:84: error: ‘_mm256_set1_epi32’ was not declared in this scope
         pv1 = _mm256_mullo_epi32(pv1, _mm256_and_si256(t1, _mm256_set1_epi32(0xFFFF)));
                                                                                    ^
prog.cpp:92:85: error: ‘_mm256_and_si256’ was not declared in this scope
         pv1 = _mm256_mullo_epi32(pv1, _mm256_and_si256(t1, _mm256_set1_epi32(0xFFFF)));
                                                                                     ^
prog.cpp:92:86: error: ‘_mm256_mullo_epi32’ was not declared in this scope
         pv1 = _mm256_mullo_epi32(pv1, _mm256_and_si256(t1, _mm256_set1_epi32(0xFFFF)));
                                                                                      ^
prog.cpp:93:9: error: ‘pv2’ was not declared in this scope
         pv2 = _mm256_mullo_epi32(pv2, _mm256_srli_epi32(t1, 16));
         ^
prog.cpp:93:63: error: ‘_mm256_srli_epi32’ was not declared in this scope
         pv2 = _mm256_mullo_epi32(pv2, _mm256_srli_epi32(t1, 16));
                                                               ^
prog.cpp:94:15: error: ‘__m256i’ does not name a type
         const __m256i t2 = _mm256_mullo_epi16(_mm256_cvtepu8_epi16(m128[i + 2]),
               ^
prog.cpp:96:9: error: ‘pv3’ was not declared in this scope
         pv3 = _mm256_mullo_epi32(pv3, _mm256_and_si256(t2, _mm256_set1_epi32(0xFFFF)));
         ^
prog.cpp:96:56: error: ‘t2’ was not declared in this scope
         pv3 = _mm256_mullo_epi32(pv3, _mm256_and_si256(t2, _mm256_set1_epi32(0xFFFF)));
                                                        ^
prog.cpp:97:9: error: ‘pv4’ was not declared in this scope
         pv4 = _mm256_mullo_epi32(pv4, _mm256_srli_epi32(t2, 16));
         ^
prog.cpp:98:15: error: ‘__m256i’ does not name a type
         const __m256i t3 = _mm256_mullo_epi16(_mm256_cvtepu8_epi16(m128[i + 4]),
               ^
prog.cpp:100:9: error: ‘pv5’ was not declared in this scope
         pv5 = _mm256_mullo_epi32(pv5, _mm256_and_si256(t3, _mm256_set1_epi32(0xFFFF)));
         ^
prog.cpp:100:56: error: ‘t3’ was not declared in this scope
         pv5 = _mm256_mullo_epi32(pv5, _mm256_and_si256(t3, _mm256_set1_epi32(0xFFFF)));
                                                        ^
prog.cpp:101:9: error: ‘pv6’ was not declared in this scope
         pv6 = _mm256_mullo_epi32(pv6, _mm256_srli_epi32(t3, 16));
         ^
prog.cpp:102:15: error: ‘__m256i’ does not name a type
         const __m256i t4 = _mm256_mullo_epi16(_mm256_cvtepu8_epi16(m128[i + 6]),
               ^
prog.cpp:104:9: error: ‘pv7’ was not declared in this scope
         pv7 = _mm256_mullo_epi32(pv7, _mm256_and_si256(t4, _mm256_set1_epi32(0xFFFF)));
         ^
prog.cpp:104:56: error: ‘t4’ was not declared in this scope
         pv7 = _mm256_mullo_epi32(pv7, _mm256_and_si256(t4, _mm256_set1_epi32(0xFFFF)));
                                                        ^
prog.cpp:105:9: error: ‘pv8’ was not declared in this scope
         pv8 = _mm256_mullo_epi32(pv8, _mm256_srli_epi32(t4, 16));
         ^
prog.cpp:107:5: error: ‘pv1’ was not declared in this scope
     pv1 = _mm256_mullo_epi32(pv1, pv2);
     ^
prog.cpp:107:35: error: ‘pv2’ was not declared in this scope
     pv1 = _mm256_mullo_epi32(pv1, pv2);
                                   ^
prog.cpp:107:38: error: ‘_mm256_mullo_epi32’ was not declared in this scope
     pv1 = _mm256_mullo_epi32(pv1, pv2);
                                      ^
prog.cpp:108:5: error: ‘pv3’ was not declared in this scope
     pv3 = _mm256_mullo_epi32(pv3, pv4);
     ^
prog.cpp:108:35: error: ‘pv4’ was not declared in this scope
     pv3 = _mm256_mullo_epi32(pv3, pv4);
                                   ^
prog.cpp:109:5: error: ‘pv5’ was not declared in this scope
     pv5 = _mm256_mullo_epi32(pv5, pv6);
     ^
prog.cpp:109:35: error: ‘pv6’ was not declared in this scope
     pv5 = _mm256_mullo_epi32(pv5, pv6);
                                   ^
prog.cpp:110:5: error: ‘pv7’ was not declared in this scope
     pv7 = _mm256_mullo_epi32(pv7, pv8);
     ^
prog.cpp:110:35: error: ‘pv8’ was not declared in this scope
     pv7 = _mm256_mullo_epi32(pv7, pv8);
                                   ^
prog.cpp:114:13: error: expected ‘;’ before ‘hi’
     __m128i hi = _mm256_extracti128_si256(pv1, 1);
             ^
prog.cpp:115:13: error: expected ‘;’ before ‘lo’
     __m128i lo = _mm256_extracti128_si256(pv1, 0);
             ^
prog.cpp:116:5: error: ‘lo’ was not declared in this scope
     lo = _mm_mullo_epi32(hi, lo);
     ^
prog.cpp:116:26: error: ‘hi’ was not declared in this scope
     lo = _mm_mullo_epi32(hi, lo);
                          ^
prog.cpp:116:32: error: ‘_mm_mullo_epi32’ was not declared in this scope
     lo = _mm_mullo_epi32(hi, lo);
                                ^
prog.cpp:117:56: error: ‘_mm_extract_epi32’ was not declared in this scope
     const auto prod3 = unsigned(_mm_extract_epi32(lo, 0) * _mm_extract_epi32(lo, 1) *
                                                        ^
prog.cpp:186:69: error: ‘_pext_u64’ was not declared in this scope
             a1 *= mtable[_pext_u64(qwords[i + 0], 0x0606060606060606)];
                                                                     ^
stdout
Standard output is empty