#include <iostream> #include <chrono> #include <ratio> #include <random> #include <algorithm> #include <numeric> #include <functional> #include <stdlib.h> #include <x86intrin.h> using namespace std; using time_clk = chrono::steady_clock; using timepoint = chrono::time_point<time_clk>; using duration = chrono::duration<double, nano>; constexpr size_t kMegebyte = 1024 * 1024; constexpr size_t size = 4 * kMegebyte; alignas(64) uint8_t bytes[size]; alignas(64) unsigned mtable[0x10000]; void fillMtable() { union { uint64_t dep; uint8_t v[8]; }; for (size_t i = 0; i < 0x10000; ++i) { dep = _pdep_u64(i, 0x0606060606060606) | 0x0101010101010101; mtable[i] = accumulate(begin(v), end(v), 1u, multiplies<unsigned>{}); } } template<unsigned x> unsigned bexp(unsigned pow) { unsigned acc1 = x; unsigned acc2 = 1; for (; pow; pow >>= 1, acc1 *= acc1) { if (pow & 1) acc2 *= acc1; } return acc2; } int main() { constexpr size_t bits = 3; mt19937 rnd; auto ud = uniform_int_distribution<size_t>(0, (1 << (bits - 1)) - 1); generate_n(bytes, size, [&]{ return ud(rnd) * 2 + 1; }); timepoint start1 = time_clk::now(); const auto prod1 = accumulate(begin(bytes), end(bytes), 1u, multiplies<unsigned>{}); timepoint end1 = time_clk::now(); duration d1 = end1 - start1; cout << "accumulate: res = " << prod1 << ", time = " << d1.count() / size << " ns\n"; timepoint start2 = time_clk::now(); unsigned p1 = 1u, p2 = 1u, p3 = 1u, p4 = 1u; for (size_t i = 0; i < size; i += 4) { p1 *= bytes[i + 0]; p2 *= bytes[i + 1]; p3 *= bytes[i + 2]; p4 *= bytes[i + 3]; } const auto prod2 = p1 * p2 * p3 * p4; timepoint end2 = time_clk::now(); duration d2 = end2 - start2; cout << " accum * 4: res = " << prod2 << ", time = " << d2.count() / size << " ns\n"; auto m128 = reinterpret_cast<__m128i*>(bytes); timepoint start3 = time_clk::now(); __m256i pv1 = _mm256_set1_epi32(1); __m256i pv2 = _mm256_set1_epi32(1); __m256i pv3 = _mm256_set1_epi32(1); __m256i pv4 = _mm256_set1_epi32(1); __m256i pv5 = _mm256_set1_epi32(1); __m256i pv6 = _mm256_set1_epi32(1); __m256i pv7 = _mm256_set1_epi32(1); __m256i pv8 = _mm256_set1_epi32(1); for (size_t i = 0; i < size / 16; i += 8) { const __m256i t1 = _mm256_mullo_epi16(_mm256_cvtepu8_epi16(m128[i + 0]), _mm256_cvtepu8_epi16(m128[i + 1])); pv1 = _mm256_mullo_epi32(pv1, _mm256_and_si256(t1, _mm256_set1_epi32(0xFFFF))); pv2 = _mm256_mullo_epi32(pv2, _mm256_srli_epi32(t1, 16)); const __m256i t2 = _mm256_mullo_epi16(_mm256_cvtepu8_epi16(m128[i + 2]), _mm256_cvtepu8_epi16(m128[i + 3])); pv3 = _mm256_mullo_epi32(pv3, _mm256_and_si256(t2, _mm256_set1_epi32(0xFFFF))); pv4 = _mm256_mullo_epi32(pv4, _mm256_srli_epi32(t2, 16)); const __m256i t3 = _mm256_mullo_epi16(_mm256_cvtepu8_epi16(m128[i + 4]), _mm256_cvtepu8_epi16(m128[i + 5])); pv5 = _mm256_mullo_epi32(pv5, _mm256_and_si256(t3, _mm256_set1_epi32(0xFFFF))); pv6 = _mm256_mullo_epi32(pv6, _mm256_srli_epi32(t3, 16)); const __m256i t4 = _mm256_mullo_epi16(_mm256_cvtepu8_epi16(m128[i + 6]), _mm256_cvtepu8_epi16(m128[i + 7])); pv7 = _mm256_mullo_epi32(pv7, _mm256_and_si256(t4, _mm256_set1_epi32(0xFFFF))); pv8 = _mm256_mullo_epi32(pv8, _mm256_srli_epi32(t4, 16)); } pv1 = _mm256_mullo_epi32(pv1, pv2); pv3 = _mm256_mullo_epi32(pv3, pv4); pv5 = _mm256_mullo_epi32(pv5, pv6); pv7 = _mm256_mullo_epi32(pv7, pv8); pv1 = _mm256_mullo_epi32(pv1, pv3); pv5 = _mm256_mullo_epi32(pv5, pv7); pv1 = _mm256_mullo_epi32(pv1, pv5); __m128i hi = _mm256_extracti128_si256(pv1, 1); __m128i lo = _mm256_extracti128_si256(pv1, 0); lo = _mm_mullo_epi32(hi, lo); const auto prod3 = unsigned(_mm_extract_epi32(lo, 0) * _mm_extract_epi32(lo, 1) * _mm_extract_epi32(lo, 2) * _mm_extract_epi32(lo, 3)); timepoint end3 = time_clk::now(); duration d3 = end3 - start3; cout << " AVX2: res = " << prod3 << ", time = " << d3.count() / size << " ns\n"; auto qwords = reinterpret_cast<uint64_t*>(bytes); timepoint start4 = time_clk::now(); unsigned acc1 = 1; unsigned acc3 = 0; unsigned acc5 = 0; unsigned acc7 = 0; for (size_t i = 0; i < size / 8; i += 2) { auto compr = (qwords[i] << 4) | qwords[i + 1]; constexpr uint64_t lsb = 0x1111111111111111; if ((compr & lsb) != lsb) // if there is at least one even value { auto b = reinterpret_cast<uint8_t*>(qwords + i); acc1 *= accumulate(b, b + 16, acc1, multiplies<unsigned>{}); if (!acc1) break; } else { const auto b2 = compr & 0x2222222222222222; const auto b4 = compr & 0x4444444444444444; const auto b24 = b4 & (b2 * 2); const unsigned c7 = __builtin_popcountll(b24); acc3 += __builtin_popcountll(b2) - c7; acc5 += __builtin_popcountll(b4) - c7; acc7 += c7; } } const auto prod4 = acc1 * bexp<3>(acc3) * bexp<5>(acc5) * bexp<7>(acc7); timepoint end4 = time_clk::now(); duration d4 = end4 - start4; cout << "binary exp: res = " << prod4 << ", time = " << d4.count() / size << " ns\n"; auto words = reinterpret_cast<uint16_t*>(bytes); timepoint start5 = time_clk::now(); auto prod5 = 1u; for (size_t i = 0; i < size / 2; i += 4) { const auto t1 = uint32_t(words[i + 0]) * words[i + 1]; const auto t2 = uint32_t(words[i + 2]) * words[i + 3]; const auto t3 = uint64_t(t1 & 0xFF00FF) * (t2 & 0xFF00FF); prod5 *= uint32_t(t3 >> 32) * uint32_t(t3 & 0xFFFF); } timepoint end5 = time_clk::now(); duration d5 = end5 - start5; cout << "2 mul in 1: res = " << prod5 << ", time = " << d5.count() / size << " ns\n"; fillMtable(); timepoint start6 = time_clk::now(); unsigned a1 = 1; unsigned a2 = 1; for (size_t i = 0; i < size / 8; i += 2) { constexpr uint64_t lsb = 0x0101010101010101; if ((qwords[i] & lsb) != lsb || (qwords[i + 1] & lsb) != lsb) { // if there is at least one even value auto b = reinterpret_cast<uint8_t*>(qwords + i); acc1 *= accumulate(b, b + 16, acc1, multiplies<unsigned>{}); if (!acc1) break; } else { a1 *= mtable[_pext_u64(qwords[i + 0], 0x0606060606060606)]; a2 *= mtable[_pext_u64(qwords[i + 1], 0x0606060606060606)]; } } const auto prod6 = a1 * a2; timepoint end6 = time_clk::now(); duration d6 = end6 - start6; cout << "mult table: res = " << prod6 << ", time = " << d6.count() / size << " ns\n"; return 0; }
Standard input is empty
prog.cpp: In function ‘void fillMtable()’:
prog.cpp:31:46: error: ‘_pdep_u64’ was not declared in this scope
dep = _pdep_u64(i, 0x0606060606060606) | 0x0101010101010101;
^
prog.cpp: In function ‘int main()’:
prog.cpp:78:34: error: expected type-specifier before ‘__m128i’
auto m128 = reinterpret_cast<__m128i*>(bytes);
^
prog.cpp:78:34: error: expected ‘>’ before ‘__m128i’
prog.cpp:78:34: error: expected ‘(’ before ‘__m128i’
prog.cpp:78:34: error: ‘__m128i’ was not declared in this scope
prog.cpp:78:42: error: expected primary-expression before ‘>’ token
auto m128 = reinterpret_cast<__m128i*>(bytes);
^
prog.cpp:78:50: error: expected ‘)’ before ‘;’ token
auto m128 = reinterpret_cast<__m128i*>(bytes);
^
prog.cpp:80:5: error: ‘__m256i’ was not declared in this scope
__m256i pv1 = _mm256_set1_epi32(1);
^
prog.cpp:80:13: error: expected ‘;’ before ‘pv1’
__m256i pv1 = _mm256_set1_epi32(1);
^
prog.cpp:81:13: error: expected ‘;’ before ‘pv2’
__m256i pv2 = _mm256_set1_epi32(1);
^
prog.cpp:82:13: error: expected ‘;’ before ‘pv3’
__m256i pv3 = _mm256_set1_epi32(1);
^
prog.cpp:83:13: error: expected ‘;’ before ‘pv4’
__m256i pv4 = _mm256_set1_epi32(1);
^
prog.cpp:84:13: error: expected ‘;’ before ‘pv5’
__m256i pv5 = _mm256_set1_epi32(1);
^
prog.cpp:85:13: error: expected ‘;’ before ‘pv6’
__m256i pv6 = _mm256_set1_epi32(1);
^
prog.cpp:86:13: error: expected ‘;’ before ‘pv7’
__m256i pv7 = _mm256_set1_epi32(1);
^
prog.cpp:87:13: error: expected ‘;’ before ‘pv8’
__m256i pv8 = _mm256_set1_epi32(1);
^
prog.cpp:90:15: error: ‘__m256i’ does not name a type
const __m256i t1 = _mm256_mullo_epi16(_mm256_cvtepu8_epi16(m128[i + 0]),
^
prog.cpp:92:9: error: ‘pv1’ was not declared in this scope
pv1 = _mm256_mullo_epi32(pv1, _mm256_and_si256(t1, _mm256_set1_epi32(0xFFFF)));
^
prog.cpp:92:56: error: ‘t1’ was not declared in this scope
pv1 = _mm256_mullo_epi32(pv1, _mm256_and_si256(t1, _mm256_set1_epi32(0xFFFF)));
^
prog.cpp:92:84: error: ‘_mm256_set1_epi32’ was not declared in this scope
pv1 = _mm256_mullo_epi32(pv1, _mm256_and_si256(t1, _mm256_set1_epi32(0xFFFF)));
^
prog.cpp:92:85: error: ‘_mm256_and_si256’ was not declared in this scope
pv1 = _mm256_mullo_epi32(pv1, _mm256_and_si256(t1, _mm256_set1_epi32(0xFFFF)));
^
prog.cpp:92:86: error: ‘_mm256_mullo_epi32’ was not declared in this scope
pv1 = _mm256_mullo_epi32(pv1, _mm256_and_si256(t1, _mm256_set1_epi32(0xFFFF)));
^
prog.cpp:93:9: error: ‘pv2’ was not declared in this scope
pv2 = _mm256_mullo_epi32(pv2, _mm256_srli_epi32(t1, 16));
^
prog.cpp:93:63: error: ‘_mm256_srli_epi32’ was not declared in this scope
pv2 = _mm256_mullo_epi32(pv2, _mm256_srli_epi32(t1, 16));
^
prog.cpp:94:15: error: ‘__m256i’ does not name a type
const __m256i t2 = _mm256_mullo_epi16(_mm256_cvtepu8_epi16(m128[i + 2]),
^
prog.cpp:96:9: error: ‘pv3’ was not declared in this scope
pv3 = _mm256_mullo_epi32(pv3, _mm256_and_si256(t2, _mm256_set1_epi32(0xFFFF)));
^
prog.cpp:96:56: error: ‘t2’ was not declared in this scope
pv3 = _mm256_mullo_epi32(pv3, _mm256_and_si256(t2, _mm256_set1_epi32(0xFFFF)));
^
prog.cpp:97:9: error: ‘pv4’ was not declared in this scope
pv4 = _mm256_mullo_epi32(pv4, _mm256_srli_epi32(t2, 16));
^
prog.cpp:98:15: error: ‘__m256i’ does not name a type
const __m256i t3 = _mm256_mullo_epi16(_mm256_cvtepu8_epi16(m128[i + 4]),
^
prog.cpp:100:9: error: ‘pv5’ was not declared in this scope
pv5 = _mm256_mullo_epi32(pv5, _mm256_and_si256(t3, _mm256_set1_epi32(0xFFFF)));
^
prog.cpp:100:56: error: ‘t3’ was not declared in this scope
pv5 = _mm256_mullo_epi32(pv5, _mm256_and_si256(t3, _mm256_set1_epi32(0xFFFF)));
^
prog.cpp:101:9: error: ‘pv6’ was not declared in this scope
pv6 = _mm256_mullo_epi32(pv6, _mm256_srli_epi32(t3, 16));
^
prog.cpp:102:15: error: ‘__m256i’ does not name a type
const __m256i t4 = _mm256_mullo_epi16(_mm256_cvtepu8_epi16(m128[i + 6]),
^
prog.cpp:104:9: error: ‘pv7’ was not declared in this scope
pv7 = _mm256_mullo_epi32(pv7, _mm256_and_si256(t4, _mm256_set1_epi32(0xFFFF)));
^
prog.cpp:104:56: error: ‘t4’ was not declared in this scope
pv7 = _mm256_mullo_epi32(pv7, _mm256_and_si256(t4, _mm256_set1_epi32(0xFFFF)));
^
prog.cpp:105:9: error: ‘pv8’ was not declared in this scope
pv8 = _mm256_mullo_epi32(pv8, _mm256_srli_epi32(t4, 16));
^
prog.cpp:107:5: error: ‘pv1’ was not declared in this scope
pv1 = _mm256_mullo_epi32(pv1, pv2);
^
prog.cpp:107:35: error: ‘pv2’ was not declared in this scope
pv1 = _mm256_mullo_epi32(pv1, pv2);
^
prog.cpp:107:38: error: ‘_mm256_mullo_epi32’ was not declared in this scope
pv1 = _mm256_mullo_epi32(pv1, pv2);
^
prog.cpp:108:5: error: ‘pv3’ was not declared in this scope
pv3 = _mm256_mullo_epi32(pv3, pv4);
^
prog.cpp:108:35: error: ‘pv4’ was not declared in this scope
pv3 = _mm256_mullo_epi32(pv3, pv4);
^
prog.cpp:109:5: error: ‘pv5’ was not declared in this scope
pv5 = _mm256_mullo_epi32(pv5, pv6);
^
prog.cpp:109:35: error: ‘pv6’ was not declared in this scope
pv5 = _mm256_mullo_epi32(pv5, pv6);
^
prog.cpp:110:5: error: ‘pv7’ was not declared in this scope
pv7 = _mm256_mullo_epi32(pv7, pv8);
^
prog.cpp:110:35: error: ‘pv8’ was not declared in this scope
pv7 = _mm256_mullo_epi32(pv7, pv8);
^
prog.cpp:114:13: error: expected ‘;’ before ‘hi’
__m128i hi = _mm256_extracti128_si256(pv1, 1);
^
prog.cpp:115:13: error: expected ‘;’ before ‘lo’
__m128i lo = _mm256_extracti128_si256(pv1, 0);
^
prog.cpp:116:5: error: ‘lo’ was not declared in this scope
lo = _mm_mullo_epi32(hi, lo);
^
prog.cpp:116:26: error: ‘hi’ was not declared in this scope
lo = _mm_mullo_epi32(hi, lo);
^
prog.cpp:116:32: error: ‘_mm_mullo_epi32’ was not declared in this scope
lo = _mm_mullo_epi32(hi, lo);
^
prog.cpp:117:56: error: ‘_mm_extract_epi32’ was not declared in this scope
const auto prod3 = unsigned(_mm_extract_epi32(lo, 0) * _mm_extract_epi32(lo, 1) *
^
prog.cpp:186:69: error: ‘_pext_u64’ was not declared in this scope
a1 *= mtable[_pext_u64(qwords[i + 0], 0x0606060606060606)];
^
Standard output is empty