#include <iostream>
#include <random>
#include <ctime>
#include <cassert>
#include <chrono>
#include <immintrin.h>
const int Q = 30000000;
const int N = 1 << 24;
using T = uint32_t;
T a[2 * N];
const T identity_element = 0;
T reduce(T u, T v) { return u + v; }
__attribute__((target("sse4.1"))) __m128i reduce(__m128i u, __m128i v) { return _mm_add_epi32(u, v); }
__attribute__((target("avx2"))) __m256i reduce(__m256i u, __m256i v) { return _mm256_add_epi32(u, v); }
static_assert(sizeof(T) == 4, "Segment tree elements must be 32-bit");
T query_recursive_inner(int v, int vl, int vr, int l, int r)
{
if(l >= r) return identity_element;
if(l <= vl && vr <= r) return a[v];
int vm = (vl + vr) >> 1;
return reduce(query_recursive_inner(v << 1, vl, vm, l, std::min(r, vm)), query_recursive_inner((v << 1) | 1, vm, vr, std::max(l, vm), r));
}
T query_recursive_inner(int l, int r)
{
return query_recursive_inner(1, 0, N, l, r + 1);
}
T query_recursive_outer(int v, int vl, int vr, int l, int r)
{
if(vl == l && vr == r) return a[v];
int vm = (vl + vr) >> 1;
if(r <= vm) return query_recursive_outer(v << 1, vl, vm, l, r);
if(l >= vm) return query_recursive_outer((v << 1) | 1, vm, vr, l, r);
return reduce(query_recursive_outer(v << 1, vl, vm, l, vm), query_recursive_outer((v << 1) | 1, vm, vr, vm, r));
}
T query_recursive_outer(int l, int r)
{
return query_recursive_outer(1, 0, N, l, r + 1);
}
T query_bottom_up(int l, int r)
{
l += N;
r += N;
T ans = identity_element;
while(l <= r)
{
if(l & 1)
{
ans = reduce(ans, a[l]);
l++;
}
if(!(r & 1))
{
ans = reduce(ans, a[r]);
r--;
}
l >>= 1;
r >>= 1;
}
return ans;
}
int ffs(unsigned int x) { return sizeof(unsigned int) * 8 - 1 - __builtin_clz(x); }
__attribute__((target("avx2"))) T query_parallel(int l, int r)
{
if(l == r) return a[l + N];
int mbit = ffs(l ^ r);
int reset = ((1 << mbit) - 1);
int m = r & ~reset;
using vecint = T __attribute__((vector_size(32)));
__m256i identity_vec = _mm256_set1_epi32(identity_element);
vecint vec_ans = (vecint)identity_vec;
__m256i indexes = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
if((l & reset) != 0) {
int ll = l - 1 + N;
int rr = m - 1 + N;
int modbit = 0;
int maxmodbit = ffs(ll ^ rr) + 1;
vecint ll_vec = (vecint)_mm256_srav_epi32(_mm256_set1_epi32(ll), indexes);
#define LOOP(content) if(modbit + 8 <= maxmodbit) { \
vec_ans = (vecint)reduce((__m256i)vec_ans, _mm256_i32gather_epi32((int*)a, (__m256i)(((ll_vec & 1) - 1) & (ll_vec | 1)), 4)); \
ll_vec >>= 8; \
modbit += 8; \
content \
}
LOOP(LOOP(LOOP(LOOP())))
#undef LOOP
__m256i tmp = _mm256_i32gather_epi32((int*)a, (__m256i)(((ll_vec & 1) - 1) & (ll_vec | 1)), 4);
__m256i mask = _mm256_cmpgt_epi32(_mm256_set1_epi32(maxmodbit & 7), indexes);
vec_ans = (vecint)reduce((__m256i)vec_ans, _mm256_blendv_epi8(identity_vec, tmp, mask));
}
else
vec_ans[0] = reduce(vec_ans[0], a[(l + N) >> mbit]);
if((r & reset) != reset)
{
int ll = m + N;
int rr = r + 1 + N;
int modbit = 0;
int maxmodbit = ffs(ll ^ rr) + 1;
vecint rr_vec = (vecint)_mm256_srav_epi32(_mm256_set1_epi32(rr), indexes);
#define LOOP(content) if(modbit + 8 <= maxmodbit) { \
vec_ans = (vecint)reduce((__m256i)vec_ans, _mm256_i32gather_epi32((int*)a, (__m256i)(~((rr_vec & 1) - 1) & (rr_vec - 1)), 4)); \
rr_vec >>= 8; \
modbit += 8; \
content \
}
LOOP(LOOP(LOOP(LOOP())))
#undef LOOP
__m256i tmp = _mm256_i32gather_epi32((int*)a, (__m256i)(~((rr_vec & 1) - 1) & (rr_vec - 1)), 4);
__m256i mask = _mm256_cmpgt_epi32(_mm256_set1_epi32(maxmodbit & 7), indexes);
vec_ans = (vecint)reduce((__m256i)vec_ans, _mm256_blendv_epi8(identity_vec, tmp, mask));
}
else
vec_ans[0] = reduce(vec_ans[0], a[(r + N) >> mbit]);
// vec_ans = 7 6 5 4 3 2 1 0
__m128i low128 = _mm256_castsi256_si128((__m256i)vec_ans); // 3 2 1 0
__m128i high128 = _mm256_extractf128_si256((__m256i)vec_ans, 1); // 7 6 5 4
__m128i ans128 = reduce(low128, high128); // 7+3 6+2 5+1 4+0
T ans = identity_element;
for(int i = 0; i < 4; i++) ans = reduce(ans, ((T __attribute__((vector_size(16))))ans128)[i]);
return ans;
}
T pref[N];
T query_prefix(int l, int r) { return pref[r] - (l == 0 ? 0 : pref[l - 1]); }
T fenwick[N];
T query_fenwick(int r)
{
T ans = 0;
for (; r >= 0; r = (r & (r + 1)) - 1) ans += fenwick[r];
return ans;
}
T query_fenwick(int l, int r)
{
return query_fenwick(r) - (l == 0 ? 0 : query_fenwick(l - 1));
}
int main()
{
//std::mt19937 rng(1447);
std::mt19937 rng(std::chrono::steady_clock::now().time_since_epoch().count());
std::pair<int, int>* queries = new std::pair<int, int>[Q];
for(int i = 0; i < Q; i++)
{
int l = rng() % N;
int r = rng() % N;
if(l > r) std::swap(l, r);
queries[i] = {l, r};
}
for(int i = 0; i < N; i++) a[N + i] = rng() % (32768);
for(int i = N - 1; i >= 1; i--) a[i] = reduce(a[i * 2], a[i * 2 + 1]);
a[0] = identity_element;
for(int i = 0; i < N; i++) pref[i] = (i == 0 ? 0 : pref[i - 1]) + a[N + i];
for(int i = 0; i < N; i++)
for (int j = i; j < N; j = (j | (j + 1)))
fenwick[j] += a[N + i];
#define CHECK(func) { \
auto clock_start = clock(); \
T checksum = 0; \
for(int i = 0; i < Q; i++) { \
checksum += func(queries[i].first, queries[i].second); \
} \
std::cout << #func << ": " << (double)(clock() - clock_start) / CLOCKS_PER_SEC << " seconds (checksum: " << checksum << ")" << std::endl; \
}
CHECK(query_recursive_inner)
CHECK(query_recursive_outer)
CHECK(query_bottom_up)
CHECK(query_parallel)
CHECK(query_fenwick)
CHECK(query_prefix)
delete [] queries;
return 0;
}
I2luY2x1ZGUgPGlvc3RyZWFtPgojaW5jbHVkZSA8cmFuZG9tPgojaW5jbHVkZSA8Y3RpbWU+CiNpbmNsdWRlIDxjYXNzZXJ0PgojaW5jbHVkZSA8Y2hyb25vPgojaW5jbHVkZSA8aW1taW50cmluLmg+Cgpjb25zdCBpbnQgUSA9IDMwMDAwMDAwOwpjb25zdCBpbnQgTiA9IDEgPDwgMjQ7Cgp1c2luZyBUID0gdWludDMyX3Q7ClQgYVsyICogTl07Cgpjb25zdCBUIGlkZW50aXR5X2VsZW1lbnQgPSAwOwpUIHJlZHVjZShUIHUsIFQgdikgeyByZXR1cm4gdSArIHY7IH0KX19hdHRyaWJ1dGVfXygodGFyZ2V0KCJzc2U0LjEiKSkpIF9fbTEyOGkgcmVkdWNlKF9fbTEyOGkgdSwgX19tMTI4aSB2KSB7IHJldHVybiBfbW1fYWRkX2VwaTMyKHUsIHYpOyB9Cl9fYXR0cmlidXRlX18oKHRhcmdldCgiYXZ4MiIpKSkgX19tMjU2aSByZWR1Y2UoX19tMjU2aSB1LCBfX20yNTZpIHYpIHsgcmV0dXJuIF9tbTI1Nl9hZGRfZXBpMzIodSwgdik7IH0KCnN0YXRpY19hc3NlcnQoc2l6ZW9mKFQpID09IDQsICJTZWdtZW50IHRyZWUgZWxlbWVudHMgbXVzdCBiZSAzMi1iaXQiKTsKClQgcXVlcnlfcmVjdXJzaXZlX2lubmVyKGludCB2LCBpbnQgdmwsIGludCB2ciwgaW50IGwsIGludCByKSAKewoJaWYobCA+PSByKSByZXR1cm4gaWRlbnRpdHlfZWxlbWVudDsKCWlmKGwgPD0gdmwgJiYgdnIgPD0gcikgcmV0dXJuIGFbdl07CglpbnQgdm0gPSAodmwgKyB2cikgPj4gMTsKCXJldHVybiByZWR1Y2UocXVlcnlfcmVjdXJzaXZlX2lubmVyKHYgPDwgMSwgdmwsIHZtLCBsLCBzdGQ6Om1pbihyLCB2bSkpLCBxdWVyeV9yZWN1cnNpdmVfaW5uZXIoKHYgPDwgMSkgfCAxLCB2bSwgdnIsIHN0ZDo6bWF4KGwsIHZtKSwgcikpOwp9CgpUIHF1ZXJ5X3JlY3Vyc2l2ZV9pbm5lcihpbnQgbCwgaW50IHIpIAp7CglyZXR1cm4gcXVlcnlfcmVjdXJzaXZlX2lubmVyKDEsIDAsIE4sIGwsIHIgKyAxKTsKfQoKVCBxdWVyeV9yZWN1cnNpdmVfb3V0ZXIoaW50IHYsIGludCB2bCwgaW50IHZyLCBpbnQgbCwgaW50IHIpIAp7CglpZih2bCA9PSBsICYmIHZyID09IHIpIHJldHVybiBhW3ZdOwoJCglpbnQgdm0gPSAodmwgKyB2cikgPj4gMTsKCWlmKHIgPD0gdm0pIHJldHVybiBxdWVyeV9yZWN1cnNpdmVfb3V0ZXIodiA8PCAxLCB2bCwgdm0sIGwsIHIpOwoJaWYobCA+PSB2bSkgcmV0dXJuIHF1ZXJ5X3JlY3Vyc2l2ZV9vdXRlcigodiA8PCAxKSB8IDEsIHZtLCB2ciwgbCwgcik7CglyZXR1cm4gcmVkdWNlKHF1ZXJ5X3JlY3Vyc2l2ZV9vdXRlcih2IDw8IDEsIHZsLCB2bSwgbCwgdm0pLCBxdWVyeV9yZWN1cnNpdmVfb3V0ZXIoKHYgPDwgMSkgfCAxLCB2bSwgdnIsIHZtLCByKSk7Cn0KClQgcXVlcnlfcmVjdXJzaXZlX291dGVyKGludCBsLCBpbnQgcikgCnsKCXJldHVybiBxdWVyeV9yZWN1cnNpdmVfb3V0ZXIoMSwgMCwgTiwgbCwgciArIDEpOwp9CgpUIHF1ZXJ5X2JvdHRvbV91cChpbnQgbCwgaW50IHIpIAp7CglsICs9IE47CglyICs9IE47CglUIGFucyA9IGlkZW50aXR5X2VsZW1lbnQ7Cgl3aGlsZShsIDw9IHIpIAoJewoJCWlmKGwgJiAxKSAKCQl7CgkJCWFucyA9IHJlZHVjZShhbnMsIGFbbF0pOwoJCQlsKys7CgkJfQoJCWlmKCEociAmIDEpKSAKCQl7CgkJCWFucyA9IHJlZHVjZShhbnMsIGFbcl0pOwoJCQlyLS07CgkJfQoJCWwgPj49IDE7CgkJciA+Pj0gMTsKCX0KCXJldHVybiBhbnM7Cn0KCgppbnQgZmZzKHVuc2lnbmVkIGludCB4KSB7IHJldHVybiBzaXplb2YodW5zaWduZWQgaW50KSAqIDggLSAxIC0gX19idWlsdGluX2Nseih4KTsgfQoKX19hdHRyaWJ1dGVfXygodGFyZ2V0KCJhdngyIikpKSBUIHF1ZXJ5X3BhcmFsbGVsKGludCBsLCBpbnQgcikgCnsKCWlmKGwgPT0gcikgcmV0dXJuIGFbbCArIE5dOwoKCWludCBtYml0ID0gZmZzKGwgXiByKTsKCWludCByZXNldCA9ICgoMSA8PCBtYml0KSAtIDEpOwoJaW50IG0gPSByICYgfnJlc2V0OwoKCXVzaW5nIHZlY2ludCA9IFQgX19hdHRyaWJ1dGVfXygodmVjdG9yX3NpemUoMzIpKSk7CglfX20yNTZpIGlkZW50aXR5X3ZlYyA9IF9tbTI1Nl9zZXQxX2VwaTMyKGlkZW50aXR5X2VsZW1lbnQpOwoJdmVjaW50IHZlY19hbnMgPSAodmVjaW50KWlkZW50aXR5X3ZlYzsKCV9fbTI1NmkgaW5kZXhlcyA9IF9tbTI1Nl9zZXRfZXBpMzIoNywgNiwgNSwgNCwgMywgMiwgMSwgMCk7CgoJaWYoKGwgJiByZXNldCkgIT0gMCkgewoJCWludCBsbCA9IGwgLSAxICsgTjsKCQlpbnQgcnIgPSBtIC0gMSArIE47CgoJCWludCBtb2RiaXQgPSAwOwoJCWludCBtYXhtb2RiaXQgPSBmZnMobGwgXiBycikgKyAxOwoKCQl2ZWNpbnQgbGxfdmVjID0gKHZlY2ludClfbW0yNTZfc3Jhdl9lcGkzMihfbW0yNTZfc2V0MV9lcGkzMihsbCksIGluZGV4ZXMpOwoKI2RlZmluZSBMT09QKGNvbnRlbnQpIGlmKG1vZGJpdCArIDggPD0gbWF4bW9kYml0KSB7IFwKCQkJdmVjX2FucyA9ICh2ZWNpbnQpcmVkdWNlKChfX20yNTZpKXZlY19hbnMsIF9tbTI1Nl9pMzJnYXRoZXJfZXBpMzIoKGludCopYSwgKF9fbTI1NmkpKCgobGxfdmVjICYgMSkgLSAxKSAmIChsbF92ZWMgfCAxKSksIDQpKTsgXAoJCQlsbF92ZWMgPj49IDg7IFwKCQkJbW9kYml0ICs9IDg7IFwKCQkJY29udGVudCBcCgkJfQoJCUxPT1AoTE9PUChMT09QKExPT1AoKSkpKQojdW5kZWYgTE9PUAoKCQlfX20yNTZpIHRtcCA9IF9tbTI1Nl9pMzJnYXRoZXJfZXBpMzIoKGludCopYSwgKF9fbTI1NmkpKCgobGxfdmVjICYgMSkgLSAxKSAmIChsbF92ZWMgfCAxKSksIDQpOwoJCV9fbTI1NmkgbWFzayA9IF9tbTI1Nl9jbXBndF9lcGkzMihfbW0yNTZfc2V0MV9lcGkzMihtYXhtb2RiaXQgJiA3KSwgaW5kZXhlcyk7CgkJdmVjX2FucyA9ICh2ZWNpbnQpcmVkdWNlKChfX20yNTZpKXZlY19hbnMsIF9tbTI1Nl9ibGVuZHZfZXBpOChpZGVudGl0eV92ZWMsIHRtcCwgbWFzaykpOwoJfSAKCWVsc2UgCgkJdmVjX2Fuc1swXSA9IHJlZHVjZSh2ZWNfYW5zWzBdLCBhWyhsICsgTikgPj4gbWJpdF0pOwoJCgoJaWYoKHIgJiByZXNldCkgIT0gcmVzZXQpIAoJewoJCWludCBsbCA9IG0gKyBOOwoJCWludCByciA9IHIgKyAxICsgTjsKCgkJaW50IG1vZGJpdCA9IDA7CgkJaW50IG1heG1vZGJpdCA9IGZmcyhsbCBeIHJyKSArIDE7CgoJCXZlY2ludCBycl92ZWMgPSAodmVjaW50KV9tbTI1Nl9zcmF2X2VwaTMyKF9tbTI1Nl9zZXQxX2VwaTMyKHJyKSwgaW5kZXhlcyk7CgojZGVmaW5lIExPT1AoY29udGVudCkgaWYobW9kYml0ICsgOCA8PSBtYXhtb2RiaXQpIHsgXAoJCQl2ZWNfYW5zID0gKHZlY2ludClyZWR1Y2UoKF9fbTI1NmkpdmVjX2FucywgX21tMjU2X2kzMmdhdGhlcl9lcGkzMigoaW50KilhLCAoX19tMjU2aSkofigocnJfdmVjICYgMSkgLSAxKSAmIChycl92ZWMgLSAxKSksIDQpKTsgXAoJCQlycl92ZWMgPj49IDg7IFwKCQkJbW9kYml0ICs9IDg7IFwKCQkJY29udGVudCBcCgkJfQoJCUxPT1AoTE9PUChMT09QKExPT1AoKSkpKQojdW5kZWYgTE9PUAoKCQlfX20yNTZpIHRtcCA9IF9tbTI1Nl9pMzJnYXRoZXJfZXBpMzIoKGludCopYSwgKF9fbTI1NmkpKH4oKHJyX3ZlYyAmIDEpIC0gMSkgJiAocnJfdmVjIC0gMSkpLCA0KTsKCQlfX20yNTZpIG1hc2sgPSBfbW0yNTZfY21wZ3RfZXBpMzIoX21tMjU2X3NldDFfZXBpMzIobWF4bW9kYml0ICYgNyksIGluZGV4ZXMpOwoJCXZlY19hbnMgPSAodmVjaW50KXJlZHVjZSgoX19tMjU2aSl2ZWNfYW5zLCBfbW0yNTZfYmxlbmR2X2VwaTgoaWRlbnRpdHlfdmVjLCB0bXAsIG1hc2spKTsKCX0gCgllbHNlIAoJCXZlY19hbnNbMF0gPSByZWR1Y2UodmVjX2Fuc1swXSwgYVsociArIE4pID4+IG1iaXRdKTsKCQoJLy8gdmVjX2FucyA9IDcgNiA1IDQgMyAyIDEgMAoJX19tMTI4aSBsb3cxMjggPSBfbW0yNTZfY2FzdHNpMjU2X3NpMTI4KChfX20yNTZpKXZlY19hbnMpOyAvLyAzIDIgMSAwCglfX20xMjhpIGhpZ2gxMjggPSBfbW0yNTZfZXh0cmFjdGYxMjhfc2kyNTYoKF9fbTI1NmkpdmVjX2FucywgMSk7IC8vIDcgNiA1IDQKCV9fbTEyOGkgYW5zMTI4ID0gcmVkdWNlKGxvdzEyOCwgaGlnaDEyOCk7IC8vIDcrMyA2KzIgNSsxIDQrMAoJVCBhbnMgPSBpZGVudGl0eV9lbGVtZW50OwoJZm9yKGludCBpID0gMDsgaSA8IDQ7IGkrKykgYW5zID0gcmVkdWNlKGFucywgKChUIF9fYXR0cmlidXRlX18oKHZlY3Rvcl9zaXplKDE2KSkpKWFuczEyOClbaV0pOwoJcmV0dXJuIGFuczsKfQoKVCBwcmVmW05dOwpUIHF1ZXJ5X3ByZWZpeChpbnQgbCwgaW50IHIpIHsgcmV0dXJuIHByZWZbcl0gLSAobCA9PSAwID8gMCA6IHByZWZbbCAtIDFdKTsgfQoKVCBmZW53aWNrW05dOwoKVCBxdWVyeV9mZW53aWNrKGludCByKQp7CglUIGFucyA9IDA7Cglmb3IgKDsgciA+PSAwOyByID0gKHIgJiAociArIDEpKSAtIDEpIGFucyArPSBmZW53aWNrW3JdOwoJcmV0dXJuIGFuczsKfQoKVCBxdWVyeV9mZW53aWNrKGludCBsLCBpbnQgcikKewoJcmV0dXJuIHF1ZXJ5X2ZlbndpY2socikgLSAobCA9PSAwID8gMCA6IHF1ZXJ5X2ZlbndpY2sobCAtIDEpKTsKfQoKaW50IG1haW4oKSAKewoJLy9zdGQ6Om10MTk5Mzcgcm5nKDE0NDcpOwoJc3RkOjptdDE5OTM3IHJuZyhzdGQ6OmNocm9ubzo6c3RlYWR5X2Nsb2NrOjpub3coKS50aW1lX3NpbmNlX2Vwb2NoKCkuY291bnQoKSk7CgoJc3RkOjpwYWlyPGludCwgaW50PiogcXVlcmllcyA9IG5ldyBzdGQ6OnBhaXI8aW50LCBpbnQ+W1FdOwoJZm9yKGludCBpID0gMDsgaSA8IFE7IGkrKykgCgl7CgkJaW50IGwgPSBybmcoKSAlIE47CgkJaW50IHIgPSBybmcoKSAlIE47CgkJaWYobCA+IHIpIHN0ZDo6c3dhcChsLCByKTsKCQlxdWVyaWVzW2ldID0ge2wsIHJ9OwoJfQoJZm9yKGludCBpID0gMDsgaSA8IE47IGkrKykgYVtOICsgaV0gPSBybmcoKSAlICgzMjc2OCk7Cglmb3IoaW50IGkgPSBOIC0gMTsgaSA+PSAxOyBpLS0pIGFbaV0gPSByZWR1Y2UoYVtpICogMl0sIGFbaSAqIDIgKyAxXSk7CglhWzBdID0gaWRlbnRpdHlfZWxlbWVudDsKCglmb3IoaW50IGkgPSAwOyBpIDwgTjsgaSsrKSBwcmVmW2ldID0gKGkgPT0gMCA/IDAgOiBwcmVmW2kgLSAxXSkgKyBhW04gKyBpXTsKCQoJZm9yKGludCBpID0gMDsgaSA8IE47IGkrKykKCQlmb3IgKGludCBqID0gaTsgaiA8IE47IGogPSAoaiB8IChqICsgMSkpKSAKCQkJZmVud2lja1tqXSArPSBhW04gKyBpXTsKCiNkZWZpbmUgQ0hFQ0soZnVuYykgeyBcCgkJYXV0byBjbG9ja19zdGFydCA9IGNsb2NrKCk7IFwKCQlUIGNoZWNrc3VtID0gMDsgXAoJCWZvcihpbnQgaSA9IDA7IGkgPCBROyBpKyspIHsgXAoJCQljaGVja3N1bSArPSBmdW5jKHF1ZXJpZXNbaV0uZmlyc3QsIHF1ZXJpZXNbaV0uc2Vjb25kKTsgXAoJCX0gXAoJCXN0ZDo6Y291dCA8PCAjZnVuYyA8PCAiOiAiIDw8IChkb3VibGUpKGNsb2NrKCkgLSBjbG9ja19zdGFydCkgLyBDTE9DS1NfUEVSX1NFQyA8PCAiIHNlY29uZHMgKGNoZWNrc3VtOiAiIDw8IGNoZWNrc3VtIDw8ICIpIiA8PCBzdGQ6OmVuZGw7IFwKCX0KCglDSEVDSyhxdWVyeV9yZWN1cnNpdmVfaW5uZXIpCglDSEVDSyhxdWVyeV9yZWN1cnNpdmVfb3V0ZXIpCglDSEVDSyhxdWVyeV9ib3R0b21fdXApCglDSEVDSyhxdWVyeV9wYXJhbGxlbCkKCUNIRUNLKHF1ZXJ5X2ZlbndpY2spCglDSEVDSyhxdWVyeV9wcmVmaXgpCgkKCWRlbGV0ZSBbXSBxdWVyaWVzOwoKCXJldHVybiAwOwp9