enum {
kMaxSize = 80000000,
kMinSize = 1000000,
kRadix = 1500,
kRadix3 = 325,
kRadix3Sq = kRadix3 * kRadix3,
kMaxCounters = (kMaxSize + kRadix - 1) / kRadix,
};
struct IndexValue
{
unsigned index;
unsigned value;
};
unsigned g_input[kMaxSize];
unsigned g_index[kMaxSize];
struct IndexValue g_sort[kMaxSize];
struct IndexValue g_sort2[kMaxSize];
unsigned g_counters[kMaxCounters];
unsigned g_output[kMaxSize];
void reorder3(const unsigned size)
{
const unsigned min_bucket1 = size / kRadix3;
const unsigned large_buckets1 = size % kRadix3;
g_counters[0] = 0;
for (unsigned i = 1; i <= large_buckets1; ++i)
g_counters[i] = g_counters[i - 1] + min_bucket1 + 1;
for (unsigned i = large_buckets1 + 1; i < kRadix3; ++i)
g_counters[i] = g_counters[i - 1] + min_bucket1;
for (unsigned i = 0; i < size; ++i)
{
const unsigned dst = g_counters[g_index[i] % kRadix3]++;
g_sort[dst].index = g_index[i] / kRadix3;
g_sort[dst].value = g_input[i];
__builtin_prefetch(&g_sort[dst + 4].value, 1);
}
const unsigned min_bucket2 = size / kRadix3Sq * kRadix3;
const unsigned large_buckets2 = size / kRadix3 % kRadix3;
g_counters[0] = 0;
for (unsigned i = 1; i <= large_buckets2; ++i)
g_counters[i] = g_counters[i - 1] + min_bucket2 + kRadix3;
g_counters[large_buckets2 + 1] = g_counters[large_buckets2] +
min_bucket2 + large_buckets1;
for (unsigned i = large_buckets2 + 2; i < kRadix3; ++i)
g_counters[i] = g_counters[i - 1] + min_bucket2;
for (unsigned i = 0; i < size; ++i)
{
const unsigned dst = g_counters[g_sort[i].index % kRadix3]++;
g_sort2[dst].index = g_sort[i].index / kRadix3;
g_sort2[dst].value = g_sort[i].value;
__builtin_prefetch(&g_sort2[dst + 4].value, 1);
}
g_counters[0] = 0;
for (unsigned i = 1; i < (size + kRadix3Sq - 1) / kRadix3Sq; ++i)
g_counters[i] = g_counters[i - 1] + kRadix3Sq;
for (unsigned i = 0; i < size; ++i)
{
const unsigned dst = g_counters[g_sort2[i].index]++;
g_output[dst] = g_sort2[i].value;
__builtin_prefetch(&g_output[dst + 4], 1);
}
}
void reorder2(const unsigned size)
{
const unsigned min_bucket = size / kRadix;
const unsigned large_buckets = size % kRadix;
g_counters[0] = 0;
for (unsigned i = 1; i <= large_buckets; ++i)
g_counters[i] = g_counters[i - 1] + min_bucket + 1;
for (unsigned i = large_buckets + 1; i < kRadix; ++i)
g_counters[i] = g_counters[i - 1] + min_bucket;
for (unsigned i = 0; i < size; ++i)
{
const unsigned dst = g_counters[g_index[i] % kRadix]++;
g_sort[dst].index = g_index[i] / kRadix;
g_sort[dst].value = g_input[i];
__builtin_prefetch(&g_sort[dst + 1].value, 1);
}
g_counters[0] = 0;
for (unsigned i = 1; i < (size + kRadix - 1) / kRadix; ++i)
g_counters[i] = g_counters[i - 1] + kRadix;
for (unsigned i = 0; i < size; ++i)
{
const unsigned dst = g_counters[g_sort[i].index]++;
g_output[dst] = g_sort[i].value;
__builtin_prefetch(&g_output[dst + 1], 1);
}
}
void reorder1(const unsigned size)
{
for (unsigned i = 0; i < size; ++i)
g_output[g_index[i]] = g_input[i];
}
#include <iostream>
#include <random>
#include <chrono>
#include <algorithm>
#include <numeric>
using namespace std;
int main()
{
mt19937_64 rng(random_device{}());
for (unsigned size = kMaxSize; size > kMinSize; size -= size / 4)
//for (unsigned size = 64*1024*kRadix; size > kMinSize; size -= size / 2)
{
iota(begin(g_index), begin(g_index) + size, 0);
shuffle(begin(g_index), begin(g_index) + size, rng);
copy(begin(g_index), begin(g_index) + size, begin(g_input));
const auto start = chrono::steady_clock::now();
reorder3(size);
const chrono::nanoseconds time = chrono::steady_clock::now() - start;
const auto b = begin(g_output);
const auto e = begin(g_output) + size;
bool ok = adjacent_find(b, e, [&](auto l, auto r){return l + 1 != r;})
== e;
cout << size << ' ' << time.count() / double(size)
<< ' ' << boolalpha << ok << '\n';
}
}