enum {
    kMaxSize = 80000000,
    kMinSize = 1000000,
    kRadix = 1500,
    kRadix3 = 325,
    kRadix3Sq = kRadix3 * kRadix3,
    kMaxCounters = (kMaxSize + kRadix - 1) / kRadix,
};

struct IndexValue
{
    unsigned index;
    unsigned value;
};

unsigned g_input[kMaxSize];
unsigned g_index[kMaxSize];
struct IndexValue g_sort[kMaxSize];
struct IndexValue g_sort2[kMaxSize];
unsigned g_counters[kMaxCounters];
unsigned g_output[kMaxSize];

void reorder3(const unsigned size)
{
    const unsigned min_bucket1 = size / kRadix3;
    const unsigned large_buckets1 = size % kRadix3;
    g_counters[0] = 0;

    for (unsigned i = 1; i <= large_buckets1; ++i)
        g_counters[i] = g_counters[i - 1] + min_bucket1 + 1;

    for (unsigned i = large_buckets1 + 1; i < kRadix3; ++i)
        g_counters[i] = g_counters[i - 1] + min_bucket1;

    for (unsigned i = 0; i < size; ++i)
    {
        const unsigned dst = g_counters[g_index[i] % kRadix3]++;
        g_sort[dst].index = g_index[i] / kRadix3;
        g_sort[dst].value = g_input[i];
        __builtin_prefetch(&g_sort[dst + 4].value, 1);
    }

    const unsigned min_bucket2 = size / kRadix3Sq * kRadix3;
    const unsigned large_buckets2 = size / kRadix3 % kRadix3;
    g_counters[0] = 0;

    for (unsigned i = 1; i <= large_buckets2; ++i)
        g_counters[i] = g_counters[i - 1] + min_bucket2 + kRadix3;

    g_counters[large_buckets2 + 1] = g_counters[large_buckets2] +
                                     min_bucket2 + large_buckets1;

    for (unsigned i = large_buckets2 + 2; i < kRadix3; ++i)
        g_counters[i] = g_counters[i - 1] + min_bucket2;

    for (unsigned i = 0; i < size; ++i)
    {
        const unsigned dst = g_counters[g_sort[i].index % kRadix3]++;
        g_sort2[dst].index = g_sort[i].index / kRadix3;
        g_sort2[dst].value = g_sort[i].value;
        __builtin_prefetch(&g_sort2[dst + 4].value, 1);
    }

    g_counters[0] = 0;

    for (unsigned i = 1; i < (size + kRadix3Sq - 1) / kRadix3Sq; ++i)
        g_counters[i] = g_counters[i - 1] + kRadix3Sq;

    for (unsigned i = 0; i < size; ++i)
    {
        const unsigned dst = g_counters[g_sort2[i].index]++;
        g_output[dst] = g_sort2[i].value;
        __builtin_prefetch(&g_output[dst + 4], 1);
    }
}

void reorder2(const unsigned size)
{
    const unsigned min_bucket = size / kRadix;
    const unsigned large_buckets = size % kRadix;
    g_counters[0] = 0;

    for (unsigned i = 1; i <= large_buckets; ++i)
        g_counters[i] = g_counters[i - 1] + min_bucket + 1;

    for (unsigned i = large_buckets + 1; i < kRadix; ++i)
        g_counters[i] = g_counters[i - 1] + min_bucket;

    for (unsigned i = 0; i < size; ++i)
    {
        const unsigned dst = g_counters[g_index[i] % kRadix]++;
        g_sort[dst].index = g_index[i] / kRadix;
        g_sort[dst].value = g_input[i];
        __builtin_prefetch(&g_sort[dst + 1].value, 1);
    }

    g_counters[0] = 0;

    for (unsigned i = 1; i < (size + kRadix - 1) / kRadix; ++i)
        g_counters[i] = g_counters[i - 1] + kRadix;

    for (unsigned i = 0; i < size; ++i)
    {
        const unsigned dst = g_counters[g_sort[i].index]++;
        g_output[dst] = g_sort[i].value;
        __builtin_prefetch(&g_output[dst + 1], 1);
    }
}

void reorder1(const unsigned size)
{
    for (unsigned i = 0; i < size; ++i)
        g_output[g_index[i]] = g_input[i];
}

#include <iostream>
#include <random>
#include <chrono>
#include <algorithm>
#include <numeric>
using namespace std;

int main()
{
    mt19937_64 rng(random_device{}());

    for (unsigned size = kMaxSize; size > kMinSize; size -= size / 4)
    //for (unsigned size = 64*1024*kRadix; size > kMinSize; size -= size / 2)
    {
        iota(begin(g_index), begin(g_index) + size, 0);
        shuffle(begin(g_index), begin(g_index) + size, rng);
        copy(begin(g_index), begin(g_index) + size, begin(g_input));

        const auto start = chrono::steady_clock::now();
        reorder3(size);
        const chrono::nanoseconds time = chrono::steady_clock::now() - start;

        const auto b = begin(g_output);
        const auto e = begin(g_output) + size;

        bool ok = adjacent_find(b, e, [&](auto l, auto r){return l + 1 != r;})
                  == e;

        cout << size << ' ' << time.count() / double(size)
        << ' ' << boolalpha << ok << '\n';
    }
}