enum {
    kMaxSize = 110000000,
    kMinSize = 1000000,
    kRadix = 1500,
    kMaxCounters = (kMaxSize + kRadix - 1) / kRadix,
};

struct IndexValue
{
    unsigned index;
    unsigned value;
};

unsigned g_input[kMaxSize];
unsigned g_index[kMaxSize];
struct IndexValue g_sort[kMaxSize];
unsigned g_counters[kMaxCounters];
unsigned g_output[kMaxSize];

void reorder2(const unsigned size)
{
    const unsigned min_bucket = size / kRadix;
    const unsigned large_buckets = size % kRadix;
    g_counters[0] = 0;

    for (unsigned i = 1; i <= large_buckets; ++i)
        g_counters[i] = g_counters[i - 1] + min_bucket + 1;

    for (unsigned i = large_buckets + 1; i < kRadix; ++i)
        g_counters[i] = g_counters[i - 1] + min_bucket;

    for (unsigned i = 0; i < size; ++i)
    {
        const unsigned dst = g_counters[g_index[i] % kRadix]++;
        g_sort[dst].index = g_index[i] / kRadix;
        g_sort[dst].value = g_input[i];
        __builtin_prefetch(&g_sort[dst + 1].value, 1);
    }

    g_counters[0] = 0;

    for (unsigned i = 1; i < (size + kRadix - 1) / kRadix; ++i)
        g_counters[i] = g_counters[i - 1] + kRadix;

    for (unsigned i = 0; i < size; ++i)
    {
        const unsigned dst = g_counters[g_sort[i].index]++;
        g_output[dst] = g_sort[i].value;
        __builtin_prefetch(&g_output[dst + 1], 1);
    }
}

void reorder1(const unsigned size)
{
    for (unsigned i = 0; i < size; ++i)
        g_output[g_index[i]] = g_input[i];
}

#include <iostream>
#include <random>
#include <chrono>
#include <algorithm>
#include <numeric>
using namespace std;

int main()
{
    mt19937_64 rng(random_device{}());

    for (unsigned size = kMaxSize; size > kMinSize; size -= size / 4)
    //for (unsigned size = 64*1024*kRadix; size > kMinSize; size -= size / 2)
    {
        iota(begin(g_index), begin(g_index) + size, 0);
        shuffle(begin(g_index), begin(g_index) + size, rng);
        copy(begin(g_index), begin(g_index) + size, begin(g_input));

        const auto start = chrono::steady_clock::now();
        reorder2(size);
        const chrono::nanoseconds time = chrono::steady_clock::now() - start;

        const auto b = begin(g_output);
        const auto e = begin(g_output) + size;

        bool ok = adjacent_find(b, e, [&](auto l, auto r){return l + 1 != r;})
                  == e;

        cout << size << ' ' << time.count() / double(size)
        << ' ' << boolalpha << ok << '\n';
    }
}