#include <iostream>
#include <random>
#include <chrono>
#include <vector>
#include <deque>
#include <array>
#include <algorithm>
#include <numeric>
using namespace std;

enum {
    kPageW = 10000,
    kSortingBits = 8,
    kTooLarge = 1u << kSortingBits,
};

struct WidthPos
{
    uint32_t width = 0;
    uint32_t pos = 0;

    WidthPos(uint32_t w = 0, uint32_t p = 0)
            : width(w)
            , pos(p)
    {}
};

bool operator < (WidthPos l, WidthPos r)
{ return l.width > r.width; }

using Vec = vector<uint32_t>;
using SVec = vector<WidthPos>;

uint64_t g_complexity = 0ull;

constexpr uint64_t ceilDiv(uint64_t dividend, uint64_t divisor)
{
    return (dividend + divisor - 1) / divisor;
}

uint32_t getMinRows(const Vec& input)
{
    const auto sum = accumulate(begin(input), end(input), 0ull);
    return max(1u, uint32_t(ceilDiv(sum, kPageW)));
}

uint32_t getMaxColumns(const Vec& input)
{
    const auto sum = accumulate(begin(input), end(input), 0ull);
    return uint32_t(input.size() / max(1ull, sum / kPageW));
}

uint32_t preproc(const Vec& input, SVec& sorted)
{
    sorted.clear();
    sorted.resize(input.size());

    auto sum = 0ull;
    array<uint32_t, 1 + kTooLarge> cnt_1 {};

    for (uint32_t i = 0; i != input.size(); ++i)
    {
        sum += input[i];

        if (input[i] >= kTooLarge)
            sorted[cnt_1[0]++] = {input[i], i};
        else
            ++cnt_1[kTooLarge - input[i]];
    }

    sort(begin(sorted), begin(sorted) + cnt_1[0]);
    partial_sum(begin(cnt_1), end(cnt_1), begin(cnt_1));

    for (uint32_t i = 0; i != input.size(); ++i)
    {
        if (input[i] < kTooLarge)
        {
            auto& cnt = cnt_1[kTooLarge - 1 - input[i]];
            sorted[cnt++] = {input[i], i};
            __builtin_prefetch(4 + &sorted[cnt], 1);
        }
    }

    return uint32_t(input.size() / max(1ull, sum / kPageW));
}

uint32_t arrangeAcrossG(const Vec& input) // iterate by width
{
    SVec sorted;
    vector<bool> seen_columns;

    for (auto columns = preproc(input, sorted); columns; --columns)
    {
        seen_columns.clear();
        seen_columns.resize(columns);
        uint32_t sum = 0;
        uint32_t use_cnt = columns;

        for (auto x: sorted)
        {
            ++g_complexity;
            const auto column = x.pos % columns;

            if (!seen_columns[column])
            {
                sum += x.width;
                --use_cnt;

                if (sum > kPageW)
                    break;
            }

            seen_columns[column] = true;

            if (!use_cnt)
                return columns;
        }
    }

    return  0;
}

uint32_t preprocX(const Vec& input, SVec& sorted)
{
    sorted.clear();
    sorted.resize(input.size());

    auto sum = 0ull;
    array<uint32_t, 1 + kTooLarge> cnt_1 {};

    for (uint32_t i = 0; i != input.size(); ++i)
    {
        sum += input[i];

        if (input[i] >= kTooLarge)
            sorted[cnt_1[0]++] = {input[i], i};
        else
            ++cnt_1[kTooLarge - input[i]];
    }

    sort(begin(sorted), begin(sorted) + cnt_1[0]);
    partial_sum(begin(cnt_1), end(cnt_1), begin(cnt_1));

    for (uint32_t i = 0; i != input.size(); ++i)
    {
        if (input[i] < kTooLarge)
        {
            auto& cnt = cnt_1[kTooLarge - 1 - input[i]];
            sorted[cnt++] = {input[i], i};
            __builtin_prefetch(4 + &sorted[cnt], 1);
        }
    }

    return max(1u, uint32_t(ceilDiv(sum, kPageW)));
}

uint32_t arrangeDownG(const Vec& input) // iterate by width
{
    SVec sorted;
    vector<bool> seen_columns;

    for (auto rows = preprocX(input, sorted); rows <= input.size(); ++rows)
    {
        auto columns =  uint32_t(ceilDiv(input.size(), rows));
        seen_columns.clear();
        seen_columns.resize(columns);
        uint32_t sum = 0;
        uint32_t use_cnt = columns;

        for (auto x: sorted)
        {
            ++g_complexity;
            const auto column = x.pos / rows;

            if (!seen_columns[column])
            {
                sum += x.width;
                --use_cnt;

                if (sum > kPageW)
                    break;
            }

            seen_columns[column] = true;

            if (!use_cnt)
                return columns;
        }
    }

    return  0;
}

uint32_t arrangeAcrossC(const Vec& input) // iterate by columns
{
    for (auto columns = getMaxColumns(input); columns; --columns)
    {
        uint32_t sum = 0;

        for (uint32_t c = 0; sum <= kPageW && c < columns; ++c)
        {
            uint32_t cw = 0;

            for (uint32_t i = c; i < input.size(); i += columns)
            {
                cw = max(cw, input[i]);
                ++g_complexity;
            }

            sum += cw;
        }

        if (sum <= kPageW)
            return columns;
    }

    return 0;
}

uint32_t arrangeAcrossR(const Vec& input) // iterate by rows
{
    auto columns = getMaxColumns(input);
    std::vector<uint32_t> widths(columns);

    for (; columns; --columns)
    {
        uint32_t sum = 0;
        size_t column = 0;

        for (size_t i = 0; sum <= kPageW && i < input.size(); ++i, ++column)
        {
            if (column == columns)
            {
                column = 0;
            }

            if (input[i] > widths[column])
            {
                sum += input[i] - widths[column];
                widths[column] = input[i];
            }

            ++g_complexity;
        }

        if (sum <= kPageW)
        {
            return columns;
        }

        fill(begin(widths), begin(widths) + columns, 0u);
    }

    return 0;
}

uint32_t arrangeDownR(const Vec& input) // iterate by rows
{
    auto rows = getMinRows(input);
    auto columns =  uint32_t(ceilDiv(input.size(), rows));
    std::vector<uint32_t> widths(columns);

    while (rows <= input.size())
    {
        uint32_t sum = 0;

        for (uint32_t row = 0; row < rows; ++row)
        {
            uint32_t column = 0;
            uint32_t i = row;

            for (; sum <= kPageW && i < input.size(); i += rows, ++column)
            {
                if (input[i] > widths[column])
                {
                    sum += input[i] - widths[column];
                    widths[column] = input[i];
                }

                ++g_complexity;
            }

            if (sum > kPageW)
            {
                break;
            }
        }

        if (sum <= kPageW)
        {
            return columns;
        }

        ++rows;
        columns =  uint32_t(ceilDiv(input.size(), rows));
        fill(begin(widths), begin(widths) + columns, 0u);
    }

    return 0;
}

uint32_t arrangeDownNaive(const Vec& input)
{
    const size_t size = input.size();

    for (size_t rows = 1; rows <= size; ++rows)
    {
        size_t index = 0;
        uint32_t text_w = 0;

        for (uint32_t c = 1; ; ++c)
        {
            uint32_t column_w = 0;

            for (uint32_t r = 0; r < rows && index < size; ++r, ++index)
                column_w = max(column_w, input[index]);

            text_w += column_w;

            if (text_w > kPageW)
                break;

            if (index >= size)
                return c;
        }
    }

    return 0;
}

uint32_t arrangeDownRMQ(Vec& input)
{
    auto rows = getMinRows(input);
    uint32_t ranges = 1;

    while (rows <= input.size())
    {
        if (rows >= ranges * 2)
        { // lazily update RMQ data
            transform(begin(input) + ranges, end(input), begin(input),
                      begin(input), [](auto l, auto r) {return max(l, r);}
            );

            ranges *= 2;
        }
        else
        { // use RMQ to get widths of columns and check if all columns fit
            uint32_t sum = 0;

            for (uint32_t i = 0; sum <= kPageW && i < input.size(); i += rows)
            {
                ++g_complexity;
                auto j = i + rows - ranges;

                if (j < input.size())
                    sum += max(input[i], input[j]);
                else
                    sum += input[i];
            }

            if (sum <= kPageW)
            {
                return uint32_t(ceilDiv(input.size(), rows));
            }

            ++rows;
        }
    }

    return 0;
}

struct DeckEntry
{
    uint32_t v;
    uint32_t p;

    DeckEntry(uint32_t value = 0, uint32_t pos = 0)
            : v(value)
            , p(pos)
    {}
};

using Deck = deque<DeckEntry>;

void push(Deck& deck, uint32_t value, uint32_t pos)
{
    while (!deck.empty() && deck.back().v <= value)
        deck.pop_back();

    deck.emplace_back(value, pos);
}

uint32_t maxAfterPos(Deck& deck, uint32_t pos)
{
    if (pos == deck.front().p)
        deck.pop_front();

    return deck.front().v;
}

uint32_t preprocRMQ(Vec& input, uint32_t rows)
{
    if (rows < 4)
        return 1;

    uint32_t back_pos = 0;
    size_t front_pos = 0;
    uint32_t ranges = 1;
    Deck deck;

    while (rows >>= 1)
      ranges <<= 1;

    for (; back_pos != ranges - 1; ++back_pos)
    {
        push(deck, input[back_pos], back_pos);
    }

    for (; back_pos != input.size(); ++front_pos, ++back_pos)
    {
        push(deck, input[back_pos], back_pos);
        input[front_pos] = maxAfterPos(deck, back_pos - ranges);
    }

    for (; front_pos != input.size(); ++front_pos, ++back_pos)
    {
        input[front_pos] = maxAfterPos(deck, back_pos - ranges);
    }

    return ranges;
}

uint32_t arrangeDownRMQ2(Vec& input)
{
    auto rows = getMinRows(input);
    uint32_t ranges = preprocRMQ(input, rows);

    while (rows <= input.size())
    {
        if (rows >= ranges * 2)
        { // lazily update RMQ data
            transform(begin(input) + ranges, end(input), begin(input),
                      begin(input), [](auto l, auto r) {return max(l, r);}
            );

            ranges *= 2;
            //g_complexity += input.size() - ranges;
        }
        else
        { // use RMQ to get widths of columns and check if all columns fit
            uint32_t sum = 0;

            for (uint32_t i = 0; sum <= kPageW && i < input.size(); i += rows)
            {
                ++g_complexity;
                auto j = i + rows - ranges;

                if (j < input.size())
                    sum += max(input[i], input[j]);
                else
                    sum += input[i];
            }

            if (sum <= kPageW)
            {
                return uint32_t(ceilDiv(input.size(), rows));
            }

            ++rows;
        }
    }

    return 0;
}

class TwoStk
{
    vector<uint32_t> in_;
    vector<uint32_t> out_;
    uint32_t in_max_;

    void inout()
    {
        in_max_ = 0;
        uint32_t out_max = 0;

        while (!in_.empty())
        {
            const auto x = in_.back();
            in_.pop_back();
            out_max = max(out_max, x);
            out_.push_back(out_max);
        }
    }

public:
    TwoStk(uint32_t size)
            : in_max_(0)
    {
        in_.reserve(size);
        out_.reserve(size);
    }

    void push(uint32_t x)
    {
        in_.push_back(x);
        in_max_ = max(in_max_, x);
    }

    uint32_t pop()
    {
        if (out_.empty())
        {
            inout();
        }

        const auto res = out_.back();
        out_.pop_back();
        return max(res, in_max_);
    }
};

uint32_t preprocRMQ3(Vec& input, uint32_t rows)
{
    if (rows < 4)
        return 1;

    uint32_t back_pos = 0;
    size_t front_pos = 0;
    uint32_t ranges = 1;

    while (rows >>= 1)
      ranges <<= 1;

    TwoStk ts(ranges);

    for (; back_pos != ranges - 1; ++back_pos)
    {
        ts.push(input[back_pos]);
    }

    for (; back_pos != input.size(); ++front_pos, ++back_pos)
    {
        ts.push(input[back_pos]);
        input[front_pos] = ts.pop();
    }

    for (; front_pos != input.size(); ++front_pos, ++back_pos)
    {
        input[front_pos] = ts.pop();
    }

    return ranges;
}

uint32_t arrangeDownRMQ3(Vec& input)
{
    auto rows = getMinRows(input);
    uint32_t ranges = preprocRMQ3(input, rows);

    while (rows <= input.size())
    {
        if (rows >= ranges * 2)
        { // lazily update RMQ data
            transform(begin(input) + ranges, end(input), begin(input),
                      begin(input), [](auto l, auto r) {return max(l, r);}
            );

            ranges *= 2;
            //g_complexity += input.size() - ranges;
        }
        else
        { // use RMQ to get widths of columns and check if all columns fit
            uint32_t sum = 0;

            for (uint32_t i = 0; sum <= kPageW && i < input.size(); i += rows)
            {
                ++g_complexity;
                auto j = i + rows - ranges;

                if (j < input.size())
                    sum += max(input[i], input[j]);
                else
                    sum += input[i];
            }

            if (sum <= kPageW)
            {
                return uint32_t(ceilDiv(input.size(), rows));
            }

            ++rows;
        }
    }

    return 0;
}

int main()
{
    mt19937_64 rng(random_device{}());
    geometric_distribution<uint32_t> distrib(0.02); // max value up to ~1000

    for (unsigned size = 3; size < 1000000u; size *= 2)
    {
        const unsigned repeat = max(1u, 100000u / size);
        chrono::duration<double> time {};
        g_complexity = 0ull;
        Vec input(size);
        uint32_t res = 0u;

        for (unsigned i = 0; i < repeat; ++i)
        {
            generate(begin(input), end(input), [&] {return 1 + distrib(rng);});
            auto start = chrono::steady_clock::now();

            res = arrangeDownG(input);

            time += chrono::steady_clock::now() - start;
        }

        cout << size << ' '
        << time.count() / repeat << ' '
        << g_complexity / repeat << ' '
        << res << '\n';
    }
}
