#pragma GCC optimize("Ofast,unroll-loops")
//#pragma GCC target("avx,avx2,fma")
#pragma GCC target("avx")
#include <bits/stdc++.h>
class Timer {
std::chrono::time_point<std::chrono::steady_clock> timePoint;
size_t value;
public:
void start() { timePoint = std::chrono::steady_clock::now(); }
void finish() {
auto curr = std::chrono::steady_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(curr - timePoint);
value = elapsed.count();
}
size_t operator()() const { return value; }
};
namespace {
template<int n, typename T>
void mult(const T *__restrict a, const T *__restrict b, T *__restrict res) {
if (n <= 64) { // if length is small then naive multiplication if faster
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
res[i + j] += a[i] * b[j];
}
}
} else {
const int mid = n / 2;
alignas(64) T btmp[n], E[n] = {};
auto atmp = btmp + mid;
for (int i = 0; i < mid; i++) {
atmp[i] = a[i] + a[i + mid]; // atmp(x) - sum of two halfs a(x)
btmp[i] = b[i] + b[i + mid]; // btmp(x) - sum of two halfs b(x)
}
mult<mid>(atmp, btmp, E); // Calculate E(x) = (alow(x) + ahigh(x)) * (blow(x) + bhigh(x))
mult<mid>(a + 0, b + 0, res); // Calculate rlow(x) = alow(x) * blow(x)
mult<mid>(a + mid, b + mid, res + n); // Calculate rhigh(x) = ahigh(x) * bhigh(x)
for (int i = 0; i < mid; i++) { // Then, calculate rmid(x) = E(x) - rlow(x) - rhigh(x) and write in memory
const auto tmp = res[i + mid];
res[i + mid] += E[i] - res[i] - res[i + 2 * mid];
res[i + 2 * mid] += E[i + mid] - tmp - res[i + 3 * mid];
}
}
}
}
#define isz(x) (int)(x).size()
using namespace std;
using cd = complex<double>;
const double PI = acos(-1);
void fft(vector<cd> & a, bool invert) {
int n = a.size();
for (int i = 1, j = 0; i < n; i++) {
int bit = n >> 1;
for (; j & bit; bit >>= 1)
j ^= bit;
j ^= bit;
if (i < j)
swap(a[i], a[j]);
}
for (int len = 2; len <= n; len <<= 1) {
double ang = 2 * PI / len * (invert ? -1 : 1);
cd wlen(cos(ang), sin(ang));
for (int i = 0; i < n; i += len) {
cd w(1);
for (int j = 0; j < len / 2; j++) {
cd u = a[i+j], v = a[i+j+len/2] * w;
a[i+j] = u + v;
a[i+j+len/2] = u - v;
w *= wlen;
}
}
}
if (invert) {
for (cd & x : a)
x /= n;
}
}
vector<int> multiply(vector<int> const& a, vector<int> const& b) {
vector<cd> fa(a.begin(), a.end()), fb(b.begin(), b.end());
int n = 1;
while (n < a.size() + b.size())
n <<= 1;
fa.resize(n);
fb.resize(n);
fft(fa, false);
fft(fb, false);
for (int i = 0; i < n; i++)
fa[i] *= fb[i];
fft(fa, true);
vector<int> result(n);
for (int i = 0; i < n; i++)
result[i] = round(fa[i].real());
return result;
}
template<int NMAX>
void test() {
Timer timer;
timer.start();
static int a[NMAX], b[NMAX], c[2*NMAX];
for (int i = 0; i < NMAX; i++) a[i] = b[i] = 1;
mult<NMAX>(a,b,c);
timer.finish();
std::cout << "NMAX = " << std::setw(10) << NMAX << ", karatsuba: " << std::setw(5) << timer() << "ms,";
timer.start();
vector<int> va(NMAX,1), vb(NMAX,1);
auto vc = multiply(va, vb);
timer.finish();
std::cout << " FFT: " << std::setw(5) << timer() << "ms" << std::endl;
for (int i = 0; i < isz(vc); i++) {
if (vc[i] != c[i]) {
std::cout << "i = " << i << ", vc[i] = " << vc[i] << ", c[i] " << c[i] << std::endl;
std::exit(0);
}
assert(vc[i] == c[i]);
}
}
int main() {
test<(1 << 19)>();
test<(1 << 18)>();
test<(1 << 17)>();
test<(1 << 16)>();
test<(1 << 15)>();
test<(1 << 14)>();
test<(1 << 13)>();
test<(1 << 12)>();
test<(1 << 11)>();
test<(1 << 10)>();
return 0;
}