#include <iostream>
#include <future>
#include <atomic>
using namespace std;
constexpr unsigned kBoardSize = 16;
constexpr unsigned kThreads = 4;
constexpr unsigned kFirstBitInd = 21;
constexpr uint64_t kFirstBit = 1ull << 21;
constexpr uint64_t kAllColumns = ((1ull << kBoardSize) - 1) << kFirstBitInd;
std::atomic<unsigned> gRow = ATOMIC_VAR_INIT(kBoardSize / 2);
uint64_t solve_1(unsigned start_column)
{
uint64_t stk[kBoardSize];
uint64_t free_columns = kAllColumns;
uint64_t free_diag_1 = ~0ull;
uint64_t free_diag_2 = ~0ull;
uint64_t cursor = kFirstBit << start_column;
uint64_t solutions = 0;
unsigned row = 0;
do { // while (row)
while (cursor)
{
const uint64_t last_bit = cursor & -cursor;
if (!(free_columns ^ last_bit)) // or if (row == kBoardSize - 1)
{
++solutions;
break;
}
// next row
stk[row] = cursor;
++row;
free_columns ^= last_bit;
free_diag_1 ^= last_bit;
free_diag_2 ^= last_bit;
free_diag_1 <<= 1;
free_diag_2 >>= 1;
cursor = free_columns & free_diag_1 & free_diag_2;
}
// previous row
--row;
cursor = stk[row];
const uint64_t last_bit = cursor & -cursor;
free_columns ^= last_bit;
free_diag_1 >>= 1;
free_diag_2 <<= 1;
free_diag_1 ^= last_bit;
free_diag_2 ^= last_bit;
cursor ^= last_bit;
} while (row);
return solutions;
}
uint64_t solve_task()
{
uint64_t sum = 0;
unsigned my_row = 0;
while ((my_row = atomic_fetch_add(&gRow, 1u)) < kBoardSize)
{
auto res = solve_1(kBoardSize - my_row - 1);
if ((kBoardSize & 1) == 0 || my_row != kBoardSize / 2)
res *= 2;
sum += res;
}
return sum;
}
int main()
{
std::future<uint64_t> fut[kThreads];
uint64_t sum = 0;
for (auto& slot: fut)
slot = async(std::launch::async, solve_task);
for (auto& slot: fut)
sum += slot.get();
cout << sum << '\n';
}