#include <vector>
#include <algorithm>
#include <iostream>
#include <numeric>
#include <cmath>
// https://g...content-available-to-author-only...b.com/Cyan4973/FiniteStateEntropy
#include "fse.h"
using namespace std;
// Optimized for the case of many duplicates
template<class ForwardIt>
ForwardIt rem_dupl(ForwardIt first, ForwardIt last)
{
if (first == last)
return last;
auto value = *first;
bool dupl = false;
ForwardIt result = first;
while (++first != last) {
if (value == *first) {
dupl = true;
}
else {
if (!dupl) {
*result++ = value;
}
value = std::move(*first);
dupl = false;
}
}
if (!dupl) {
*result++ = value;
}
return result;
}
double est_compr_bits(const vector<char>& vc) {
vector<char> compressed(2 * vc.size());
auto compr_res = FSE_compress(
compressed.data(), compressed.size(),
vc.data(), vc.size()
);
if (FSE_isError(compr_res)) {
std::cout << "FSE error: " << FSE_getErrorName(compr_res) << "\n";
return 0.;
}
return 8. * compr_res;
}
// https://g...content-available-to-author-only...b.com/badboy/6267743
uint64_t hash64shift(uint64_t key)
{
key = (~key) + (key << 21); // key = (key << 21) - key - 1;
key = key ^ (key >> 24);
key = (key + (key << 3)) + (key << 8); // key * 265
key = key ^ (key >> 14);
key = (key + (key << 2)) + (key << 4); // key * 21
key = key ^ (key >> 28);
key = key + (key << 31);
return key;
}
using Ty = uint32_t;
constexpr double h_factor = 2.1;
constexpr double passes = 10. - 2.;
constexpr size_t key_count = 2'000'000'000;
constexpr double key_bytes = 50.;
constexpr double index_bytes = 32.;
constexpr double avail_bytes = 1024 * 1024 * 1024;
constexpr double MB = 1024 * 1024;
int main() {
// Partial algorithm for hash compression
vector<Ty> v(key_count);
iota(begin(v), end(v), 0);
const auto h = 1 | static_cast<Ty>(key_count * h_factor);
transform(begin(v), end(v), begin(v), [](Ty x) {
return hash64shift(x) % h;
});
sort(begin(v), end(v));
const auto uniq_end = rem_dupl(begin(v), end(v));
v.erase(uniq_end, end(v));
adjacent_difference(begin(v), end(v), begin(v));
const auto max_diff = *max_element(begin(v), end(v));
vector<char> vc(v.size());
copy(begin(v), end(v), begin(vc));
const auto bits = est_compr_bits(vc);
// Calculations
const auto uniq_keys = uniq_end - begin(v);
const auto uniq_frac = static_cast<double>(uniq_keys) / key_count;
const auto dupl_frac = pow((1 - uniq_frac), passes);
const auto uncompr_bytes = dupl_frac * key_count * key_bytes;
const auto bits_per_key = bits / uniq_keys;
const auto compressed_bytes = bits_per_key * key_count / 8;
const auto bitmap_bytes = key_count / 8;
const auto total_bytes = compressed_bytes + uncompr_bytes + bitmap_bytes;
const auto index_entries = (avail_bytes - total_bytes) / index_bytes;
const auto keys_per_ind = key_count / index_entries;
// Printing results
//for (auto x: vc) { cout << (int)x << '\n'; }
cout << "Bits per key: " << bits_per_key << '\n';
cout << "Compressed MB: " << compressed_bytes / MB << '\n';
cout << "Uncompressed MB: " << uncompr_bytes / MB << '\n';
cout << "Bitmap MB: " << bitmap_bytes / MB << '\n';
cout << "MB used: " << total_bytes / MB << '\n';
cout << "Index entries: " << index_entries << '\n';
cout << "Indexed fragment size: " << keys_per_ind << '\n';
cout << "Unique hash fraction: " << uniq_frac << '\n';
cout << "Maxumum hash difference: " << max_diff << '\n';
}