#include <chrono>
#include <cmath>
//#include <fastmath.h>
#include <iostream>
#include <immintrin.h>
#include <vector>
#include <random>
#include <iomanip>
using f64 = double;
using s64 = int64_t;
using u64 = uint64_t;
static constexpr u64 cycles = 24;
static constexpr u64 sample_max = 1000000;
f64 sse_sqrt(const f64 x) {
__m128d root = _mm_sqrt_pd(_mm_load_pd(&x));
return *(reinterpret_cast<f64*>(&root));
}
f64 carmack_sqrt(const f64 x) {
union {
f64 x;
s64 i;
} u = {};
u.x = x;
u.i = 0x5FE6EB50C7B537AAl - (u.i >> 1);
f64 xhalf = 0.5 * x;
u.x = u.x * (1.5 - xhalf * u.x * u.x);
return u.x * x;
}
int main(int /* argc */, char ** /*argv*/) {
std::random_device r;
std::default_random_engine e1(r());
std::uniform_int_distribution<u64> uniform_dist(1, sample_max);
std::vector<u64> input_samples(sample_max);
for (auto& s: input_samples)
s = uniform_dist(e1);
// std::sqrt
{
std::cout << "> Measuring std::sqrt.\r\n> Please wait . . .\r\n";
f64 acc = 0;
auto t1 = std::chrono::high_resolution_clock::now();
for (u64 cycle = 0; cycle < cycles; ++cycle) {
for (u64 sample : input_samples) {
acc += std::sqrt(static_cast<f64>(sample));
}
}
auto t2 = std::chrono::high_resolution_clock::now();
std::chrono::nanoseconds total = t2 - t1;
std::cout << "Accumulated result: " << std::setprecision(19) << acc << "\n";
std::cout << "> Total execution time: " <<
std::chrono::duration_cast<std::chrono::milliseconds>(total).count() << " ms.\r\n";
}
// SEE optimized SQRT
{
std::cout << "> Measuring SSE optimized SQRT.\r\n> Please wait . . .\r\n";
f64 acc = 0;
auto t1 = std::chrono::high_resolution_clock::now();
for (u64 cycle = 0; cycle < cycles; ++cycle) {
for (u64 sample : input_samples) {
acc += sse_sqrt(static_cast<f64>(sample));
}
}
auto t2 = std::chrono::high_resolution_clock::now();
std::chrono::nanoseconds total = t2 - t1;
std::cout << "Accumulated result: " << std::setprecision(19) << acc << "\n";
std::cout << "> Total execution time: " << std::chrono::duration_cast<std::chrono::milliseconds>(total).count() << " ms.\r\n";
}
// Carmack optimized SQRT
{
std::cout << "> Measuring Carmack optimized SQRT.\r\n> Please wait . . .\r\n";
f64 acc = 0;
auto t1 = std::chrono::high_resolution_clock::now();
for (u64 cycle = 0; cycle < cycles; ++cycle) {
for (u64 sample : input_samples) {
acc += carmack_sqrt(static_cast<f64>(sample));
}
}
auto t2 = std::chrono::high_resolution_clock::now();
std::chrono::nanoseconds total = t2 - t1;
std::cout << "Accumulated result: " << std::setprecision(19) << acc << "\n";
std::cout << "> Total execution time: " << std::chrono::duration_cast<std::chrono::milliseconds>(total).count() << " ms.\r\n";
}
// Control: dummy run, not calling SQRT
{
std::cout << "> Dummy run without SQRT.\r\n> Please wait . . .\r\n";
f64 acc = 0;
auto t1 = std::chrono::high_resolution_clock::now();
for (u64 cycle = 0; cycle < cycles; ++cycle) {
for (u64 sample : input_samples) {
acc += sample;
}
}
auto t2 = std::chrono::high_resolution_clock::now();
std::chrono::nanoseconds total = t2 - t1;
std::cout << "Accumulated result: " << std::setprecision(19) << acc << "\n";
std::cout << "> Total execution time: " << std::chrono::duration_cast<std::chrono::milliseconds>(total).count() << " ms.\r\n";
}
// std::cout << "> Press any key to exit . . .\r\n";
// std::getchar();
return 0;
}