fork(5) download
  1. #include <chrono>
  2. #include <cmath>
  3. //#include <fastmath.h>
  4. #include <iostream>
  5. #include <immintrin.h>
  6. #include <vector>
  7. #include <random>
  8. #include <iomanip>
  9.  
  10. using f64 = double;
  11. using s64 = int64_t;
  12. using u64 = uint64_t;
  13.  
  14. static constexpr u64 cycles = 24;
  15. static constexpr u64 sample_max = 1000000;
  16.  
  17. f64 sse_sqrt(const f64 x) {
  18. __m128d root = _mm_sqrt_pd(_mm_load_pd(&x));
  19. return *(reinterpret_cast<f64*>(&root));
  20. }
  21.  
  22. f64 carmack_sqrt(const f64 x) {
  23. union {
  24. f64 x;
  25. s64 i;
  26. } u = {};
  27. u.x = x;
  28. u.i = 0x5FE6EB50C7B537AAl - (u.i >> 1);
  29. f64 xhalf = 0.5 * x;
  30. u.x = u.x * (1.5 - xhalf * u.x * u.x);
  31. return u.x * x;
  32. }
  33.  
  34. int main(int /* argc */, char ** /*argv*/) {
  35.  
  36.  
  37. std::random_device r;
  38. std::default_random_engine e1(r());
  39. std::uniform_int_distribution<u64> uniform_dist(1, sample_max);
  40.  
  41. std::vector<u64> input_samples(sample_max);
  42. for (auto& s: input_samples)
  43. s = uniform_dist(e1);
  44.  
  45. // std::sqrt
  46. {
  47. std::cout << "> Measuring std::sqrt.\r\n> Please wait . . .\r\n";
  48. f64 acc = 0;
  49. auto t1 = std::chrono::high_resolution_clock::now();
  50. for (u64 cycle = 0; cycle < cycles; ++cycle) {
  51. for (u64 sample : input_samples) {
  52. acc += std::sqrt(static_cast<f64>(sample));
  53. }
  54. }
  55. auto t2 = std::chrono::high_resolution_clock::now();
  56. std::chrono::nanoseconds total = t2 - t1;
  57. std::cout << "Accumulated result: " << std::setprecision(19) << acc << "\n";
  58. std::cout << "> Total execution time: " <<
  59. std::chrono::duration_cast<std::chrono::milliseconds>(total).count() << " ms.\r\n";
  60. }
  61.  
  62. // SEE optimized SQRT
  63. {
  64. std::cout << "> Measuring SSE optimized SQRT.\r\n> Please wait . . .\r\n";
  65. f64 acc = 0;
  66. auto t1 = std::chrono::high_resolution_clock::now();
  67. for (u64 cycle = 0; cycle < cycles; ++cycle) {
  68. for (u64 sample : input_samples) {
  69. acc += sse_sqrt(static_cast<f64>(sample));
  70. }
  71. }
  72. auto t2 = std::chrono::high_resolution_clock::now();
  73. std::chrono::nanoseconds total = t2 - t1;
  74. std::cout << "Accumulated result: " << std::setprecision(19) << acc << "\n";
  75. std::cout << "> Total execution time: " << std::chrono::duration_cast<std::chrono::milliseconds>(total).count() << " ms.\r\n";
  76. }
  77.  
  78. // Carmack optimized SQRT
  79. {
  80. std::cout << "> Measuring Carmack optimized SQRT.\r\n> Please wait . . .\r\n";
  81. f64 acc = 0;
  82. auto t1 = std::chrono::high_resolution_clock::now();
  83. for (u64 cycle = 0; cycle < cycles; ++cycle) {
  84. for (u64 sample : input_samples) {
  85. acc += carmack_sqrt(static_cast<f64>(sample));
  86. }
  87. }
  88. auto t2 = std::chrono::high_resolution_clock::now();
  89. std::chrono::nanoseconds total = t2 - t1;
  90. std::cout << "Accumulated result: " << std::setprecision(19) << acc << "\n";
  91. std::cout << "> Total execution time: " << std::chrono::duration_cast<std::chrono::milliseconds>(total).count() << " ms.\r\n";
  92. }
  93.  
  94. // Control: dummy run, not calling SQRT
  95. {
  96. std::cout << "> Dummy run without SQRT.\r\n> Please wait . . .\r\n";
  97. f64 acc = 0;
  98. auto t1 = std::chrono::high_resolution_clock::now();
  99. for (u64 cycle = 0; cycle < cycles; ++cycle) {
  100. for (u64 sample : input_samples) {
  101. acc += sample;
  102. }
  103. }
  104. auto t2 = std::chrono::high_resolution_clock::now();
  105. std::chrono::nanoseconds total = t2 - t1;
  106. std::cout << "Accumulated result: " << std::setprecision(19) << acc << "\n";
  107. std::cout << "> Total execution time: " << std::chrono::duration_cast<std::chrono::milliseconds>(total).count() << " ms.\r\n";
  108. }
  109.  
  110. // std::cout << "> Press any key to exit . . .\r\n";
  111. // std::getchar();
  112.  
  113. return 0;
  114. }
  115.  
Success #stdin #stdout 0.52s 10640KB
stdin
Standard input is empty
stdout
> Measuring std::sqrt.
> Please wait . . .
Accumulated result: 15992462920.66438866
> Total execution time: 132 ms.
> Measuring SSE optimized SQRT.
> Please wait . . .
Accumulated result: 15992462920.66438866
> Total execution time: 263 ms.
> Measuring Carmack optimized SQRT.
> Please wait . . .
Accumulated result: 15977405972.00270462
> Total execution time: 54 ms.
> Dummy run without SQRT.
> Please wait . . .
Accumulated result: 11990908370952
> Total execution time: 20 ms.