fork download
  1. enum {
  2. kMaxSize = 80000000,
  3. kMinSize = 1000000,
  4. kRadix = 1500,
  5. kRadix3 = 325,
  6. kRadix3Sq = kRadix3 * kRadix3,
  7. kMaxCounters = (kMaxSize + kRadix - 1) / kRadix,
  8. };
  9.  
  10. struct IndexValue
  11. {
  12. unsigned index;
  13. unsigned value;
  14. };
  15.  
  16. unsigned g_input[kMaxSize];
  17. unsigned g_index[kMaxSize];
  18. struct IndexValue g_sort[kMaxSize];
  19. struct IndexValue g_sort2[kMaxSize];
  20. unsigned g_counters[kMaxCounters];
  21. unsigned g_output[kMaxSize];
  22.  
  23. void reorder3(const unsigned size)
  24. {
  25. const unsigned min_bucket1 = size / kRadix3;
  26. const unsigned large_buckets1 = size % kRadix3;
  27. g_counters[0] = 0;
  28.  
  29. for (unsigned i = 1; i <= large_buckets1; ++i)
  30. g_counters[i] = g_counters[i - 1] + min_bucket1 + 1;
  31.  
  32. for (unsigned i = large_buckets1 + 1; i < kRadix3; ++i)
  33. g_counters[i] = g_counters[i - 1] + min_bucket1;
  34.  
  35. for (unsigned i = 0; i < size; ++i)
  36. {
  37. const unsigned dst = g_counters[g_index[i] % kRadix3]++;
  38. g_sort[dst].index = g_index[i] / kRadix3;
  39. g_sort[dst].value = g_input[i];
  40. __builtin_prefetch(&g_sort[dst + 4].value, 1);
  41. }
  42.  
  43. const unsigned min_bucket2 = size / kRadix3Sq * kRadix3;
  44. const unsigned large_buckets2 = size / kRadix3 % kRadix3;
  45. g_counters[0] = 0;
  46.  
  47. for (unsigned i = 1; i <= large_buckets2; ++i)
  48. g_counters[i] = g_counters[i - 1] + min_bucket2 + kRadix3;
  49.  
  50. g_counters[large_buckets2 + 1] = g_counters[large_buckets2] +
  51. min_bucket2 + large_buckets1;
  52.  
  53. for (unsigned i = large_buckets2 + 2; i < kRadix3; ++i)
  54. g_counters[i] = g_counters[i - 1] + min_bucket2;
  55.  
  56. for (unsigned i = 0; i < size; ++i)
  57. {
  58. const unsigned dst = g_counters[g_sort[i].index % kRadix3]++;
  59. g_sort2[dst].index = g_sort[i].index / kRadix3;
  60. g_sort2[dst].value = g_sort[i].value;
  61. __builtin_prefetch(&g_sort2[dst + 4].value, 1);
  62. }
  63.  
  64. g_counters[0] = 0;
  65.  
  66. for (unsigned i = 1; i < (size + kRadix3Sq - 1) / kRadix3Sq; ++i)
  67. g_counters[i] = g_counters[i - 1] + kRadix3Sq;
  68.  
  69. for (unsigned i = 0; i < size; ++i)
  70. {
  71. const unsigned dst = g_counters[g_sort2[i].index]++;
  72. g_output[dst] = g_sort2[i].value;
  73. __builtin_prefetch(&g_output[dst + 4], 1);
  74. }
  75. }
  76.  
  77. void reorder2(const unsigned size)
  78. {
  79. const unsigned min_bucket = size / kRadix;
  80. const unsigned large_buckets = size % kRadix;
  81. g_counters[0] = 0;
  82.  
  83. for (unsigned i = 1; i <= large_buckets; ++i)
  84. g_counters[i] = g_counters[i - 1] + min_bucket + 1;
  85.  
  86. for (unsigned i = large_buckets + 1; i < kRadix; ++i)
  87. g_counters[i] = g_counters[i - 1] + min_bucket;
  88.  
  89. for (unsigned i = 0; i < size; ++i)
  90. {
  91. const unsigned dst = g_counters[g_index[i] % kRadix]++;
  92. g_sort[dst].index = g_index[i] / kRadix;
  93. g_sort[dst].value = g_input[i];
  94. __builtin_prefetch(&g_sort[dst + 1].value, 1);
  95. }
  96.  
  97. g_counters[0] = 0;
  98.  
  99. for (unsigned i = 1; i < (size + kRadix - 1) / kRadix; ++i)
  100. g_counters[i] = g_counters[i - 1] + kRadix;
  101.  
  102. for (unsigned i = 0; i < size; ++i)
  103. {
  104. const unsigned dst = g_counters[g_sort[i].index]++;
  105. g_output[dst] = g_sort[i].value;
  106. __builtin_prefetch(&g_output[dst + 1], 1);
  107. }
  108. }
  109.  
  110. void reorder1(const unsigned size)
  111. {
  112. for (unsigned i = 0; i < size; ++i)
  113. g_output[g_index[i]] = g_input[i];
  114. }
  115.  
  116. #include <iostream>
  117. #include <random>
  118. #include <chrono>
  119. #include <algorithm>
  120. #include <numeric>
  121. using namespace std;
  122.  
  123. int main()
  124. {
  125. mt19937_64 rng(random_device{}());
  126.  
  127. for (unsigned size = kMaxSize; size > kMinSize; size -= size / 4)
  128. //for (unsigned size = 64*1024*kRadix; size > kMinSize; size -= size / 2)
  129. {
  130. iota(begin(g_index), begin(g_index) + size, 0);
  131. shuffle(begin(g_index), begin(g_index) + size, rng);
  132. copy(begin(g_index), begin(g_index) + size, begin(g_input));
  133.  
  134. const auto start = chrono::steady_clock::now();
  135. reorder3(size);
  136. const chrono::nanoseconds time = chrono::steady_clock::now() - start;
  137.  
  138. const auto b = begin(g_output);
  139. const auto e = begin(g_output) + size;
  140.  
  141. bool ok = adjacent_find(b, e, [&](auto l, auto r){return l + 1 != r;})
  142. == e;
  143.  
  144. cout << size << ' ' << time.count() / double(size)
  145. << ' ' << boolalpha << ok << '\n';
  146. }
  147. }
Runtime error #stdin #stdout 0s 100KB
stdin
Standard input is empty
stdout
Standard output is empty