fork(1) download
  1. #include <stdio.h>
  2. #include <stdint.h>
  3. #include <string.h>
  4.  
  5. #include <chrono>
  6.  
  7. #include <xmmintrin.h>
  8. #include <emmintrin.h>
  9. #include <immintrin.h>
  10.  
  11. struct audio_repack;
  12.  
  13. typedef int (*audio_repack_func_t)(struct audio_repack *,
  14. const uint8_t *, uint32_t);
  15.  
  16. struct audio_repack {
  17. uint8_t *packet_buffer;
  18. uint32_t packet_size;
  19.  
  20. uint32_t base_src_size;
  21. uint32_t base_dst_size;
  22.  
  23. audio_repack_func_t repack_func;
  24. };
  25.  
  26.  
  27. #ifdef AVX
  28. int repack_8ch_swap23_avx2(struct audio_repack *repack,
  29. const uint8_t *bsrc, uint32_t frame_count)
  30. {
  31. const uint32_t size = frame_count * repack->base_src_size;
  32.  
  33. const uint32_t half_frame_count = frame_count / 2;
  34. const __m256i *src = (__m256i *)bsrc;
  35. const __m256i *esrc = src + half_frame_count;
  36. __m256i *dst = (__m256i *)repack->packet_buffer;
  37. for (; src != esrc; ++src, ++dst) {
  38. __m256i target = _mm256_load_si256(src);
  39. __m256i buf = _mm256_shufflelo_epi16(target, 0xB4);
  40. _mm256_store_si256(dst, buf);
  41. }
  42. if (frame_count % 2 == 1) {
  43. memcpy(dst, src, 16);
  44. dst[2] = src[3];
  45. dst[3] = src[2];
  46. }
  47.  
  48. return 0;
  49. }
  50. #endif
  51.  
  52. int repack_8ch_swap23_sse2(struct audio_repack *repack,
  53. const uint8_t *bsrc, uint32_t frame_count)
  54. {
  55. const uint32_t size = frame_count * repack->base_src_size;
  56.  
  57. const __m128i *src = (__m128i *)bsrc;
  58. const __m128i *esrc = src + frame_count;
  59. __m128i *dst = (__m128i *)repack->packet_buffer;
  60. for (; src != esrc; ++src, ++dst) {
  61. __m128i target = _mm_load_si128(src);
  62. __m128i buf = _mm_shufflelo_epi16(target, 0xB4);
  63. _mm_store_si128(dst, buf);
  64. }
  65.  
  66. return 0;
  67. }
  68.  
  69. int repack_8ch_swap23_3(struct audio_repack *repack,
  70. const uint8_t *bsrc, uint32_t frame_count)
  71. {
  72. const uint32_t size = frame_count * repack->base_src_size;
  73.  
  74. const uint16_t *esrc = (uint16_t *)(bsrc + size);
  75. uint16_t *dst = (uint16_t *)repack->packet_buffer;
  76. for (const uint16_t *src = (uint16_t *)bsrc; src != esrc; src += 8, dst += 8) {
  77. memcpy(dst, src, 16);
  78. dst[2] = src[3];
  79. dst[3] = src[2];
  80. }
  81.  
  82. return 0;
  83. }
  84.  
  85. int repack_8ch_swap23_2(struct audio_repack *repack,
  86. const uint8_t *bsrc, uint32_t frame_count)
  87. {
  88. const uint32_t size = frame_count * repack->base_src_size;
  89.  
  90. const uint16_t *esrc = (uint16_t *)(bsrc + size);
  91. uint16_t *dst = (uint16_t *)repack->packet_buffer;
  92. for (const uint16_t *src = (uint16_t *)bsrc; src != esrc; src += 8, dst += 8) {
  93. memcpy(dst, src, 4);
  94. dst[2] = src[3];
  95. dst[3] = src[2];
  96. memcpy(dst + 4, src + 4, 8);
  97. }
  98.  
  99. return 0;
  100. }
  101.  
  102. int repack_8ch_swap23(struct audio_repack *repack,
  103. const uint8_t *bsrc, uint32_t frame_count)
  104. {
  105. const uint32_t size = frame_count * repack->base_src_size;
  106.  
  107. memcpy(repack->packet_buffer, bsrc, size);
  108.  
  109. const uint16_t *esrc = (uint16_t *)(bsrc + size);
  110. uint16_t *dst = (uint16_t *)repack->packet_buffer;
  111. for (const uint16_t *src = (uint16_t *)bsrc; src != esrc; src += 8, dst += 8) {
  112. dst[2] = src[3];
  113. dst[3] = src[2];
  114. }
  115.  
  116. return 0;
  117. }
  118.  
  119. int print(const uint8_t *buf, int idx, const char* s) {
  120. const uint16_t *data = (uint16_t *)buf;
  121. printf("%6s %4d: %02X %02X %02X %02X %02X %02X %02X %02X\n",
  122. s, idx,
  123. data[8 * idx + 0], data[8 * idx + 1], data[8 * idx + 2], data[8 * idx + 3],
  124. data[8 * idx + 4], data[8 * idx + 5], data[8 * idx + 6], data[8 * idx + 7]);
  125. }
  126.  
  127. int test(struct audio_repack *repack, const uint8_t *data, audio_repack_func_t func, int frame_count, const char* s) {
  128. auto start = std::chrono::system_clock::now();
  129. (*func)(repack, data, frame_count);
  130. auto end = std::chrono::system_clock::now();
  131.  
  132. print(repack->packet_buffer, 0, s);
  133. print(repack->packet_buffer, 2, s);
  134. print(repack->packet_buffer, 798, s);
  135. print(repack->packet_buffer, 800, s);
  136.  
  137. printf("%6s: %d ns\n", s, std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count());
  138. }
  139.  
  140. int main() {
  141. uint16_t base[8] = { 0, 1, 3, 2, 4, 5, 6, 7 };
  142.  
  143. const int frame = 801;
  144. #ifdef AVX
  145. uint16_t __attribute__((vector_size(32))) input[8 * frame] = { 0 };
  146. #else
  147. uint16_t input[8 * frame] = { 0 };
  148. #endif
  149. for (int i = 0; i < frame; ++i) {
  150. memcpy(input + 8 * i, base, 16);
  151. }
  152. print((uint8_t *)input, 0, "input");
  153. print((uint8_t *)input, 2, "input");
  154. print((uint8_t *)input, 798, "input");
  155. print((uint8_t *)input, 800, "input");
  156.  
  157. struct audio_repack repack = { 0 };
  158. repack.base_src_size = 8 * (16 / 8);
  159. repack.base_dst_size = 8 * (16 / 8);
  160. repack.packet_size = repack.base_dst_size;
  161.  
  162. #ifdef AVX
  163. uint16_t __attribute__((vector_size(32))) output[8 * frame] = { 0 };
  164. #else
  165. uint16_t output[8 * frame] = { 0 };
  166. #endif
  167. repack.packet_buffer = (uint8_t *)output;
  168.  
  169. test(&repack, (uint8_t *)input, repack_8ch_swap23, frame, "test1");
  170. test(&repack, (uint8_t *)input, repack_8ch_swap23_2, frame, "test2");
  171. test(&repack, (uint8_t *)input, repack_8ch_swap23_3, frame, "test3");
  172. test(&repack, (uint8_t *)input, repack_8ch_swap23_sse2, frame, "sse2");
  173. #ifdef AVX
  174. test(&repack, (uint8_t *)input, repack_8ch_swap23_avx2, frame, "avx2");
  175. #endif
  176.  
  177. return 0;
  178. }
Success #stdin #stdout 0s 16048KB
stdin
Standard input is empty
stdout
 input    0: 00 01 03 02 04 05 06 07
 input    2: 00 01 03 02 04 05 06 07
 input  798: 00 01 03 02 04 05 06 07
 input  800: 00 01 03 02 04 05 06 07
 test1    0: 00 01 02 03 04 05 06 07
 test1    2: 00 01 02 03 04 05 06 07
 test1  798: 00 01 02 03 04 05 06 07
 test1  800: 00 01 02 03 04 05 06 07
 test1: 4212 ns
 test2    0: 00 01 02 03 04 05 06 07
 test2    2: 00 01 02 03 04 05 06 07
 test2  798: 00 01 02 03 04 05 06 07
 test2  800: 00 01 02 03 04 05 06 07
 test2: 2277 ns
 test3    0: 00 01 02 03 04 05 06 07
 test3    2: 00 01 02 03 04 05 06 07
 test3  798: 00 01 02 03 04 05 06 07
 test3  800: 00 01 02 03 04 05 06 07
 test3: 2358 ns
  sse2    0: 00 01 02 03 04 05 06 07
  sse2    2: 00 01 02 03 04 05 06 07
  sse2  798: 00 01 02 03 04 05 06 07
  sse2  800: 00 01 02 03 04 05 06 07
  sse2: 1017 ns