fork download
  1. #import<arpa/inet.h>
  2. using u=uint8_t;using U=uint32_t;U i,o,x,b,m,z=65536;U R(u*&p){if(m=i)x=i<3?*p++<<8|*p++:i<4?*p++|*p++<<8:(x=*(U*)p,p+=4,i<6?htonl(x):x);else{for(x=*p++;128>>m&x;)++m;if(m>1)for(x&=127>>m;--m;)x=(*p&192)-128?~0:*p++&63|x<<6;m?x=~0:0;}return x;}U r(u*&p){U y=R(p);x=i&&i<4&x>>10==54?R(p)>>10==55?y*1024+x-56613888:~0:x;b++?:x-65279?x==z-2&i==1?i=3,r(p):x+z+z|i-4||(i=6,r(p)):i&&i%3-1?x=~0:r(p);return x<z*17&x>>11!=27?x:~0;}void w(U x,u*&p){if(o)o<4?x/z?x-=z,w(55296|x>>10,p),w(56320|x&1023,p),0:o<3?*p++=x>>8,*p++=x:(*p++=x,*p++=x>>8):*(*(U*)p=o<6?htonl(x):x,p+=4);else if(x<128)*p++=x;else{for(m=0;~63<<m&x;m+=6);for(*p++=~127>>m/6|x>>m;m;*p++=128|x>>m&63)m-=6;}}U t(u*p,u*&q){for(b=0,x=1;x;)w(r(p),q);return x;}
  3.  
  4. #include <vector>
  5. #include <iostream>
  6.  
  7. std::ostream& operator<<(std::ostream& out, const std::vector<u>& v)
  8. {
  9. out << "{ ";
  10. for (int i: v) out << i << " ";
  11. out << "}";
  12. return out;
  13. }
  14.  
  15. int test_read(int encoding, std::vector<u> input, U expected)
  16. {
  17. b = 0;
  18. i = encoding;
  19. auto d = input.data();
  20. U actual = r(d);
  21. if (actual == expected) return 0;
  22. std::cerr << std::hex << "Decoding " << encoding << "; " << input << " gave " << actual
  23. << " instead of " << expected << std::endl;
  24. return 1;
  25. }
  26.  
  27. int test_write(int encoding, U input, std::vector<u> expected)
  28. {
  29. o = encoding;
  30. u buf[20], *p = buf;
  31. w(input, p);
  32. std::vector<u> actual(buf,p);
  33. if (expected == actual) return 0;
  34. std::cerr << std::hex << "Encoding " << encoding << "; " << input << " gave " << actual
  35. << " instead of " << expected << std::endl;
  36. return 1;
  37. }
  38.  
  39. int test_transcode(int ienc, std::vector<u> input, int oenc, std::vector<u> expected)
  40. {
  41. b = 0;
  42. i = ienc; o = oenc;
  43. u buf[200], *p = buf, *d = input.data();
  44. int result = t(d, p);
  45. std::vector<u> actual(buf,p);
  46. if (result ? expected.empty() : expected == actual) return 0;
  47. std::cerr << std::hex << "Encoding " << ienc << " to " << oenc << "; " << input << " gave " << actual
  48. << " instead of " << expected << std::endl;
  49. return 1;
  50. }
  51.  
  52. static const U FAIL = ~0;
  53. int main() {
  54. int e = 0; // error count
  55. // UTF-8
  56. e += test_read(0, { 128 }, FAIL); // unexpected continuation
  57. e += test_read(0, { 128, 1 }, FAIL);
  58. e += test_read(0, { 128, 128 }, FAIL);
  59. e += test_read(0, { 192, 192 }, FAIL); // start without continuation
  60. e += test_read(0, { 192, 0 }, FAIL);
  61. e += test_read(0, { 224, 0 }, FAIL);
  62. e += test_read(0, { 224, 192 }, FAIL);
  63. e += test_read(0, { 0xf4, 0x90, 128, 128 }, FAIL); // Unicode maximum+1
  64.  
  65. e += test_read(0, { 127 }, 127);
  66. e += test_read(0, { 192, 129 }, 1); // We accept overlong UTF-8
  67. e += test_read(0, { 0xc2, 128 }, 128);
  68. e += test_read(0, { 224, 128, 129 }, 1);
  69. e += test_read(0, { 0xef, 128, 128 }, 0xF000);
  70. e += test_read(0, { 0xef, 191, 191 }, 0xFFFF);
  71. e += test_read(0, { 0xf4, 128, 128, 128 }, 0x100000);
  72. e += test_read(0, { 0xf4, 0x8f, 191, 191 }, 0x10FFFF); // Unicode maximum
  73.  
  74. e += test_read(0, { 0xEF, 0xBB, 0xBF, 127 }, 127); // byte-order mark
  75.  
  76. e += test_write(0, 0, { 0 });
  77. e += test_write(0, 127, { 127 });
  78. e += test_write(0, 128, { 0xc2, 128 });
  79. e += test_write(0, 255, { 0xc3, 191 });
  80. e += test_write(0, 0xFFFF, { 0xef, 191, 191 });
  81. e += test_write(0, 0x10FFFF, { 0xf4, 0x8f, 191, 191 });
  82.  
  83. // UTF-16
  84. e += test_read(1, { 0, 1 }, 1);
  85. e += test_read(1, { 0xd8, 0, 0xdc, 1 }, 0x10001);
  86. e += test_read(1, { 0xdb, 0xff, 0xdf, 0xff }, 0x10ffff);
  87.  
  88. e += test_read(1, { 0xd8, 0, 0xd8, 1 }, FAIL); // mismatched surrogate
  89. e += test_read(1, { 0xd8, 0, 0, 1 }, FAIL); // mismatched surrogate
  90. e += test_read(1, { 0xdc, 0 }, FAIL);
  91.  
  92. e += test_write(1, 1, { 0, 1 });
  93. e += test_write(1, 256, { 1, 0 });
  94. e += test_write(1, 0xffff, { 255, 255 });
  95. e += test_write(1, 0x10001, { 0xd8, 0, 0xdc, 1 });
  96. e += test_write(1, 0x10ffff, { 0xdb, 0xff, 0xdf, 0xff });
  97.  
  98. // UTF-16LE
  99. e += test_write(3, 1, { 1, 0 });
  100. e += test_write(3, 256, { 0, 1 });
  101. e += test_write(3, 0x10001, { 0, 0xd8, 1, 0xdc });
  102. e += test_write(3, 0x10fffe, { 0xff, 0xdb, 0xfe, 0xdf });
  103.  
  104. // UTF-16 byte-order mark
  105. e += test_read(1, { 0xFE, 0xFF, 0x0, 1 }, 1); // byte-order mark
  106. e += test_read(1, { 0xFF, 0xFE, 1, 0x0 }, 1); // reversed byte-order mark
  107. // disallowed byte-order marks
  108. e += test_read(2, { 0xFE, 0xFF }, FAIL);
  109. e += test_read(3, { 0xFF, 0xFE }, FAIL);
  110. // reversed byte-order mark is an unassigned character - to be treated like regular character, according to question
  111. e += test_read(2, { 0xFF, 0xFE }, 0xfffe);
  112. e += test_read(3, { 0xFE, 0xFF }, 0xfffe);
  113.  
  114. // UTF-32
  115. e += test_read(4, { 0, 0, 0, 1 }, 1);
  116. e += test_read(4, { 1, 0, 0, 0 }, FAIL);
  117. e += test_write(4, 1, { 0, 0, 0, 1 });
  118. e += test_write(4, 0x10203, { 0, 1, 2, 3 });
  119.  
  120. // UTF-32LE
  121. e += test_read(6, { 0, 0, 0, 1 }, FAIL);
  122. e += test_read(6, { 1, 0, 0, 0 }, 1);
  123.  
  124. // UTF-32 byte-order mark
  125. e += test_read(4, { 0, 0, 0xFE, 0xFF, 0, 0, 0, 1 }, 1); // byte-order mark
  126. e += test_read(4, { 0xFF, 0xFE, 0, 0, 1, 0, 0, 0 }, 1); // reversed byte-order mark
  127. // disallowed byte-order marks
  128. e += test_read(5, { 0, 0, 0xFE, 0xFF }, FAIL);
  129. e += test_read(5, { 0xFF, 0xFE, 0, 0 }, FAIL);
  130. e += test_read(6, { 0, 0, 0xFE, 0xFF }, FAIL);
  131. e += test_read(6, { 0xFF, 0xFE, 0, 0 }, FAIL);
  132.  
  133. e += test_transcode(1, { 1, 2, 0xFE, 0xFF, 0, 0 }, // That's not a BOM; it's a zwnj when not the first char
  134. 1, { 1, 2, 0xFE, 0xFF, 0, 0 });
  135. e += test_transcode(1, { 0xFF, 0xFE, 1, 2, 0, 0 }, // reversed byte-order mark implies little-endian
  136. 1, { 2, 1, 0, 0 });
  137. e += test_transcode(4, { 0xFF, 0xFE, 0, 0, 1, 2, 0, 0, 0, 0 }, // reversed BOM means little-endian
  138. 4, { 0, 0, 2, 1, 0, 0, 0, 0 });
  139. e += test_transcode(1, { 0xdb, 0xff, 0xdf, 0xff, 0, 0 }, // U+10ffff UTF-16 to UTF-8
  140. 0, { 0xf4, 0x8f, 191, 191, 0 });
  141.  
  142. return e;
  143. }
Success #stdin #stdout 0.01s 5288KB
stdin
Standard input is empty
stdout
Standard output is empty