fork download
  1. // Wheels - various C++ utilities
  2. //
  3. // Written in 2012 by Martinho Fernandes <martinho.fernandes@gmail.com>
  4. //
  5. // To the extent possible under law, the author(s) have dedicated all copyright and related
  6. // and neighboring rights to this software to the public domain worldwide. This software is
  7. // distributed without any warranty.
  8. //
  9. // You should have received a copy of the CC0 Public Domain Dedication along with this software.
  10. // If not, see <http://c...content-available-to-author-only...s.org/publicdomain/zero/1.0/>.
  11.  
  12. // Unicode encoder/decoder
  13.  
  14. // # Codec concept
  15. // Assume C is a Codec type, codec is an instance of C, u is a codepoint, b is
  16. // a byte, [bb, be) is a range of input iterators on bytes, [cb, ce) is a range
  17. // of input iterators on codepoints, bo is an output iterator on bytes, and co
  18. // is an output iterator on codepoints.
  19. // C::is_fixed_width | constexpr bool | true iff is fixed width
  20. // C::max_width | constexpr size_t | maximum width
  21. // C::is_reusable | constexpr bool | true iff is the codec can be reused
  22. // C::is_stateless | constexpr bool | true iff is each codepoint can be encoded individually
  23. // C codec; | C | creates a new codec
  24. // C() | C | creates a new codec
  25. // codec.encode(cb, ce, bo) | output iterator | encodes codepoints
  26. // codec.decode(bb, be, co) | output iterator | decodes codepoints
  27. // codec.encode_one(c, bo) | void | encodes one codepoint (only available if is_stateless)
  28. // codec.decode_one(bb, be) | codepoint | decodes one codepoint (only available if is_stateless)
  29. //TODO: bytes vs code units?
  30.  
  31. #ifndef OGONEK_CODEC_HPP
  32. #define OGONEK_CODEC_HPP
  33.  
  34. #include "types.h++"
  35.  
  36. #include <cassert>
  37. #include <cstddef>
  38. #include <array>
  39. #include <tuple>
  40. #include <utility>
  41.  
  42. namespace ogonek {
  43. namespace codec {
  44. namespace detail {
  45. template <typename Derived>
  46. struct codec_base {
  47. template <typename InputIterator, typename OutputIterator>
  48. OutputIterator encode(InputIterator first, InputIterator last, OutputIterator out) {
  49. for(; first != last; ++first) {
  50. static_cast<Derived*>(this)->encode_one(*first, out);
  51. }
  52. return out;
  53. }
  54. template <typename InputIterator, typename OutputIterator>
  55. OutputIterator decode(InputIterator first, InputIterator last, OutputIterator out) {
  56. while(first != last) {
  57. *out++ = static_cast<Derived*>(this)->decode_one(first, last);
  58. }
  59. return out;
  60. }
  61. };
  62. } // namespace detail
  63.  
  64. enum class byte_order {
  65. big_endian,
  66. little_endian
  67. };
  68.  
  69. struct utf8 : detail::codec_base<utf8> {
  70. static constexpr bool is_fixed_width = false;
  71. static constexpr std::size_t max_width = 4;
  72. static constexpr bool is_reusable = true;
  73. static constexpr bool is_stateless = true;
  74. template <typename OutputIterator>
  75. void encode_one(codepoint c, OutputIterator& out) {
  76. assert(c < 0x200000); // TODO: invalids are UB?
  77. if(c < 0x80) {
  78. *out++ = c & 0x7F;
  79. } else if(c < 0x800) {
  80. *out++ = 0xC0 | ((c & 0x3C0) >> 6);
  81. *out++ = 0x80 | (c & 0x3F);
  82. } else if(c < 0x10000) {
  83. *out++ = 0xE0 | ((c & 0xF000) >> 12);
  84. *out++ = 0x80 | ((c & 0xFC0) >> 6);
  85. *out++ = 0x80 | (c & 0x3F);
  86. } else {
  87. *out++ = 0xF0 | ((c & 0x1C0000) >> 18);
  88. *out++ = 0x80 | ((c & 0x3F000) >> 12);
  89. *out++ = 0x80 | ((c & 0xFC0) >> 6);
  90. *out++ = 0x80 | (c & 0x3F);
  91. }
  92. }
  93. template <typename InputIterator>
  94. codepoint decode_one(InputIterator& first, InputIterator /*TODO test last*/) {
  95. codepoint u0 = *first++;
  96. if((u0 & 0x80) == 0) {
  97. return u0;
  98. }
  99. codepoint u1 = *first++;
  100. if((u0 & 0xE0) != 0xE0) {
  101. return ((u0 & 0x1F) << 6) |
  102. (u1 & 0x3F);
  103. }
  104. codepoint u2 = *first++;
  105. if((u0 & 0xF0) != 0xF0) {
  106. return ((u0 & 0x0F) << 12) |
  107. ((u1 & 0x3F) << 6) |
  108. (u2 & 0x3F);
  109. }
  110. codepoint u3 = *first++;
  111. return ((u0 & 0x07) << 18) |
  112. ((u1 & 0x3F) << 12) |
  113. ((u2 & 0x3F) << 6) |
  114. (u3 & 0x3F);
  115. }
  116. };
  117. namespace utf16_detail {
  118. template <byte_order ByteOrder>
  119. struct endian;
  120. template <>
  121. struct endian<byte_order::big_endian> {
  122. template <typename OutputIterator>
  123. static void output(OutputIterator& out, byte a, byte b) {
  124. *out++ = a;
  125. *out++ = b;
  126. }
  127. template <typename OutputIterator>
  128. static void output(OutputIterator& out, byte a, byte b, byte c, byte d) {
  129. *out++ = a;
  130. *out++ = b;
  131. *out++ = c;
  132. *out++ = d;
  133. }
  134. template <typename Array>
  135. static codepoint make_code_unit(Array t) {
  136. return (t[0] << 8) | t[1];
  137. }
  138. };
  139. template <>
  140. struct endian<byte_order::little_endian> {
  141. template <typename OutputIterator>
  142. static void output(OutputIterator& out, byte a, byte b) {
  143. *out++ = b;
  144. *out++ = a;
  145. }
  146. template <typename OutputIterator>
  147. static void output(OutputIterator& out, byte a, byte b, byte c, byte d) {
  148. *out++ = b;
  149. *out++ = a;
  150. *out++ = d;
  151. *out++ = c;
  152. }
  153. template <typename Array>
  154. static codepoint make_code_unit(Array t) {
  155. return (t[1] << 8) | t[0];
  156. }
  157. };
  158. } // namespace utf16_detail
  159.  
  160. template <byte_order ByteOrder>
  161. struct utf16 : detail::codec_base<utf16<ByteOrder>> {
  162. static constexpr bool is_fixed_width = false;
  163. static constexpr std::size_t max_width = 4;
  164. static constexpr bool is_reusable = true;
  165. static constexpr bool is_stateless = true;
  166. template <typename OutputIterator>
  167. void encode_one(codepoint c, OutputIterator& out) {
  168. assert(c < 0x200000); // TODO: invalids are UB?
  169. if(c < 0x10000) {
  170. utf16_detail::endian<ByteOrder>::output(out,
  171. (c & 0xFF00) >> 8, c & 0xFF);
  172. } else {
  173. auto normal = c - 0x10000;
  174. auto lead = 0xD800 + ((normal & 0xFFC00) >> 10);
  175. auto trail = 0xDC00 + (normal & 0x3FF);
  176. utf16_detail::endian<ByteOrder>::output(out,
  177. (lead & 0xFF00) >> 8, lead & 0xFF,
  178. (trail & 0xFF00) >> 8, trail & 0xFF);
  179. }
  180. }
  181. template <typename InputIterator>
  182. codepoint decode_one(InputIterator& first, InputIterator /*TODO: use last*/) {
  183. std::array<byte, 2> leads { { *first++, *first++ } };
  184. auto lead = utf16_detail::endian<ByteOrder>::make_code_unit(leads);
  185. if(lead < 0xD800 || lead > 0xDFFF) {
  186. return lead;
  187. } else {
  188. std::array<byte, 2> trails { { *first++, *first++ } };
  189. auto trail = utf16_detail::endian<ByteOrder>::make_code_unit(trails);
  190. auto top = lead - 0xD800;
  191. auto low = trail - 0xDC00;
  192. return 0x10000 + ((top << 10) | low);
  193. }
  194. }
  195. };
  196. using utf16be = utf16<byte_order::big_endian>;
  197. using utf16le = utf16<byte_order::little_endian>;
  198.  
  199. namespace utf32_detail {
  200. template <byte_order ByteOrder>
  201. struct endian;
  202. template <>
  203. struct endian<byte_order::big_endian> {
  204. template <typename OutputIterator>
  205. static void output(OutputIterator& out, byte a, byte b, byte c, byte d) {
  206. *out++ = a;
  207. *out++ = b;
  208. *out++ = c;
  209. *out++ = d;
  210. }
  211. template <typename Array>
  212. static codepoint make_code_unit(Array t) {
  213. codepoint result = 0;
  214. for(int i = 0; i < 4; ++i) {
  215. result <<= 8;
  216. result |= t[i];
  217. }
  218. return result;
  219. }
  220. };
  221. template <>
  222. struct endian<byte_order::little_endian> {
  223. template <typename OutputIterator>
  224. static void output(OutputIterator& out, byte a, byte b, byte c, byte d) {
  225. *out++ = d;
  226. *out++ = c;
  227. *out++ = b;
  228. *out++ = a;
  229. }
  230. template <typename Array>
  231. static codepoint make_code_unit(Array t) {
  232. codepoint result = 0;
  233. for(int i = 0; i < 4; ++i) {
  234. result <<= 8;
  235. result |= t[3-i];
  236. }
  237. return result;
  238. }
  239. };
  240. } // namespace utf32_detail
  241.  
  242. template <byte_order ByteOrder>
  243. struct utf32 : detail::codec_base<utf32<ByteOrder>> {
  244. static constexpr bool is_fixed_width = true;
  245. static constexpr std::size_t max_width = 4;
  246. static constexpr bool is_reusable = true;
  247. static constexpr bool is_stateless = true;
  248. template <typename OutputIterator>
  249. void encode_one(codepoint c, OutputIterator& out) {
  250. assert(c < 0x200000); // TODO: invalids are UB?
  251. return utf32_detail::endian<ByteOrder>::output(out,
  252. 0, (c & 0x1F0000) >> 16, (c & 0xFF00) >> 8, c & 0xFF);
  253. }
  254. template <typename InputIterator>
  255. codepoint decode_one(InputIterator& first, InputIterator /*TODO use last*/) {
  256. std::array<byte, 4> bytes { { *first++, *first++, *first++, *first++ } };
  257. return utf32_detail::endian<ByteOrder>::make_code_unit(bytes);
  258. }
  259. };
  260. using utf32be = utf32<byte_order::big_endian>;
  261. using utf32le = utf32<byte_order::little_endian>;
  262.  
  263. namespace detail {
  264. constexpr char base64[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
  265. unsigned rev_base64(char c) {
  266. return std::find(std::begin(base64), std::end(base64), c) - std::begin(base64);
  267. }
  268. } // namespace detail
  269. struct utf7 {
  270. static constexpr bool is_fixed_width = true;
  271. static constexpr std::size_t max_width = 4;
  272. static constexpr bool is_reusable = false;
  273. static constexpr bool is_stateless = true;
  274. private:
  275. bool in_unicode = false;
  276.  
  277. template <typename OutputIterator>
  278. void enter_ascii(OutputIterator& out) {
  279. if(in_unicode) {
  280. flush(out);
  281. *out++ = '-';
  282. in_unicode = false;
  283. }
  284. }
  285. template <typename OutputIterator>
  286. void enter_unicode(OutputIterator& out) {
  287. if(!in_unicode) {
  288. flush(out);
  289. *out++ = '+';
  290. in_unicode = true;
  291. }
  292. }
  293.  
  294. std::uint16_t state = 0;
  295. int state_bits = 0;
  296. template <typename OutputIterator>
  297. void encode_one(codepoint c, OutputIterator& out) {
  298. if(c == '+') {
  299. enter_ascii(out);
  300. *out++ = '+';
  301. *out++ = '-';
  302. } else if((c >= 0x20 && c <= 0x7E && c != '~' && c != '\\')
  303. || (c == '\t' || c == '\r' || c == '\n')) {
  304. // NOTE: all optional direct characters considered
  305. enter_ascii(out);
  306. *out++ = c;
  307. } else {
  308. enter_unicode(out);
  309. utf16be u16codec;
  310. auto src = { c };
  311. std::array<byte, 4> u16;
  312. auto end = u16codec.encode(src.begin(), src.end(), u16.begin());
  313. for(auto it = u16.begin(); it != end; ++it) {
  314. auto unit = *it;
  315. state_bits += 2;
  316. *out++ = detail::base64[state | (unit >> state_bits)];
  317. state = (unit & (0x3F >> (6-state_bits))) << (6-state_bits);
  318. if(state_bits == 6) {
  319. *out++ = detail::base64[state];
  320. state = 0;
  321. }
  322. state_bits %= 6;
  323. }
  324. }
  325. }
  326. template <typename OutputIterator>
  327. void flush(OutputIterator& out) {
  328. if(state_bits != 0) {
  329. *out++ = detail::base64[state];
  330. state = 0;
  331. state_bits = 0;
  332. }
  333. }
  334. template <typename InputIterator>
  335. codepoint get_unit(byte& c, InputIterator& first, InputIterator /*TODO use last*/) {
  336. state |= c << (10-state_bits);
  337. state_bits += 6;
  338. for(; state_bits < 10; state_bits += 6) {
  339. c = detail::rev_base64(*first++);
  340. state |= c << (10-state_bits);
  341. }
  342. c = detail::rev_base64(*first++);
  343. codepoint result = state | (c >> (state_bits-10));
  344. state = c << (16-(state_bits-10));
  345. state_bits -= 10;
  346. return result;
  347. }
  348. template <typename InputIterator>
  349. codepoint decode_one(InputIterator& first, InputIterator last) {
  350. auto c = *first++;
  351. if(in_unicode && c == '-') {
  352. if(first == last) return -1;
  353. c = *first++;
  354. in_unicode = false;
  355. } else if(!in_unicode && c == '+') {
  356. c = *first++;
  357. if(c == '-') return '+';
  358. in_unicode = true;
  359. state = 0;
  360. state_bits = 0;
  361. }
  362. if(!in_unicode) return c;
  363. c = detail::rev_base64(c);
  364. auto lead = get_unit(c, first, last);
  365. if(lead >= 0xD800 && lead < 0xDC00) {
  366. auto trail = get_unit(c, first, last);
  367. auto units = { byte((lead & 0xFF00) >> 8), byte(lead & 0xFF),
  368. byte((trail & 0xFF00) >> 8), byte(trail & 0xFF) };
  369. utf16be codec;
  370. auto it = units.begin();
  371. return codec.decode_one(it, units.end());
  372. } else {
  373. return lead;
  374. }
  375. }
  376. public:
  377. template <typename InputIterator, typename OutputIterator>
  378. OutputIterator encode(InputIterator first, InputIterator last, OutputIterator out) {
  379. for(; first != last; ++first) {
  380. encode_one(*first, out);
  381. }
  382. flush(out);
  383. if(in_unicode) *out++ = '-';
  384. return out;
  385. }
  386. template <typename InputIterator, typename OutputIterator>
  387. OutputIterator decode(InputIterator first, InputIterator last, OutputIterator out) {
  388. while(first != last) {
  389. auto c = decode_one(first, last);
  390. if(c == -1u) continue;
  391. *out++ = c;
  392. }
  393. return out;
  394. }
  395. };
  396. } // namespace codec
  397. } // namespace ogonek
  398.  
  399. #endif // OGONEK_CODEC_HPP
  400.  
Not running #stdin #stdout 0s 0KB
stdin
Standard input is empty
stdout
Standard output is empty