fork download
  1. #include <iostream>
  2.  
  3. #include <string>
  4. #include <cstdint>
  5. #include <stdexcept>
  6. namespace utfconv
  7. {
  8. using std::string;
  9. using std::u32string;
  10. //typedef uint32_t char32_t;
  11. //typedef std::basic_string<uint32_t> u32string;
  12.  
  13. size_t bitsize(char32_t a)
  14. {
  15. for (int shift = 31; shift > 0; shift--)
  16. {
  17. if ((a >> shift)){ return shift+1; }
  18. }
  19. return 0;
  20. }
  21.  
  22. int u8_seq_size(size_t bitsize)
  23. {
  24. if (bitsize <= 7)
  25. {
  26. return 1;
  27. }
  28. else
  29. {
  30. return ((bitsize - 2) / 5) + 1;
  31. }
  32. }
  33. int u8_seq_bit_size(size_t bitsize)
  34. {
  35. size_t s= u8_seq_size(bitsize);
  36. if (s == 1)
  37. {
  38. return 7;
  39. }
  40. else
  41. {
  42. return s * 5 + 1;
  43. }
  44. }
  45.  
  46.  
  47. char32_t bitrange(char32_t d, size_t s, size_t e)
  48. {
  49. uint32_t mask = ((1 << (s - e)) - 1);
  50. return (d >> e) & mask;
  51. }
  52. string to_utf8(const u32string& data)
  53. {
  54. string result;
  55. result.reserve(data.size()*5);
  56.  
  57. for (auto c : data)
  58. {
  59. size_t bsize = bitsize(c);
  60. size_t seqbsize = u8_seq_bit_size(bsize);
  61. int size = u8_seq_size(seqbsize);
  62. if (size == 1)
  63. {
  64. result.push_back(c);
  65. }
  66. else
  67. {
  68. size_t start = seqbsize;
  69. char head = ~((1 << (8-size))-1);
  70. size_t bitlen = 7 - size;
  71. char h = head + (0xFF & bitrange(c, start, start - bitlen));
  72. start = start - bitlen;
  73. result.push_back(h);
  74. size--;
  75. while (size)
  76. {
  77. size_t bitlen = 6;
  78. char bin = bitrange(c, start, start - bitlen);
  79. result.push_back(0x80 + bin);
  80. start = start - bitlen;
  81. size--;
  82. }
  83. }
  84. }
  85. return result;
  86. }
  87.  
  88. int u8_headder_get_data_count(unsigned char c)
  89. {
  90. if (c <= 0x7f){ return 1; }
  91. if ((c & 0xC0) == 0x80){ throw std::runtime_error("bad utf8 sequence"); }
  92. for (int shift = 6; shift > 0;shift--)
  93. {
  94. if (!((c >> shift)&1)){ return 8 - shift -1;}
  95. }
  96. throw std::runtime_error("bad utf8 sequence");
  97. }
  98. u32string to_ucs4(const string& data)
  99. {
  100. u32string result;
  101. result.reserve(data.size() * 5);
  102. int remain = 0;
  103. char32_t current = 0;
  104. for (auto c : data)
  105. {
  106. int bit = 0;
  107. if (remain == 0)
  108. {
  109. remain = u8_headder_get_data_count(c);
  110. bit = 7 - remain;
  111. }
  112. else
  113. {
  114. if ((c & 0xC0) == 0xC0){ throw std::runtime_error("bad utf8 sequence"); }
  115. bit = 6;
  116. }
  117.  
  118. current = (current << bit) + (c & ((1 << (bit + 1)) - 1));
  119. if (remain == 1)
  120. {
  121. result.push_back(current);
  122. current = 0;
  123. }
  124. remain--;
  125. }
  126. return result;
  127.  
  128. }
  129. }
  130.  
  131.  
  132. int main() {
  133. using namespace utfconv;
  134.  
  135. //test code
  136. const char* data= u8"abcdeあいうえお亜";
  137.  
  138.  
  139. std::cout << "source utf8 string:" << std::endl;
  140. std::cout << data << std::endl;
  141.  
  142.  
  143. u32string u32str = to_ucs4(data);
  144.  
  145. std::cout << "ucs4 sequence:" << std::endl;
  146. for (auto u32code : u32str)
  147. {
  148. std::cout <<std::hex << "0x" << u32code << ",";
  149. }
  150. std::cout << std::endl;
  151.  
  152. string u8str = to_utf8(u32str);
  153.  
  154. std::cout << "utf8 byte sequence:" << std::endl;
  155. for (auto u8byte : u8str)
  156. {
  157. std::cout <<std::hex << "0x" << uint32_t(uint8_t(u8byte)) << ",";
  158. }
  159. std::cout << std::endl;
  160.  
  161.  
  162. return 0;
  163. }
Success #stdin #stdout 0s 3276KB
stdin
Standard input is empty
stdout
source utf8 string:
abcdeあいうえお亜
ucs4 sequence:
0x61,0x62,0x63,0x64,0x65,0x3042,0x3044,0x3046,0x3048,0x304a,0x4e9c,
utf8 byte sequence:
0x61,0x62,0x63,0x64,0x65,0xe3,0x81,0x82,0xe3,0x81,0x84,0xe3,0x81,0x86,0xe3,0x81,0x88,0xe3,0x81,0x8a,0xe4,0xba,0x9c,