fork download
  1. import java.io.ByteArrayOutputStream;
  2. import java.nio.ByteBuffer;
  3. import java.nio.charset.Charset;
  4. import java.util.Scanner;
  5.  
  6. /*
  7. プログラミングのお題スレ Part12
  8. ttps://mevius.5ch.net/test/read.cgi/tech/1538096947/576
  9.  
  10. 576 名前:デフォルトの名無しさん[sage] 投稿日:2018/11/18(日) 22:42:22.16 ID:qz83zcM7
  11. お題
  12. UTF-8のバイト列が与えられるので、Unicodeのコードポイントを求めよ
  13.  
  14. 6F 64 61 69
  15. => U+006F U+0064 U+0061 U+0069
  16.  
  17. E3 81 8A E9 A1 8C
  18. => U+304A U+984C
  19.  
  20. C2 A9 F0 9F 8D 94 E9 A6 99 41
  21. => U+00A9 U+1F354 U+9999 U+0041
  22. */
  23. class Ideone
  24. {
  25. static final int REPLACEMENT_CHARACTER = 0xFFFD;
  26. public static void main(String[] args)
  27. {
  28. System.out.println("言語任せ");
  29. printUnicodeCodePoint(toByteArray("6F 64 61 69"));
  30. printUnicodeCodePoint(toByteArray("E3 81 8A E9 A1 8C"));
  31. printUnicodeCodePoint(toByteArray("C2 A9 F0 9F 8D 94 E9 A6 99 41"));
  32.  
  33. System.out.println("自力デコード");
  34. printUnicodeCodePoint2(toByteArray("6F 64 61 69"));
  35. printUnicodeCodePoint2(toByteArray("E3 81 8A E9 A1 8C"));
  36. printUnicodeCodePoint2(toByteArray("C2 A9 F0 9F 8D 94 E9 A6 99 41"));
  37. }
  38.  
  39. static byte[] toByteArray(String s)
  40. {
  41. Scanner in = new Scanner(s);
  42. while (in.hasNextInt(16))
  43. {
  44. baos.write(in.nextInt(16));
  45. }
  46. in.close();
  47. return baos.toByteArray();
  48. }
  49.  
  50. // 言語任せ
  51. static void printUnicodeCodePoint(byte[] bs)
  52. {
  53. for (byte b : bs)
  54. {
  55. System.out.printf("%02X ", b & 0xFF);
  56. }
  57. System.out.println();
  58.  
  59. System.out.print("=> ");
  60. String str = new String(bs, Charset.forName("UTF-8"));
  61. str.codePoints().forEach(i -> System.out.printf("U+%04X ", i));
  62.  
  63. System.out.println();
  64. System.out.println();
  65. }
  66.  
  67. // 自力
  68. static void printUnicodeCodePoint2(byte[] bs)
  69. {
  70. for (byte b : bs)
  71. {
  72. System.out.printf("%02X ", b & 0xFF);
  73. }
  74. System.out.println();
  75.  
  76. System.out.print("=> ");
  77.  
  78. ByteBuffer buf = ByteBuffer.wrap(bs);
  79. while (buf.hasRemaining())
  80. {
  81. System.out.printf("U+%04X ", utf8decode(buf));
  82. }
  83.  
  84. System.out.println();
  85. System.out.println();
  86. }
  87.  
  88. // ByteBufferから1文字デコードする
  89. static int utf8decode(ByteBuffer buf)
  90. {
  91. byte b1 = buf.get();
  92. if (b1 >= 0) return b1 & 0b01111111;
  93. if (b1 <= -65) return REPLACEMENT_CHARACTER;
  94.  
  95. byte b2 = buf.get();
  96. if (!test(b2))
  97. {
  98. buf.position(buf.position() - 1);
  99. return REPLACEMENT_CHARACTER;
  100. }
  101.  
  102. if (b1 <= -33) return (b1 & 0b00011111) << 6 | (b2 & 0b00111111);
  103.  
  104. byte b3 = buf.get();
  105. if (!test(b3))
  106. {
  107. buf.position(buf.position() - 1);
  108. return REPLACEMENT_CHARACTER;
  109. }
  110.  
  111. if (b1 <= -17) return (b1 & 0b00001111) << 12 | (b2 & 0b00111111) << 6 | (b3 & 0b00111111);
  112.  
  113. byte b4 = buf.get();
  114. if (!test(b4))
  115. {
  116. buf.position(buf.position() - 1);
  117. return REPLACEMENT_CHARACTER;
  118. }
  119.  
  120. if (b1 <= -9) return (b1 & 0b00000111) << 18 | (b2 & 0b00111111) << 12 | (b3 & 0b00111111) << 6 | (b4 & 0b00111111);
  121.  
  122. byte b5 = buf.get();
  123. if (!test(b5))
  124. {
  125. buf.position(buf.position() - 1);
  126. return REPLACEMENT_CHARACTER;
  127. }
  128. if (b1 <= -5) return REPLACEMENT_CHARACTER;
  129.  
  130. byte b6 = buf.get();
  131. if (!test(b6))
  132. {
  133. buf.position(buf.position() - 1);
  134. return REPLACEMENT_CHARACTER;
  135. }
  136.  
  137. return REPLACEMENT_CHARACTER;
  138. }
  139.  
  140. static boolean test(byte b)
  141. {
  142. return (b & 0b11000000) == 0b10000000;
  143. }
  144. }
  145.  
Success #stdin #stdout 0.18s 2184192KB
stdin
Standard input is empty
stdout
言語任せ
6F 64 61 69 
=> U+006F U+0064 U+0061 U+0069 

E3 81 8A E9 A1 8C 
=> U+304A U+984C 

C2 A9 F0 9F 8D 94 E9 A6 99 41 
=> U+00A9 U+1F354 U+9999 U+0041 

自力デコード
6F 64 61 69 
=> U+006F U+0064 U+0061 U+0069 

E3 81 8A E9 A1 8C 
=> U+304A U+984C 

C2 A9 F0 9F 8D 94 E9 A6 99 41 
=> U+00A9 U+1F354 U+9999 U+0041