fork(4) download
  1. import java.util.Map;
  2. import java.util.HashMap;
  3.  
  4. /**
  5.  * Halfwidth and Fullwidth Character Normalization for CJK
  6.  * http://s...content-available-to-author-only...s.asia
  7.  *
  8.  * See the Unicode Standard 6.0 – Halfwidth and Fullwidth Forms
  9.  * http://u...content-available-to-author-only...e.org/charts/PDF/UFF00.pdf
  10.  *
  11.  * For Chinese, Japanese and Korean, some characters have Unicode mappings to
  12.  * both a halfwidth and a fullwidth version. This code normalizes them
  13.  * to halfwidth for latin characters, numbers and punctuation and fullwidth
  14.  * for everything else.
  15.  * Fine for half/full width normalization but not fully equivalent to NFKC
  16.  * normalization
  17.  */
  18. public class Main{
  19.  
  20. private static final Map<Character, Character> charCodeMap;
  21. // Key — Original Character
  22. // Value — Replacement character
  23. static {
  24. charCodeMap = new HashMap<Character, Character>();
  25. // TO HALFWIDTH CHARACTERS
  26. // ASCII variants (Latin Symbols, Punctuation, Numbers, and Alphabet)
  27. for (char key = '\uff01'; key <= '\uff5e'; key++) {
  28. char value = (char) (key - '\ufee0');
  29. charCodeMap.put(key, value);
  30. }
  31. // Brackets
  32. charCodeMap.put('\uff5f', '\u2985'); // left white parenthesis
  33. charCodeMap.put('\uff60', '\u2986'); // right white parenthesis
  34. // Symbol Variants
  35. charCodeMap.put('\uffe0', '\u00a2'); // Cent sign
  36. charCodeMap.put('\uffe1', '\u00a3'); // Pound sign
  37. charCodeMap.put('\uffe2', '\u00ac'); // Not sign
  38. charCodeMap.put('\uffe3', '\u00af'); // Macron
  39. charCodeMap.put('\uffe4', '\u00a6'); // Broken Bar
  40. charCodeMap.put('\uffe5', '\u00a5'); // Yen sign
  41. charCodeMap.put('\uffe6', '\u20a9'); // Won sign
  42. // Space (strictly speaking not listed in Unicode 6.0 Halfwidth and
  43. // Fullwidth forms but including here as the ideographic space can
  44. // cause issues)
  45. charCodeMap.put('\u3000', '\u0020'); // SPACE
  46. // TO FULLWIDTH CHARACTERS
  47. // CJK punctuation
  48. charCodeMap.put('\uff61', '\u3002'); // ideographic full stop
  49. charCodeMap.put('\uff62', '\u300c'); // left corner bracket
  50. charCodeMap.put('\uff63', '\u300d'); // right corner bracket
  51. charCodeMap.put('\uff64', '\u3001'); // ideographic comma
  52. // Katakana variants
  53. charCodeMap.put('\uff65', '\u30fb'); // Middle Dot
  54. charCodeMap.put('\uff66', '\u30f2'); // Wo
  55. charCodeMap.put('\uff67', '\u30a1'); // A small
  56. charCodeMap.put('\uff68', '\u30a3'); // I small
  57. charCodeMap.put('\uff69', '\u30a5'); // U small
  58. charCodeMap.put('\uff6a', '\u30a7'); // E small
  59. charCodeMap.put('\uff6b', '\u30a9'); // O small
  60. charCodeMap.put('\uff6c', '\u30e3'); // Ya small
  61. charCodeMap.put('\uff6d', '\u30e5'); // Yu small
  62. charCodeMap.put('\uff6e', '\u30e7'); // Yo small
  63. charCodeMap.put('\uff6f', '\u30c3'); // Tsu small
  64. charCodeMap.put('\uff70', '\u30fc'); // Prolonged Sound Mark
  65. charCodeMap.put('\uff71', '\u30a2'); // A
  66. charCodeMap.put('\uff72', '\u30a4'); // I
  67. charCodeMap.put('\uff73', '\u30a6'); // U
  68. charCodeMap.put('\uff74', '\u30a8'); // E
  69. charCodeMap.put('\uff75', '\u30aa'); // O
  70. charCodeMap.put('\uff76', '\u30ab'); // Ka
  71. charCodeMap.put('\uff77', '\u30ad'); // Ki
  72. charCodeMap.put('\uff78', '\u30af'); // Ku
  73. charCodeMap.put('\uff79', '\u30b1'); // Ke
  74. charCodeMap.put('\uff7a', '\u30b3'); // Ko
  75. charCodeMap.put('\uff7b', '\u30b5'); // Sa
  76. charCodeMap.put('\uff7c', '\u30b7'); // Shi
  77. charCodeMap.put('\uff7d', '\u30b9'); // Su
  78. charCodeMap.put('\uff7e', '\u30bb'); // Se
  79. charCodeMap.put('\uff7f', '\u30bd'); // So
  80. charCodeMap.put('\uff80', '\u30bf'); // Ta
  81. charCodeMap.put('\uff81', '\u30c1'); // Chi
  82. charCodeMap.put('\uff82', '\u30c4'); // Tsu
  83. charCodeMap.put('\uff83', '\u30c6'); // Te
  84. charCodeMap.put('\uff84', '\u30c8'); // To
  85. charCodeMap.put('\uff85', '\u30ca'); // Na
  86. charCodeMap.put('\uff86', '\u30cb'); // Ni
  87. charCodeMap.put('\uff87', '\u30cc'); // Nu
  88. charCodeMap.put('\uff88', '\u30cd'); // Ne
  89. charCodeMap.put('\uff89', '\u30ce'); // No
  90. charCodeMap.put('\uff8a', '\u30cf'); // Ha
  91. charCodeMap.put('\uff8b', '\u30d2'); // Hi
  92. charCodeMap.put('\uff8c', '\u30d5'); // Hu
  93. charCodeMap.put('\uff8d', '\u30d8'); // He
  94. charCodeMap.put('\uff8e', '\u30db'); // Ho
  95. charCodeMap.put('\uff8f', '\u30de'); // Ma
  96. charCodeMap.put('\uff90', '\u30df'); // Mi
  97. charCodeMap.put('\uff91', '\u30e0'); // Mu
  98. charCodeMap.put('\uff92', '\u30e1'); // Me
  99. charCodeMap.put('\uff93', '\u30e2'); // Mo
  100. charCodeMap.put('\uff94', '\u30e4'); // Ya
  101. charCodeMap.put('\uff95', '\u30e6'); // Yu
  102. charCodeMap.put('\uff96', '\u30e8'); // Yo
  103. charCodeMap.put('\uff97', '\u30e9'); // Ra
  104. charCodeMap.put('\uff98', '\u30ea'); // Ri
  105. charCodeMap.put('\uff99', '\u30eb'); // Ru
  106. charCodeMap.put('\uff9a', '\u30ec'); // Re
  107. charCodeMap.put('\uff9b', '\u30ed'); // Ro
  108. charCodeMap.put('\uff9c', '\u30ef'); // Wa
  109. charCodeMap.put('\uff9d', '\u30f3'); // N
  110. charCodeMap.put('\uff9e', '\u3099'); // Voiced Sound Mark
  111. charCodeMap.put('\uff9f', '\u309a'); // Semi-Voiced Sound Mark
  112. // Hangul variants
  113. charCodeMap.put('\uffa0', '\u3164'); // Hangul Filler
  114. // Hangul First Range
  115. // KIYEOK to HIEUH
  116. for (char key = '\uffa1'; key <= '\uffbe'; key++) {
  117. char value = (char) (key - '\uce70');
  118. charCodeMap.put(key, value);
  119. }
  120. // Hangul Second Range
  121. // A to E
  122. for (char key = '\uffc2'; key <= '\uffc7'; key++) {
  123. char value = (char) (key - '\uce73');
  124. charCodeMap.put(key, value);
  125. }
  126. // Hangul Third Range
  127. // YEO to OE
  128. for (char key = '\uffca'; key <= '\uffcf'; key++) {
  129. char value = (char) (key - '\uce75');
  130. charCodeMap.put(key, value);
  131. }
  132. // Hangul Fourth Range
  133. // YO to YU
  134. for (char key = '\uffd2'; key <= '\uffd7'; key++) {
  135. char value = (char) (key - '\uce77');
  136. charCodeMap.put(key, value);
  137. }
  138. // More Hangul variants
  139. charCodeMap.put('\uffda', '\u3161'); // Hangul EU
  140. charCodeMap.put('\uffdb', '\u3162'); // Hangul YI
  141. charCodeMap.put('\uffdc', '\u3163'); // Hangul I
  142. // Symbol Variants
  143. charCodeMap.put('\uffe8', '\u2502'); // Forms Light Vertical
  144. charCodeMap.put('\uffe9', '\u2190'); // Leftwards Arrow
  145. charCodeMap.put('\uffea', '\u2191'); // Upwards Arrow
  146. charCodeMap.put('\uffeb', '\u2192'); // Rightwards Arrow
  147. charCodeMap.put('\uffec', '\u2193'); // Downwards Arrow
  148. charCodeMap.put('\uffed', '\u25a0'); // Black Square
  149. charCodeMap.put('\uffee', '\u25cb'); // White Circle
  150. }
  151.  
  152. /**
  153. * Takes an unnormalized (Halfwidth/Fullwidth) and outputs a normalized string
  154. */
  155. public static void main(String[] args) {
  156. String[] unnormalized = {"Asia", "アジア", "アジア"};
  157. for(int j=0; j<unnormalized.length; j++){
  158. System.out.println("Unnormalized:\t " + unnormalized[j]);
  159. char[] buffer = unnormalized[j].toCharArray();
  160. int bufferLen = buffer.length;
  161. for (int i = 0; i < bufferLen; i++) {
  162. if (charCodeMap.containsKey(buffer[i])) {
  163. buffer[i] = charCodeMap.get(buffer[i]);
  164. }
  165. }
  166. System.out.println("Normalized:\t " + new String(buffer));
  167. }
  168. }
  169.  
  170. }
Success #stdin #stdout 0.03s 245632KB
stdin
Standard input is empty
stdout
Unnormalized:	 Asia
Normalized:	 Asia
Unnormalized:	 アジア
Normalized:	 アジア
Unnormalized:	 アジア
Normalized:	 アジア