import java.util.Map;
import java.util.HashMap;
/**
* Halfwidth and Fullwidth Character Normalization for CJK
* http://s...content-available-to-author-only...s.asia
*
* See the Unicode Standard 6.0 – Halfwidth and Fullwidth Forms
* http://u...content-available-to-author-only...e.org/charts/PDF/UFF00.pdf
*
* For Chinese, Japanese and Korean, some characters have Unicode mappings to
* both a halfwidth and a fullwidth version. This code normalizes them
* to halfwidth for latin characters, numbers and punctuation and fullwidth
* for everything else.
* Fine for half/full width normalization but not fully equivalent to NFKC
* normalization
*/
public class Main{
private static final Map
<Character, Character
> charCodeMap
; // Key — Original Character
// Value — Replacement character
static {
charCodeMap
= new HashMap
<Character, Character
>(); // TO HALFWIDTH CHARACTERS
// ASCII variants (Latin Symbols, Punctuation, Numbers, and Alphabet)
for (char key = '\uff01'; key <= '\uff5e'; key++) {
char value = (char) (key - '\ufee0');
charCodeMap.put(key, value);
}
// Brackets
charCodeMap.put('\uff5f', '\u2985'); // left white parenthesis
charCodeMap.put('\uff60', '\u2986'); // right white parenthesis
// Symbol Variants
charCodeMap.put('\uffe0', '\u00a2'); // Cent sign
charCodeMap.put('\uffe1', '\u00a3'); // Pound sign
charCodeMap.put('\uffe2', '\u00ac'); // Not sign
charCodeMap.put('\uffe3', '\u00af'); // Macron
charCodeMap.put('\uffe4', '\u00a6'); // Broken Bar
charCodeMap.put('\uffe5', '\u00a5'); // Yen sign
charCodeMap.put('\uffe6', '\u20a9'); // Won sign
// Space (strictly speaking not listed in Unicode 6.0 Halfwidth and
// Fullwidth forms but including here as the ideographic space can
// cause issues)
charCodeMap.put('\u3000', '\u0020'); // SPACE
// TO FULLWIDTH CHARACTERS
// CJK punctuation
charCodeMap.put('\uff61', '\u3002'); // ideographic full stop
charCodeMap.put('\uff62', '\u300c'); // left corner bracket
charCodeMap.put('\uff63', '\u300d'); // right corner bracket
charCodeMap.put('\uff64', '\u3001'); // ideographic comma
// Katakana variants
charCodeMap.put('\uff65', '\u30fb'); // Middle Dot
charCodeMap.put('\uff66', '\u30f2'); // Wo
charCodeMap.put('\uff67', '\u30a1'); // A small
charCodeMap.put('\uff68', '\u30a3'); // I small
charCodeMap.put('\uff69', '\u30a5'); // U small
charCodeMap.put('\uff6a', '\u30a7'); // E small
charCodeMap.put('\uff6b', '\u30a9'); // O small
charCodeMap.put('\uff6c', '\u30e3'); // Ya small
charCodeMap.put('\uff6d', '\u30e5'); // Yu small
charCodeMap.put('\uff6e', '\u30e7'); // Yo small
charCodeMap.put('\uff6f', '\u30c3'); // Tsu small
charCodeMap.put('\uff70', '\u30fc'); // Prolonged Sound Mark
charCodeMap.put('\uff71', '\u30a2'); // A
charCodeMap.put('\uff72', '\u30a4'); // I
charCodeMap.put('\uff73', '\u30a6'); // U
charCodeMap.put('\uff74', '\u30a8'); // E
charCodeMap.put('\uff75', '\u30aa'); // O
charCodeMap.put('\uff76', '\u30ab'); // Ka
charCodeMap.put('\uff77', '\u30ad'); // Ki
charCodeMap.put('\uff78', '\u30af'); // Ku
charCodeMap.put('\uff79', '\u30b1'); // Ke
charCodeMap.put('\uff7a', '\u30b3'); // Ko
charCodeMap.put('\uff7b', '\u30b5'); // Sa
charCodeMap.put('\uff7c', '\u30b7'); // Shi
charCodeMap.put('\uff7d', '\u30b9'); // Su
charCodeMap.put('\uff7e', '\u30bb'); // Se
charCodeMap.put('\uff7f', '\u30bd'); // So
charCodeMap.put('\uff80', '\u30bf'); // Ta
charCodeMap.put('\uff81', '\u30c1'); // Chi
charCodeMap.put('\uff82', '\u30c4'); // Tsu
charCodeMap.put('\uff83', '\u30c6'); // Te
charCodeMap.put('\uff84', '\u30c8'); // To
charCodeMap.put('\uff85', '\u30ca'); // Na
charCodeMap.put('\uff86', '\u30cb'); // Ni
charCodeMap.put('\uff87', '\u30cc'); // Nu
charCodeMap.put('\uff88', '\u30cd'); // Ne
charCodeMap.put('\uff89', '\u30ce'); // No
charCodeMap.put('\uff8a', '\u30cf'); // Ha
charCodeMap.put('\uff8b', '\u30d2'); // Hi
charCodeMap.put('\uff8c', '\u30d5'); // Hu
charCodeMap.put('\uff8d', '\u30d8'); // He
charCodeMap.put('\uff8e', '\u30db'); // Ho
charCodeMap.put('\uff8f', '\u30de'); // Ma
charCodeMap.put('\uff90', '\u30df'); // Mi
charCodeMap.put('\uff91', '\u30e0'); // Mu
charCodeMap.put('\uff92', '\u30e1'); // Me
charCodeMap.put('\uff93', '\u30e2'); // Mo
charCodeMap.put('\uff94', '\u30e4'); // Ya
charCodeMap.put('\uff95', '\u30e6'); // Yu
charCodeMap.put('\uff96', '\u30e8'); // Yo
charCodeMap.put('\uff97', '\u30e9'); // Ra
charCodeMap.put('\uff98', '\u30ea'); // Ri
charCodeMap.put('\uff99', '\u30eb'); // Ru
charCodeMap.put('\uff9a', '\u30ec'); // Re
charCodeMap.put('\uff9b', '\u30ed'); // Ro
charCodeMap.put('\uff9c', '\u30ef'); // Wa
charCodeMap.put('\uff9d', '\u30f3'); // N
charCodeMap.put('\uff9e', '\u3099'); // Voiced Sound Mark
charCodeMap.put('\uff9f', '\u309a'); // Semi-Voiced Sound Mark
// Hangul variants
charCodeMap.put('\uffa0', '\u3164'); // Hangul Filler
// Hangul First Range
// KIYEOK to HIEUH
for (char key = '\uffa1'; key <= '\uffbe'; key++) {
char value = (char) (key - '\uce70');
charCodeMap.put(key, value);
}
// Hangul Second Range
// A to E
for (char key = '\uffc2'; key <= '\uffc7'; key++) {
char value = (char) (key - '\uce73');
charCodeMap.put(key, value);
}
// Hangul Third Range
// YEO to OE
for (char key = '\uffca'; key <= '\uffcf'; key++) {
char value = (char) (key - '\uce75');
charCodeMap.put(key, value);
}
// Hangul Fourth Range
// YO to YU
for (char key = '\uffd2'; key <= '\uffd7'; key++) {
char value = (char) (key - '\uce77');
charCodeMap.put(key, value);
}
// More Hangul variants
charCodeMap.put('\uffda', '\u3161'); // Hangul EU
charCodeMap.put('\uffdb', '\u3162'); // Hangul YI
charCodeMap.put('\uffdc', '\u3163'); // Hangul I
// Symbol Variants
charCodeMap.put('\uffe8', '\u2502'); // Forms Light Vertical
charCodeMap.put('\uffe9', '\u2190'); // Leftwards Arrow
charCodeMap.put('\uffea', '\u2191'); // Upwards Arrow
charCodeMap.put('\uffeb', '\u2192'); // Rightwards Arrow
charCodeMap.put('\uffec', '\u2193'); // Downwards Arrow
charCodeMap.put('\uffed', '\u25a0'); // Black Square
charCodeMap.put('\uffee', '\u25cb'); // White Circle
}
/**
* Takes an unnormalized (Halfwidth/Fullwidth) and outputs a normalized string
*/
public static void main
(String[] args
) { String[] unnormalized
= {"Asia",
"アジア",
"アジア"}; for(int j=0; j<unnormalized.length; j++){
System.
out.
println("Unnormalized:\t " + unnormalized
[j
]); char[] buffer = unnormalized[j].toCharArray();
int bufferLen = buffer.length;
for (int i = 0; i < bufferLen; i++) {
if (charCodeMap.containsKey(buffer[i])) {
buffer[i] = charCodeMap.get(buffer[i]);
}
}
}
}
}