|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
package jdk.internal.icu.text; |
|
|
|
import jdk.internal.icu.impl.Norm2AllModes; |
|
|
|
import java.text.CharacterIterator; |
|
import java.text.Normalizer; |
|
|
|
/** |
|
* Unicode Normalization |
|
* |
|
* <h2>Unicode normalization API</h2> |
|
* |
|
* <code>normalize</code> transforms Unicode text into an equivalent composed or |
|
* decomposed form, allowing for easier sorting and searching of text. |
|
* <code>normalize</code> supports the standard normalization forms described in |
|
* <a href="http://www.unicode.org/reports/tr15/" target="unicode"> |
|
* Unicode Standard Annex #15 — Unicode Normalization Forms</a>. |
|
* |
|
* Characters with accents or other adornments can be encoded in |
|
* several different ways in Unicode. For example, take the character A-acute. |
|
* In Unicode, this can be encoded as a single character (the |
|
* "composed" form): |
|
* |
|
* <pre> |
|
* 00C1 LATIN CAPITAL LETTER A WITH ACUTE |
|
* </pre> |
|
* |
|
* or as two separate characters (the "decomposed" form): |
|
* |
|
* <pre> |
|
* 0041 LATIN CAPITAL LETTER A |
|
* 0301 COMBINING ACUTE ACCENT |
|
* </pre> |
|
* |
|
* To a user of your program, however, both of these sequences should be |
|
* treated as the same "user-level" character "A with acute accent". When you |
|
* are searching or comparing text, you must ensure that these two sequences are |
|
* treated equivalently. In addition, you must handle characters with more than |
|
* one accent. Sometimes the order of a character's combining accents is |
|
* significant, while in other cases accent sequences in different orders are |
|
* really equivalent. |
|
* |
|
* Similarly, the string "ffi" can be encoded as three separate letters: |
|
* |
|
* <pre> |
|
* 0066 LATIN SMALL LETTER F |
|
* 0066 LATIN SMALL LETTER F |
|
* 0069 LATIN SMALL LETTER I |
|
* </pre> |
|
* |
|
* or as the single character |
|
* |
|
* <pre> |
|
* FB03 LATIN SMALL LIGATURE FFI |
|
* </pre> |
|
* |
|
* The ffi ligature is not a distinct semantic character, and strictly speaking |
|
* it shouldn't be in Unicode at all, but it was included for compatibility |
|
* with existing character sets that already provided it. The Unicode standard |
|
* identifies such characters by giving them "compatibility" decompositions |
|
* into the corresponding semantic characters. When sorting and searching, you |
|
* will often want to use these mappings. |
|
* |
|
* <code>normalize</code> helps solve these problems by transforming text into |
|
* the canonical composed and decomposed forms as shown in the first example |
|
* above. In addition, you can have it perform compatibility decompositions so |
|
* that you can treat compatibility characters the same as their equivalents. |
|
* Finally, <code>normalize</code> rearranges accents into the proper canonical |
|
* order, so that you do not have to worry about accent rearrangement on your |
|
* own. |
|
* |
|
* Form FCD, "Fast C or D", is also designed for collation. |
|
* It allows to work on strings that are not necessarily normalized |
|
* with an algorithm (like in collation) that works under "canonical closure", |
|
* i.e., it treats precomposed characters and their decomposed equivalents the |
|
* same. |
|
* |
|
* It is not a normalization form because it does not provide for uniqueness of |
|
* representation. Multiple strings may be canonically equivalent (their NFDs |
|
* are identical) and may all conform to FCD without being identical themselves. |
|
* |
|
* The form is defined such that the "raw decomposition", the recursive |
|
* canonical decomposition of each character, results in a string that is |
|
* canonically ordered. This means that precomposed characters are allowed for |
|
* as long as their decompositions do not need canonical reordering. |
|
* |
|
* Its advantage for a process like collation is that all NFD and most NFC texts |
|
* - and many unnormalized texts - already conform to FCD and do not need to be |
|
* normalized (NFD) for such a process. The FCD quick check will return YES for |
|
* most strings in practice. |
|
* |
|
* normalize(FCD) may be implemented with NFD. |
|
* |
|
* For more details on FCD see Unicode Technical Note #5 (Canonical Equivalence in Applications): |
|
* http://www.unicode.org/notes/tn5/#FCD |
|
* |
|
* ICU collation performs either NFD or FCD normalization automatically if |
|
* normalization is turned on for the collator object. Beyond collation and |
|
* string search, normalized strings may be useful for string equivalence |
|
* comparisons, transliteration/transcription, unique representations, etc. |
|
* |
|
* The W3C generally recommends to exchange texts in NFC. |
|
* Note also that most legacy character encodings use only precomposed forms and |
|
* often do not encode any combining marks by themselves. For conversion to such |
|
* character encodings the Unicode text needs to be normalized to NFC. |
|
* For more usage examples, see the Unicode Standard Annex. |
|
* |
|
* Note: The Normalizer class also provides API for iterative normalization. |
|
* While the setIndex() and getIndex() refer to indices in the |
|
* underlying Unicode input text, the next() and previous() methods |
|
* iterate through characters in the normalized output. |
|
* This means that there is not necessarily a one-to-one correspondence |
|
* between characters returned by next() and previous() and the indices |
|
* passed to and returned from setIndex() and getIndex(). |
|
* It is for this reason that Normalizer does not implement the CharacterIterator interface. |
|
* |
|
* @stable ICU 2.8 |
|
*/ |
|
|
|
public final class NormalizerBase implements Cloneable { |
|
|
|
|
|
private UCharacterIterator text; |
|
private Normalizer2 norm2; |
|
private Mode mode; |
|
private int options; |
|
|
|
// The normalization buffer is the result of normalization |
|
|
|
private int currentIndex; |
|
private int nextIndex; |
|
|
|
|
|
private StringBuilder buffer; |
|
private int bufferPos; |
|
|
|
|
|
private static final class ModeImpl { |
|
private ModeImpl(Normalizer2 n2) { |
|
normalizer2 = n2; |
|
} |
|
private final Normalizer2 normalizer2; |
|
} |
|
|
|
private static final class NFDModeImpl { |
|
private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFDInstance()); |
|
} |
|
|
|
private static final class NFKDModeImpl { |
|
private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKDInstance()); |
|
} |
|
|
|
private static final class NFCModeImpl { |
|
private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFCInstance()); |
|
} |
|
|
|
private static final class NFKCModeImpl { |
|
private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKCInstance()); |
|
} |
|
|
|
private static final class Unicode32 { |
|
private static final UnicodeSet INSTANCE = new UnicodeSet("[:age=3.2:]").freeze(); |
|
} |
|
|
|
private static final class NFD32ModeImpl { |
|
private static final ModeImpl INSTANCE = |
|
new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFDInstance(), |
|
Unicode32.INSTANCE)); |
|
} |
|
|
|
private static final class NFKD32ModeImpl { |
|
private static final ModeImpl INSTANCE = |
|
new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKDInstance(), |
|
Unicode32.INSTANCE)); |
|
} |
|
|
|
private static final class NFC32ModeImpl { |
|
private static final ModeImpl INSTANCE = |
|
new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFCInstance(), |
|
Unicode32.INSTANCE)); |
|
} |
|
|
|
private static final class NFKC32ModeImpl { |
|
private static final ModeImpl INSTANCE = |
|
new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKCInstance(), |
|
Unicode32.INSTANCE)); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public static final int UNICODE_3_2=0x20; |
|
|
|
public static final int UNICODE_3_2_0_ORIGINAL=UNICODE_3_2; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public static final int UNICODE_LATEST = 0x00; |
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public static final int DONE = UCharacterIterator.DONE; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public abstract static class Mode { |
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
@Deprecated |
|
protected Mode() { |
|
} |
|
|
|
|
|
|
|
|
|
*/ |
|
@Deprecated |
|
protected abstract Normalizer2 getNormalizer2(int options); |
|
} |
|
|
|
private static Mode toMode(Normalizer.Form form) { |
|
switch (form) { |
|
case NFC : |
|
return NFC; |
|
case NFD : |
|
return NFD; |
|
case NFKC : |
|
return NFKC; |
|
case NFKD : |
|
return NFKD; |
|
} |
|
|
|
throw new IllegalArgumentException("Unexpected normalization form: " + |
|
form); |
|
} |
|
|
|
private static final class NONEMode extends Mode { |
|
protected Normalizer2 getNormalizer2(int options) { return Norm2AllModes.NOOP_NORMALIZER2; } |
|
} |
|
|
|
private static final class NFDMode extends Mode { |
|
protected Normalizer2 getNormalizer2(int options) { |
|
return (options&UNICODE_3_2) != 0 ? |
|
NFD32ModeImpl.INSTANCE.normalizer2 : |
|
NFDModeImpl.INSTANCE.normalizer2; |
|
} |
|
} |
|
|
|
private static final class NFKDMode extends Mode { |
|
protected Normalizer2 getNormalizer2(int options) { |
|
return (options&UNICODE_3_2) != 0 ? |
|
NFKD32ModeImpl.INSTANCE.normalizer2 : |
|
NFKDModeImpl.INSTANCE.normalizer2; |
|
} |
|
} |
|
|
|
private static final class NFCMode extends Mode { |
|
protected Normalizer2 getNormalizer2(int options) { |
|
return (options&UNICODE_3_2) != 0 ? |
|
NFC32ModeImpl.INSTANCE.normalizer2 : |
|
NFCModeImpl.INSTANCE.normalizer2; |
|
} |
|
} |
|
|
|
private static final class NFKCMode extends Mode { |
|
protected Normalizer2 getNormalizer2(int options) { |
|
return (options&UNICODE_3_2) != 0 ? |
|
NFKC32ModeImpl.INSTANCE.normalizer2 : |
|
NFKCModeImpl.INSTANCE.normalizer2; |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
*/ |
|
public static final Mode NONE = new NONEMode(); |
|
|
|
|
|
|
|
|
|
*/ |
|
public static final Mode NFD = new NFDMode(); |
|
|
|
|
|
|
|
|
|
*/ |
|
public static final Mode NFKD = new NFKDMode(); |
|
|
|
|
|
|
|
|
|
*/ |
|
public static final Mode NFC = new NFCMode(); |
|
|
|
public static final Mode NFKC =new NFKCMode(); |
|
|
|
//------------------------------------------------------------------------- |
|
// Iterator constructors |
|
//------------------------------------------------------------------------- |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public NormalizerBase(String str, Mode mode, int opt) { |
|
this.text = UCharacterIterator.getInstance(str); |
|
this.mode = mode; |
|
this.options=opt; |
|
norm2 = mode.getNormalizer2(opt); |
|
buffer = new StringBuilder(); |
|
} |
|
|
|
public NormalizerBase(String str, Mode mode) { |
|
this(str, mode, 0); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public NormalizerBase(CharacterIterator iter, Mode mode, int opt) { |
|
this.text = UCharacterIterator.getInstance((CharacterIterator)iter.clone()); |
|
this.mode = mode; |
|
this.options = opt; |
|
norm2 = mode.getNormalizer2(opt); |
|
buffer = new StringBuilder(); |
|
} |
|
|
|
public NormalizerBase(CharacterIterator iter, Mode mode) { |
|
this(iter, mode, 0); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public Object clone() { |
|
try { |
|
NormalizerBase copy = (NormalizerBase) super.clone(); |
|
copy.text = (UCharacterIterator) text.clone(); |
|
copy.mode = mode; |
|
copy.options = options; |
|
copy.norm2 = norm2; |
|
copy.buffer = new StringBuilder(buffer); |
|
copy.bufferPos = bufferPos; |
|
copy.currentIndex = currentIndex; |
|
copy.nextIndex = nextIndex; |
|
return copy; |
|
} |
|
catch (CloneNotSupportedException e) { |
|
throw new InternalError(e.toString(), e); |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public static String normalize(String str, Mode mode, int options) { |
|
return mode.getNormalizer2(options).normalize(str); |
|
} |
|
|
|
public static String normalize(String str, Normalizer.Form form) { |
|
return NormalizerBase.normalize(str, toMode(form), UNICODE_LATEST); |
|
} |
|
|
|
public static String normalize(String str, Normalizer.Form form, int options) { |
|
return NormalizerBase.normalize(str, toMode(form), options); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public static boolean isNormalized(String str, Mode mode, int options) { |
|
return mode.getNormalizer2(options).isNormalized(str); |
|
} |
|
|
|
public static boolean isNormalized(String str, Normalizer.Form form) { |
|
return NormalizerBase.isNormalized(str, toMode(form), UNICODE_LATEST); |
|
} |
|
|
|
public static boolean isNormalized(String str, Normalizer.Form form, int options) { |
|
return NormalizerBase.isNormalized(str, toMode(form), options); |
|
} |
|
|
|
//------------------------------------------------------------------------- |
|
// Iteration API |
|
//------------------------------------------------------------------------- |
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public int current() { |
|
if(bufferPos<buffer.length() || nextNormalize()) { |
|
return buffer.codePointAt(bufferPos); |
|
} else { |
|
return DONE; |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public int next() { |
|
if(bufferPos<buffer.length() || nextNormalize()) { |
|
int c=buffer.codePointAt(bufferPos); |
|
bufferPos+=Character.charCount(c); |
|
return c; |
|
} else { |
|
return DONE; |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public int previous() { |
|
if(bufferPos>0 || previousNormalize()) { |
|
int c=buffer.codePointBefore(bufferPos); |
|
bufferPos-=Character.charCount(c); |
|
return c; |
|
} else { |
|
return DONE; |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public void reset() { |
|
text.setIndex(0); |
|
currentIndex=nextIndex=0; |
|
clearBuffer(); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public void setIndexOnly(int index) { |
|
text.setIndex(index); |
|
currentIndex=nextIndex=index; |
|
clearBuffer(); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public int setIndex(int index) { |
|
setIndexOnly(index); |
|
return current(); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
@Deprecated |
|
public int getBeginIndex() { |
|
return 0; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
@Deprecated |
|
public int getEndIndex() { |
|
return endIndex(); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public int getIndex() { |
|
if(bufferPos<buffer.length()) { |
|
return currentIndex; |
|
} else { |
|
return nextIndex; |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public int endIndex() { |
|
return text.getLength(); |
|
} |
|
|
|
//------------------------------------------------------------------------- |
|
// Iterator attributes |
|
//------------------------------------------------------------------------- |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public void setMode(Mode newMode) { |
|
mode = newMode; |
|
norm2 = mode.getNormalizer2(options); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public Mode getMode() { |
|
return mode; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public void setText(String newText) { |
|
UCharacterIterator newIter = UCharacterIterator.getInstance(newText); |
|
if (newIter == null) { |
|
throw new IllegalStateException("Could not create a new UCharacterIterator"); |
|
} |
|
text = newIter; |
|
reset(); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public void setText(CharacterIterator newText) { |
|
UCharacterIterator newIter = UCharacterIterator.getInstance(newText); |
|
if (newIter == null) { |
|
throw new IllegalStateException("Could not create a new UCharacterIterator"); |
|
} |
|
text = newIter; |
|
currentIndex=nextIndex=0; |
|
clearBuffer(); |
|
} |
|
|
|
private void clearBuffer() { |
|
buffer.setLength(0); |
|
bufferPos=0; |
|
} |
|
|
|
private boolean nextNormalize() { |
|
clearBuffer(); |
|
currentIndex=nextIndex; |
|
text.setIndex(nextIndex); |
|
|
|
int c=text.nextCodePoint(); |
|
if(c<0) { |
|
return false; |
|
} |
|
StringBuilder segment=new StringBuilder().appendCodePoint(c); |
|
while((c=text.nextCodePoint())>=0) { |
|
if(norm2.hasBoundaryBefore(c)) { |
|
text.moveCodePointIndex(-1); |
|
break; |
|
} |
|
segment.appendCodePoint(c); |
|
} |
|
nextIndex=text.getIndex(); |
|
norm2.normalize(segment, buffer); |
|
return buffer.length()!=0; |
|
} |
|
|
|
private boolean previousNormalize() { |
|
clearBuffer(); |
|
nextIndex=currentIndex; |
|
text.setIndex(currentIndex); |
|
StringBuilder segment=new StringBuilder(); |
|
int c; |
|
while((c=text.previousCodePoint())>=0) { |
|
if(c<=0xffff) { |
|
segment.insert(0, (char)c); |
|
} else { |
|
segment.insert(0, Character.toChars(c)); |
|
} |
|
if(norm2.hasBoundaryBefore(c)) { |
|
break; |
|
} |
|
} |
|
currentIndex=text.getIndex(); |
|
norm2.normalize(segment, buffer); |
|
bufferPos=buffer.length(); |
|
return buffer.length()!=0; |
|
} |
|
|
|
} |