/* |
|
* Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved. |
|
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
|
* |
|
* This code is free software; you can redistribute it and/or modify it |
|
* under the terms of the GNU General Public License version 2 only, as |
|
* published by the Free Software Foundation. Oracle designates this |
|
* particular file as subject to the "Classpath" exception as provided |
|
* by Oracle in the LICENSE file that accompanied this code. |
|
* |
|
* This code is distributed in the hope that it will be useful, but WITHOUT |
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
|
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
|
* version 2 for more details (a copy is included in the LICENSE file that |
|
* accompanied this code). |
|
* |
|
* You should have received a copy of the GNU General Public License version |
|
* 2 along with this work; if not, write to the Free Software Foundation, |
|
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
|
* |
|
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
|
* or visit www.oracle.com if you need additional information or have any |
|
* questions. |
|
*/ |
|
/* |
|
******************************************************************************* |
|
* Copyright (C) 2000-2014, International Business Machines Corporation and |
|
* others. All Rights Reserved. |
|
******************************************************************************* |
|
*/ |
|
package sun.text.normalizer; |
|
import java.text.CharacterIterator; |
|
import java.text.Normalizer; |
|
/** |
|
* Unicode Normalization |
|
* |
|
* <h2>Unicode normalization API</h2> |
|
* |
|
* <code>normalize</code> transforms Unicode text into an equivalent composed or |
|
* decomposed form, allowing for easier sorting and searching of text. |
|
* <code>normalize</code> supports the standard normalization forms described in |
|
* <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode"> |
|
* Unicode Standard Annex #15 — Unicode Normalization Forms</a>. |
|
* |
|
* Characters with accents or other adornments can be encoded in |
|
* several different ways in Unicode. For example, take the character A-acute. |
|
* In Unicode, this can be encoded as a single character (the |
|
* "composed" form): |
|
* |
|
* <pre> |
|
* 00C1 LATIN CAPITAL LETTER A WITH ACUTE |
|
* </pre> |
|
* |
|
* or as two separate characters (the "decomposed" form): |
|
* |
|
* <pre> |
|
* 0041 LATIN CAPITAL LETTER A |
|
* 0301 COMBINING ACUTE ACCENT |
|
* </pre> |
|
* |
|
* To a user of your program, however, both of these sequences should be |
|
* treated as the same "user-level" character "A with acute accent". When you |
|
* are searching or comparing text, you must ensure that these two sequences are |
|
* treated equivalently. In addition, you must handle characters with more than |
|
* one accent. Sometimes the order of a character's combining accents is |
|
* significant, while in other cases accent sequences in different orders are |
|
* really equivalent. |
|
* |
|
* Similarly, the string "ffi" can be encoded as three separate letters: |
|
* |
|
* <pre> |
|
* 0066 LATIN SMALL LETTER F |
|
* 0066 LATIN SMALL LETTER F |
|
* 0069 LATIN SMALL LETTER I |
|
* </pre> |
|
* |
|
* or as the single character |
|
* |
|
* <pre> |
|
* FB03 LATIN SMALL LIGATURE FFI |
|
* </pre> |
|
* |
|
* The ffi ligature is not a distinct semantic character, and strictly speaking |
|
* it shouldn't be in Unicode at all, but it was included for compatibility |
|
* with existing character sets that already provided it. The Unicode standard |
|
* identifies such characters by giving them "compatibility" decompositions |
|
* into the corresponding semantic characters. When sorting and searching, you |
|
* will often want to use these mappings. |
|
* |
|
* <code>normalize</code> helps solve these problems by transforming text into |
|
* the canonical composed and decomposed forms as shown in the first example |
|
* above. In addition, you can have it perform compatibility decompositions so |
|
* that you can treat compatibility characters the same as their equivalents. |
|
* Finally, <code>normalize</code> rearranges accents into the proper canonical |
|
* order, so that you do not have to worry about accent rearrangement on your |
|
* own. |
|
* |
|
* Form FCD, "Fast C or D", is also designed for collation. |
|
* It allows to work on strings that are not necessarily normalized |
|
* with an algorithm (like in collation) that works under "canonical closure", |
|
* i.e., it treats precomposed characters and their decomposed equivalents the |
|
* same. |
|
* |
|
* It is not a normalization form because it does not provide for uniqueness of |
|
* representation. Multiple strings may be canonically equivalent (their NFDs |
|
* are identical) and may all conform to FCD without being identical themselves. |
|
* |
|
* The form is defined such that the "raw decomposition", the recursive |
|
* canonical decomposition of each character, results in a string that is |
|
* canonically ordered. This means that precomposed characters are allowed for |
|
* as long as their decompositions do not need canonical reordering. |
|
* |
|
* Its advantage for a process like collation is that all NFD and most NFC texts |
|
* - and many unnormalized texts - already conform to FCD and do not need to be |
|
* normalized (NFD) for such a process. The FCD quick check will return YES for |
|
* most strings in practice. |
|
* |
|
* normalize(FCD) may be implemented with NFD. |
|
* |
|
* For more details on FCD see Unicode Technical Note #5 (Canonical Equivalence in Applications): |
|
* http://www.unicode.org/notes/tn5/#FCD |
|
* |
|
* ICU collation performs either NFD or FCD normalization automatically if |
|
* normalization is turned on for the collator object. Beyond collation and |
|
* string search, normalized strings may be useful for string equivalence |
|
* comparisons, transliteration/transcription, unique representations, etc. |
|
* |
|
* The W3C generally recommends to exchange texts in NFC. |
|
* Note also that most legacy character encodings use only precomposed forms and |
|
* often do not encode any combining marks by themselves. For conversion to such |
|
* character encodings the Unicode text needs to be normalized to NFC. |
|
* For more usage examples, see the Unicode Standard Annex. |
|
* |
|
* Note: The Normalizer class also provides API for iterative normalization. |
|
* While the setIndex() and getIndex() refer to indices in the |
|
* underlying Unicode input text, the next() and previous() methods |
|
* iterate through characters in the normalized output. |
|
* This means that there is not necessarily a one-to-one correspondence |
|
* between characters returned by next() and previous() and the indices |
|
* passed to and returned from setIndex() and getIndex(). |
|
* It is for this reason that Normalizer does not implement the CharacterIterator interface. |
|
* |
|
* @stable ICU 2.8 |
|
*/ |
|
// Original filename in ICU4J: Normalizer.java |
|
public final class NormalizerBase implements Cloneable { |
|
// The input text and our position in it |
|
private UCharacterIterator text; |
|
private Normalizer2 norm2; |
|
private Mode mode; |
|
private int options; |
|
// The normalization buffer is the result of normalization |
|
// of the source in [currentIndex..nextIndex] . |
|
private int currentIndex; |
|
private int nextIndex; |
|
// A buffer for holding intermediate results |
|
private StringBuilder buffer; |
|
private int bufferPos; |
|
// Helper classes to defer loading of normalization data. |
|
private static final class ModeImpl { |
|
private ModeImpl(Normalizer2 n2) { |
|
normalizer2 = n2; |
|
} |
|
private final Normalizer2 normalizer2; |
|
} |
|
private static final class NFDModeImpl { |
|
private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFDInstance()); |
|
} |
|
private static final class NFKDModeImpl { |
|
private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKDInstance()); |
|
} |
|
private static final class NFCModeImpl { |
|
private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFCInstance()); |
|
} |
|
private static final class NFKCModeImpl { |
|
private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKCInstance()); |
|
} |
|
private static final class Unicode32 { |
|
private static final UnicodeSet INSTANCE = new UnicodeSet("[:age=3.2:]").freeze(); |
|
} |
|
private static final class NFD32ModeImpl { |
|
private static final ModeImpl INSTANCE = |
|
new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFDInstance(), |
|
Unicode32.INSTANCE)); |
|
} |
|
private static final class NFKD32ModeImpl { |
|
private static final ModeImpl INSTANCE = |
|
new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKDInstance(), |
|
Unicode32.INSTANCE)); |
|
} |
|
private static final class NFC32ModeImpl { |
|
private static final ModeImpl INSTANCE = |
|
new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFCInstance(), |
|
Unicode32.INSTANCE)); |
|
} |
|
private static final class NFKC32ModeImpl { |
|
private static final ModeImpl INSTANCE = |
|
new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKCInstance(), |
|
Unicode32.INSTANCE)); |
|
} |
|
/** |
|
* Options bit set value to select Unicode 3.2 normalization |
|
* (except NormalizationCorrections). |
|
* At most one Unicode version can be selected at a time. |
|
* @stable ICU 2.6 |
|
*/ |
|
public static final int UNICODE_3_2=0x20; |
|
public static final int UNICODE_3_2_0_ORIGINAL=UNICODE_3_2; |
|
/* |
|
* Default option for the latest Unicode normalization. This option is |
|
* provided mainly for testing. |
|
* The value zero means that normalization is done with the fixes for |
|
* - Corrigendum 4 (Five CJK Canonical Mapping Errors) |
|
* - Corrigendum 5 (Normalization Idempotency) |
|
*/ |
|
public static final int UNICODE_LATEST = 0x00; |
|
/** |
|
* Constant indicating that the end of the iteration has been reached. |
|
* This is guaranteed to have the same value as {@link UCharacterIterator#DONE}. |
|
* @stable ICU 2.8 |
|
*/ |
|
public static final int DONE = UCharacterIterator.DONE; |
|
/** |
|
* Constants for normalization modes. |
|
* <p> |
|
* The Mode class is not intended for public subclassing. |
|
* Only the Mode constants provided by the Normalizer class should be used, |
|
* and any fields or methods should not be called or overridden by users. |
|
* @stable ICU 2.8 |
|
*/ |
|
public abstract static class Mode { |
|
/** |
|
* Sole constructor |
|
* @internal |
|
* @deprecated This API is ICU internal only. |
|
*/ |
|
@Deprecated |
|
protected Mode() { |
|
} |
|
/** |
|
* @internal |
|
* @deprecated This API is ICU internal only. |
|
*/ |
|
@Deprecated |
|
protected abstract Normalizer2 getNormalizer2(int options); |
|
} |
|
private static Mode toMode(Normalizer.Form form) { |
|
switch (form) { |
|
case NFC : |
|
return NFC; |
|
case NFD : |
|
return NFD; |
|
case NFKC : |
|
return NFKC; |
|
case NFKD : |
|
return NFKD; |
|
} |
|
throw new IllegalArgumentException("Unexpected normalization form: " + |
|
form); |
|
} |
|
private static final class NONEMode extends Mode { |
|
protected Normalizer2 getNormalizer2(int options) { return Norm2AllModes.NOOP_NORMALIZER2; } |
|
} |
|
private static final class NFDMode extends Mode { |
|
protected Normalizer2 getNormalizer2(int options) { |
|
return (options&UNICODE_3_2) != 0 ? |
|
NFD32ModeImpl.INSTANCE.normalizer2 : |
|
NFDModeImpl.INSTANCE.normalizer2; |
|
} |
|
} |
|
private static final class NFKDMode extends Mode { |
|
protected Normalizer2 getNormalizer2(int options) { |
|
return (options&UNICODE_3_2) != 0 ? |
|
NFKD32ModeImpl.INSTANCE.normalizer2 : |
|
NFKDModeImpl.INSTANCE.normalizer2; |
|
} |
|
} |
|
private static final class NFCMode extends Mode { |
|
protected Normalizer2 getNormalizer2(int options) { |
|
return (options&UNICODE_3_2) != 0 ? |
|
NFC32ModeImpl.INSTANCE.normalizer2 : |
|
NFCModeImpl.INSTANCE.normalizer2; |
|
} |
|
} |
|
private static final class NFKCMode extends Mode { |
|
protected Normalizer2 getNormalizer2(int options) { |
|
return (options&UNICODE_3_2) != 0 ? |
|
NFKC32ModeImpl.INSTANCE.normalizer2 : |
|
NFKCModeImpl.INSTANCE.normalizer2; |
|
} |
|
} |
|
/** |
|
* No decomposition/composition. |
|
* @stable ICU 2.8 |
|
*/ |
|
public static final Mode NONE = new NONEMode(); |
|
/** |
|
* Canonical decomposition. |
|
* @stable ICU 2.8 |
|
*/ |
|
public static final Mode NFD = new NFDMode(); |
|
/** |
|
* Compatibility decomposition. |
|
* @stable ICU 2.8 |
|
*/ |
|
public static final Mode NFKD = new NFKDMode(); |
|
/** |
|
* Canonical decomposition followed by canonical composition. |
|
* @stable ICU 2.8 |
|
*/ |
|
public static final Mode NFC = new NFCMode(); |
|
public static final Mode NFKC =new NFKCMode(); |
|
//------------------------------------------------------------------------- |
|
// Iterator constructors |
|
//------------------------------------------------------------------------- |
|
/** |
|
* Creates a new {@code NormalizerBase} object for iterating over the |
|
* normalized form of a given string. |
|
* <p> |
|
* The {@code options} parameter specifies which optional |
|
* {@code NormalizerBase} features are to be enabled for this object. |
|
* <p> |
|
* @param str The string to be normalized. The normalization |
|
* will start at the beginning of the string. |
|
* |
|
* @param mode The normalization mode. |
|
* |
|
* @param opt Any optional features to be enabled. |
|
* Currently the only available option is {@link #UNICODE_3_2}. |
|
* If you want the default behavior corresponding to one of the |
|
* standard Unicode Normalization Forms, use 0 for this argument. |
|
* @stable ICU 2.6 |
|
*/ |
|
public NormalizerBase(String str, Mode mode, int opt) { |
|
this.text = UCharacterIterator.getInstance(str); |
|
this.mode = mode; |
|
this.options=opt; |
|
norm2 = mode.getNormalizer2(opt); |
|
buffer = new StringBuilder(); |
|
} |
|
public NormalizerBase(String str, Mode mode) { |
|
this(str, mode, 0); |
|
} |
|
/** |
|
* Creates a new {@code NormalizerBase} object for iterating over the |
|
* normalized form of the given text. |
|
* <p> |
|
* @param iter The input text to be normalized. The normalization |
|
* will start at the beginning of the string. |
|
* |
|
* @param mode The normalization mode. |
|
* |
|
* @param opt Any optional features to be enabled. |
|
* Currently the only available option is {@link #UNICODE_3_2}. |
|
* If you want the default behavior corresponding to one of the |
|
* standard Unicode Normalization Forms, use 0 for this argument. |
|
* @stable ICU 2.6 |
|
*/ |
|
public NormalizerBase(CharacterIterator iter, Mode mode, int opt) { |
|
this.text = UCharacterIterator.getInstance((CharacterIterator)iter.clone()); |
|
this.mode = mode; |
|
this.options = opt; |
|
norm2 = mode.getNormalizer2(opt); |
|
buffer = new StringBuilder(); |
|
} |
|
public NormalizerBase(CharacterIterator iter, Mode mode) { |
|
this(iter, mode, 0); |
|
} |
|
/** |
|
* Clones this {@code NormalizerBase} object. All properties of this |
|
* object are duplicated in the new object, including the cloning of any |
|
* {@link CharacterIterator} that was passed in to the constructor |
|
* or to {@link #setText(CharacterIterator) setText}. |
|
* However, the text storage underlying |
|
* the {@code CharacterIterator} is not duplicated unless the |
|
* iterator's {@code clone} method does so. |
|
* @stable ICU 2.8 |
|
*/ |
|
public Object clone() { |
|
try { |
|
NormalizerBase copy = (NormalizerBase) super.clone(); |
|
copy.text = (UCharacterIterator) text.clone(); |
|
copy.mode = mode; |
|
copy.options = options; |
|
copy.norm2 = norm2; |
|
copy.buffer = new StringBuilder(buffer); |
|
copy.bufferPos = bufferPos; |
|
copy.currentIndex = currentIndex; |
|
copy.nextIndex = nextIndex; |
|
return copy; |
|
} |
|
catch (CloneNotSupportedException e) { |
|
throw new InternalError(e.toString(), e); |
|
} |
|
} |
|
/** |
|
* Normalizes a {@code String} using the given normalization operation. |
|
* <p> |
|
* The {@code options} parameter specifies which optional |
|
* {@code NormalizerBase} features are to be enabled for this operation. |
|
* Currently the only available option is {@link #UNICODE_3_2}. |
|
* If you want the default behavior corresponding to one of the standard |
|
* Unicode Normalization Forms, use 0 for this argument. |
|
* <p> |
|
* @param str the input string to be normalized. |
|
* @param mode the normalization mode |
|
* @param options the optional features to be enabled. |
|
* @return String the normalized string |
|
* @stable ICU 2.6 |
|
*/ |
|
public static String normalize(String str, Mode mode, int options) { |
|
return mode.getNormalizer2(options).normalize(str); |
|
} |
|
public static String normalize(String str, Normalizer.Form form) { |
|
return NormalizerBase.normalize(str, toMode(form), UNICODE_LATEST); |
|
} |
|
public static String normalize(String str, Normalizer.Form form, int options) { |
|
return NormalizerBase.normalize(str, toMode(form), options); |
|
} |
|
/** |
|
* Test if a string is in a given normalization form. |
|
* This is semantically equivalent to source.equals(normalize(source, mode)). |
|
* |
|
* Unlike quickCheck(), this function returns a definitive result, |
|
* never a "maybe". |
|
* For NFD, NFKD, and FCD, both functions work exactly the same. |
|
* For NFC and NFKC where quickCheck may return "maybe", this function will |
|
* perform further tests to arrive at a true/false result. |
|
* @param str the input string to be checked to see if it is |
|
* normalized |
|
* @param mode the normalization mode |
|
* @param options Options for use with exclusion set and tailored Normalization |
|
* The only option that is currently recognized is UNICODE_3_2 |
|
* @see #isNormalized |
|
* @stable ICU 2.6 |
|
*/ |
|
public static boolean isNormalized(String str, Mode mode, int options) { |
|
return mode.getNormalizer2(options).isNormalized(str); |
|
} |
|
public static boolean isNormalized(String str, Normalizer.Form form) { |
|
return NormalizerBase.isNormalized(str, toMode(form), UNICODE_LATEST); |
|
} |
|
public static boolean isNormalized(String str, Normalizer.Form form, int options) { |
|
return NormalizerBase.isNormalized(str, toMode(form), options); |
|
} |
|
//------------------------------------------------------------------------- |
|
// Iteration API |
|
//------------------------------------------------------------------------- |
|
/** |
|
* Return the current character in the normalized text. |
|
* @return The codepoint as an int |
|
* @stable ICU 2.8 |
|
*/ |
|
public int current() { |
|
if(bufferPos<buffer.length() || nextNormalize()) { |
|
return buffer.codePointAt(bufferPos); |
|
} else { |
|
return DONE; |
|
} |
|
} |
|
/** |
|
* Return the next character in the normalized text and advance |
|
* the iteration position by one. If the end |
|
* of the text has already been reached, {@link #DONE} is returned. |
|
* @return The codepoint as an int |
|
* @stable ICU 2.8 |
|
*/ |
|
public int next() { |
|
if(bufferPos<buffer.length() || nextNormalize()) { |
|
int c=buffer.codePointAt(bufferPos); |
|
bufferPos+=Character.charCount(c); |
|
return c; |
|
} else { |
|
return DONE; |
|
} |
|
} |
|
/** |
|
* Return the previous character in the normalized text and decrement |
|
* the iteration position by one. If the beginning |
|
* of the text has already been reached, {@link #DONE} is returned. |
|
* @return The codepoint as an int |
|
* @stable ICU 2.8 |
|
*/ |
|
public int previous() { |
|
if(bufferPos>0 || previousNormalize()) { |
|
int c=buffer.codePointBefore(bufferPos); |
|
bufferPos-=Character.charCount(c); |
|
return c; |
|
} else { |
|
return DONE; |
|
} |
|
} |
|
/** |
|
* Reset the index to the beginning of the text. |
|
* This is equivalent to setIndexOnly(startIndex)). |
|
* @stable ICU 2.8 |
|
*/ |
|
public void reset() { |
|
text.setIndex(0); |
|
currentIndex=nextIndex=0; |
|
clearBuffer(); |
|
} |
|
/** |
|
* Set the iteration position in the input text that is being normalized, |
|
* without any immediate normalization. |
|
* After setIndexOnly(), getIndex() will return the same index that is |
|
* specified here. |
|
* |
|
* @param index the desired index in the input text. |
|
* @stable ICU 2.8 |
|
*/ |
|
public void setIndexOnly(int index) { |
|
text.setIndex(index); // validates index |
|
currentIndex=nextIndex=index; |
|
clearBuffer(); |
|
} |
|
/** |
|
* Set the iteration position in the input text that is being normalized |
|
* and return the first normalized character at that position. |
|
* <p> |
|
* <b>Note:</b> This method sets the position in the <em>input</em> text, |
|
* while {@link #next} and {@link #previous} iterate through characters |
|
* in the normalized <em>output</em>. This means that there is not |
|
* necessarily a one-to-one correspondence between characters returned |
|
* by {@code next} and {@code previous} and the indices passed to and |
|
* returned from {@code setIndex} and {@link #getIndex}. |
|
* <p> |
|
* @param index the desired index in the input text. |
|
* |
|
* @return the first normalized character that is the result of iterating |
|
* forward starting at the given index. |
|
* |
|
* @throws IllegalArgumentException if the given index is less than |
|
* {@link #getBeginIndex} or greater than {@link #getEndIndex}. |
|
* deprecated ICU 3.2 |
|
* @obsolete ICU 3.2 |
|
*/ |
|
public int setIndex(int index) { |
|
setIndexOnly(index); |
|
return current(); |
|
} |
|
/** |
|
* Retrieve the index of the start of the input text. This is the begin |
|
* index of the {@code CharacterIterator} or the start (i.e. 0) of the |
|
* {@code String} over which this {@code NormalizerBase} is iterating |
|
* @deprecated ICU 2.2. Use startIndex() instead. |
|
* @return The codepoint as an int |
|
* @see #startIndex |
|
*/ |
|
@Deprecated |
|
public int getBeginIndex() { |
|
return 0; |
|
} |
|
/** |
|
* Retrieve the index of the end of the input text. This is the end index |
|
* of the {@code CharacterIterator} or the length of the {@code String} |
|
* over which this {@code NormalizerBase} is iterating |
|
* @deprecated ICU 2.2. Use endIndex() instead. |
|
* @return The codepoint as an int |
|
* @see #endIndex |
|
*/ |
|
@Deprecated |
|
public int getEndIndex() { |
|
return endIndex(); |
|
} |
|
/** |
|
* Retrieve the current iteration position in the input text that is |
|
* being normalized. This method is useful in applications such as |
|
* searching, where you need to be able to determine the position in |
|
* the input text that corresponds to a given normalized output character. |
|
* <p> |
|
* <b>Note:</b> This method sets the position in the <em>input</em>, while |
|
* {@link #next} and {@link #previous} iterate through characters in the |
|
* <em>output</em>. This means that there is not necessarily a one-to-one |
|
* correspondence between characters returned by {@code next} and |
|
* {@code previous} and the indices passed to and returned from |
|
* {@code setIndex} and {@link #getIndex}. |
|
* @return The current iteration position |
|
* @stable ICU 2.8 |
|
*/ |
|
public int getIndex() { |
|
if(bufferPos<buffer.length()) { |
|
return currentIndex; |
|
} else { |
|
return nextIndex; |
|
} |
|
} |
|
/** |
|
* Retrieve the index of the end of the input text. This is the end index |
|
* of the {@code CharacterIterator} or the length of the {@code String} |
|
* over which this {@code NormalizerBase} is iterating |
|
* @return The current iteration position |
|
* @stable ICU 2.8 |
|
*/ |
|
public int endIndex() { |
|
return text.getLength(); |
|
} |
|
//------------------------------------------------------------------------- |
|
// Iterator attributes |
|
//------------------------------------------------------------------------- |
|
/** |
|
* Set the normalization mode for this object. |
|
* <p> |
|
* <b>Note:</b>If the normalization mode is changed while iterating |
|
* over a string, calls to {@link #next} and {@link #previous} may |
|
* return previously buffers characters in the old normalization mode |
|
* until the iteration is able to re-sync at the next base character. |
|
* It is safest to call {@link #setText setText()}, {@link #first}, |
|
* {@link #last}, etc. after calling {@code setMode}. |
|
* <p> |
|
* @param newMode the new mode for this {@code NormalizerBase}. |
|
* The supported modes are: |
|
* <ul> |
|
* <li>{@link #NFC} - Unicode canonical decompositiion |
|
* followed by canonical composition. |
|
* <li>{@link #NFKC} - Unicode compatibility decompositiion |
|
* follwed by canonical composition. |
|
* <li>{@link #NFD} - Unicode canonical decomposition |
|
* <li>{@link #NFKD} - Unicode compatibility decomposition. |
|
* <li>{@link #NONE} - Do nothing but return characters |
|
* from the underlying input text. |
|
* </ul> |
|
* |
|
* @see #getMode |
|
* @stable ICU 2.8 |
|
*/ |
|
public void setMode(Mode newMode) { |
|
mode = newMode; |
|
norm2 = mode.getNormalizer2(options); |
|
} |
|
/** |
|
* Return the basic operation performed by this {@code NormalizerBase} |
|
* |
|
* @see #setMode |
|
* @stable ICU 2.8 |
|
*/ |
|
public Mode getMode() { |
|
return mode; |
|
} |
|
/** |
|
* Set the input text over which this {@code NormalizerBase} will iterate. |
|
* The iteration position is set to the beginning of the input text. |
|
* @param newText The new string to be normalized. |
|
* @stable ICU 2.8 |
|
*/ |
|
public void setText(String newText) { |
|
UCharacterIterator newIter = UCharacterIterator.getInstance(newText); |
|
if (newIter == null) { |
|
throw new IllegalStateException("Could not create a new UCharacterIterator"); |
|
} |
|
text = newIter; |
|
reset(); |
|
} |
|
/** |
|
* Set the input text over which this {@code NormalizerBase} will iterate. |
|
* The iteration position is set to the beginning of the input text. |
|
* @param newText The new string to be normalized. |
|
* @stable ICU 2.8 |
|
*/ |
|
public void setText(CharacterIterator newText) { |
|
UCharacterIterator newIter = UCharacterIterator.getInstance(newText); |
|
if (newIter == null) { |
|
throw new IllegalStateException("Could not create a new UCharacterIterator"); |
|
} |
|
text = newIter; |
|
currentIndex=nextIndex=0; |
|
clearBuffer(); |
|
} |
|
private void clearBuffer() { |
|
buffer.setLength(0); |
|
bufferPos=0; |
|
} |
|
private boolean nextNormalize() { |
|
clearBuffer(); |
|
currentIndex=nextIndex; |
|
text.setIndex(nextIndex); |
|
// Skip at least one character so we make progress. |
|
int c=text.nextCodePoint(); |
|
if(c<0) { |
|
return false; |
|
} |
|
StringBuilder segment=new StringBuilder().appendCodePoint(c); |
|
while((c=text.nextCodePoint())>=0) { |
|
if(norm2.hasBoundaryBefore(c)) { |
|
text.moveCodePointIndex(-1); |
|
break; |
|
} |
|
segment.appendCodePoint(c); |
|
} |
|
nextIndex=text.getIndex(); |
|
norm2.normalize(segment, buffer); |
|
return buffer.length()!=0; |
|
} |
|
private boolean previousNormalize() { |
|
clearBuffer(); |
|
nextIndex=currentIndex; |
|
text.setIndex(currentIndex); |
|
StringBuilder segment=new StringBuilder(); |
|
int c; |
|
while((c=text.previousCodePoint())>=0) { |
|
if(c<=0xffff) { |
|
segment.insert(0, (char)c); |
|
} else { |
|
segment.insert(0, Character.toChars(c)); |
|
} |
|
if(norm2.hasBoundaryBefore(c)) { |
|
break; |
|
} |
|
} |
|
currentIndex=text.getIndex(); |
|
norm2.normalize(segment, buffer); |
|
bufferPos=buffer.length(); |
|
return buffer.length()!=0; |
|
} |
|
} |