|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
/* |
|
******************************************************************************* |
|
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved * |
|
* * |
|
* The original version of this source code and documentation is copyrighted * |
|
* and owned by IBM, These materials are provided under terms of a License * |
|
* Agreement between IBM and Sun. This technology is protected by multiple * |
|
* US and International patents. This notice and attribution to IBM may not * |
|
* to removed. * |
|
******************************************************************************* |
|
*/ |
|
|
|
package sun.text.normalizer; |
|
|
|
import java.io.IOException; |
|
import java.util.MissingResourceException; |
|
|
|
/** |
|
* <p> |
|
* The UCharacter class provides extensions to the |
|
* <a href="https://docs.oracle.com/javase/1.5.0/docs/api/java/lang/Character.html"> |
|
* java.lang.Character</a> class. These extensions provide support for |
|
* more Unicode properties and together with the <a href=../text/UTF16.html>UTF16</a> |
|
* class, provide support for supplementary characters (those with code |
|
* points above U+FFFF). |
|
* Each ICU release supports the latest version of Unicode available at that time. |
|
* </p> |
|
* <p> |
|
* Code points are represented in these API using ints. While it would be |
|
* more convenient in Java to have a separate primitive datatype for them, |
|
* ints suffice in the meantime. |
|
* </p> |
|
* <p> |
|
* To use this class please add the jar file name icu4j.jar to the |
|
* class path, since it contains data files which supply the information used |
|
* by this file.<br> |
|
* E.g. In Windows <br> |
|
* <code>set CLASSPATH=%CLASSPATH%;$JAR_FILE_PATH/ucharacter.jar</code>.<br> |
|
* Otherwise, another method would be to copy the files uprops.dat and |
|
* unames.icu from the icu4j source subdirectory |
|
* <i>$ICU4J_SRC/src/com.ibm.icu.impl.data</i> to your class directory |
|
* <i>$ICU4J_CLASS/com.ibm.icu.impl.data</i>. |
|
* </p> |
|
* <p> |
|
* Aside from the additions for UTF-16 support, and the updated Unicode |
|
* properties, the main differences between UCharacter and Character are: |
|
* <ul> |
|
* <li> UCharacter is not designed to be a char wrapper and does not have |
|
* APIs to which involves management of that single char.<br> |
|
* These include: |
|
* <ul> |
|
* <li> char charValue(), |
|
* <li> int compareTo(java.lang.Character, java.lang.Character), etc. |
|
* </ul> |
|
* <li> UCharacter does not include Character APIs that are deprecated, nor |
|
* does it include the Java-specific character information, such as |
|
* boolean isJavaIdentifierPart(char ch). |
|
* <li> Character maps characters 'A' - 'Z' and 'a' - 'z' to the numeric |
|
* values '10' - '35'. UCharacter also does this in digit and |
|
* getNumericValue, to adhere to the java semantics of these |
|
* methods. New methods unicodeDigit, and |
|
* getUnicodeNumericValue do not treat the above code points |
|
* as having numeric values. This is a semantic change from ICU4J 1.3.1. |
|
* </ul> |
|
* <p> |
|
* Further detail differences can be determined from the program |
|
* <a href="http://source.icu-project.org/repos/icu/icu4j/trunk/src/com/ibm/icu/dev/test/lang/UCharacterCompare.java"> |
|
* com.ibm.icu.dev.test.lang.UCharacterCompare</a> |
|
* </p> |
|
* <p> |
|
* In addition to Java compatibility functions, which calculate derived properties, |
|
* this API provides low-level access to the Unicode Character Database. |
|
* </p> |
|
* <p> |
|
* Unicode assigns each code point (not just assigned character) values for |
|
* many properties. |
|
* Most of them are simple boolean flags, or constants from a small enumerated list. |
|
* For some properties, values are strings or other relatively more complex types. |
|
* </p> |
|
* <p> |
|
* For more information see |
|
* "About the Unicode Character Database" (http://www.unicode.org/ucd/) |
|
* and the ICU User Guide chapter on Properties (http://www.icu-project.org/userguide/properties.html). |
|
* </p> |
|
* <p> |
|
* There are also functions that provide easy migration from C/POSIX functions |
|
* like isblank(). Their use is generally discouraged because the C/POSIX |
|
* standards do not define their semantics beyond the ASCII range, which means |
|
* that different implementations exhibit very different behavior. |
|
* Instead, Unicode properties should be used directly. |
|
* </p> |
|
* <p> |
|
* There are also only a few, broad C/POSIX character classes, and they tend |
|
* to be used for conflicting purposes. For example, the "isalpha()" class |
|
* is sometimes used to determine word boundaries, while a more sophisticated |
|
* approach would at least distinguish initial letters from continuation |
|
* characters (the latter including combining marks). |
|
* (In ICU, BreakIterator is the most sophisticated API for word boundaries.) |
|
* Another example: There is no "istitle()" class for titlecase characters. |
|
* </p> |
|
* <p> |
|
* ICU 3.4 and later provides API access for all twelve C/POSIX character classes. |
|
* ICU implements them according to the Standard Recommendations in |
|
* Annex C: Compatibility Properties of UTS #18 Unicode Regular Expressions |
|
* (http://www.unicode.org/reports/tr18/#Compatibility_Properties). |
|
* </p> |
|
* <p> |
|
* API access for C/POSIX character classes is as follows: |
|
* - alpha: isUAlphabetic(c) or hasBinaryProperty(c, UProperty.ALPHABETIC) |
|
* - lower: isULowercase(c) or hasBinaryProperty(c, UProperty.LOWERCASE) |
|
* - upper: isUUppercase(c) or hasBinaryProperty(c, UProperty.UPPERCASE) |
|
* - punct: ((1<<getType(c)) & ((1<<DASH_PUNCTUATION)|(1<<START_PUNCTUATION)|(1<<END_PUNCTUATION)|(1<<CONNECTOR_PUNCTUATION)|(1<<OTHER_PUNCTUATION)|(1<<INITIAL_PUNCTUATION)|(1<<FINAL_PUNCTUATION)))!=0 |
|
* - digit: isDigit(c) or getType(c)==DECIMAL_DIGIT_NUMBER |
|
* - xdigit: hasBinaryProperty(c, UProperty.POSIX_XDIGIT) |
|
* - alnum: hasBinaryProperty(c, UProperty.POSIX_ALNUM) |
|
* - space: isUWhiteSpace(c) or hasBinaryProperty(c, UProperty.WHITE_SPACE) |
|
* - blank: hasBinaryProperty(c, UProperty.POSIX_BLANK) |
|
* - cntrl: getType(c)==CONTROL |
|
* - graph: hasBinaryProperty(c, UProperty.POSIX_GRAPH) |
|
* - print: hasBinaryProperty(c, UProperty.POSIX_PRINT) |
|
* </p> |
|
* <p> |
|
* The C/POSIX character classes are also available in UnicodeSet patterns, |
|
* using patterns like [:graph:] or \p{graph}. |
|
* </p> |
|
* <p> |
|
* Note: There are several ICU (and Java) whitespace functions. |
|
* Comparison: |
|
* - isUWhiteSpace=UCHAR_WHITE_SPACE: Unicode White_Space property; |
|
* most of general categories "Z" (separators) + most whitespace ISO controls |
|
* (including no-break spaces, but excluding IS1..IS4 and ZWSP) |
|
* - isWhitespace: Java isWhitespace; Z + whitespace ISO controls but excluding no-break spaces |
|
* - isSpaceChar: just Z (including no-break spaces) |
|
* </p> |
|
* <p> |
|
* This class is not subclassable |
|
* </p> |
|
* @author Syn Wee Quek |
|
* @stable ICU 2.1 |
|
* @see com.ibm.icu.lang.UCharacterEnums |
|
*/ |
|
|
|
public final class UCharacter |
|
{ |
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public static interface NumericType |
|
{ |
|
|
|
|
|
*/ |
|
public static final int DECIMAL = 1; |
|
} |
|
|
|
// public data members ----------------------------------------------- |
|
|
|
|
|
|
|
|
|
*/ |
|
public static final int MIN_VALUE = UTF16.CODEPOINT_MIN_VALUE; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public static final int MAX_VALUE = UTF16.CODEPOINT_MAX_VALUE; |
|
|
|
|
|
|
|
|
|
*/ |
|
public static final int SUPPLEMENTARY_MIN_VALUE = |
|
UTF16.SUPPLEMENTARY_MIN_VALUE; |
|
|
|
// public methods ---------------------------------------------------- |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public static int digit(int ch, int radix) |
|
{ |
|
|
|
int props = getProperty(ch); |
|
int value; |
|
if (getNumericType(props) == NumericType.DECIMAL) { |
|
value = UCharacterProperty.getUnsignedValue(props); |
|
} else { |
|
value = getEuropeanDigit(ch); |
|
} |
|
return (0 <= value && value < radix) ? value : -1; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public static int getDirection(int ch) |
|
{ |
|
return gBdp.getClass(ch); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public static int getCodePoint(char lead, char trail) |
|
{ |
|
if (UTF16.isLeadSurrogate(lead) && UTF16.isTrailSurrogate(trail)) { |
|
return UCharacterProperty.getRawSupplementary(lead, trail); |
|
} |
|
throw new IllegalArgumentException("Illegal surrogate characters"); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public static VersionInfo getAge(int ch) |
|
{ |
|
if (ch < MIN_VALUE || ch > MAX_VALUE) { |
|
throw new IllegalArgumentException("Codepoint out of bounds"); |
|
} |
|
return PROPERTY_.getAge(ch); |
|
} |
|
|
|
// private variables ------------------------------------------------- |
|
|
|
|
|
|
|
*/ |
|
private static final UCharacterProperty PROPERTY_; |
|
|
|
|
|
*/ |
|
private static final char[] PROPERTY_TRIE_INDEX_; |
|
private static final char[] PROPERTY_TRIE_DATA_; |
|
private static final int PROPERTY_INITIAL_VALUE_; |
|
|
|
private static final UBiDiProps gBdp; |
|
|
|
|
|
static |
|
{ |
|
try |
|
{ |
|
PROPERTY_ = UCharacterProperty.getInstance(); |
|
PROPERTY_TRIE_INDEX_ = PROPERTY_.m_trieIndex_; |
|
PROPERTY_TRIE_DATA_ = PROPERTY_.m_trieData_; |
|
PROPERTY_INITIAL_VALUE_ = PROPERTY_.m_trieInitialValue_; |
|
} |
|
catch (Exception e) |
|
{ |
|
throw new MissingResourceException(e.getMessage(),"",""); |
|
} |
|
|
|
UBiDiProps bdp; |
|
try { |
|
bdp=UBiDiProps.getSingleton(); |
|
} catch(IOException e) { |
|
bdp=UBiDiProps.getDummy(); |
|
} |
|
gBdp=bdp; |
|
} |
|
|
|
|
|
|
|
*/ |
|
private static final int NUMERIC_TYPE_SHIFT_ = 5; |
|
|
|
|
|
*/ |
|
private static final int NUMERIC_TYPE_MASK_ = 0x7 << NUMERIC_TYPE_SHIFT_; |
|
|
|
// private methods --------------------------------------------------- |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
private static int getEuropeanDigit(int ch) { |
|
if ((ch > 0x7a && ch < 0xff21) |
|
|| ch < 0x41 || (ch > 0x5a && ch < 0x61) |
|
|| ch > 0xff5a || (ch > 0xff3a && ch < 0xff41)) { |
|
return -1; |
|
} |
|
if (ch <= 0x7a) { |
|
|
|
return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61); |
|
} |
|
|
|
if (ch <= 0xff3a) { |
|
return ch + 10 - 0xff21; |
|
} |
|
|
|
return ch + 10 - 0xff41; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
private static int getNumericType(int props) |
|
{ |
|
return (props & NUMERIC_TYPE_MASK_) >> NUMERIC_TYPE_SHIFT_; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
private static final int getProperty(int ch) |
|
{ |
|
if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE |
|
|| (ch > UTF16.LEAD_SURROGATE_MAX_VALUE |
|
&& ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) { |
|
|
|
try { |
|
return PROPERTY_TRIE_DATA_[ |
|
(PROPERTY_TRIE_INDEX_[ch >> 5] << 2) |
|
+ (ch & 0x1f)]; |
|
} catch (ArrayIndexOutOfBoundsException e) { |
|
return PROPERTY_INITIAL_VALUE_; |
|
} |
|
} |
|
if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) { |
|
|
|
return PROPERTY_TRIE_DATA_[ |
|
(PROPERTY_TRIE_INDEX_[(0x2800 >> 5) + (ch >> 5)] << 2) |
|
+ (ch & 0x1f)]; |
|
} |
|
|
|
if (ch <= UTF16.CODEPOINT_MAX_VALUE) { |
|
// supplementary code point 10000..10FFFF |
|
// look at the construction of supplementary characters |
|
|
|
return PROPERTY_.m_trie_.getSurrogateValue( |
|
UTF16.getLeadSurrogate(ch), |
|
(char)(ch & 0x3ff)); |
|
} |
|
// return m_dataOffset_ if there is an error, in this case we return |
|
// the default value: m_initialValue_ |
|
// we cannot assume that m_initialValue_ is at offset 0 |
|
|
|
return PROPERTY_INITIAL_VALUE_; |
|
} |
|
|
|
} |