|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
/** |
|
******************************************************************************* |
|
* Copyright (C) 1996-2014, International Business Machines Corporation and |
|
* others. All Rights Reserved. |
|
******************************************************************************* |
|
*/ |
|
|
|
package jdk.internal.icu.text; |
|
|
|
import jdk.internal.icu.impl.UCharacterProperty; |
|
|
|
/** |
|
* <p>Standalone utility class providing UTF16 character conversions and |
|
* indexing conversions. |
|
* <p>Code that uses strings alone rarely need modification. |
|
* By design, UTF-16 does not allow overlap, so searching for strings is a safe |
|
* operation. Similarly, concatenation is always safe. Substringing is safe if |
|
* the start and end are both on UTF-32 boundaries. In normal code, the values |
|
* for start and end are on those boundaries, since they arose from operations |
|
* like searching. If not, the nearest UTF-32 boundaries can be determined |
|
* using <code>bounds()</code>. |
|
* <strong>Examples:</strong> |
|
* <p>The following examples illustrate use of some of these methods. |
|
* <pre>{@code |
|
* // iteration forwards: Original |
|
* for (int i = 0; i < s.length(); ++i) { |
|
* char ch = s.charAt(i); |
|
* doSomethingWith(ch); |
|
* } |
|
* |
|
* // iteration forwards: Changes for UTF-32 |
|
* int ch; |
|
* for (int i = 0; i < s.length(); i += UTF16.getCharCount(ch)) { |
|
* ch = UTF16.charAt(s, i); |
|
* doSomethingWith(ch); |
|
* } |
|
* |
|
* // iteration backwards: Original |
|
* for (int i = s.length() - 1; i >= 0; --i) { |
|
* char ch = s.charAt(i); |
|
* doSomethingWith(ch); |
|
* } |
|
* |
|
* // iteration backwards: Changes for UTF-32 |
|
* int ch; |
|
* for (int i = s.length() - 1; i > 0; i -= UTF16.getCharCount(ch)) { |
|
* ch = UTF16.charAt(s, i); |
|
* doSomethingWith(ch); |
|
* } |
|
* }</pre> |
|
* <strong>Notes:</strong> |
|
* <ul> |
|
* <li> |
|
* <strong>Naming:</strong> For clarity, High and Low surrogates are called |
|
* <code>Lead</code> and <code>Trail</code> in the API, which gives a better |
|
* sense of their ordering in a string. <code>offset16</code> and |
|
* <code>offset32</code> are used to distinguish offsets to UTF-16 |
|
* boundaries vs offsets to UTF-32 boundaries. <code>int char32</code> is |
|
* used to contain UTF-32 characters, as opposed to <code>char16</code>, |
|
* which is a UTF-16 code unit. |
|
* </li> |
|
* <li> |
|
* <strong>Roundtripping Offsets:</strong> You can always roundtrip from a |
|
* UTF-32 offset to a UTF-16 offset and back. Because of the difference in |
|
* structure, you can roundtrip from a UTF-16 offset to a UTF-32 offset and |
|
* back if and only if <code>bounds(string, offset16) != TRAIL</code>. |
|
* </li> |
|
* <li> |
|
* <strong>Exceptions:</strong> The error checking will throw an exception |
|
* if indices are out of bounds. Other than that, all methods will |
|
* behave reasonably, even if unmatched surrogates or out-of-bounds UTF-32 |
|
* values are present. <code>UCharacter.isLegal()</code> can be used to check |
|
* for validity if desired. |
|
* </li> |
|
* <li> |
|
* <strong>Unmatched Surrogates:</strong> If the string contains unmatched |
|
* surrogates, then these are counted as one UTF-32 value. This matches |
|
* their iteration behavior, which is vital. It also matches common display |
|
* practice as missing glyphs (see the Unicode Standard Section 5.4, 5.5). |
|
* </li> |
|
* <li> |
|
* <strong>Optimization:</strong> The method implementations may need |
|
* optimization if the compiler doesn't fold static final methods. Since |
|
* surrogate pairs will form an exceeding small percentage of all the text |
|
* in the world, the singleton case should always be optimized for. |
|
* </li> |
|
* </ul> |
|
* @author Mark Davis, with help from Markus Scherer |
|
* @stable ICU 2.1 |
|
*/ |
|
|
|
public final class UTF16 |
|
{ |
|
// public variables --------------------------------------------------- |
|
|
|
|
|
|
|
|
|
*/ |
|
public static final int CODEPOINT_MIN_VALUE = 0; |
|
|
|
|
|
|
|
|
|
*/ |
|
public static final int CODEPOINT_MAX_VALUE = 0x10ffff; |
|
|
|
|
|
|
|
*/ |
|
public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000; |
|
|
|
|
|
|
|
*/ |
|
public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800; |
|
|
|
|
|
|
|
*/ |
|
public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00; |
|
|
|
|
|
|
|
*/ |
|
public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF; |
|
|
|
|
|
|
|
*/ |
|
public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF; |
|
|
|
|
|
|
|
*/ |
|
public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE; |
|
|
|
|
|
*/ |
|
private static final int LEAD_SURROGATE_BITMASK = 0xFFFFFC00; |
|
|
|
|
|
*/ |
|
private static final int TRAIL_SURROGATE_BITMASK = 0xFFFFFC00; |
|
|
|
|
|
*/ |
|
private static final int SURROGATE_BITMASK = 0xFFFFF800; |
|
|
|
|
|
*/ |
|
private static final int LEAD_SURROGATE_BITS = 0xD800; |
|
|
|
|
|
*/ |
|
private static final int TRAIL_SURROGATE_BITS = 0xDC00; |
|
|
|
|
|
*/ |
|
private static final int SURROGATE_BITS = 0xD800; |
|
|
|
// constructor -------------------------------------------------------- |
|
|
|
// /CLOVER:OFF |
|
|
|
|
|
*/ |
|
private UTF16() { |
|
} |
|
|
|
// /CLOVER:ON |
|
// public method ------------------------------------------------------ |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public static int charAt(String source, int offset16) { |
|
char single = source.charAt(offset16); |
|
if (single < LEAD_SURROGATE_MIN_VALUE) { |
|
return single; |
|
} |
|
return _charAt(source, offset16, single); |
|
} |
|
|
|
private static int _charAt(String source, int offset16, char single) { |
|
if (single > TRAIL_SURROGATE_MAX_VALUE) { |
|
return single; |
|
} |
|
|
|
// Convert the UTF-16 surrogate pair if necessary. |
|
// For simplicity in usage, and because the frequency of pairs is |
|
// low, look both directions. |
|
|
|
if (single <= LEAD_SURROGATE_MAX_VALUE) { |
|
++offset16; |
|
if (source.length() != offset16) { |
|
char trail = source.charAt(offset16); |
|
if (trail >= TRAIL_SURROGATE_MIN_VALUE && trail <= TRAIL_SURROGATE_MAX_VALUE) { |
|
return UCharacterProperty.getRawSupplementary(single, trail); |
|
} |
|
} |
|
} else { |
|
--offset16; |
|
if (offset16 >= 0) { |
|
|
|
char lead = source.charAt(offset16); |
|
if (lead >= LEAD_SURROGATE_MIN_VALUE && lead <= LEAD_SURROGATE_MAX_VALUE) { |
|
return UCharacterProperty.getRawSupplementary(lead, single); |
|
} |
|
} |
|
} |
|
return single; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public static int charAt(CharSequence source, int offset16) { |
|
char single = source.charAt(offset16); |
|
if (single < UTF16.LEAD_SURROGATE_MIN_VALUE) { |
|
return single; |
|
} |
|
return _charAt(source, offset16, single); |
|
} |
|
|
|
private static int _charAt(CharSequence source, int offset16, char single) { |
|
if (single > UTF16.TRAIL_SURROGATE_MAX_VALUE) { |
|
return single; |
|
} |
|
|
|
// Convert the UTF-16 surrogate pair if necessary. |
|
// For simplicity in usage, and because the frequency of pairs is |
|
// low, look both directions. |
|
|
|
if (single <= UTF16.LEAD_SURROGATE_MAX_VALUE) { |
|
++offset16; |
|
if (source.length() != offset16) { |
|
char trail = source.charAt(offset16); |
|
if (trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE |
|
&& trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) { |
|
return UCharacterProperty.getRawSupplementary(single, trail); |
|
} |
|
} |
|
} else { |
|
--offset16; |
|
if (offset16 >= 0) { |
|
|
|
char lead = source.charAt(offset16); |
|
if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE |
|
&& lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) { |
|
return UCharacterProperty.getRawSupplementary(lead, single); |
|
} |
|
} |
|
} |
|
return single; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public static int charAt(char source[], int start, int limit, int offset16) { |
|
offset16 += start; |
|
if (offset16 < start || offset16 >= limit) { |
|
throw new ArrayIndexOutOfBoundsException(offset16); |
|
} |
|
|
|
char single = source[offset16]; |
|
if (!isSurrogate(single)) { |
|
return single; |
|
} |
|
|
|
// Convert the UTF-16 surrogate pair if necessary. |
|
// For simplicity in usage, and because the frequency of pairs is |
|
|
|
if (single <= LEAD_SURROGATE_MAX_VALUE) { |
|
offset16++; |
|
if (offset16 >= limit) { |
|
return single; |
|
} |
|
char trail = source[offset16]; |
|
if (isTrailSurrogate(trail)) { |
|
return UCharacterProperty.getRawSupplementary(single, trail); |
|
} |
|
} |
|
else { |
|
if (offset16 == start) { |
|
return single; |
|
} |
|
offset16--; |
|
char lead = source[offset16]; |
|
if (isLeadSurrogate(lead)) |
|
return UCharacterProperty.getRawSupplementary(lead, single); |
|
} |
|
return single; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public static int getCharCount(int char32) |
|
{ |
|
if (char32 < SUPPLEMENTARY_MIN_VALUE) { |
|
return 1; |
|
} |
|
return 2; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public static boolean isSurrogate(char char16) |
|
{ |
|
return (char16 & SURROGATE_BITMASK) == SURROGATE_BITS; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public static boolean isTrailSurrogate(char char16) |
|
{ |
|
return (char16 & TRAIL_SURROGATE_BITMASK) == TRAIL_SURROGATE_BITS; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public static boolean isLeadSurrogate(char char16) |
|
{ |
|
return (char16 & LEAD_SURROGATE_BITMASK) == LEAD_SURROGATE_BITS; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public static char getLeadSurrogate(int char32) |
|
{ |
|
if (char32 >= SUPPLEMENTARY_MIN_VALUE) { |
|
return (char)(LEAD_SURROGATE_OFFSET_ + |
|
(char32 >> LEAD_SURROGATE_SHIFT_)); |
|
} |
|
|
|
return 0; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public static char getTrailSurrogate(int char32) |
|
{ |
|
if (char32 >= SUPPLEMENTARY_MIN_VALUE) { |
|
return (char)(TRAIL_SURROGATE_MIN_VALUE + |
|
(char32 & TRAIL_SURROGATE_MASK_)); |
|
} |
|
|
|
return (char) char32; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public static String valueOf(int char32) |
|
{ |
|
if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { |
|
throw new IllegalArgumentException("Illegal codepoint"); |
|
} |
|
return toString(char32); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public static StringBuffer append(StringBuffer target, int char32) |
|
{ |
|
|
|
if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { |
|
throw new IllegalArgumentException("Illegal codepoint: " + Integer.toHexString(char32)); |
|
} |
|
|
|
|
|
if (char32 >= SUPPLEMENTARY_MIN_VALUE) |
|
{ |
|
target.append(getLeadSurrogate(char32)); |
|
target.append(getTrailSurrogate(char32)); |
|
} |
|
else { |
|
target.append((char) char32); |
|
} |
|
return target; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public static int moveCodePointOffset(char source[], int start, int limit, |
|
int offset16, int shift32) |
|
{ |
|
int size = source.length; |
|
int count; |
|
char ch; |
|
int result = offset16 + start; |
|
if (start < 0 || limit < start) { |
|
throw new StringIndexOutOfBoundsException(start); |
|
} |
|
if (limit > size) { |
|
throw new StringIndexOutOfBoundsException(limit); |
|
} |
|
if (offset16 < 0 || result > limit) { |
|
throw new StringIndexOutOfBoundsException(offset16); |
|
} |
|
if (shift32 > 0) { |
|
if (shift32 + result > size) { |
|
throw new StringIndexOutOfBoundsException(result); |
|
} |
|
count = shift32; |
|
while (result < limit && count > 0) |
|
{ |
|
ch = source[result]; |
|
if (isLeadSurrogate(ch) && (result + 1 < limit) && |
|
isTrailSurrogate(source[result + 1])) { |
|
result++; |
|
} |
|
count--; |
|
result++; |
|
} |
|
} else { |
|
if (result + shift32 < start) { |
|
throw new StringIndexOutOfBoundsException(result); |
|
} |
|
for (count = -shift32; count > 0; count--) { |
|
result--; |
|
if (result < start) { |
|
break; |
|
} |
|
ch = source[result]; |
|
if (isTrailSurrogate(ch) && result > start && isLeadSurrogate(source[result - 1])) { |
|
result--; |
|
} |
|
} |
|
} |
|
if (count != 0) { |
|
throw new StringIndexOutOfBoundsException(shift32); |
|
} |
|
result -= start; |
|
return result; |
|
} |
|
|
|
// private data members ------------------------------------------------- |
|
|
|
|
|
|
|
*/ |
|
private static final int LEAD_SURROGATE_SHIFT_ = 10; |
|
|
|
|
|
|
|
*/ |
|
private static final int TRAIL_SURROGATE_MASK_ = 0x3FF; |
|
|
|
|
|
|
|
*/ |
|
private static final int LEAD_SURROGATE_OFFSET_ = |
|
LEAD_SURROGATE_MIN_VALUE - |
|
(SUPPLEMENTARY_MIN_VALUE |
|
>> LEAD_SURROGATE_SHIFT_); |
|
|
|
// private methods ------------------------------------------------------ |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
private static String toString(int ch) |
|
{ |
|
if (ch < SUPPLEMENTARY_MIN_VALUE) { |
|
return String.valueOf((char) ch); |
|
} |
|
|
|
StringBuilder result = new StringBuilder(); |
|
result.append(getLeadSurrogate(ch)); |
|
result.append(getTrailSurrogate(ch)); |
|
return result.toString(); |
|
} |
|
} |