/* |
|
* Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved. |
|
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
|
* |
|
* This code is free software; you can redistribute it and/or modify it |
|
* under the terms of the GNU General Public License version 2 only, as |
|
* published by the Free Software Foundation. Oracle designates this |
|
* particular file as subject to the "Classpath" exception as provided |
|
* by Oracle in the LICENSE file that accompanied this code. |
|
* |
|
* This code is distributed in the hope that it will be useful, but WITHOUT |
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
|
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
|
* version 2 for more details (a copy is included in the LICENSE file that |
|
* accompanied this code). |
|
* |
|
* You should have received a copy of the GNU General Public License version |
|
* 2 along with this work; if not, write to the Free Software Foundation, |
|
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
|
* |
|
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
|
* or visit www.oracle.com if you need additional information or have any |
|
* questions. |
|
*/ |
|
/* |
|
******************************************************************************* |
|
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved * |
|
* * |
|
* The original version of this source code and documentation is copyrighted * |
|
* and owned by IBM, These materials are provided under terms of a License * |
|
* Agreement between IBM and Sun. This technology is protected by multiple * |
|
* US and International patents. This notice and attribution to IBM may not * |
|
* to removed. * |
|
******************************************************************************* |
|
*/ |
|
package sun.text.normalizer; |
|
import java.io.BufferedInputStream; |
|
import java.io.InputStream; |
|
import java.io.IOException; |
|
import java.util.MissingResourceException; |
|
/** |
|
* <p>Internal class used for Unicode character property database.</p> |
|
* <p>This classes store binary data read from uprops.icu. |
|
* It does not have the capability to parse the data into more high-level |
|
* information. It only returns bytes of information when required.</p> |
|
* <p>Due to the form most commonly used for retrieval, array of char is used |
|
* to store the binary data.</p> |
|
* <p>UCharacterPropertyDB also contains information on accessing indexes to |
|
* significant points in the binary data.</p> |
|
* <p>Responsibility for molding the binary data into more meaning form lies on |
|
* <a href=UCharacter.html>UCharacter</a>.</p> |
|
* @author Syn Wee Quek |
|
* @since release 2.1, february 1st 2002 |
|
*/ |
|
public final class UCharacterProperty |
|
{ |
|
// public data members ----------------------------------------------- |
|
/** |
|
* Trie data |
|
*/ |
|
public CharTrie m_trie_; |
|
/** |
|
* Optimization |
|
* CharTrie index array |
|
*/ |
|
public char[] m_trieIndex_; |
|
/** |
|
* Optimization |
|
* CharTrie data array |
|
*/ |
|
public char[] m_trieData_; |
|
/** |
|
* Optimization |
|
* CharTrie data offset |
|
*/ |
|
public int m_trieInitialValue_; |
|
/** |
|
* Unicode version |
|
*/ |
|
public VersionInfo m_unicodeVersion_; |
|
// uprops.h enum UPropertySource --------------------------------------- *** |
|
/** From uchar.c/uprops.icu properties vectors trie */ |
|
public static final int SRC_PROPSVEC=2; |
|
/** One more than the highest UPropertySource (SRC_) constant. */ |
|
public static final int SRC_COUNT=9; |
|
// public methods ---------------------------------------------------- |
|
/** |
|
* Java friends implementation |
|
*/ |
|
public void setIndexData(CharTrie.FriendAgent friendagent) |
|
{ |
|
m_trieIndex_ = friendagent.getPrivateIndex(); |
|
m_trieData_ = friendagent.getPrivateData(); |
|
m_trieInitialValue_ = friendagent.getPrivateInitialValue(); |
|
} |
|
/** |
|
* Gets the property value at the index. |
|
* This is optimized. |
|
* Note this is alittle different from CharTrie the index m_trieData_ |
|
* is never negative. |
|
* @param ch code point whose property value is to be retrieved |
|
* @return property value of code point |
|
*/ |
|
public final int getProperty(int ch) |
|
{ |
|
if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE |
|
|| (ch > UTF16.LEAD_SURROGATE_MAX_VALUE |
|
&& ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) { |
|
// BMP codepoint 0000..D7FF or DC00..FFFF |
|
// optimized |
|
try { // using try for ch < 0 is faster than using an if statement |
|
return m_trieData_[ |
|
(m_trieIndex_[ch >> Trie.INDEX_STAGE_1_SHIFT_] |
|
<< Trie.INDEX_STAGE_2_SHIFT_) |
|
+ (ch & Trie.INDEX_STAGE_3_MASK_)]; |
|
} catch (ArrayIndexOutOfBoundsException e) { |
|
return m_trieInitialValue_; |
|
} |
|
} |
|
if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) { |
|
// lead surrogate D800..DBFF |
|
return m_trieData_[ |
|
(m_trieIndex_[Trie.LEAD_INDEX_OFFSET_ |
|
+ (ch >> Trie.INDEX_STAGE_1_SHIFT_)] |
|
<< Trie.INDEX_STAGE_2_SHIFT_) |
|
+ (ch & Trie.INDEX_STAGE_3_MASK_)]; |
|
} |
|
if (ch <= UTF16.CODEPOINT_MAX_VALUE) { |
|
// supplementary code point 10000..10FFFF |
|
// look at the construction of supplementary characters |
|
// trail forms the ends of it. |
|
return m_trie_.getSurrogateValue( |
|
UTF16.getLeadSurrogate(ch), |
|
(char)(ch & Trie.SURROGATE_MASK_)); |
|
} |
|
// ch is out of bounds |
|
// return m_dataOffset_ if there is an error, in this case we return |
|
// the default value: m_initialValue_ |
|
// we cannot assume that m_initialValue_ is at offset 0 |
|
// this is for optimization. |
|
return m_trieInitialValue_; |
|
// this all is an inlined form of return m_trie_.getCodePointValue(ch); |
|
} |
|
/** |
|
* Getting the unsigned numeric value of a character embedded in the property |
|
* argument |
|
* @param prop the character |
|
* @return unsigned numberic value |
|
*/ |
|
public static int getUnsignedValue(int prop) |
|
{ |
|
return (prop >> VALUE_SHIFT_) & UNSIGNED_VALUE_MASK_AFTER_SHIFT_; |
|
} |
|
/** |
|
* Gets the unicode additional properties. |
|
* C version getUnicodeProperties. |
|
* @param codepoint codepoint whose additional properties is to be |
|
* retrieved |
|
* @param column |
|
* @return unicode properties |
|
*/ |
|
public int getAdditional(int codepoint, int column) { |
|
if (column == -1) { |
|
return getProperty(codepoint); |
|
} |
|
if (column < 0 || column >= m_additionalColumnsCount_) { |
|
return 0; |
|
} |
|
return m_additionalVectors_[ |
|
m_additionalTrie_.getCodePointValue(codepoint) + column]; |
|
} |
|
/** |
|
* <p>Get the "age" of the code point.</p> |
|
* <p>The "age" is the Unicode version when the code point was first |
|
* designated (as a non-character or for Private Use) or assigned a |
|
* character.</p> |
|
* <p>This can be useful to avoid emitting code points to receiving |
|
* processes that do not accept newer characters.</p> |
|
* <p>The data is from the UCD file DerivedAge.txt.</p> |
|
* <p>This API does not check the validity of the codepoint.</p> |
|
* @param codepoint The code point. |
|
* @return the Unicode version number |
|
*/ |
|
public VersionInfo getAge(int codepoint) |
|
{ |
|
int version = getAdditional(codepoint, 0) >> AGE_SHIFT_; |
|
return VersionInfo.getInstance( |
|
(version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_, |
|
version & LAST_NIBBLE_MASK_, 0, 0); |
|
} |
|
/** |
|
* Forms a supplementary code point from the argument character<br> |
|
* Note this is for internal use hence no checks for the validity of the |
|
* surrogate characters are done |
|
* @param lead lead surrogate character |
|
* @param trail trailing surrogate character |
|
* @return code point of the supplementary character |
|
*/ |
|
public static int getRawSupplementary(char lead, char trail) |
|
{ |
|
return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_; |
|
} |
|
/** |
|
* Loads the property data and initialize the UCharacterProperty instance. |
|
* @throws MissingResourceException when data is missing or data has been corrupted |
|
*/ |
|
public static UCharacterProperty getInstance() |
|
{ |
|
if(INSTANCE_ == null) { |
|
try { |
|
INSTANCE_ = new UCharacterProperty(); |
|
} |
|
catch (Exception e) { |
|
throw new MissingResourceException(e.getMessage(),"",""); |
|
} |
|
} |
|
return INSTANCE_; |
|
} |
|
/** |
|
* Checks if the argument c is to be treated as a white space in ICU |
|
* rules. Usually ICU rule white spaces are ignored unless quoted. |
|
* Equivalent to test for Pattern_White_Space Unicode property. |
|
* Stable set of characters, won't change. |
|
* See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/ |
|
* @param c codepoint to check |
|
* @return true if c is a ICU white space |
|
*/ |
|
public static boolean isRuleWhiteSpace(int c) |
|
{ |
|
/* "white space" in the sense of ICU rule parsers |
|
This is a FIXED LIST that is NOT DEPENDENT ON UNICODE PROPERTIES. |
|
See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/ |
|
U+0009..U+000D, U+0020, U+0085, U+200E..U+200F, and U+2028..U+2029 |
|
Equivalent to test for Pattern_White_Space Unicode property. |
|
*/ |
|
return (c >= 0x0009 && c <= 0x2029 && |
|
(c <= 0x000D || c == 0x0020 || c == 0x0085 || |
|
c == 0x200E || c == 0x200F || c >= 0x2028)); |
|
} |
|
// protected variables ----------------------------------------------- |
|
/** |
|
* Extra property trie |
|
*/ |
|
CharTrie m_additionalTrie_; |
|
/** |
|
* Extra property vectors, 1st column for age and second for binary |
|
* properties. |
|
*/ |
|
int m_additionalVectors_[]; |
|
/** |
|
* Number of additional columns |
|
*/ |
|
int m_additionalColumnsCount_; |
|
/** |
|
* Maximum values for block, bits used as in vector word |
|
* 0 |
|
*/ |
|
int m_maxBlockScriptValue_; |
|
/** |
|
* Maximum values for script, bits used as in vector word |
|
* 0 |
|
*/ |
|
int m_maxJTGValue_; |
|
// private variables ------------------------------------------------- |
|
/** |
|
* UnicodeData.txt property object |
|
*/ |
|
private static UCharacterProperty INSTANCE_ = null; |
|
/** |
|
* Default name of the datafile |
|
*/ |
|
private static final String DATA_FILE_NAME_ = "/sun/text/resources/uprops.icu"; |
|
/** |
|
* Default buffer size of datafile |
|
*/ |
|
private static final int DATA_BUFFER_SIZE_ = 25000; |
|
/** |
|
* Numeric value shift |
|
*/ |
|
private static final int VALUE_SHIFT_ = 8; |
|
/** |
|
* Mask to be applied after shifting to obtain an unsigned numeric value |
|
*/ |
|
private static final int UNSIGNED_VALUE_MASK_AFTER_SHIFT_ = 0xFF; |
|
/** |
|
* Shift value for lead surrogate to form a supplementary character. |
|
*/ |
|
private static final int LEAD_SURROGATE_SHIFT_ = 10; |
|
/** |
|
* Offset to add to combined surrogate pair to avoid msking. |
|
*/ |
|
private static final int SURROGATE_OFFSET_ = |
|
UTF16.SUPPLEMENTARY_MIN_VALUE - |
|
(UTF16.SURROGATE_MIN_VALUE << |
|
LEAD_SURROGATE_SHIFT_) - |
|
UTF16.TRAIL_SURROGATE_MIN_VALUE; |
|
// additional properties ---------------------------------------------- |
|
/** |
|
* First nibble shift |
|
*/ |
|
private static final int FIRST_NIBBLE_SHIFT_ = 0x4; |
|
/** |
|
* Second nibble mask |
|
*/ |
|
private static final int LAST_NIBBLE_MASK_ = 0xF; |
|
/** |
|
* Age value shift |
|
*/ |
|
private static final int AGE_SHIFT_ = 24; |
|
// private constructors -------------------------------------------------- |
|
/** |
|
* Constructor |
|
* @exception IOException thrown when data reading fails or data corrupted |
|
*/ |
|
private UCharacterProperty() throws IOException |
|
{ |
|
// jar access |
|
InputStream is = ICUData.getRequiredStream(DATA_FILE_NAME_); |
|
BufferedInputStream b = new BufferedInputStream(is, DATA_BUFFER_SIZE_); |
|
UCharacterPropertyReader reader = new UCharacterPropertyReader(b); |
|
reader.read(this); |
|
b.close(); |
|
m_trie_.putIndexData(this); |
|
} |
|
public void upropsvec_addPropertyStarts(UnicodeSet set) { |
|
/* add the start code point of each same-value range of the properties vectors trie */ |
|
if(m_additionalColumnsCount_>0) { |
|
/* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */ |
|
TrieIterator propsVectorsIter = new TrieIterator(m_additionalTrie_); |
|
RangeValueIterator.Element propsVectorsResult = new RangeValueIterator.Element(); |
|
while(propsVectorsIter.next(propsVectorsResult)){ |
|
set.add(propsVectorsResult.start); |
|
} |
|
} |
|
} |
|
} |