/* | 
|
 * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved. | 
|
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. | 
|
 * | 
|
 * This code is free software; you can redistribute it and/or modify it | 
|
 * under the terms of the GNU General Public License version 2 only, as | 
|
 * published by the Free Software Foundation.  Oracle designates this | 
|
 * particular file as subject to the "Classpath" exception as provided | 
|
 * by Oracle in the LICENSE file that accompanied this code. | 
|
 * | 
|
 * This code is distributed in the hope that it will be useful, but WITHOUT | 
|
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | 
|
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License | 
|
 * version 2 for more details (a copy is included in the LICENSE file that | 
|
 * accompanied this code). | 
|
 * | 
|
 * You should have received a copy of the GNU General Public License version | 
|
 * 2 along with this work; if not, write to the Free Software Foundation, | 
|
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. | 
|
 * | 
|
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA | 
|
 * or visit www.oracle.com if you need additional information or have any | 
|
 * questions. | 
|
*/  | 
|
/*  | 
|
*******************************************************************************  | 
|
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *  | 
|
* *  | 
|
* The original version of this source code and documentation is copyrighted *  | 
|
* and owned by IBM, These materials are provided under terms of a License *  | 
|
* Agreement between IBM and Sun. This technology is protected by multiple *  | 
|
* US and International patents. This notice and attribution to IBM may not *  | 
|
* to removed. *  | 
|
*******************************************************************************  | 
|
*/  | 
|
package sun.text.normalizer;  | 
|
import java.io.BufferedInputStream;  | 
|
import java.io.InputStream;  | 
|
import java.io.IOException;  | 
|
import java.util.MissingResourceException;  | 
|
/**  | 
|
* <p>Internal class used for Unicode character property database.</p>  | 
|
* <p>This classes store binary data read from uprops.icu.  | 
|
* It does not have the capability to parse the data into more high-level  | 
|
* information. It only returns bytes of information when required.</p>  | 
|
* <p>Due to the form most commonly used for retrieval, array of char is used  | 
|
* to store the binary data.</p>  | 
|
* <p>UCharacterPropertyDB also contains information on accessing indexes to  | 
|
* significant points in the binary data.</p>  | 
|
* <p>Responsibility for molding the binary data into more meaning form lies on  | 
|
* <a href=UCharacter.html>UCharacter</a>.</p>  | 
|
* @author Syn Wee Quek  | 
|
* @since release 2.1, february 1st 2002  | 
|
*/  | 
|
public final class UCharacterProperty  | 
|
{ | 
|
// public data members -----------------------------------------------  | 
|
    /** | 
|
    * Trie data | 
|
*/  | 
|
public CharTrie m_trie_;  | 
|
    /** | 
|
     * Optimization | 
|
     * CharTrie index array | 
|
*/  | 
|
public char[] m_trieIndex_;  | 
|
    /** | 
|
     * Optimization | 
|
     * CharTrie data array | 
|
*/  | 
|
public char[] m_trieData_;  | 
|
    /** | 
|
     * Optimization | 
|
     * CharTrie data offset | 
|
*/  | 
|
public int m_trieInitialValue_;  | 
|
    /** | 
|
    * Unicode version | 
|
*/  | 
|
public VersionInfo m_unicodeVersion_;  | 
|
// uprops.h enum UPropertySource --------------------------------------- ***  | 
|
    /** From uchar.c/uprops.icu properties vectors trie */ | 
|
public static final int SRC_PROPSVEC=2;  | 
|
    /** One more than the highest UPropertySource (SRC_) constant. */ | 
|
public static final int SRC_COUNT=9;  | 
|
// public methods ----------------------------------------------------  | 
|
    /** | 
|
     * Java friends implementation | 
|
*/  | 
|
public void setIndexData(CharTrie.FriendAgent friendagent)  | 
|
    { | 
|
m_trieIndex_ = friendagent.getPrivateIndex();  | 
|
m_trieData_ = friendagent.getPrivateData();  | 
|
m_trieInitialValue_ = friendagent.getPrivateInitialValue();  | 
|
}  | 
|
    /** | 
|
    * Gets the property value at the index. | 
|
    * This is optimized. | 
|
    * Note this is alittle different from CharTrie the index m_trieData_ | 
|
    * is never negative. | 
|
    * @param ch code point whose property value is to be retrieved | 
|
    * @return property value of code point | 
|
*/  | 
|
public final int getProperty(int ch)  | 
|
    { | 
|
if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE  | 
|
|| (ch > UTF16.LEAD_SURROGATE_MAX_VALUE  | 
|
&& ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) {  | 
|
// BMP codepoint 0000..D7FF or DC00..FFFF  | 
|
            // optimized | 
|
            try { // using try for ch < 0 is faster than using an if statement | 
|
return m_trieData_[  | 
|
(m_trieIndex_[ch >> Trie.INDEX_STAGE_1_SHIFT_]  | 
|
<< Trie.INDEX_STAGE_2_SHIFT_)  | 
|
+ (ch & Trie.INDEX_STAGE_3_MASK_)];  | 
|
} catch (ArrayIndexOutOfBoundsException e) {  | 
|
return m_trieInitialValue_;  | 
|
}  | 
|
}  | 
|
if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) {  | 
|
            // lead surrogate D800..DBFF | 
|
return m_trieData_[  | 
|
(m_trieIndex_[Trie.LEAD_INDEX_OFFSET_  | 
|
+ (ch >> Trie.INDEX_STAGE_1_SHIFT_)]  | 
|
<< Trie.INDEX_STAGE_2_SHIFT_)  | 
|
+ (ch & Trie.INDEX_STAGE_3_MASK_)];  | 
|
}  | 
|
if (ch <= UTF16.CODEPOINT_MAX_VALUE) {  | 
|
// supplementary code point 10000..10FFFF  | 
|
// look at the construction of supplementary characters  | 
|
            // trail forms the ends of it. | 
|
return m_trie_.getSurrogateValue(  | 
|
UTF16.getLeadSurrogate(ch),  | 
|
(char)(ch & Trie.SURROGATE_MASK_));  | 
|
}  | 
|
// ch is out of bounds  | 
|
// return m_dataOffset_ if there is an error, in this case we return  | 
|
// the default value: m_initialValue_  | 
|
// we cannot assume that m_initialValue_ is at offset 0  | 
|
        // this is for optimization. | 
|
return m_trieInitialValue_;  | 
|
// this all is an inlined form of return m_trie_.getCodePointValue(ch);  | 
|
}  | 
|
    /** | 
|
    * Getting the unsigned numeric value of a character embedded in the property | 
|
    * argument | 
|
    * @param prop the character | 
|
    * @return unsigned numberic value | 
|
*/  | 
|
public static int getUnsignedValue(int prop)  | 
|
    { | 
|
return (prop >> VALUE_SHIFT_) & UNSIGNED_VALUE_MASK_AFTER_SHIFT_;  | 
|
}  | 
|
    /** | 
|
     * Gets the unicode additional properties. | 
|
     * C version getUnicodeProperties. | 
|
     * @param codepoint codepoint whose additional properties is to be | 
|
     *                  retrieved | 
|
     * @param column | 
|
     * @return unicode properties | 
|
*/  | 
|
       public int getAdditional(int codepoint, int column) { | 
|
if (column == -1) {  | 
|
return getProperty(codepoint);  | 
|
}  | 
|
if (column < 0 || column >= m_additionalColumnsCount_) {  | 
|
return 0;  | 
|
}  | 
|
return m_additionalVectors_[  | 
|
m_additionalTrie_.getCodePointValue(codepoint) + column];  | 
|
}  | 
|
       /** | 
|
     * <p>Get the "age" of the code point.</p> | 
|
     * <p>The "age" is the Unicode version when the code point was first | 
|
     * designated (as a non-character or for Private Use) or assigned a | 
|
     * character.</p> | 
|
     * <p>This can be useful to avoid emitting code points to receiving | 
|
     * processes that do not accept newer characters.</p> | 
|
     * <p>The data is from the UCD file DerivedAge.txt.</p> | 
|
     * <p>This API does not check the validity of the codepoint.</p> | 
|
     * @param codepoint The code point. | 
|
     * @return the Unicode version number | 
|
*/  | 
|
public VersionInfo getAge(int codepoint)  | 
|
    { | 
|
int version = getAdditional(codepoint, 0) >> AGE_SHIFT_;  | 
|
return VersionInfo.getInstance(  | 
|
(version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_,  | 
|
version & LAST_NIBBLE_MASK_, 0, 0);  | 
|
}  | 
|
    /** | 
|
    * Forms a supplementary code point from the argument character<br> | 
|
    * Note this is for internal use hence no checks for the validity of the | 
|
    * surrogate characters are done | 
|
    * @param lead lead surrogate character | 
|
    * @param trail trailing surrogate character | 
|
    * @return code point of the supplementary character | 
|
*/  | 
|
public static int getRawSupplementary(char lead, char trail)  | 
|
    { | 
|
return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_;  | 
|
}  | 
|
    /** | 
|
    * Loads the property data and initialize the UCharacterProperty instance. | 
|
    * @throws MissingResourceException when data is missing or data has been corrupted | 
|
*/  | 
|
public static UCharacterProperty getInstance()  | 
|
    { | 
|
        if(INSTANCE_ == null) { | 
|
            try { | 
|
INSTANCE_ = new UCharacterProperty();  | 
|
}  | 
|
catch (Exception e) {  | 
|
throw new MissingResourceException(e.getMessage(),"","");  | 
|
}  | 
|
}  | 
|
return INSTANCE_;  | 
|
}  | 
|
    /** | 
|
     * Checks if the argument c is to be treated as a white space in ICU | 
|
     * rules. Usually ICU rule white spaces are ignored unless quoted. | 
|
     * Equivalent to test for Pattern_White_Space Unicode property. | 
|
     * Stable set of characters, won't change. | 
|
     * See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/ | 
|
     * @param c codepoint to check | 
|
     * @return true if c is a ICU white space | 
|
*/  | 
|
public static boolean isRuleWhiteSpace(int c)  | 
|
    { | 
|
        /* "white space" in the sense of ICU rule parsers | 
|
           This is a FIXED LIST that is NOT DEPENDENT ON UNICODE PROPERTIES. | 
|
           See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/ | 
|
           U+0009..U+000D, U+0020, U+0085, U+200E..U+200F, and U+2028..U+2029 | 
|
           Equivalent to test for Pattern_White_Space Unicode property. | 
|
*/  | 
|
return (c >= 0x0009 && c <= 0x2029 &&  | 
|
(c <= 0x000D || c == 0x0020 || c == 0x0085 ||  | 
|
c == 0x200E || c == 0x200F || c >= 0x2028));  | 
|
}  | 
|
// protected variables -----------------------------------------------  | 
|
    /** | 
|
     * Extra property trie | 
|
*/  | 
|
CharTrie m_additionalTrie_;  | 
|
    /** | 
|
     * Extra property vectors, 1st column for age and second for binary | 
|
     * properties. | 
|
*/  | 
|
int m_additionalVectors_[];  | 
|
    /** | 
|
     * Number of additional columns | 
|
*/  | 
|
int m_additionalColumnsCount_;  | 
|
    /** | 
|
     * Maximum values for block, bits used as in vector word | 
|
     * 0 | 
|
*/  | 
|
int m_maxBlockScriptValue_;  | 
|
    /** | 
|
     * Maximum values for script, bits used as in vector word | 
|
     * 0 | 
|
*/  | 
|
int m_maxJTGValue_;  | 
|
// private variables -------------------------------------------------  | 
|
      /** | 
|
     * UnicodeData.txt property object | 
|
*/  | 
|
private static UCharacterProperty INSTANCE_ = null;  | 
|
    /** | 
|
    * Default name of the datafile | 
|
*/  | 
|
private static final String DATA_FILE_NAME_ = "/sun/text/resources/uprops.icu";  | 
|
    /** | 
|
    * Default buffer size of datafile | 
|
*/  | 
|
private static final int DATA_BUFFER_SIZE_ = 25000;  | 
|
    /** | 
|
    * Numeric value shift | 
|
*/  | 
|
private static final int VALUE_SHIFT_ = 8;  | 
|
    /** | 
|
    * Mask to be applied after shifting to obtain an unsigned numeric value | 
|
*/  | 
|
private static final int UNSIGNED_VALUE_MASK_AFTER_SHIFT_ = 0xFF;  | 
|
    /** | 
|
    * Shift value for lead surrogate to form a supplementary character. | 
|
*/  | 
|
private static final int LEAD_SURROGATE_SHIFT_ = 10;  | 
|
    /** | 
|
    * Offset to add to combined surrogate pair to avoid msking. | 
|
*/  | 
|
private static final int SURROGATE_OFFSET_ =  | 
|
UTF16.SUPPLEMENTARY_MIN_VALUE -  | 
|
(UTF16.SURROGATE_MIN_VALUE <<  | 
|
LEAD_SURROGATE_SHIFT_) -  | 
|
UTF16.TRAIL_SURROGATE_MIN_VALUE;  | 
|
// additional properties ----------------------------------------------  | 
|
    /** | 
|
     * First nibble shift | 
|
*/  | 
|
private static final int FIRST_NIBBLE_SHIFT_ = 0x4;  | 
|
    /** | 
|
     * Second nibble mask | 
|
*/  | 
|
private static final int LAST_NIBBLE_MASK_ = 0xF;  | 
|
    /** | 
|
     * Age value shift | 
|
*/  | 
|
private static final int AGE_SHIFT_ = 24;  | 
|
// private constructors --------------------------------------------------  | 
|
    /** | 
|
    * Constructor | 
|
    * @exception IOException thrown when data reading fails or data corrupted | 
|
*/  | 
|
private UCharacterProperty() throws IOException  | 
|
    { | 
|
        // jar access | 
|
InputStream is = ICUData.getRequiredStream(DATA_FILE_NAME_);  | 
|
BufferedInputStream b = new BufferedInputStream(is, DATA_BUFFER_SIZE_);  | 
|
UCharacterPropertyReader reader = new UCharacterPropertyReader(b);  | 
|
reader.read(this);  | 
|
b.close();  | 
|
m_trie_.putIndexData(this);  | 
|
}  | 
|
public void upropsvec_addPropertyStarts(UnicodeSet set) {  | 
|
        /* add the start code point of each same-value range of the properties vectors trie */ | 
|
if(m_additionalColumnsCount_>0) {  | 
|
            /* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */ | 
|
TrieIterator propsVectorsIter = new TrieIterator(m_additionalTrie_);  | 
|
RangeValueIterator.Element propsVectorsResult = new RangeValueIterator.Element();  | 
|
while(propsVectorsIter.next(propsVectorsResult)){  | 
|
set.add(propsVectorsResult.start);  | 
|
}  | 
|
}  | 
|
}  | 
|
}  |