/* | 
|
 * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved. | 
|
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. | 
|
 * | 
|
 * This code is free software; you can redistribute it and/or modify it | 
|
 * under the terms of the GNU General Public License version 2 only, as | 
|
 * published by the Free Software Foundation.  Oracle designates this | 
|
 * particular file as subject to the "Classpath" exception as provided | 
|
 * by Oracle in the LICENSE file that accompanied this code. | 
|
 * | 
|
 * This code is distributed in the hope that it will be useful, but WITHOUT | 
|
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | 
|
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License | 
|
 * version 2 for more details (a copy is included in the LICENSE file that | 
|
 * accompanied this code). | 
|
 * | 
|
 * You should have received a copy of the GNU General Public License version | 
|
 * 2 along with this work; if not, write to the Free Software Foundation, | 
|
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. | 
|
 * | 
|
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA | 
|
 * or visit www.oracle.com if you need additional information or have any | 
|
 * questions. | 
|
*/  | 
|
/*  | 
|
*******************************************************************************  | 
|
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *  | 
|
* *  | 
|
* The original version of this source code and documentation is copyrighted *  | 
|
* and owned by IBM, These materials are provided under terms of a License *  | 
|
* Agreement between IBM and Sun. This technology is protected by multiple *  | 
|
* US and International patents. This notice and attribution to IBM may not *  | 
|
* to removed. *  | 
|
*******************************************************************************  | 
|
*/  | 
|
package sun.text.normalizer;  | 
|
import java.io.DataInputStream;  | 
|
import java.io.InputStream;  | 
|
import java.io.IOException;  | 
|
/** | 
|
 * <p>A trie is a kind of compressed, serializable table of values | 
|
 * associated with Unicode code points (0..0x10ffff).</p> | 
|
 * <p>This class defines the basic structure of a trie and provides methods | 
|
 * to <b>retrieve the offsets to the actual data</b>.</p> | 
|
 * <p>Data will be the form of an array of basic types, char or int.</p> | 
|
 * <p>The actual data format will have to be specified by the user in the | 
|
 * inner static interface com.ibm.icu.impl.Trie.DataManipulate.</p> | 
|
 * <p>This trie implementation is optimized for getting offset while walking | 
|
 * forward through a UTF-16 string. | 
|
 * Therefore, the simplest and fastest access macros are the | 
|
 * fromLead() and fromOffsetTrail() methods. | 
|
 * The fromBMP() method are a little more complicated; they get offsets even | 
|
 * for lead surrogate codepoints, while the fromLead() method get special | 
|
 * "folded" offsets for lead surrogate code units if there is relevant data | 
|
 * associated with them. | 
|
 * From such a folded offsets, an offset needs to be extracted to supply | 
|
 * to the fromOffsetTrail() methods. | 
|
 * To handle such supplementary codepoints, some offset information are kept | 
|
 * in the data.</p> | 
|
 * <p>Methods in com.ibm.icu.impl.Trie.DataManipulate are called to retrieve | 
|
 * that offset from the folded value for the lead surrogate unit.</p> | 
|
 * <p>For examples of use, see com.ibm.icu.impl.CharTrie or | 
|
 * com.ibm.icu.impl.IntTrie.</p> | 
|
 * @author synwee | 
|
 * @see com.ibm.icu.impl.CharTrie | 
|
 * @see com.ibm.icu.impl.IntTrie | 
|
 * @since release 2.1, Jan 01 2002 | 
|
*/  | 
|
public abstract class Trie  | 
|
{ | 
|
// public class declaration ----------------------------------------  | 
|
    /** | 
|
    * Character data in com.ibm.impl.Trie have different user-specified format | 
|
    * for different purposes. | 
|
    * This interface specifies methods to be implemented in order for | 
|
    * com.ibm.impl.Trie, to surrogate offset information encapsulated within | 
|
    * the data. | 
|
*/  | 
|
public static interface DataManipulate  | 
|
    { | 
|
        /** | 
|
        * Called by com.ibm.icu.impl.Trie to extract from a lead surrogate's | 
|
        * data | 
|
        * the index array offset of the indexes for that lead surrogate. | 
|
        * @param value data value for a surrogate from the trie, including the | 
|
        *        folding offset | 
|
        * @return data offset or 0 if there is no data for the lead surrogate | 
|
*/  | 
|
public int getFoldingOffset(int value);  | 
|
}  | 
|
    // default implementation | 
|
    private static class DefaultGetFoldingOffset implements DataManipulate { | 
|
        public int getFoldingOffset(int value) { | 
|
return value;  | 
|
}  | 
|
}  | 
|
// protected constructor -------------------------------------------  | 
|
    /** | 
|
    * Trie constructor for CharTrie use. | 
|
    * @param inputStream ICU data file input stream which contains the | 
|
    *                        trie | 
|
    * @param dataManipulate object containing the information to parse the | 
|
    *                       trie data | 
|
    * @throws IOException thrown when input stream does not have the | 
|
    *                        right header. | 
|
*/  | 
|
protected Trie(InputStream inputStream,  | 
|
DataManipulate dataManipulate) throws IOException  | 
|
    { | 
|
DataInputStream input = new DataInputStream(inputStream);  | 
|
        // Magic number to authenticate the data. | 
|
int signature = input.readInt();  | 
|
m_options_ = input.readInt();  | 
|
if (!checkHeader(signature)) {  | 
|
throw new IllegalArgumentException("ICU data file error: Trie header authentication failed, please check if you have the most updated ICU data file");  | 
|
}  | 
|
        if(dataManipulate != null) { | 
|
m_dataManipulate_ = dataManipulate;  | 
|
        } else { | 
|
m_dataManipulate_ = new DefaultGetFoldingOffset();  | 
|
}  | 
|
m_isLatin1Linear_ = (m_options_ &  | 
|
HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;  | 
|
m_dataOffset_ = input.readInt();  | 
|
m_dataLength_ = input.readInt();  | 
|
unserialize(inputStream);  | 
|
}  | 
|
    /** | 
|
    * Trie constructor | 
|
    * @param index array to be used for index | 
|
    * @param options used by the trie | 
|
    * @param dataManipulate object containing the information to parse the | 
|
    *                       trie data | 
|
*/  | 
|
protected Trie(char index[], int options, DataManipulate dataManipulate)  | 
|
    { | 
|
m_options_ = options;  | 
|
if(dataManipulate != null) {  | 
|
m_dataManipulate_ = dataManipulate;  | 
|
        } else { | 
|
m_dataManipulate_ = new DefaultGetFoldingOffset();  | 
|
}  | 
|
m_isLatin1Linear_ = (m_options_ &  | 
|
HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;  | 
|
m_index_ = index;  | 
|
m_dataOffset_ = m_index_.length;  | 
|
}  | 
|
// protected data members ------------------------------------------  | 
|
    /** | 
|
    * Lead surrogate code points' index displacement in the index array. | 
|
    * 0x10000-0xd800=0x2800 | 
|
    * 0x2800 >> INDEX_STAGE_1_SHIFT_ | 
|
*/  | 
|
protected static final int LEAD_INDEX_OFFSET_ = 0x2800 >> 5;  | 
|
    /** | 
|
    * Shift size for shifting right the input index. 1..9 | 
|
*/  | 
|
protected static final int INDEX_STAGE_1_SHIFT_ = 5;  | 
|
    /** | 
|
    * Shift size for shifting left the index array values. | 
|
    * Increases possible data size with 16-bit index values at the cost | 
|
    * of compactability. | 
|
    * This requires blocks of stage 2 data to be aligned by | 
|
    * DATA_GRANULARITY. | 
|
    * 0..INDEX_STAGE_1_SHIFT | 
|
*/  | 
|
protected static final int INDEX_STAGE_2_SHIFT_ = 2;  | 
|
    /** | 
|
     * Number of data values in a stage 2 (data array) block. | 
|
*/  | 
|
protected static final int DATA_BLOCK_LENGTH=1<<INDEX_STAGE_1_SHIFT_;  | 
|
    /** | 
|
    * Mask for getting the lower bits from the input index. | 
|
    * DATA_BLOCK_LENGTH - 1. | 
|
*/  | 
|
protected static final int INDEX_STAGE_3_MASK_ = DATA_BLOCK_LENGTH - 1;  | 
|
    /** Number of bits of a trail surrogate that are used in index table lookups. */ | 
|
protected static final int SURROGATE_BLOCK_BITS=10-INDEX_STAGE_1_SHIFT_;  | 
|
    /** | 
|
     * Number of index (stage 1) entries per lead surrogate. | 
|
     * Same as number of index entries for 1024 trail surrogates, | 
|
     * ==0x400>>INDEX_STAGE_1_SHIFT_ | 
|
*/  | 
|
protected static final int SURROGATE_BLOCK_COUNT=(1<<SURROGATE_BLOCK_BITS);  | 
|
    /** Length of the BMP portion of the index (stage 1) array. */ | 
|
protected static final int BMP_INDEX_LENGTH=0x10000>>INDEX_STAGE_1_SHIFT_;  | 
|
    /** | 
|
    * Surrogate mask to use when shifting offset to retrieve supplementary | 
|
    * values | 
|
*/  | 
|
protected static final int SURROGATE_MASK_ = 0x3FF;  | 
|
    /** | 
|
    * Index or UTF16 characters | 
|
*/  | 
|
protected char m_index_[];  | 
|
    /** | 
|
    * Internal TrieValue which handles the parsing of the data value. | 
|
    * This class is to be implemented by the user | 
|
*/  | 
|
protected DataManipulate m_dataManipulate_;  | 
|
    /** | 
|
    * Start index of the data portion of the trie. CharTrie combines | 
|
    * index and data into a char array, so this is used to indicate the | 
|
    * initial offset to the data portion. | 
|
    * Note this index always points to the initial value. | 
|
*/  | 
|
protected int m_dataOffset_;  | 
|
    /** | 
|
    * Length of the data array | 
|
*/  | 
|
protected int m_dataLength_;  | 
|
// protected methods -----------------------------------------------  | 
|
    /** | 
|
    * Gets the offset to the data which the surrogate pair points to. | 
|
    * @param lead lead surrogate | 
|
    * @param trail trailing surrogate | 
|
    * @return offset to data | 
|
*/  | 
|
protected abstract int getSurrogateOffset(char lead, char trail);  | 
|
    /** | 
|
    * Gets the value at the argument index | 
|
    * @param index value at index will be retrieved | 
|
    * @return 32 bit value | 
|
*/  | 
|
protected abstract int getValue(int index);  | 
|
    /** | 
|
    * Gets the default initial value | 
|
    * @return 32 bit value | 
|
*/  | 
|
protected abstract int getInitialValue();  | 
|
    /** | 
|
    * Gets the offset to the data which the index ch after variable offset | 
|
    * points to. | 
|
    * Note for locating a non-supplementary character data offset, calling | 
|
    * <p> | 
|
    * getRawOffset(0, ch); | 
|
    * </p> | 
|
    * will do. Otherwise if it is a supplementary character formed by | 
|
    * surrogates lead and trail. Then we would have to call getRawOffset() | 
|
    * with getFoldingIndexOffset(). See getSurrogateOffset(). | 
|
    * @param offset index offset which ch is to start from | 
|
    * @param ch index to be used after offset | 
|
    * @return offset to the data | 
|
*/  | 
|
protected final int getRawOffset(int offset, char ch)  | 
|
    { | 
|
return (m_index_[offset + (ch >> INDEX_STAGE_1_SHIFT_)]  | 
|
<< INDEX_STAGE_2_SHIFT_)  | 
|
+ (ch & INDEX_STAGE_3_MASK_);  | 
|
}  | 
|
    /** | 
|
    * Gets the offset to data which the BMP character points to | 
|
    * Treats a lead surrogate as a normal code point. | 
|
    * @param ch BMP character | 
|
    * @return offset to data | 
|
*/  | 
|
protected final int getBMPOffset(char ch)  | 
|
    { | 
|
return (ch >= UTF16.LEAD_SURROGATE_MIN_VALUE  | 
|
&& ch <= UTF16.LEAD_SURROGATE_MAX_VALUE)  | 
|
? getRawOffset(LEAD_INDEX_OFFSET_, ch)  | 
|
: getRawOffset(0, ch);  | 
|
// using a getRawOffset(ch) makes no diff  | 
|
}  | 
|
    /** | 
|
    * Gets the offset to the data which this lead surrogate character points | 
|
    * to. | 
|
    * Data at the returned offset may contain folding offset information for | 
|
    * the next trailing surrogate character. | 
|
    * @param ch lead surrogate character | 
|
    * @return offset to data | 
|
*/  | 
|
protected final int getLeadOffset(char ch)  | 
|
    { | 
|
return getRawOffset(0, ch);  | 
|
}  | 
|
    /** | 
|
    * Internal trie getter from a code point. | 
|
    * Could be faster(?) but longer with | 
|
    *   if((c32)<=0xd7ff) { (result)=_TRIE_GET_RAW(trie, data, 0, c32); } | 
|
    * Gets the offset to data which the codepoint points to | 
|
    * @param ch codepoint | 
|
    * @return offset to data | 
|
*/  | 
|
protected final int getCodePointOffset(int ch)  | 
|
    { | 
|
        // if ((ch >> 16) == 0) slower | 
|
if (ch < 0) {  | 
|
return -1;  | 
|
} else if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE) {  | 
|
            // fastpath for the part of the BMP below surrogates (D800) where getRawOffset() works | 
|
return getRawOffset(0, (char)ch);  | 
|
} else if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) {  | 
|
            // BMP codepoint | 
|
return getBMPOffset((char)ch);  | 
|
} else if (ch <= UCharacter.MAX_VALUE) {  | 
|
// look at the construction of supplementary characters  | 
|
            // trail forms the ends of it. | 
|
return getSurrogateOffset(UTF16.getLeadSurrogate(ch),  | 
|
(char)(ch & SURROGATE_MASK_));  | 
|
        } else { | 
|
            // return -1 // if there is an error, in this case we return | 
|
return -1;  | 
|
}  | 
|
}  | 
|
    /** | 
|
    * <p>Parses the inputstream and creates the trie index with it.</p> | 
|
    * <p>This is overwritten by the child classes. | 
|
    * @param inputStream input stream containing the trie information | 
|
    * @exception IOException thrown when data reading fails. | 
|
*/  | 
|
protected void unserialize(InputStream inputStream) throws IOException  | 
|
    { | 
|
        //indexLength is a multiple of 1024 >> INDEX_STAGE_2_SHIFT_ | 
|
m_index_ = new char[m_dataOffset_];  | 
|
DataInputStream input = new DataInputStream(inputStream);  | 
|
for (int i = 0; i < m_dataOffset_; i ++) {  | 
|
m_index_[i] = input.readChar();  | 
|
}  | 
|
}  | 
|
    /** | 
|
    * Determines if this is a 32 bit trie | 
|
    * @return true if options specifies this is a 32 bit trie | 
|
*/  | 
|
protected final boolean isIntTrie()  | 
|
    { | 
|
return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) != 0;  | 
|
}  | 
|
    /** | 
|
    * Determines if this is a 16 bit trie | 
|
    * @return true if this is a 16 bit trie | 
|
*/  | 
|
protected final boolean isCharTrie()  | 
|
    { | 
|
return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) == 0;  | 
|
}  | 
|
// private data members --------------------------------------------  | 
|
    /** | 
|
    * Latin 1 option mask | 
|
*/  | 
|
protected static final int HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_ = 0x200;  | 
|
    /** | 
|
    * Constant number to authenticate the byte block | 
|
*/  | 
|
protected static final int HEADER_SIGNATURE_ = 0x54726965;  | 
|
    /** | 
|
    * Header option formatting | 
|
*/  | 
|
private static final int HEADER_OPTIONS_SHIFT_MASK_ = 0xF;  | 
|
protected static final int HEADER_OPTIONS_INDEX_SHIFT_ = 4;  | 
|
protected static final int HEADER_OPTIONS_DATA_IS_32_BIT_ = 0x100;  | 
|
    /** | 
|
    * Flag indicator for Latin quick access data block | 
|
*/  | 
|
private boolean m_isLatin1Linear_;  | 
|
    /** | 
|
    * <p>Trie options field.</p> | 
|
    * <p>options bit field:<br> | 
|
    * 9  1 = Latin-1 data is stored linearly at data + DATA_BLOCK_LENGTH<br> | 
|
    * 8  0 = 16-bit data, 1=32-bit data<br> | 
|
    * 7..4  INDEX_STAGE_1_SHIFT   // 0..INDEX_STAGE_2_SHIFT<br> | 
|
    * 3..0  INDEX_STAGE_2_SHIFT   // 1..9<br> | 
|
*/  | 
|
private int m_options_;  | 
|
// private methods ---------------------------------------------------  | 
|
    /** | 
|
    * Authenticates raw data header. | 
|
    * Checking the header information, signature and options. | 
|
    * @param signature This contains the options and type of a Trie | 
|
    * @return true if the header is authenticated valid | 
|
*/  | 
|
private final boolean checkHeader(int signature)  | 
|
    { | 
|
// check the signature  | 
|
// Trie in big-endian US-ASCII (0x54726965).  | 
|
        // Magic number to authenticate the data. | 
|
if (signature != HEADER_SIGNATURE_) {  | 
|
return false;  | 
|
}  | 
|
if ((m_options_ & HEADER_OPTIONS_SHIFT_MASK_) !=  | 
|
INDEX_STAGE_1_SHIFT_ ||  | 
|
((m_options_ >> HEADER_OPTIONS_INDEX_SHIFT_) &  | 
|
HEADER_OPTIONS_SHIFT_MASK_)  | 
|
!= INDEX_STAGE_2_SHIFT_) {  | 
|
return false;  | 
|
}  | 
|
return true;  | 
|
}  | 
|
}  |