/* |
|
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
|
* |
|
* This code is free software; you can redistribute it and/or modify it |
|
* under the terms of the GNU General Public License version 2 only, as |
|
* published by the Free Software Foundation. Oracle designates this |
|
* particular file as subject to the "Classpath" exception as provided |
|
* by Oracle in the LICENSE file that accompanied this code. |
|
* |
|
* This code is distributed in the hope that it will be useful, but WITHOUT |
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
|
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
|
* version 2 for more details (a copy is included in the LICENSE file that |
|
* accompanied this code). |
|
* |
|
* You should have received a copy of the GNU General Public License version |
|
* 2 along with this work; if not, write to the Free Software Foundation, |
|
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
|
* |
|
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
|
* or visit www.oracle.com if you need additional information or have any |
|
* questions. |
|
* |
|
*/ |
|
/* |
|
******************************************************************************* |
|
* |
|
* Copyright (C) 1999-2003, International Business Machines |
|
* Corporation and others. All Rights Reserved. |
|
* |
|
******************************************************************************* |
|
*/ |
|
package sun.font; |
|
/** |
|
* <code>ScriptRun</code> is used to find runs of characters in |
|
* the same script, as defined in the <code>Script</code> class. |
|
* It implements a simple iterator over an array of characters. |
|
* The iterator will assign <code>COMMON</code> and <code>INHERITED</code> |
|
* characters to the same script as the preceding characters. If the |
|
* COMMON and INHERITED characters are first, they will be assigned to |
|
* the same script as the following characters. |
|
* |
|
* The iterator will try to match paired punctuation. If it sees an |
|
* opening punctuation character, it will remember the script that |
|
* was assigned to that character, and assign the same script to the |
|
* matching closing punctuation. |
|
* |
|
* No attempt is made to combine related scripts into a single run. In |
|
* particular, Hiragana, Katakana, and Han characters will appear in seperate |
|
* runs. |
|
|
|
* Here is an example of how to iterate over script runs: |
|
* <pre> |
|
* void printScriptRuns(char[] text) |
|
* { |
|
* ScriptRun scriptRun = new ScriptRun(text, 0, text.length); |
|
* |
|
* while (scriptRun.next()) { |
|
* int start = scriptRun.getScriptStart(); |
|
* int limit = scriptRun.getScriptLimit(); |
|
* int script = scriptRun.getScriptCode(); |
|
* |
|
* System.out.println("Script \"" + Script.getName(script) + "\" from " + |
|
* start + " to " + limit + "."); |
|
* } |
|
* } |
|
* </pre> |
|
* |
|
*/ |
|
public final class ScriptRun |
|
{ |
|
private char[] text; // fixed once set by constructor |
|
private int textStart; |
|
private int textLimit; |
|
private int scriptStart; // change during iteration |
|
private int scriptLimit; |
|
private int scriptCode; |
|
private int stack[]; // stack used to handle paired punctuation if encountered |
|
private int parenSP; |
|
public ScriptRun() { |
|
// must call init later or we die. |
|
} |
|
/** |
|
* Construct a <code>ScriptRun</code> object which iterates over a subrange |
|
* of the given characetrs. |
|
* |
|
* @param chars the array of characters over which to iterate. |
|
* @param start the index of the first character over which to iterate |
|
* @param count the number of characters over which to iterate |
|
*/ |
|
public ScriptRun(char[] chars, int start, int count) |
|
{ |
|
init(chars, start, count); |
|
} |
|
public void init(char[] chars, int start, int count) |
|
{ |
|
if (chars == null || start < 0 || count < 0 || count > chars.length - start) { |
|
throw new IllegalArgumentException(); |
|
} |
|
text = chars; |
|
textStart = start; |
|
textLimit = start + count; |
|
scriptStart = textStart; |
|
scriptLimit = textStart; |
|
scriptCode = Script.INVALID_CODE; |
|
parenSP = 0; |
|
} |
|
/** |
|
* Get the starting index of the current script run. |
|
* |
|
* @return the index of the first character in the current script run. |
|
*/ |
|
public final int getScriptStart() { |
|
return scriptStart; |
|
} |
|
/** |
|
* Get the index of the first character after the current script run. |
|
* |
|
* @return the index of the first character after the current script run. |
|
*/ |
|
public final int getScriptLimit() { |
|
return scriptLimit; |
|
} |
|
/** |
|
* Get the script code for the script of the current script run. |
|
* |
|
* @return the script code for the script of the current script run. |
|
* @see #Script |
|
*/ |
|
public final int getScriptCode() { |
|
return scriptCode; |
|
} |
|
/** |
|
* Find the next script run. Returns <code>false</code> if there |
|
* isn't another run, returns <code>true</code> if there is. |
|
* |
|
* @return <code>false</code> if there isn't another run, <code>true</code> if there is. |
|
*/ |
|
public final boolean next() { |
|
int startSP = parenSP; // used to find the first new open character |
|
// if we've fallen off the end of the text, we're done |
|
if (scriptLimit >= textLimit) { |
|
return false; |
|
} |
|
scriptCode = Script.COMMON; |
|
scriptStart = scriptLimit; |
|
int ch; |
|
while ((ch = nextCodePoint()) != DONE) { |
|
int sc = ScriptRunData.getScript(ch); |
|
int pairIndex = sc == Script.COMMON ? getPairIndex(ch) : -1; |
|
// Paired character handling: |
|
// |
|
// if it's an open character, push it onto the stack. |
|
// if it's a close character, find the matching open on the |
|
// stack, and use that script code. Any non-matching open |
|
// characters above it on the stack will be popped. |
|
if (pairIndex >= 0) { |
|
if ((pairIndex & 1) == 0) { |
|
if (stack == null) { |
|
stack = new int[32]; |
|
} else if (parenSP == stack.length) { |
|
int[] newstack = new int[stack.length + 32]; |
|
System.arraycopy(stack, 0, newstack, 0, stack.length); |
|
stack = newstack; |
|
} |
|
stack[parenSP++] = pairIndex; |
|
stack[parenSP++] = scriptCode; |
|
} else if (parenSP > 0) { |
|
int pi = pairIndex & ~1; |
|
while ((parenSP -= 2) >= 0 && stack[parenSP] != pi); |
|
if (parenSP >= 0) { |
|
sc = stack[parenSP+1]; |
|
} else { |
|
parenSP = 0; |
|
} |
|
if (parenSP < startSP) { |
|
startSP = parenSP; |
|
} |
|
} |
|
} |
|
if (sameScript(scriptCode, sc)) { |
|
if (scriptCode <= Script.INHERITED && sc > Script.INHERITED) { |
|
scriptCode = sc; |
|
// now that we have a final script code, fix any open |
|
// characters we pushed before we knew the script code. |
|
while (startSP < parenSP) { |
|
stack[startSP+1] = scriptCode; |
|
startSP += 2; |
|
} |
|
} |
|
// if this character is a close paired character, |
|
// pop it from the stack |
|
if (pairIndex > 0 && (pairIndex & 1) != 0 && parenSP > 0) { |
|
parenSP -= 2; |
|
} |
|
} else { |
|
// We've just seen the first character of |
|
// the next run. Back over it so we'll see |
|
// it again the next time. |
|
pushback(ch); |
|
// we're outta here |
|
break; |
|
} |
|
} |
|
return true; |
|
} |
|
static final int SURROGATE_START = 0x10000; |
|
static final int LEAD_START = 0xd800; |
|
static final int LEAD_LIMIT = 0xdc00; |
|
static final int TAIL_START = 0xdc00; |
|
static final int TAIL_LIMIT = 0xe000; |
|
static final int LEAD_SURROGATE_SHIFT = 10; |
|
static final int SURROGATE_OFFSET = SURROGATE_START - (LEAD_START << LEAD_SURROGATE_SHIFT) - TAIL_START; |
|
static final int DONE = -1; |
|
private final int nextCodePoint() { |
|
if (scriptLimit >= textLimit) { |
|
return DONE; |
|
} |
|
int ch = text[scriptLimit++]; |
|
if (ch >= LEAD_START && ch < LEAD_LIMIT && scriptLimit < textLimit) { |
|
int nch = text[scriptLimit]; |
|
if (nch >= TAIL_START && nch < TAIL_LIMIT) { |
|
++scriptLimit; |
|
ch = (ch << LEAD_SURROGATE_SHIFT) + nch + SURROGATE_OFFSET; |
|
} |
|
} |
|
return ch; |
|
} |
|
private final void pushback(int ch) { |
|
if (ch >= 0) { |
|
if (ch >= 0x10000) { |
|
scriptLimit -= 2; |
|
} else { |
|
scriptLimit -= 1; |
|
} |
|
} |
|
} |
|
/** |
|
* Compare two script codes to see if they are in the same script. If one script is |
|
* a strong script, and the other is INHERITED or COMMON, it will compare equal. |
|
* |
|
* @param scriptOne one of the script codes. |
|
* @param scriptTwo the other script code. |
|
* @return <code>true</code> if the two scripts are the same. |
|
* @see com.ibm.icu.lang.Script |
|
*/ |
|
private static boolean sameScript(int scriptOne, int scriptTwo) { |
|
return scriptOne == scriptTwo || scriptOne <= Script.INHERITED || scriptTwo <= Script.INHERITED; |
|
} |
|
/** |
|
* Find the highest bit that's set in a word. Uses a binary search through |
|
* the bits. |
|
* |
|
* @param n the word in which to find the highest bit that's set. |
|
* @return the bit number (counting from the low order bit) of the highest bit. |
|
*/ |
|
private static final byte highBit(int n) |
|
{ |
|
if (n <= 0) { |
|
return -32; |
|
} |
|
byte bit = 0; |
|
if (n >= 1 << 16) { |
|
n >>= 16; |
|
bit += 16; |
|
} |
|
if (n >= 1 << 8) { |
|
n >>= 8; |
|
bit += 8; |
|
} |
|
if (n >= 1 << 4) { |
|
n >>= 4; |
|
bit += 4; |
|
} |
|
if (n >= 1 << 2) { |
|
n >>= 2; |
|
bit += 2; |
|
} |
|
if (n >= 1 << 1) { |
|
n >>= 1; |
|
bit += 1; |
|
} |
|
return bit; |
|
} |
|
/** |
|
* Search the pairedChars array for the given character. |
|
* |
|
* @param ch the character for which to search. |
|
* @return the index of the character in the table, or -1 if it's not there. |
|
*/ |
|
private static int getPairIndex(int ch) |
|
{ |
|
int probe = pairedCharPower; |
|
int index = 0; |
|
if (ch >= pairedChars[pairedCharExtra]) { |
|
index = pairedCharExtra; |
|
} |
|
while (probe > (1 << 0)) { |
|
probe >>= 1; |
|
if (ch >= pairedChars[index + probe]) { |
|
index += probe; |
|
} |
|
} |
|
if (pairedChars[index] != ch) { |
|
index = -1; |
|
} |
|
return index; |
|
} |
|
// all common |
|
private static int pairedChars[] = { |
|
0x0028, 0x0029, // ascii paired punctuation // common |
|
0x003c, 0x003e, // common |
|
0x005b, 0x005d, // common |
|
0x007b, 0x007d, // common |
|
0x00ab, 0x00bb, // guillemets // common |
|
0x2018, 0x2019, // general punctuation // common |
|
0x201c, 0x201d, // common |
|
0x2039, 0x203a, // common |
|
0x3008, 0x3009, // chinese paired punctuation // common |
|
0x300a, 0x300b, |
|
0x300c, 0x300d, |
|
0x300e, 0x300f, |
|
0x3010, 0x3011, |
|
0x3014, 0x3015, |
|
0x3016, 0x3017, |
|
0x3018, 0x3019, |
|
0x301a, 0x301b |
|
}; |
|
private static final int pairedCharPower = 1 << highBit(pairedChars.length); |
|
private static final int pairedCharExtra = pairedChars.length - pairedCharPower; |
|
} |