|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
|
|
/* |
|
* (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved |
|
* (C) Copyright IBM Corp. 1996, 1997 - All Rights Reserved |
|
* |
|
* The original version of this source code and documentation is copyrighted |
|
* and owned by Taligent, Inc., a wholly-owned subsidiary of IBM. These |
|
* materials are provided under terms of a License Agreement between Taligent |
|
* and Sun. This technology is protected by multiple US and International |
|
* patents. This notice and attribution to Taligent may not be removed. |
|
* Taligent is a registered trademark of Taligent, Inc. |
|
* |
|
*/ |
|
|
|
package java.text; |
|
|
|
import java.util.ArrayList; |
|
|
|
/** |
|
* Utility class for normalizing and merging patterns for collation. |
|
* Patterns are strings of the form <entry>*, where <entry> has the |
|
* form: |
|
* <pattern> := <entry>* |
|
* <entry> := <separator><chars>{"/"<extension>} |
|
* <separator> := "=", ",", ";", "<", "&" |
|
* <chars>, and <extension> are both arbitrary strings. |
|
* unquoted whitespaces are ignored. |
|
* 'xxx' can be used to quote characters |
|
* One difference from Collator is that & is used to reset to a current |
|
* point. Or, in other words, it introduces a new sequence which is to |
|
* be added to the old. |
|
* That is: "a < b < c < d" is the same as "a < b & b < c & c < d" OR |
|
* "a < b < d & b < c" |
|
* XXX: make '' be a single quote. |
|
* @see PatternEntry |
|
* @author Mark Davis, Helena Shih |
|
*/ |
|
|
|
final class MergeCollation { |
|
|
|
|
|
|
|
|
|
*/ |
|
public MergeCollation(String pattern) throws ParseException |
|
{ |
|
for (int i = 0; i < statusArray.length; i++) |
|
statusArray[i] = 0; |
|
setPattern(pattern); |
|
} |
|
|
|
|
|
|
|
*/ |
|
public String getPattern() { |
|
return getPattern(true); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public String getPattern(boolean withWhiteSpace) { |
|
StringBuffer result = new StringBuffer(); |
|
PatternEntry tmp = null; |
|
ArrayList<PatternEntry> extList = null; |
|
int i; |
|
for (i = 0; i < patterns.size(); ++i) { |
|
PatternEntry entry = patterns.get(i); |
|
if (entry.extension.length() != 0) { |
|
if (extList == null) |
|
extList = new ArrayList<>(); |
|
extList.add(entry); |
|
} else { |
|
if (extList != null) { |
|
PatternEntry last = findLastWithNoExtension(i-1); |
|
for (int j = extList.size() - 1; j >= 0 ; j--) { |
|
tmp = extList.get(j); |
|
tmp.addToBuffer(result, false, withWhiteSpace, last); |
|
} |
|
extList = null; |
|
} |
|
entry.addToBuffer(result, false, withWhiteSpace, null); |
|
} |
|
} |
|
if (extList != null) { |
|
PatternEntry last = findLastWithNoExtension(i-1); |
|
for (int j = extList.size() - 1; j >= 0 ; j--) { |
|
tmp = extList.get(j); |
|
tmp.addToBuffer(result, false, withWhiteSpace, last); |
|
} |
|
extList = null; |
|
} |
|
return result.toString(); |
|
} |
|
|
|
private final PatternEntry findLastWithNoExtension(int i) { |
|
for (--i;i >= 0; --i) { |
|
PatternEntry entry = patterns.get(i); |
|
if (entry.extension.length() == 0) { |
|
return entry; |
|
} |
|
} |
|
return null; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public String emitPattern() { |
|
return emitPattern(true); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public String emitPattern(boolean withWhiteSpace) { |
|
StringBuffer result = new StringBuffer(); |
|
for (int i = 0; i < patterns.size(); ++i) |
|
{ |
|
PatternEntry entry = patterns.get(i); |
|
if (entry != null) { |
|
entry.addToBuffer(result, true, withWhiteSpace, null); |
|
} |
|
} |
|
return result.toString(); |
|
} |
|
|
|
|
|
|
|
*/ |
|
public void setPattern(String pattern) throws ParseException |
|
{ |
|
patterns.clear(); |
|
addPattern(pattern); |
|
} |
|
|
|
|
|
|
|
|
|
*/ |
|
public void addPattern(String pattern) throws ParseException |
|
{ |
|
if (pattern == null) |
|
return; |
|
|
|
PatternEntry.Parser parser = new PatternEntry.Parser(pattern); |
|
|
|
PatternEntry entry = parser.next(); |
|
while (entry != null) { |
|
fixEntry(entry); |
|
entry = parser.next(); |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
*/ |
|
public int getCount() { |
|
return patterns.size(); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public PatternEntry getItemAt(int index) { |
|
return patterns.get(index); |
|
} |
|
|
|
//============================================================ |
|
// privates |
|
//============================================================ |
|
ArrayList<PatternEntry> patterns = new ArrayList<>(); |
|
|
|
private transient PatternEntry saveEntry = null; |
|
private transient PatternEntry lastEntry = null; |
|
|
|
// This is really used as a local variable inside fixEntry, but we cache |
|
|
|
private transient StringBuffer excess = new StringBuffer(); |
|
|
|
// |
|
// When building a MergeCollation, we need to do lots of searches to see |
|
// whether a given entry is already in the table. Since we're using an |
|
// array, this would make the algorithm O(N*N). To speed things up, we |
|
// use this bit array to remember whether the array contains any entries |
|
// starting with each Unicode character. If not, we can avoid the search. |
|
// Using BitSet would make this easier, but it's significantly slower. |
|
|
|
private transient byte[] statusArray = new byte[8192]; |
|
private final byte BITARRAYMASK = (byte)0x1; |
|
private final int BYTEPOWER = 3; |
|
private final int BYTEMASK = (1 << BYTEPOWER) - 1; |
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
private final void fixEntry(PatternEntry newEntry) throws ParseException |
|
{ |
|
// check to see whether the new entry has the same characters as the previous |
|
// entry did (this can happen when a pattern declaring a difference between two |
|
// strings that are canonically equivalent is normalized). If so, and the strength |
|
// is anything other than IDENTICAL or RESET, throw an exception (you can't |
|
|
|
if (lastEntry != null && newEntry.chars.equals(lastEntry.chars) |
|
&& newEntry.extension.equals(lastEntry.extension)) { |
|
if (newEntry.strength != Collator.IDENTICAL |
|
&& newEntry.strength != PatternEntry.RESET) { |
|
throw new ParseException("The entries " + lastEntry + " and " |
|
+ newEntry + " are adjacent in the rules, but have conflicting " |
|
+ "strengths: A character can't be unequal to itself.", -1); |
|
} else { |
|
|
|
return; |
|
} |
|
} |
|
|
|
boolean changeLastEntry = true; |
|
if (newEntry.strength != PatternEntry.RESET) { |
|
int oldIndex = -1; |
|
|
|
if ((newEntry.chars.length() == 1)) { |
|
|
|
char c = newEntry.chars.charAt(0); |
|
int statusIndex = c >> BYTEPOWER; |
|
byte bitClump = statusArray[statusIndex]; |
|
byte setBit = (byte)(BITARRAYMASK << (c & BYTEMASK)); |
|
|
|
if (bitClump != 0 && (bitClump & setBit) != 0) { |
|
oldIndex = patterns.lastIndexOf(newEntry); |
|
} else { |
|
// We're going to add an element that starts with this |
|
|
|
statusArray[statusIndex] = (byte)(bitClump | setBit); |
|
} |
|
} else { |
|
oldIndex = patterns.lastIndexOf(newEntry); |
|
} |
|
if (oldIndex != -1) { |
|
patterns.remove(oldIndex); |
|
} |
|
|
|
excess.setLength(0); |
|
int lastIndex = findLastEntry(lastEntry, excess); |
|
|
|
if (excess.length() != 0) { |
|
newEntry.extension = excess + newEntry.extension; |
|
if (lastIndex != patterns.size()) { |
|
lastEntry = saveEntry; |
|
changeLastEntry = false; |
|
} |
|
} |
|
if (lastIndex == patterns.size()) { |
|
patterns.add(newEntry); |
|
saveEntry = newEntry; |
|
} else { |
|
patterns.add(lastIndex, newEntry); |
|
} |
|
} |
|
if (changeLastEntry) { |
|
lastEntry = newEntry; |
|
} |
|
} |
|
|
|
private final int findLastEntry(PatternEntry entry, |
|
StringBuffer excessChars) throws ParseException |
|
{ |
|
if (entry == null) |
|
return 0; |
|
|
|
if (entry.strength != PatternEntry.RESET) { |
|
// Search backwards for string that contains this one; |
|
// most likely entry is last one |
|
|
|
int oldIndex = -1; |
|
if ((entry.chars.length() == 1)) { |
|
int index = entry.chars.charAt(0) >> BYTEPOWER; |
|
if ((statusArray[index] & |
|
(BITARRAYMASK << (entry.chars.charAt(0) & BYTEMASK))) != 0) { |
|
oldIndex = patterns.lastIndexOf(entry); |
|
} |
|
} else { |
|
oldIndex = patterns.lastIndexOf(entry); |
|
} |
|
if ((oldIndex == -1)) |
|
throw new ParseException("couldn't find last entry: " |
|
+ entry, oldIndex); |
|
return oldIndex + 1; |
|
} else { |
|
int i; |
|
for (i = patterns.size() - 1; i >= 0; --i) { |
|
PatternEntry e = patterns.get(i); |
|
if (e.chars.regionMatches(0,entry.chars,0, |
|
e.chars.length())) { |
|
excessChars.append(entry.chars, e.chars.length(), |
|
entry.chars.length()); |
|
break; |
|
} |
|
} |
|
if (i == -1) |
|
throw new ParseException("couldn't find: " + entry, i); |
|
return i + 1; |
|
} |
|
} |
|
} |