|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
|
|
/* |
|
****************************************************************************** |
|
* |
|
* Copyright (C) 2009-2014, International Business Machines |
|
* Corporation and others. All Rights Reserved. |
|
* |
|
****************************************************************************** |
|
*/ |
|
|
|
package jdk.internal.icu.impl; |
|
|
|
import java.util.ArrayList; |
|
|
|
import jdk.internal.icu.text.UTF16; |
|
import jdk.internal.icu.text.UnicodeSet; |
|
import jdk.internal.icu.text.UnicodeSet.SpanCondition; |
|
import jdk.internal.icu.util.OutputInt; |
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public class UnicodeSetStringSpan { |
|
|
|
/* |
|
* Which span() variant will be used? The object is either built for one variant and used once, |
|
* or built for all and may be used many times. |
|
*/ |
|
public static final int WITH_COUNT = 0x40; |
|
public static final int FWD = 0x20; |
|
public static final int BACK = 0x10; |
|
|
|
public static final int CONTAINED = 2; |
|
public static final int NOT_CONTAINED = 1; |
|
|
|
public static final int ALL = 0x7f; |
|
|
|
public static final int FWD_UTF16_CONTAINED = FWD | CONTAINED; |
|
public static final int FWD_UTF16_NOT_CONTAINED = FWD | NOT_CONTAINED; |
|
public static final int BACK_UTF16_CONTAINED = BACK | CONTAINED; |
|
public static final int BACK_UTF16_NOT_CONTAINED = BACK | NOT_CONTAINED; |
|
|
|
|
|
|
|
|
|
*/ |
|
static final short ALL_CP_CONTAINED = 0xff; |
|
|
|
|
|
static final short LONG_SPAN = ALL_CP_CONTAINED - 1; |
|
|
|
|
|
private UnicodeSet spanSet; |
|
|
|
|
|
|
|
|
|
*/ |
|
private UnicodeSet spanNotSet; |
|
|
|
|
|
private ArrayList<String> strings; |
|
|
|
|
|
private short[] spanLengths; |
|
|
|
|
|
private int maxLength16; |
|
|
|
|
|
private boolean someRelevant; |
|
|
|
|
|
private boolean all; |
|
|
|
|
|
private OffsetList offsets; |
|
|
|
|
|
|
|
|
|
*/ |
|
public UnicodeSetStringSpan(final UnicodeSet set, final ArrayList<String> setStrings, int which) { |
|
spanSet = new UnicodeSet(0, 0x10ffff); |
|
// TODO: With Java 6, just take the parent set's strings as is, |
|
// as a NavigableSet<String>, rather than as an ArrayList copy of the set of strings. |
|
// Then iterate via the first() and higher() methods. |
|
// (We do not want to create multiple Iterator objects in each span().) |
|
|
|
strings = setStrings; |
|
all = (which == ALL); |
|
spanSet.retainAll(set); |
|
if (0 != (which & NOT_CONTAINED)) { |
|
// Default to the same sets. |
|
|
|
spanNotSet = spanSet; |
|
} |
|
offsets = new OffsetList(); |
|
|
|
// Determine if the strings even need to be taken into account at all for span() etc. |
|
// If any string is relevant, then all strings need to be used for |
|
// span(longest match) but only the relevant ones for span(while contained). |
|
// TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH |
|
// and do not store UTF-8 strings if !thisRelevant and CONTAINED. |
|
// (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.) |
|
|
|
int stringsLength = strings.size(); |
|
|
|
int i, spanLength; |
|
someRelevant = false; |
|
for (i = 0; i < stringsLength; ++i) { |
|
String string = strings.get(i); |
|
int length16 = string.length(); |
|
spanLength = spanSet.span(string, SpanCondition.CONTAINED); |
|
if (spanLength < length16) { |
|
someRelevant = true; |
|
} |
|
if ( length16 > maxLength16) { |
|
maxLength16 = length16; |
|
} |
|
} |
|
if (!someRelevant && (which & WITH_COUNT) == 0) { |
|
return; |
|
} |
|
|
|
// Freeze after checking for the need to use strings at all because freezing |
|
|
|
if (all) { |
|
spanSet.freeze(); |
|
} |
|
|
|
int spanBackLengthsOffset; |
|
|
|
|
|
int allocSize; |
|
if (all) { |
|
|
|
allocSize = stringsLength * (2); |
|
} else { |
|
allocSize = stringsLength; |
|
} |
|
spanLengths = new short[allocSize]; |
|
|
|
if (all) { |
|
|
|
spanBackLengthsOffset = stringsLength; |
|
} else { |
|
|
|
spanBackLengthsOffset = 0; |
|
} |
|
|
|
// Set the meta data and spanNotSet and write the UTF-8 strings. |
|
|
|
for (i = 0; i < stringsLength; ++i) { |
|
String string = strings.get(i); |
|
int length16 = string.length(); |
|
spanLength = spanSet.span(string, SpanCondition.CONTAINED); |
|
if (spanLength < length16) { |
|
if (true ) { |
|
if (0 != (which & CONTAINED)) { |
|
if (0 != (which & FWD)) { |
|
spanLengths[i] = makeSpanLengthByte(spanLength); |
|
} |
|
if (0 != (which & BACK)) { |
|
spanLength = length16 |
|
- spanSet.spanBack(string, length16, SpanCondition.CONTAINED); |
|
spanLengths[spanBackLengthsOffset + i] = makeSpanLengthByte(spanLength); |
|
} |
|
} else { |
|
spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = 0; |
|
// flag. |
|
} |
|
} |
|
if (0 != (which & NOT_CONTAINED)) { |
|
// Add string start and end code points to the spanNotSet so that |
|
|
|
int c; |
|
if (0 != (which & FWD)) { |
|
c = string.codePointAt(0); |
|
addToSpanNotSet(c); |
|
} |
|
if (0 != (which & BACK)) { |
|
c = string.codePointBefore(length16); |
|
addToSpanNotSet(c); |
|
} |
|
} |
|
} else { |
|
if (all) { |
|
spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = ALL_CP_CONTAINED; |
|
} else { |
|
|
|
spanLengths[i] = ALL_CP_CONTAINED; |
|
} |
|
} |
|
} |
|
|
|
|
|
if (all) { |
|
spanNotSet.freeze(); |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public boolean needsStringSpanUTF16() { |
|
return someRelevant; |
|
} |
|
|
|
|
|
public boolean contains(int c) { |
|
return spanSet.contains(c); |
|
} |
|
|
|
|
|
|
|
|
|
*/ |
|
private void addToSpanNotSet(int c) { |
|
if (spanNotSet == null || spanNotSet == spanSet) { |
|
if (spanSet.contains(c)) { |
|
return; |
|
} |
|
spanNotSet = spanSet.cloneAsThawed(); |
|
} |
|
spanNotSet.add(c); |
|
} |
|
|
|
/* |
|
* Note: In span() when spanLength==0 |
|
* (after a string match, or at the beginning after an empty code point span) |
|
* and in spanNot() and spanNotUTF8(), |
|
* string matching could use a binary search because all string matches are done |
|
* from the same start index. |
|
* |
|
* For UTF-8, this would require a comparison function that returns UTF-16 order. |
|
* |
|
* This optimization should not be necessary for normal UnicodeSets because most sets have no strings, and most sets |
|
* with strings have very few very short strings. For cases with many strings, it might be better to use a different |
|
* API and implementation with a DFA (state machine). |
|
*/ |
|
|
|
/* |
|
* Algorithm for span(SpanCondition.CONTAINED) |
|
* |
|
* Theoretical algorithm: |
|
* - Iterate through the string, and at each code point boundary: |
|
* + If the code point there is in the set, then remember to continue after it. |
|
* + If a set string matches at the current position, then remember to continue after it. |
|
* + Either recursively span for each code point or string match, or recursively span |
|
* for all but the shortest one and iteratively continue the span with the shortest local match. |
|
* + Remember the longest recursive span (the farthest end point). |
|
* + If there is no match at the current position, |
|
* neither for the code point there nor for any set string, |
|
* then stop and return the longest recursive span length. |
|
* |
|
* Optimized implementation: |
|
* |
|
* (We assume that most sets will have very few very short strings. |
|
* A span using a string-less set is extremely fast.) |
|
* |
|
* Create and cache a spanSet which contains all of the single code points of the original set |
|
* but none of its strings. |
|
* |
|
* - Start with spanLength=spanSet.span(SpanCondition.CONTAINED). |
|
* - Loop: |
|
* + Try to match each set string at the end of the spanLength. |
|
* ~ Set strings that start with set-contained code points |
|
* must be matched with a partial overlap |
|
* because the recursive algorithm would have tried to match them at every position. |
|
* ~ Set strings that entirely consist of set-contained code points |
|
* are irrelevant for span(SpanCondition.CONTAINED) |
|
* because the recursive algorithm would continue after them anyway and |
|
* find the longest recursive match from their end. |
|
* ~ Rather than recursing, note each end point of a set string match. |
|
* + If no set string matched after spanSet.span(), |
|
* then return with where the spanSet.span() ended. |
|
* + If at least one set string matched after spanSet.span(), |
|
* then pop the shortest string match end point and continue the loop, |
|
* trying to match all set strings from there. |
|
* + If at least one more set string matched after a previous string match, then test if the |
|
* code point after the previous string match is also contained in the set. |
|
* Continue the loop with the shortest end point of |
|
* either this code point or a matching set string. |
|
* + If no more set string matched after a previous string match, |
|
* then try another spanLength=spanSet.span(SpanCondition.CONTAINED). |
|
* Stop if spanLength==0, otherwise continue the loop. |
|
* |
|
* By noting each end point of a set string match, the function visits each string position at most once and |
|
* finishes in linear time. |
|
* |
|
* The recursive algorithm may visit the same string position many times |
|
* if multiple paths lead to it and finishes in exponential time. |
|
*/ |
|
|
|
/* |
|
* Algorithm for span(SIMPLE) |
|
* |
|
* Theoretical algorithm: |
|
* - Iterate through the string, and at each code point boundary: |
|
* + If the code point there is in the set, then remember to continue after it. |
|
* + If a set string matches at the current position, then remember to continue after it. |
|
* + Continue from the farthest match position and ignore all others. |
|
* + If there is no match at the current position, then stop and return the current position. |
|
* |
|
* Optimized implementation: |
|
* |
|
* (Same assumption and spanSet as above.) |
|
* |
|
* - Start with spanLength=spanSet.span(SpanCondition.CONTAINED). |
|
* - Loop: |
|
* + Try to match each set string at the end of the spanLength. |
|
* ~ Set strings that start with set-contained code points |
|
* must be matched with a partial overlap |
|
* because the standard algorithm would have tried to match them earlier. |
|
* ~ Set strings that entirely consist of set-contained code points |
|
* must be matched with a full overlap because the longest-match algorithm |
|
* would hide set string matches that end earlier. |
|
* Such set strings need not be matched earlier inside the code point span |
|
* because the standard algorithm would then have |
|
* continued after the set string match anyway. |
|
* ~ Remember the longest set string match (farthest end point) |
|
* from the earliest starting point. |
|
* + If no set string matched after spanSet.span(), |
|
* then return with where the spanSet.span() ended. |
|
* + If at least one set string matched, |
|
* then continue the loop after the longest match from the earliest position. |
|
* + If no more set string matched after a previous string match, |
|
* then try another spanLength=spanSet.span(SpanCondition.CONTAINED). |
|
* Stop if spanLength==0, otherwise continue the loop. |
|
*/ |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public int span(CharSequence s, int start, SpanCondition spanCondition) { |
|
if (spanCondition == SpanCondition.NOT_CONTAINED) { |
|
return spanNot(s, start, null); |
|
} |
|
int spanLimit = spanSet.span(s, start, SpanCondition.CONTAINED); |
|
if (spanLimit == s.length()) { |
|
return spanLimit; |
|
} |
|
return spanWithStrings(s, start, spanLimit, spanCondition); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
private synchronized int spanWithStrings(CharSequence s, int start, int spanLimit, |
|
SpanCondition spanCondition) { |
|
|
|
int initSize = 0; |
|
if (spanCondition == SpanCondition.CONTAINED) { |
|
|
|
initSize = maxLength16; |
|
} |
|
offsets.setMaxLength(initSize); |
|
int length = s.length(); |
|
int pos = spanLimit, rest = length - spanLimit; |
|
int spanLength = spanLimit - start; |
|
int i, stringsLength = strings.size(); |
|
for (;;) { |
|
if (spanCondition == SpanCondition.CONTAINED) { |
|
for (i = 0; i < stringsLength; ++i) { |
|
int overlap = spanLengths[i]; |
|
if (overlap == ALL_CP_CONTAINED) { |
|
continue; |
|
} |
|
String string = strings.get(i); |
|
|
|
int length16 = string.length(); |
|
|
|
|
|
if (overlap >= LONG_SPAN) { |
|
overlap = length16; |
|
// While contained: No point matching fully inside the code point span. |
|
overlap = string.offsetByCodePoints(overlap, -1); |
|
// point. |
|
} |
|
if (overlap > spanLength) { |
|
overlap = spanLength; |
|
} |
|
int inc = length16 - overlap; |
|
for (;;) { |
|
if (inc > rest) { |
|
break; |
|
} |
|
|
|
if (!offsets.containsOffset(inc) && matches16CPB(s, pos - overlap, length, string, length16)) { |
|
if (inc == rest) { |
|
return length; |
|
} |
|
offsets.addOffset(inc); |
|
} |
|
if (overlap == 0) { |
|
break; |
|
} |
|
--overlap; |
|
++inc; |
|
} |
|
} |
|
} else { |
|
int maxInc = 0, maxOverlap = 0; |
|
for (i = 0; i < stringsLength; ++i) { |
|
int overlap = spanLengths[i]; |
|
// For longest match, we do need to try to match even an all-contained string |
|
// to find the match from the earliest start. |
|
|
|
String string = strings.get(i); |
|
|
|
int length16 = string.length(); |
|
|
|
|
|
if (overlap >= LONG_SPAN) { |
|
overlap = length16; |
|
// Longest match: Need to match fully inside the code point span |
|
// to find the match from the earliest start. |
|
} |
|
if (overlap > spanLength) { |
|
overlap = spanLength; |
|
} |
|
int inc = length16 - overlap; |
|
for (;;) { |
|
if (inc > rest || overlap < maxOverlap) { |
|
break; |
|
} |
|
|
|
if ((overlap > maxOverlap || inc > maxInc) |
|
&& matches16CPB(s, pos - overlap, length, string, length16)) { |
|
maxInc = inc; |
|
maxOverlap = overlap; |
|
break; |
|
} |
|
--overlap; |
|
++inc; |
|
} |
|
} |
|
|
|
if (maxInc != 0 || maxOverlap != 0) { |
|
// Longest-match algorithm, and there was a string match. |
|
|
|
pos += maxInc; |
|
rest -= maxInc; |
|
if (rest == 0) { |
|
return length; |
|
} |
|
spanLength = 0; |
|
continue; |
|
} |
|
} |
|
// Finished trying to match all strings at pos. |
|
|
|
if (spanLength != 0 || pos == 0) { |
|
// The position is after an unlimited code point span (spanLength!=0), |
|
// not after a string match. |
|
// The only position where spanLength==0 after a span is pos==0. |
|
// Otherwise, an unlimited code point span is only tried again when no |
|
|
|
if (offsets.isEmpty()) { |
|
return pos; |
|
} |
|
// Match strings from after the next string match. |
|
} else { |
|
|
|
if (offsets.isEmpty()) { |
|
// No more strings matched after a previous string match. |
|
|
|
spanLimit = spanSet.span(s, pos, SpanCondition.CONTAINED); |
|
spanLength = spanLimit - pos; |
|
if (spanLength == rest || |
|
spanLength == 0 |
|
) { |
|
return spanLimit; |
|
} |
|
pos += spanLength; |
|
rest -= spanLength; |
|
continue; |
|
} else { |
|
// Try to match only one code point from after a string match if some |
|
// string matched beyond it, so that we try all possible positions |
|
|
|
spanLength = spanOne(spanSet, s, pos, rest); |
|
if (spanLength > 0) { |
|
if (spanLength == rest) { |
|
return length; |
|
} |
|
// Match strings after this code point. |
|
// There cannot be any increments below it because UnicodeSet strings |
|
|
|
pos += spanLength; |
|
rest -= spanLength; |
|
offsets.shift(spanLength); |
|
spanLength = 0; |
|
continue; |
|
} |
|
// Match strings from after the next string match. |
|
} |
|
} |
|
int minOffset = offsets.popMinimum(null); |
|
pos += minOffset; |
|
rest -= minOffset; |
|
spanLength = 0; |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition, |
|
OutputInt outCount) { |
|
if (spanCondition == SpanCondition.NOT_CONTAINED) { |
|
return spanNot(s, start, outCount); |
|
} |
|
// Consider strings; they may overlap with the span, |
|
|
|
if (spanCondition == SpanCondition.CONTAINED) { |
|
return spanContainedAndCount(s, start, outCount); |
|
} |
|
|
|
int stringsLength = strings.size(); |
|
int length = s.length(); |
|
int pos = start; |
|
int rest = length - start; |
|
int count = 0; |
|
while (rest != 0) { |
|
|
|
int cpLength = spanOne(spanSet, s, pos, rest); |
|
int maxInc = (cpLength > 0) ? cpLength : 0; |
|
|
|
for (int i = 0; i < stringsLength; ++i) { |
|
String string = strings.get(i); |
|
int length16 = string.length(); |
|
if (maxInc < length16 && length16 <= rest && |
|
matches16CPB(s, pos, length, string, length16)) { |
|
maxInc = length16; |
|
} |
|
} |
|
|
|
if (maxInc == 0) { |
|
outCount.value = count; |
|
return pos; |
|
} |
|
|
|
++count; |
|
pos += maxInc; |
|
rest -= maxInc; |
|
} |
|
outCount.value = count; |
|
return pos; |
|
} |
|
|
|
private synchronized int spanContainedAndCount(CharSequence s, int start, OutputInt outCount) { |
|
|
|
offsets.setMaxLength(maxLength16); |
|
int stringsLength = strings.size(); |
|
int length = s.length(); |
|
int pos = start; |
|
int rest = length - start; |
|
int count = 0; |
|
while (rest != 0) { |
|
|
|
int cpLength = spanOne(spanSet, s, pos, rest); |
|
if (cpLength > 0) { |
|
offsets.addOffsetAndCount(cpLength, count + 1); |
|
} |
|
|
|
for (int i = 0; i < stringsLength; ++i) { |
|
String string = strings.get(i); |
|
int length16 = string.length(); |
|
// Note: If the strings were sorted by length, then we could also |
|
|
|
if (length16 <= rest && !offsets.hasCountAtOffset(length16, count + 1) && |
|
matches16CPB(s, pos, length, string, length16)) { |
|
offsets.addOffsetAndCount(length16, count + 1); |
|
} |
|
} |
|
|
|
if (offsets.isEmpty()) { |
|
outCount.value = count; |
|
return pos; |
|
} |
|
|
|
int minOffset = offsets.popMinimum(outCount); |
|
count = outCount.value; |
|
pos += minOffset; |
|
rest -= minOffset; |
|
} |
|
outCount.value = count; |
|
return pos; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public synchronized int spanBack(CharSequence s, int length, SpanCondition spanCondition) { |
|
if (spanCondition == SpanCondition.NOT_CONTAINED) { |
|
return spanNotBack(s, length); |
|
} |
|
int pos = spanSet.spanBack(s, length, SpanCondition.CONTAINED); |
|
if (pos == 0) { |
|
return 0; |
|
} |
|
int spanLength = length - pos; |
|
|
|
|
|
int initSize = 0; |
|
if (spanCondition == SpanCondition.CONTAINED) { |
|
|
|
initSize = maxLength16; |
|
} |
|
offsets.setMaxLength(initSize); |
|
int i, stringsLength = strings.size(); |
|
int spanBackLengthsOffset = 0; |
|
if (all) { |
|
spanBackLengthsOffset = stringsLength; |
|
} |
|
for (;;) { |
|
if (spanCondition == SpanCondition.CONTAINED) { |
|
for (i = 0; i < stringsLength; ++i) { |
|
int overlap = spanLengths[spanBackLengthsOffset + i]; |
|
if (overlap == ALL_CP_CONTAINED) { |
|
continue; |
|
} |
|
String string = strings.get(i); |
|
|
|
int length16 = string.length(); |
|
|
|
|
|
if (overlap >= LONG_SPAN) { |
|
overlap = length16; |
|
|
|
int len1 = 0; |
|
len1 = string.offsetByCodePoints(0, 1); |
|
overlap -= len1; |
|
} |
|
if (overlap > spanLength) { |
|
overlap = spanLength; |
|
} |
|
int dec = length16 - overlap; |
|
for (;;) { |
|
if (dec > pos) { |
|
break; |
|
} |
|
|
|
if (!offsets.containsOffset(dec) && matches16CPB(s, pos - dec, length, string, length16)) { |
|
if (dec == pos) { |
|
return 0; |
|
} |
|
offsets.addOffset(dec); |
|
} |
|
if (overlap == 0) { |
|
break; |
|
} |
|
--overlap; |
|
++dec; |
|
} |
|
} |
|
} else { |
|
int maxDec = 0, maxOverlap = 0; |
|
for (i = 0; i < stringsLength; ++i) { |
|
int overlap = spanLengths[spanBackLengthsOffset + i]; |
|
// For longest match, we do need to try to match even an all-contained string |
|
// to find the match from the latest end. |
|
|
|
String string = strings.get(i); |
|
|
|
int length16 = string.length(); |
|
|
|
|
|
if (overlap >= LONG_SPAN) { |
|
overlap = length16; |
|
// Longest match: Need to match fully inside the code point span |
|
// to find the match from the latest end. |
|
} |
|
if (overlap > spanLength) { |
|
overlap = spanLength; |
|
} |
|
int dec = length16 - overlap; |
|
for (;;) { |
|
if (dec > pos || overlap < maxOverlap) { |
|
break; |
|
} |
|
|
|
if ((overlap > maxOverlap || dec > maxDec) |
|
&& matches16CPB(s, pos - dec, length, string, length16)) { |
|
maxDec = dec; |
|
maxOverlap = overlap; |
|
break; |
|
} |
|
--overlap; |
|
++dec; |
|
} |
|
} |
|
|
|
if (maxDec != 0 || maxOverlap != 0) { |
|
// Longest-match algorithm, and there was a string match. |
|
|
|
pos -= maxDec; |
|
if (pos == 0) { |
|
return 0; |
|
} |
|
spanLength = 0; |
|
continue; |
|
} |
|
} |
|
// Finished trying to match all strings at pos. |
|
|
|
if (spanLength != 0 || pos == length) { |
|
// The position is before an unlimited code point span (spanLength!=0), |
|
// not before a string match. |
|
// The only position where spanLength==0 before a span is pos==length. |
|
// Otherwise, an unlimited code point span is only tried again when no |
|
|
|
if (offsets.isEmpty()) { |
|
return pos; |
|
} |
|
// Match strings from before the next string match. |
|
} else { |
|
|
|
if (offsets.isEmpty()) { |
|
// No more strings matched before a previous string match. |
|
|
|
int oldPos = pos; |
|
pos = spanSet.spanBack(s, oldPos, SpanCondition.CONTAINED); |
|
spanLength = oldPos - pos; |
|
if (pos == 0 || |
|
spanLength == 0 |
|
) { |
|
return pos; |
|
} |
|
continue; |
|
} else { |
|
// Try to match only one code point from before a string match if some |
|
// string matched beyond it, so that we try all possible positions |
|
|
|
spanLength = spanOneBack(spanSet, s, pos); |
|
if (spanLength > 0) { |
|
if (spanLength == pos) { |
|
return 0; |
|
} |
|
// Match strings before this code point. |
|
// There cannot be any decrements below it because UnicodeSet strings |
|
|
|
pos -= spanLength; |
|
offsets.shift(spanLength); |
|
spanLength = 0; |
|
continue; |
|
} |
|
// Match strings from before the next string match. |
|
} |
|
} |
|
pos -= offsets.popMinimum(null); |
|
spanLength = 0; |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
private int spanNot(CharSequence s, int start, OutputInt outCount) { |
|
int length = s.length(); |
|
int pos = start, rest = length - start; |
|
int stringsLength = strings.size(); |
|
int count = 0; |
|
do { |
|
// Span until we find a code point from the set, |
|
|
|
int spanLimit; |
|
if (outCount == null) { |
|
spanLimit = spanNotSet.span(s, pos, SpanCondition.NOT_CONTAINED); |
|
} else { |
|
spanLimit = spanNotSet.spanAndCount(s, pos, SpanCondition.NOT_CONTAINED, outCount); |
|
outCount.value = count = count + outCount.value; |
|
} |
|
if (spanLimit == length) { |
|
return length; |
|
} |
|
pos = spanLimit; |
|
rest = length - spanLimit; |
|
|
|
// Check whether the current code point is in the original set, |
|
|
|
int cpLength = spanOne(spanSet, s, pos, rest); |
|
if (cpLength > 0) { |
|
return pos; |
|
} |
|
|
|
|
|
for (int i = 0; i < stringsLength; ++i) { |
|
if (spanLengths[i] == ALL_CP_CONTAINED) { |
|
continue; |
|
} |
|
String string = strings.get(i); |
|
|
|
int length16 = string.length(); |
|
if (length16 <= rest && matches16CPB(s, pos, length, string, length16)) { |
|
return pos; |
|
} |
|
} |
|
|
|
// The span(while not contained) ended on a string start/end which is |
|
// not in the original set. Skip this code point and continue. |
|
|
|
pos -= cpLength; |
|
rest += cpLength; |
|
++count; |
|
} while (rest != 0); |
|
if (outCount != null) { |
|
outCount.value = count; |
|
} |
|
return length; |
|
} |
|
|
|
private int spanNotBack(CharSequence s, int length) { |
|
int pos = length; |
|
int i, stringsLength = strings.size(); |
|
do { |
|
// Span until we find a code point from the set, |
|
|
|
pos = spanNotSet.spanBack(s, pos, SpanCondition.NOT_CONTAINED); |
|
if (pos == 0) { |
|
return 0; |
|
} |
|
|
|
// Check whether the current code point is in the original set, |
|
|
|
int cpLength = spanOneBack(spanSet, s, pos); |
|
if (cpLength > 0) { |
|
return pos; |
|
} |
|
|
|
|
|
for (i = 0; i < stringsLength; ++i) { |
|
// Use spanLengths rather than a spanLengths pointer because |
|
// it is easier and we only need to know whether the string is irrelevant |
|
|
|
if (spanLengths[i] == ALL_CP_CONTAINED) { |
|
continue; |
|
} |
|
String string = strings.get(i); |
|
|
|
int length16 = string.length(); |
|
if (length16 <= pos && matches16CPB(s, pos - length16, length, string, length16)) { |
|
return pos; |
|
} |
|
} |
|
|
|
// The span(while not contained) ended on a string start/end which is |
|
// not in the original set. Skip this code point and continue. |
|
|
|
pos += cpLength; |
|
} while (pos != 0); |
|
return 0; |
|
} |
|
|
|
static short makeSpanLengthByte(int spanLength) { |
|
|
|
return spanLength < LONG_SPAN ? (short) spanLength : LONG_SPAN; |
|
} |
|
|
|
|
|
private static boolean matches16(CharSequence s, int start, final String t, int length) { |
|
int end = start + length; |
|
while (length-- > 0) { |
|
if (s.charAt(--end) != t.charAt(length)) { |
|
return false; |
|
} |
|
} |
|
return true; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
static boolean matches16CPB(CharSequence s, int start, int limit, final String t, int tlength) { |
|
return matches16(s, start, t, tlength) |
|
&& !(0 < start && Character.isHighSurrogate(s.charAt(start - 1)) && |
|
Character.isLowSurrogate(s.charAt(start))) |
|
&& !((start + tlength) < limit && Character.isHighSurrogate(s.charAt(start + tlength - 1)) && |
|
Character.isLowSurrogate(s.charAt(start + tlength))); |
|
} |
|
|
|
|
|
|
|
|
|
*/ |
|
static int spanOne(final UnicodeSet set, CharSequence s, int start, int length) { |
|
char c = s.charAt(start); |
|
if (c >= 0xd800 && c <= 0xdbff && length >= 2) { |
|
char c2 = s.charAt(start + 1); |
|
if (UTF16.isTrailSurrogate(c2)) { |
|
int supplementary = UCharacterProperty.getRawSupplementary(c, c2); |
|
return set.contains(supplementary) ? 2 : -2; |
|
} |
|
} |
|
return set.contains(c) ? 1 : -1; |
|
} |
|
|
|
static int spanOneBack(final UnicodeSet set, CharSequence s, int length) { |
|
char c = s.charAt(length - 1); |
|
if (c >= 0xdc00 && c <= 0xdfff && length >= 2) { |
|
char c2 = s.charAt(length - 2); |
|
if (UTF16.isLeadSurrogate(c2)) { |
|
int supplementary = UCharacterProperty.getRawSupplementary(c2, c); |
|
return set.contains(supplementary) ? 2 : -2; |
|
} |
|
} |
|
return set.contains(c) ? 1 : -1; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
private static final class OffsetList { |
|
private int[] list; |
|
private int length; |
|
private int start; |
|
|
|
public OffsetList() { |
|
list = new int[16]; |
|
} |
|
|
|
public void setMaxLength(int maxLength) { |
|
if (maxLength > list.length) { |
|
list = new int[maxLength]; |
|
} |
|
clear(); |
|
} |
|
|
|
public void clear() { |
|
for (int i = list.length; i-- > 0;) { |
|
list[i] = 0; |
|
} |
|
start = length = 0; |
|
} |
|
|
|
public boolean isEmpty() { |
|
return (length == 0); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public void shift(int delta) { |
|
int i = start + delta; |
|
if (i >= list.length) { |
|
i -= list.length; |
|
} |
|
if (list[i] != 0) { |
|
list[i] = 0; |
|
--length; |
|
} |
|
start = i; |
|
} |
|
|
|
|
|
|
|
|
|
*/ |
|
public void addOffset(int offset) { |
|
int i = start + offset; |
|
if (i >= list.length) { |
|
i -= list.length; |
|
} |
|
assert list[i] == 0; |
|
list[i] = 1; |
|
++length; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public void addOffsetAndCount(int offset, int count) { |
|
assert count > 0; |
|
int i = start + offset; |
|
if (i >= list.length) { |
|
i -= list.length; |
|
} |
|
if (list[i] == 0) { |
|
list[i] = count; |
|
++length; |
|
} else if (count < list[i]) { |
|
list[i] = count; |
|
} |
|
} |
|
|
|
|
|
|
|
*/ |
|
public boolean containsOffset(int offset) { |
|
int i = start + offset; |
|
if (i >= list.length) { |
|
i -= list.length; |
|
} |
|
return list[i] != 0; |
|
} |
|
|
|
|
|
|
|
*/ |
|
public boolean hasCountAtOffset(int offset, int count) { |
|
int i = start + offset; |
|
if (i >= list.length) { |
|
i -= list.length; |
|
} |
|
int oldCount = list[i]; |
|
return oldCount != 0 && oldCount <= count; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public int popMinimum(OutputInt outCount) { |
|
|
|
int i = start, result; |
|
while (++i < list.length) { |
|
int count = list[i]; |
|
if (count != 0) { |
|
list[i] = 0; |
|
--length; |
|
result = i - start; |
|
start = i; |
|
if (outCount != null) { outCount.value = count; } |
|
return result; |
|
} |
|
} |
|
// i==list.length |
|
|
|
// Wrap around and look for the next offset in list[0..start]. |
|
|
|
result = list.length - start; |
|
i = 0; |
|
int count; |
|
while ((count = list[i]) == 0) { |
|
++i; |
|
} |
|
list[i] = 0; |
|
--length; |
|
start = i; |
|
if (outCount != null) { outCount.value = count; } |
|
return result + i; |
|
} |
|
} |
|
} |