Back to index...

	/*
	* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
	* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
	*
	* This code is free software; you can redistribute it and/or modify it
	* under the terms of the GNU General Public License version 2 only, as
	* published by the Free Software Foundation. Oracle designates this
	* particular file as subject to the "Classpath" exception as provided
	* by Oracle in the LICENSE file that accompanied this code.
	*
	* This code is distributed in the hope that it will be useful, but WITHOUT
	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
	* version 2 for more details (a copy is included in the LICENSE file that
	* accompanied this code).
	*
	* You should have received a copy of the GNU General Public License version
	* 2 along with this work; if not, write to the Free Software Foundation,
	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
	*
	* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
	* or visit www.oracle.com if you need additional information or have any
	* questions.
	*/

	/*
	******************************************************************************
	*
	* Copyright (C) 2009-2014, International Business Machines
	* Corporation and others. All Rights Reserved.
	*
	******************************************************************************
	*/

	package sun.text.normalizer;

	import java.util.ArrayList;

	import sun.text.normalizer.UnicodeSet.SpanCondition;

	/*
	* Implement span() etc. for a set with strings.
	* Avoid recursion because of its exponential complexity.
	* Instead, try multiple paths at once and track them with an IndexList.
	*/
	class UnicodeSetStringSpan {

	/*
	* Which span() variant will be used? The object is either built for one variant and used once,
	* or built for all and may be used many times.
	*/
	public static final int WITH_COUNT = 0x40; // spanAndCount() may be called
	public static final int FWD = 0x20;
	public static final int BACK = 0x10;
	// public static final int UTF16 = 8;
	public static final int CONTAINED = 2;
	public static final int NOT_CONTAINED = 1;

	public static final int ALL = 0x7f;

	public static final int FWD_UTF16_CONTAINED = FWD \| /* UTF16 \| */ CONTAINED;
	public static final int FWD_UTF16_NOT_CONTAINED = FWD \| /* UTF16 \| */NOT_CONTAINED;
	public static final int BACK_UTF16_CONTAINED = BACK \| /* UTF16 \| */ CONTAINED;
	public static final int BACK_UTF16_NOT_CONTAINED = BACK \| /* UTF16 \| */NOT_CONTAINED;

	/**
	* Special spanLength short values. (since Java has not unsigned byte type)
	* All code points in the string are contained in the parent set.
	*/
	static final short ALL_CP_CONTAINED = 0xff;

	/** The spanLength is >=0xfe. */
	static final short LONG_SPAN = ALL_CP_CONTAINED - 1;

	/** Set for span(). Same as parent but without strings. */
	private UnicodeSet spanSet;

	/**
	* Set for span(not contained).
	* Same as spanSet, plus characters that start or end strings.
	*/
	private UnicodeSet spanNotSet;

	/** The strings of the parent set. */
	private ArrayList<String> strings;

	/** The lengths of span(), spanBack() etc. for each string. */
	private short[] spanLengths;

	/** Maximum lengths of relevant strings. */
	private int maxLength16;

	/** Are there strings that are not fully contained in the code point set? */
	private boolean someRelevant;

	/** Set up for all variants of span()? */
	private boolean all;

	/** Span helper */
	private OffsetList offsets;

	/**
	* Constructs for all variants of span(), or only for any one variant.
	* Initializes as little as possible, for single use.
	*/
	public UnicodeSetStringSpan(final UnicodeSet set, final ArrayList<String> setStrings, int which) {
	spanSet = new UnicodeSet(0, 0x10ffff);
	// TODO: With Java 6, just take the parent set's strings as is,
	// as a NavigableSet<String>, rather than as an ArrayList copy of the set of strings.
	// Then iterate via the first() and higher() methods.
	// (We do not want to create multiple Iterator objects in each span().)
	// See ICU ticket #7454.
	strings = setStrings;
	all = (which == ALL);
	spanSet.retainAll(set);
	if (0 != (which & NOT_CONTAINED)) {
	// Default to the same sets.
	// addToSpanNotSet() will create a separate set if necessary.
	spanNotSet = spanSet;
	}
	offsets = new OffsetList();

	// Determine if the strings even need to be taken into account at all for span() etc.
	// If any string is relevant, then all strings need to be used for
	// span(longest match) but only the relevant ones for span(while contained).
	// TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
	// and do not store UTF-8 strings if !thisRelevant and CONTAINED.
	// (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
	// Also count the lengths of the UTF-8 versions of the strings for memory allocation.
	int stringsLength = strings.size();

	int i, spanLength;
	someRelevant = false;
	for (i = 0; i < stringsLength; ++i) {
	String string = strings.get(i);
	int length16 = string.length();
	spanLength = spanSet.span(string, SpanCondition.CONTAINED);
	if (spanLength < length16) { // Relevant string.
	someRelevant = true;
	}
	if (/* (0 != (which & UTF16)) && */ length16 > maxLength16) {
	maxLength16 = length16;
	}
	}
	if (!someRelevant && (which & WITH_COUNT) == 0) {
	return;
	}

	// Freeze after checking for the need to use strings at all because freezing
	// a set takes some time and memory which are wasted if there are no relevant strings.
	if (all) {
	spanSet.freeze();
	}

	int spanBackLengthsOffset;

	// Allocate a block of meta data.
	int allocSize;
	if (all) {
	// 2 sets of span lengths
	allocSize = stringsLength * (2);
	} else {
	allocSize = stringsLength; // One set of span lengths.
	}
	spanLengths = new short[allocSize];

	if (all) {
	// Store span lengths for all span() variants.
	spanBackLengthsOffset = stringsLength;
	} else {
	// Store span lengths for only one span() variant.
	spanBackLengthsOffset = 0;
	}

	// Set the meta data and spanNotSet and write the UTF-8 strings.

	for (i = 0; i < stringsLength; ++i) {
	String string = strings.get(i);
	int length16 = string.length();
	spanLength = spanSet.span(string, SpanCondition.CONTAINED);
	if (spanLength < length16) { // Relevant string.
	if (true /* 0 != (which & UTF16) */) {
	if (0 != (which & CONTAINED)) {
	if (0 != (which & FWD)) {
	spanLengths[i] = makeSpanLengthByte(spanLength);
	}
	if (0 != (which & BACK)) {
	spanLength = length16
	- spanSet.spanBack(string, length16, SpanCondition.CONTAINED);
	spanLengths[spanBackLengthsOffset + i] = makeSpanLengthByte(spanLength);
	}
	} else /* not CONTAINED, not all, but NOT_CONTAINED */{
	spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = 0; // Only store a relevant/irrelevant
	// flag.
	}
	}
	if (0 != (which & NOT_CONTAINED)) {
	// Add string start and end code points to the spanNotSet so that
	// a span(while not contained) stops before any string.
	int c;
	if (0 != (which & FWD)) {
	c = string.codePointAt(0);
	addToSpanNotSet(c);
	}
	if (0 != (which & BACK)) {
	c = string.codePointBefore(length16);
	addToSpanNotSet(c);
	}
	}
	} else { // Irrelevant string.
	if (all) {
	spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = ALL_CP_CONTAINED;
	} else {
	// All spanXYZLengths pointers contain the same address.
	spanLengths[i] = ALL_CP_CONTAINED;
	}
	}
	}

	// Finish.
	if (all) {
	spanNotSet.freeze();
	}
	}

	/**
	* Do the strings need to be checked in span() etc.?
	*
	* @return true if strings need to be checked (call span() here),
	* false if not (use a BMPSet for best performance).
	*/
	public boolean needsStringSpanUTF16() {
	return someRelevant;
	}

	/** For fast UnicodeSet::contains(c). */
	public boolean contains(int c) {
	return spanSet.contains(c);
	}

	/**
	* Adds a starting or ending string character to the spanNotSet
	* so that a character span ends before any string.
	*/
	private void addToSpanNotSet(int c) {
	if (spanNotSet == null \|\| spanNotSet == spanSet) {
	if (spanSet.contains(c)) {
	return; // Nothing to do.
	}
	spanNotSet = spanSet.cloneAsThawed();
	}
	spanNotSet.add(c);
	}

	/*
	* Note: In span() when spanLength==0
	* (after a string match, or at the beginning after an empty code point span)
	* and in spanNot() and spanNotUTF8(),
	* string matching could use a binary search because all string matches are done
	* from the same start index.
	*
	* For UTF-8, this would require a comparison function that returns UTF-16 order.
	*
	* This optimization should not be necessary for normal UnicodeSets because most sets have no strings, and most sets
	* with strings have very few very short strings. For cases with many strings, it might be better to use a different
	* API and implementation with a DFA (state machine).
	*/

	/*
	* Algorithm for span(SpanCondition.CONTAINED)
	*
	* Theoretical algorithm:
	* - Iterate through the string, and at each code point boundary:
	* + If the code point there is in the set, then remember to continue after it.
	* + If a set string matches at the current position, then remember to continue after it.
	* + Either recursively span for each code point or string match, or recursively span
	* for all but the shortest one and iteratively continue the span with the shortest local match.
	* + Remember the longest recursive span (the farthest end point).
	* + If there is no match at the current position,
	* neither for the code point there nor for any set string,
	* then stop and return the longest recursive span length.
	*
	* Optimized implementation:
	*
	* (We assume that most sets will have very few very short strings.
	* A span using a string-less set is extremely fast.)
	*
	* Create and cache a spanSet which contains all of the single code points of the original set
	* but none of its strings.
	*
	* - Start with spanLength=spanSet.span(SpanCondition.CONTAINED).
	* - Loop:
	* + Try to match each set string at the end of the spanLength.
	* ~ Set strings that start with set-contained code points
	* must be matched with a partial overlap
	* because the recursive algorithm would have tried to match them at every position.
	* ~ Set strings that entirely consist of set-contained code points
	* are irrelevant for span(SpanCondition.CONTAINED)
	* because the recursive algorithm would continue after them anyway and
	* find the longest recursive match from their end.
	* ~ Rather than recursing, note each end point of a set string match.
	* + If no set string matched after spanSet.span(),
	* then return with where the spanSet.span() ended.
	* + If at least one set string matched after spanSet.span(),
	* then pop the shortest string match end point and continue the loop,
	* trying to match all set strings from there.
	* + If at least one more set string matched after a previous string match, then test if the
	* code point after the previous string match is also contained in the set.
	* Continue the loop with the shortest end point of
	* either this code point or a matching set string.
	* + If no more set string matched after a previous string match,
	* then try another spanLength=spanSet.span(SpanCondition.CONTAINED).
	* Stop if spanLength==0, otherwise continue the loop.
	*
	* By noting each end point of a set string match, the function visits each string position at most once and
	* finishes in linear time.
	*
	* The recursive algorithm may visit the same string position many times
	* if multiple paths lead to it and finishes in exponential time.
	*/

	/*
	* Algorithm for span(SIMPLE)
	*
	* Theoretical algorithm:
	* - Iterate through the string, and at each code point boundary:
	* + If the code point there is in the set, then remember to continue after it.
	* + If a set string matches at the current position, then remember to continue after it.
	* + Continue from the farthest match position and ignore all others.
	* + If there is no match at the current position, then stop and return the current position.
	*
	* Optimized implementation:
	*
	* (Same assumption and spanSet as above.)
	*
	* - Start with spanLength=spanSet.span(SpanCondition.CONTAINED).
	* - Loop:
	* + Try to match each set string at the end of the spanLength.
	* ~ Set strings that start with set-contained code points
	* must be matched with a partial overlap
	* because the standard algorithm would have tried to match them earlier.
	* ~ Set strings that entirely consist of set-contained code points
	* must be matched with a full overlap because the longest-match algorithm
	* would hide set string matches that end earlier.
	* Such set strings need not be matched earlier inside the code point span
	* because the standard algorithm would then have
	* continued after the set string match anyway.
	* ~ Remember the longest set string match (farthest end point)
	* from the earliest starting point.
	* + If no set string matched after spanSet.span(),
	* then return with where the spanSet.span() ended.
	* + If at least one set string matched,
	* then continue the loop after the longest match from the earliest position.
	* + If no more set string matched after a previous string match,
	* then try another spanLength=spanSet.span(SpanCondition.CONTAINED).
	* Stop if spanLength==0, otherwise continue the loop.
	*/
	/**
	* Spans a string.
	*
	* @param s The string to be spanned
	* @param start The start index that the span begins
	* @param spanCondition The span condition
	* @return the limit (exclusive end) of the span
	*/
	public int span(CharSequence s, int start, SpanCondition spanCondition) {
	if (spanCondition == SpanCondition.NOT_CONTAINED) {
	return spanNot(s, start, null);
	}
	int spanLimit = spanSet.span(s, start, SpanCondition.CONTAINED);
	if (spanLimit == s.length()) {
	return spanLimit;
	}
	return spanWithStrings(s, start, spanLimit, spanCondition);
	}

	/**
	* Synchronized method for complicated spans using the offsets.
	* Avoids synchronization for simple cases.
	*
	* @param spanLimit = spanSet.span(s, start, CONTAINED)
	*/
	private synchronized int spanWithStrings(CharSequence s, int start, int spanLimit,
	SpanCondition spanCondition) {
	// Consider strings; they may overlap with the span.
	int initSize = 0;
	if (spanCondition == SpanCondition.CONTAINED) {
	// Use offset list to try all possibilities.
	initSize = maxLength16;
	}
	offsets.setMaxLength(initSize);
	int length = s.length();
	int pos = spanLimit, rest = length - spanLimit;
	int spanLength = spanLimit - start;
	int i, stringsLength = strings.size();
	for (;;) {
	if (spanCondition == SpanCondition.CONTAINED) {
	for (i = 0; i < stringsLength; ++i) {
	int overlap = spanLengths[i];
	if (overlap == ALL_CP_CONTAINED) {
	continue; // Irrelevant string.
	}
	String string = strings.get(i);

	int length16 = string.length();

	// Try to match this string at pos-overlap..pos.
	if (overlap >= LONG_SPAN) {
	overlap = length16;
	// While contained: No point matching fully inside the code point span.
	overlap = string.offsetByCodePoints(overlap, -1); // Length of the string minus the last code
	// point.
	}
	if (overlap > spanLength) {
	overlap = spanLength;
	}
	int inc = length16 - overlap; // Keep overlap+inc==length16.
	for (;;) {
	if (inc > rest) {
	break;
	}
	// Try to match if the increment is not listed already.
	if (!offsets.containsOffset(inc) && matches16CPB(s, pos - overlap, length, string, length16)) {
	if (inc == rest) {
	return length; // Reached the end of the string.
	}
	offsets.addOffset(inc);
	}
	if (overlap == 0) {
	break;
	}
	--overlap;
	++inc;
	}
	}
	} else /* SIMPLE */{
	int maxInc = 0, maxOverlap = 0;
	for (i = 0; i < stringsLength; ++i) {
	int overlap = spanLengths[i];
	// For longest match, we do need to try to match even an all-contained string
	// to find the match from the earliest start.

	String string = strings.get(i);

	int length16 = string.length();

	// Try to match this string at pos-overlap..pos.
	if (overlap >= LONG_SPAN) {
	overlap = length16;
	// Longest match: Need to match fully inside the code point span
	// to find the match from the earliest start.
	}
	if (overlap > spanLength) {
	overlap = spanLength;
	}
	int inc = length16 - overlap; // Keep overlap+inc==length16.
	for (;;) {
	if (inc > rest \|\| overlap < maxOverlap) {
	break;
	}
	// Try to match if the string is longer or starts earlier.
	if ((overlap > maxOverlap \|\| /* redundant overlap==maxOverlap && */inc > maxInc)
	&& matches16CPB(s, pos - overlap, length, string, length16)) {
	maxInc = inc; // Longest match from earliest start.
	maxOverlap = overlap;
	break;
	}
	--overlap;
	++inc;
	}
	}

	if (maxInc != 0 \|\| maxOverlap != 0) {
	// Longest-match algorithm, and there was a string match.
	// Simply continue after it.
	pos += maxInc;
	rest -= maxInc;
	if (rest == 0) {
	return length; // Reached the end of the string.
	}
	spanLength = 0; // Match strings from after a string match.
	continue;
	}
	}
	// Finished trying to match all strings at pos.

	if (spanLength != 0 \|\| pos == 0) {
	// The position is after an unlimited code point span (spanLength!=0),
	// not after a string match.
	// The only position where spanLength==0 after a span is pos==0.
	// Otherwise, an unlimited code point span is only tried again when no
	// strings match, and if such a non-initial span fails we stop.
	if (offsets.isEmpty()) {
	return pos; // No strings matched after a span.
	}
	// Match strings from after the next string match.
	} else {
	// The position is after a string match (or a single code point).
	if (offsets.isEmpty()) {
	// No more strings matched after a previous string match.
	// Try another code point span from after the last string match.
	spanLimit = spanSet.span(s, pos, SpanCondition.CONTAINED);
	spanLength = spanLimit - pos;
	if (spanLength == rest \|\| // Reached the end of the string, or
	spanLength == 0 // neither strings nor span progressed.
	) {
	return spanLimit;
	}
	pos += spanLength;
	rest -= spanLength;
	continue; // spanLength>0: Match strings from after a span.
	} else {
	// Try to match only one code point from after a string match if some
	// string matched beyond it, so that we try all possible positions
	// and don't overshoot.
	spanLength = spanOne(spanSet, s, pos, rest);
	if (spanLength > 0) {
	if (spanLength == rest) {
	return length; // Reached the end of the string.
	}
	// Match strings after this code point.
	// There cannot be any increments below it because UnicodeSet strings
	// contain multiple code points.
	pos += spanLength;
	rest -= spanLength;
	offsets.shift(spanLength);
	spanLength = 0;
	continue; // Match strings from after a single code point.
	}
	// Match strings from after the next string match.
	}
	}
	int minOffset = offsets.popMinimum(null);
	pos += minOffset;
	rest -= minOffset;
	spanLength = 0; // Match strings from after a string match.
	}
	}

	/**
	* Spans a string and counts the smallest number of set elements on any path across the span.
	*
	* <p>For proper counting, we cannot ignore strings that are fully contained in code point spans.
	*
	* <p>If the set does not have any fully-contained strings, then we could optimize this
	* like span(), but such sets are likely rare, and this is at least still linear.
	*
	* @param s The string to be spanned
	* @param start The start index that the span begins
	* @param spanCondition The span condition
	* @param outCount The count
	* @return the limit (exclusive end) of the span
	*/
	public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition,
	OutputInt outCount) {
	if (spanCondition == SpanCondition.NOT_CONTAINED) {
	return spanNot(s, start, outCount);
	}
	// Consider strings; they may overlap with the span,
	// and they may result in a smaller count that with just code points.
	if (spanCondition == SpanCondition.CONTAINED) {
	return spanContainedAndCount(s, start, outCount);
	}
	// SIMPLE (not synchronized, does not use offsets)
	int stringsLength = strings.size();
	int length = s.length();
	int pos = start;
	int rest = length - start;
	int count = 0;
	while (rest != 0) {
	// Try to match the next code point.
	int cpLength = spanOne(spanSet, s, pos, rest);
	int maxInc = (cpLength > 0) ? cpLength : 0;
	// Try to match all of the strings.
	for (int i = 0; i < stringsLength; ++i) {
	String string = strings.get(i);
	int length16 = string.length();
	if (maxInc < length16 && length16 <= rest &&
	matches16CPB(s, pos, length, string, length16)) {
	maxInc = length16;
	}
	}
	// We are done if there is no match beyond pos.
	if (maxInc == 0) {
	outCount.value = count;
	return pos;
	}
	// Continue from the longest match.
	++count;
	pos += maxInc;
	rest -= maxInc;
	}
	outCount.value = count;
	return pos;
	}

	private synchronized int spanContainedAndCount(CharSequence s, int start, OutputInt outCount) {
	// Use offset list to try all possibilities.
	offsets.setMaxLength(maxLength16);
	int stringsLength = strings.size();
	int length = s.length();
	int pos = start;
	int rest = length - start;
	int count = 0;
	while (rest != 0) {
	// Try to match the next code point.
	int cpLength = spanOne(spanSet, s, pos, rest);
	if (cpLength > 0) {
	offsets.addOffsetAndCount(cpLength, count + 1);
	}
	// Try to match all of the strings.
	for (int i = 0; i < stringsLength; ++i) {
	String string = strings.get(i);
	int length16 = string.length();
	// Note: If the strings were sorted by length, then we could also
	// avoid trying to match if there is already a match of the same length.
	if (length16 <= rest && !offsets.hasCountAtOffset(length16, count + 1) &&
	matches16CPB(s, pos, length, string, length16)) {
	offsets.addOffsetAndCount(length16, count + 1);
	}
	}
	// We are done if there is no match beyond pos.
	if (offsets.isEmpty()) {
	outCount.value = count;
	return pos;
	}
	// Continue from the nearest match.
	int minOffset = offsets.popMinimum(outCount);
	count = outCount.value;
	pos += minOffset;
	rest -= minOffset;
	}
	outCount.value = count;
	return pos;
	}

	/**
	* Span a string backwards.
	*
	* @param s The string to be spanned
	* @param spanCondition The span condition
	* @return The string index which starts the span (i.e. inclusive).
	*/
	public synchronized int spanBack(CharSequence s, int length, SpanCondition spanCondition) {
	if (spanCondition == SpanCondition.NOT_CONTAINED) {
	return spanNotBack(s, length);
	}
	int pos = spanSet.spanBack(s, length, SpanCondition.CONTAINED);
	if (pos == 0) {
	return 0;
	}
	int spanLength = length - pos;

	// Consider strings; they may overlap with the span.
	int initSize = 0;
	if (spanCondition == SpanCondition.CONTAINED) {
	// Use offset list to try all possibilities.
	initSize = maxLength16;
	}
	offsets.setMaxLength(initSize);
	int i, stringsLength = strings.size();
	int spanBackLengthsOffset = 0;
	if (all) {
	spanBackLengthsOffset = stringsLength;
	}
	for (;;) {
	if (spanCondition == SpanCondition.CONTAINED) {
	for (i = 0; i < stringsLength; ++i) {
	int overlap = spanLengths[spanBackLengthsOffset + i];
	if (overlap == ALL_CP_CONTAINED) {
	continue; // Irrelevant string.
	}
	String string = strings.get(i);

	int length16 = string.length();

	// Try to match this string at pos-(length16-overlap)..pos-length16.
	if (overlap >= LONG_SPAN) {
	overlap = length16;
	// While contained: No point matching fully inside the code point span.
	int len1 = 0;
	len1 = string.offsetByCodePoints(0, 1);
	overlap -= len1; // Length of the string minus the first code point.
	}
	if (overlap > spanLength) {
	overlap = spanLength;
	}
	int dec = length16 - overlap; // Keep dec+overlap==length16.
	for (;;) {
	if (dec > pos) {
	break;
	}
	// Try to match if the decrement is not listed already.
	if (!offsets.containsOffset(dec) && matches16CPB(s, pos - dec, length, string, length16)) {
	if (dec == pos) {
	return 0; // Reached the start of the string.
	}
	offsets.addOffset(dec);
	}
	if (overlap == 0) {
	break;
	}
	--overlap;
	++dec;
	}
	}
	} else /* SIMPLE */{
	int maxDec = 0, maxOverlap = 0;
	for (i = 0; i < stringsLength; ++i) {
	int overlap = spanLengths[spanBackLengthsOffset + i];
	// For longest match, we do need to try to match even an all-contained string
	// to find the match from the latest end.

	String string = strings.get(i);

	int length16 = string.length();

	// Try to match this string at pos-(length16-overlap)..pos-length16.
	if (overlap >= LONG_SPAN) {
	overlap = length16;
	// Longest match: Need to match fully inside the code point span
	// to find the match from the latest end.
	}
	if (overlap > spanLength) {
	overlap = spanLength;
	}
	int dec = length16 - overlap; // Keep dec+overlap==length16.
	for (;;) {
	if (dec > pos \|\| overlap < maxOverlap) {
	break;
	}
	// Try to match if the string is longer or ends later.
	if ((overlap > maxOverlap \|\| /* redundant overlap==maxOverlap && */dec > maxDec)
	&& matches16CPB(s, pos - dec, length, string, length16)) {
	maxDec = dec; // Longest match from latest end.
	maxOverlap = overlap;
	break;
	}
	--overlap;
	++dec;
	}
	}

	if (maxDec != 0 \|\| maxOverlap != 0) {
	// Longest-match algorithm, and there was a string match.
	// Simply continue before it.
	pos -= maxDec;
	if (pos == 0) {
	return 0; // Reached the start of the string.
	}
	spanLength = 0; // Match strings from before a string match.
	continue;
	}
	}
	// Finished trying to match all strings at pos.

	if (spanLength != 0 \|\| pos == length) {
	// The position is before an unlimited code point span (spanLength!=0),
	// not before a string match.
	// The only position where spanLength==0 before a span is pos==length.
	// Otherwise, an unlimited code point span is only tried again when no
	// strings match, and if such a non-initial span fails we stop.
	if (offsets.isEmpty()) {
	return pos; // No strings matched before a span.
	}
	// Match strings from before the next string match.
	} else {
	// The position is before a string match (or a single code point).
	if (offsets.isEmpty()) {
	// No more strings matched before a previous string match.
	// Try another code point span from before the last string match.
	int oldPos = pos;
	pos = spanSet.spanBack(s, oldPos, SpanCondition.CONTAINED);
	spanLength = oldPos - pos;
	if (pos == 0 \|\| // Reached the start of the string, or
	spanLength == 0 // neither strings nor span progressed.
	) {
	return pos;
	}
	continue; // spanLength>0: Match strings from before a span.
	} else {
	// Try to match only one code point from before a string match if some
	// string matched beyond it, so that we try all possible positions
	// and don't overshoot.
	spanLength = spanOneBack(spanSet, s, pos);
	if (spanLength > 0) {
	if (spanLength == pos) {
	return 0; // Reached the start of the string.
	}
	// Match strings before this code point.
	// There cannot be any decrements below it because UnicodeSet strings
	// contain multiple code points.
	pos -= spanLength;
	offsets.shift(spanLength);
	spanLength = 0;
	continue; // Match strings from before a single code point.
	}
	// Match strings from before the next string match.
	}
	}
	pos -= offsets.popMinimum(null);
	spanLength = 0; // Match strings from before a string match.
	}
	}

	/**
	* Algorithm for spanNot()==span(SpanCondition.NOT_CONTAINED)
	*
	* Theoretical algorithm:
	* - Iterate through the string, and at each code point boundary:
	* + If the code point there is in the set, then return with the current position.
	* + If a set string matches at the current position, then return with the current position.
	*
	* Optimized implementation:
	*
	* (Same assumption as for span() above.)
	*
	* Create and cache a spanNotSet which contains
	* all of the single code points of the original set but none of its strings.
	* For each set string add its initial code point to the spanNotSet.
	* (Also add its final code point for spanNotBack().)
	*
	* - Loop:
	* + Do spanLength=spanNotSet.span(SpanCondition.NOT_CONTAINED).
	* + If the current code point is in the original set, then return the current position.
	* + If any set string matches at the current position, then return the current position.
	* + If there is no match at the current position, neither for the code point
	* there nor for any set string, then skip this code point and continue the loop.
	* This happens for set-string-initial code points that were added to spanNotSet
	* when there is not actually a match for such a set string.
	*
	* @param s The string to be spanned
	* @param start The start index that the span begins
	* @param outCount If not null: Receives the number of code points across the span.
	* @return the limit (exclusive end) of the span
	*/
	private int spanNot(CharSequence s, int start, OutputInt outCount) {
	int length = s.length();
	int pos = start, rest = length - start;
	int stringsLength = strings.size();
	int count = 0;
	do {
	// Span until we find a code point from the set,
	// or a code point that starts or ends some string.
	int spanLimit;
	if (outCount == null) {
	spanLimit = spanNotSet.span(s, pos, SpanCondition.NOT_CONTAINED);
	} else {
	spanLimit = spanNotSet.spanAndCount(s, pos, SpanCondition.NOT_CONTAINED, outCount);
	outCount.value = count = count + outCount.value;
	}
	if (spanLimit == length) {
	return length; // Reached the end of the string.
	}
	pos = spanLimit;
	rest = length - spanLimit;

	// Check whether the current code point is in the original set,
	// without the string starts and ends.
	int cpLength = spanOne(spanSet, s, pos, rest);
	if (cpLength > 0) {
	return pos; // There is a set element at pos.
	}

	// Try to match the strings at pos.
	for (int i = 0; i < stringsLength; ++i) {
	if (spanLengths[i] == ALL_CP_CONTAINED) {
	continue; // Irrelevant string.
	}
	String string = strings.get(i);

	int length16 = string.length();
	if (length16 <= rest && matches16CPB(s, pos, length, string, length16)) {
	return pos; // There is a set element at pos.
	}
	}

	// The span(while not contained) ended on a string start/end which is
	// not in the original set. Skip this code point and continue.
	// cpLength<0
	pos -= cpLength;
	rest += cpLength;
	++count;
	} while (rest != 0);
	if (outCount != null) {
	outCount.value = count;
	}
	return length; // Reached the end of the string.
	}

	private int spanNotBack(CharSequence s, int length) {
	int pos = length;
	int i, stringsLength = strings.size();
	do {
	// Span until we find a code point from the set,
	// or a code point that starts or ends some string.
	pos = spanNotSet.spanBack(s, pos, SpanCondition.NOT_CONTAINED);
	if (pos == 0) {
	return 0; // Reached the start of the string.
	}

	// Check whether the current code point is in the original set,
	// without the string starts and ends.
	int cpLength = spanOneBack(spanSet, s, pos);
	if (cpLength > 0) {
	return pos; // There is a set element at pos.
	}

	// Try to match the strings at pos.
	for (i = 0; i < stringsLength; ++i) {
	// Use spanLengths rather than a spanLengths pointer because
	// it is easier and we only need to know whether the string is irrelevant
	// which is the same in either array.
	if (spanLengths[i] == ALL_CP_CONTAINED) {
	continue; // Irrelevant string.
	}
	String string = strings.get(i);

	int length16 = string.length();
	if (length16 <= pos && matches16CPB(s, pos - length16, length, string, length16)) {
	return pos; // There is a set element at pos.
	}
	}

	// The span(while not contained) ended on a string start/end which is
	// not in the original set. Skip this code point and continue.
	// cpLength<0
	pos += cpLength;
	} while (pos != 0);
	return 0; // Reached the start of the string.
	}

	static short makeSpanLengthByte(int spanLength) {
	// 0xfe==UnicodeSetStringSpan::LONG_SPAN
	return spanLength < LONG_SPAN ? (short) spanLength : LONG_SPAN;
	}

	// Compare strings without any argument checks. Requires length>0.
	private static boolean matches16(CharSequence s, int start, final String t, int length) {
	int end = start + length;
	while (length-- > 0) {
	if (s.charAt(--end) != t.charAt(length)) {
	return false;
	}
	}
	return true;
	}

	/**
	* Compare 16-bit Unicode strings (which may be malformed UTF-16)
	* at code point boundaries.
	* That is, each edge of a match must not be in the middle of a surrogate pair.
	* @param s The string to match in.
	* @param start The start index of s.
	* @param limit The limit of the subsequence of s being spanned.
	* @param t The substring to be matched in s.
	* @param tlength The length of t.
	*/
	static boolean matches16CPB(CharSequence s, int start, int limit, final String t, int tlength) {
	return matches16(s, start, t, tlength)
	&& !(0 < start && Character.isHighSurrogate(s.charAt(start - 1)) &&
	Character.isLowSurrogate(s.charAt(start)))
	&& !((start + tlength) < limit && Character.isHighSurrogate(s.charAt(start + tlength - 1)) &&
	Character.isLowSurrogate(s.charAt(start + tlength)));
	}

	/**
	* Does the set contain the next code point?
	* If so, return its length; otherwise return its negative length.
	*/
	static int spanOne(final UnicodeSet set, CharSequence s, int start, int length) {
	char c = s.charAt(start);
	if (c >= 0xd800 && c <= 0xdbff && length >= 2) {
	char c2 = s.charAt(start + 1);
	if (UTF16.isTrailSurrogate(c2)) {
	int supplementary = UCharacterProperty.getRawSupplementary(c, c2);
	return set.contains(supplementary) ? 2 : -2;
	}
	}
	return set.contains(c) ? 1 : -1;
	}

	static int spanOneBack(final UnicodeSet set, CharSequence s, int length) {
	char c = s.charAt(length - 1);
	if (c >= 0xdc00 && c <= 0xdfff && length >= 2) {
	char c2 = s.charAt(length - 2);
	if (UTF16.isLeadSurrogate(c2)) {
	int supplementary = UCharacterProperty.getRawSupplementary(c2, c);
	return set.contains(supplementary) ? 2 : -2;
	}
	}
	return set.contains(c) ? 1 : -1;
	}

	/**
	* Helper class for UnicodeSetStringSpan.
	*
	* <p>List of offsets from the current position from where to try matching
	* a code point or a string.
	* Stores offsets rather than indexes to simplify the code and use the same list
	* for both increments (in span()) and decrements (in spanBack()).
	*
	* <p>Assumption: The maximum offset is limited, and the offsets that are stored at any one time
	* are relatively dense, that is,
	* there are normally no gaps of hundreds or thousands of offset values.
	*
	* <p>This class optionally also tracks the minimum non-negative count for each position,
	* intended to count the smallest number of elements of any path leading to that position.
	*
	* <p>The implementation uses a circular buffer of count integers,
	* each indicating whether the corresponding offset is in the list,
	* and its path element count.
	* This avoids inserting into a sorted list of offsets (or absolute indexes)
	* and physically moving part of the list.
	*
	* <p>Note: In principle, the caller should setMaxLength() to
	* the maximum of the max string length and U16_LENGTH/U8_LENGTH
	* to account for "long" single code points.
	*
	* <p>Note: An earlier version did not track counts and stored only byte flags.
	* With boolean flags, if maxLength were guaranteed to be no more than 32 or 64,
	* the list could be stored as bit flags in a single integer.
	* Rather than handling a circular buffer with a start list index,
	* the integer would simply be shifted when lower offsets are removed.
	* UnicodeSet does not have a limit on the lengths of strings.
	*/
	private static final class OffsetList {
	private int[] list;
	private int length;
	private int start;

	public OffsetList() {
	list = new int[16]; // default size
	}

	public void setMaxLength(int maxLength) {
	if (maxLength > list.length) {
	list = new int[maxLength];
	}
	clear();
	}

	public void clear() {
	for (int i = list.length; i-- > 0;) {
	list[i] = 0;
	}
	start = length = 0;
	}

	public boolean isEmpty() {
	return (length == 0);
	}

	/**
	* Reduces all stored offsets by delta, used when the current position moves by delta.
	* There must not be any offsets lower than delta.
	* If there is an offset equal to delta, it is removed.
	*
	* @param delta [1..maxLength]
	*/
	public void shift(int delta) {
	int i = start + delta;
	if (i >= list.length) {
	i -= list.length;
	}
	if (list[i] != 0) {
	list[i] = 0;
	--length;
	}
	start = i;
	}

	/**
	* Adds an offset. The list must not contain it yet.
	* @param offset [1..maxLength]
	*/
	public void addOffset(int offset) {
	int i = start + offset;
	if (i >= list.length) {
	i -= list.length;
	}
	assert list[i] == 0;
	list[i] = 1;
	++length;
	}

	/**
	* Adds an offset and updates its count.
	* The list may already contain the offset.
	* @param offset [1..maxLength]
	*/
	public void addOffsetAndCount(int offset, int count) {
	assert count > 0;
	int i = start + offset;
	if (i >= list.length) {
	i -= list.length;
	}
	if (list[i] == 0) {
	list[i] = count;
	++length;
	} else if (count < list[i]) {
	list[i] = count;
	}
	}

	/**
	* @param offset [1..maxLength]
	*/
	public boolean containsOffset(int offset) {
	int i = start + offset;
	if (i >= list.length) {
	i -= list.length;
	}
	return list[i] != 0;
	}

	/**
	* @param offset [1..maxLength]
	*/
	public boolean hasCountAtOffset(int offset, int count) {
	int i = start + offset;
	if (i >= list.length) {
	i -= list.length;
	}
	int oldCount = list[i];
	return oldCount != 0 && oldCount <= count;
	}

	/**
	* Finds the lowest stored offset from a non-empty list, removes it,
	* and reduces all other offsets by this minimum.
	* @return min=[1..maxLength]
	*/
	public int popMinimum(OutputInt outCount) {
	// Look for the next offset in list[start+1..list.length-1].
	int i = start, result;
	while (++i < list.length) {
	int count = list[i];
	if (count != 0) {
	list[i] = 0;
	--length;
	result = i - start;
	start = i;
	if (outCount != null) { outCount.value = count; }
	return result;
	}
	}
	// i==list.length

	// Wrap around and look for the next offset in list[0..start].
	// Since the list is not empty, there will be one.
	result = list.length - start;
	i = 0;
	int count;
	while ((count = list[i]) == 0) {
	++i;
	}
	list[i] = 0;
	--length;
	start = i;
	if (outCount != null) { outCount.value = count; }
	return result + i;
	}
	}
	}

Back to index...