| /* | |
|  * Copyright (c) 2019, 2020, Oracle and/or its affiliates. All rights reserved. | |
|  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. | |
|  * | |
|  * This code is free software; you can redistribute it and/or modify it | |
|  * under the terms of the GNU General Public License version 2 only, as | |
|  * published by the Free Software Foundation.  Oracle designates this | |
|  * particular file as subject to the "Classpath" exception as provided | |
|  * by Oracle in the LICENSE file that accompanied this code. | |
|  * | |
|  * This code is distributed in the hope that it will be useful, but WITHOUT | |
|  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
|  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License | |
|  * version 2 for more details (a copy is included in the LICENSE file that | |
|  * accompanied this code). | |
|  * | |
|  * You should have received a copy of the GNU General Public License version | |
|  * 2 along with this work; if not, write to the Free Software Foundation, | |
|  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. | |
|  * | |
|  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA | |
|  * or visit www.oracle.com if you need additional information or have any | |
|  * questions. | |
| */ | |
| // (c) 2018 and later: Unicode, Inc. and others. | |
| // License & terms of use: http://www.unicode.org/copyright.html#License | |
| // created: 2018may10 Markus W. Scherer | |
| package jdk.internal.icu.util; | |
| import java.util.Iterator; | |
| import java.util.NoSuchElementException; | |
| /** | |
|  * Abstract map from Unicode code points (U+0000..U+10FFFF) to integer values. | |
|  * This does not implement java.util.Map. | |
|  * | |
|  * @stable ICU 63 | |
| */ | |
| public abstract class CodePointMap implements Iterable<CodePointMap.Range> { | |
|     /** | |
|      * Selectors for how getRange() should report value ranges overlapping with surrogates. | |
|      * Most users should use NORMAL. | |
|      * | |
|      * @see #getRange | |
|      * @stable ICU 63 | |
| */ | |
|     public enum RangeOption { | |
|         /** | |
|          * getRange() enumerates all same-value ranges as stored in the map. | |
|          * Most users should use this option. | |
|          * | |
|          * @stable ICU 63 | |
| */ | |
| NORMAL, | |
|         /** | |
|          * getRange() enumerates all same-value ranges as stored in the map, | |
|          * except that lead surrogates (U+D800..U+DBFF) are treated as having the | |
|          * surrogateValue, which is passed to getRange() as a separate parameter. | |
|          * The surrogateValue is not transformed via filter(). | |
|          * See {@link Character#isHighSurrogate}. | |
|          * | |
|          * <p>Most users should use NORMAL instead. | |
|          * | |
|          * <p>This option is useful for maps that map surrogate code *units* to | |
|          * special values optimized for UTF-16 string processing | |
|          * or for special error behavior for unpaired surrogates, | |
|          * but those values are not to be associated with the lead surrogate code *points*. | |
|          * | |
|          * @stable ICU 63 | |
| */ | |
| FIXED_LEAD_SURROGATES, | |
|         /** | |
|          * getRange() enumerates all same-value ranges as stored in the map, | |
|          * except that all surrogates (U+D800..U+DFFF) are treated as having the | |
|          * surrogateValue, which is passed to getRange() as a separate parameter. | |
|          * The surrogateValue is not transformed via filter(). | |
|          * See {@link Character#isSurrogate}. | |
|          * | |
|          * <p>Most users should use NORMAL instead. | |
|          * | |
|          * <p>This option is useful for maps that map surrogate code *units* to | |
|          * special values optimized for UTF-16 string processing | |
|          * or for special error behavior for unpaired surrogates, | |
|          * but those values are not to be associated with the lead surrogate code *points*. | |
|          * | |
|          * @stable ICU 63 | |
| */ | |
| FIXED_ALL_SURROGATES | |
| } | |
|     /** | |
|      * Callback function interface: Modifies a map value. | |
|      * Optionally called by getRange(). | |
|      * The modified value will be returned by the getRange() function. | |
|      * | |
|      * <p>Can be used to ignore some of the value bits, | |
|      * make a filter for one of several values, | |
|      * return a value index computed from the map value, etc. | |
|      * | |
|      * @see #getRange | |
|      * @see #iterator | |
|      * @stable ICU 63 | |
| */ | |
|     public interface ValueFilter { | |
|         /** | |
|          * Modifies the map value. | |
|          * | |
|          * @param value map value | |
|          * @return modified value | |
|          * @stable ICU 63 | |
| */ | |
| public int apply(int value); | |
| } | |
|     /** | |
|      * Range iteration result data. | |
|      * Code points from start to end map to the same value. | |
|      * The value may have been modified by {@link ValueFilter#apply(int)}, | |
|      * or it may be the surrogateValue if a RangeOption other than "normal" was used. | |
|      * | |
|      * @see #getRange | |
|      * @see #iterator | |
|      * @stable ICU 63 | |
| */ | |
|     public static final class Range { | |
| private int start; | |
| private int end; | |
| private int value; | |
|         /** | |
|          * Constructor. Sets start and end to -1 and value to 0. | |
|          * | |
|          * @stable ICU 63 | |
| */ | |
|         public Range() { | |
| start = end = -1; | |
| value = 0; | |
| } | |
|         /** | |
|          * @return the start code point | |
|          * @stable ICU 63 | |
| */ | |
|         public int getStart() { return start; } | |
|         /** | |
|          * @return the (inclusive) end code point | |
|          * @stable ICU 63 | |
| */ | |
|         public int getEnd() { return end; } | |
|         /** | |
|          * @return the range value | |
|          * @stable ICU 63 | |
| */ | |
|         public int getValue() { return value; } | |
|         /** | |
|          * Sets the range. When using {@link #iterator()}, | |
|          * iteration will resume after the newly set end. | |
|          * | |
|          * @param start new start code point | |
|          * @param end new end code point | |
|          * @param value new value | |
|          * @stable ICU 63 | |
| */ | |
|         public void set(int start, int end, int value) { | |
| this.start = start; | |
| this.end = end; | |
| this.value = value; | |
| } | |
| } | |
| private final class RangeIterator implements Iterator<Range> { | |
| private Range range = new Range(); | |
| @Override | |
|         public boolean hasNext() { | |
| return -1 <= range.end && range.end < 0x10ffff; | |
| } | |
| @Override | |
|         public Range next() { | |
| if (getRange(range.end + 1, null, range)) { | |
| return range; | |
|             } else { | |
| throw new NoSuchElementException(); | |
| } | |
| } | |
| @Override | |
|         public final void remove() { | |
| throw new UnsupportedOperationException(); | |
| } | |
| } | |
|     /** | |
|      * Iterates over code points of a string and fetches map values. | |
|      * This does not implement java.util.Iterator. | |
|      * | |
|      * <pre> | |
|      * void onString(CodePointMap map, CharSequence s, int start) { | |
|      *     CodePointMap.StringIterator iter = map.stringIterator(s, start); | |
|      *     while (iter.next()) { | |
|      *         int end = iter.getIndex();  // code point from between start and end | |
|      *         useValue(s, start, end, iter.getCodePoint(), iter.getValue()); | |
|      *         start = end; | |
|      *     } | |
|      * } | |
|      * </pre> | |
|      * | |
|      * <p>This class is not intended for public subclassing. | |
|      * | |
|      * @stable ICU 63 | |
| */ | |
|     public class StringIterator { | |
|         /** | |
|          * @internal | |
|          * @deprecated This API is ICU internal only. | |
| */ | |
| @Deprecated | |
| protected CharSequence s; | |
|         /** | |
|          * @internal | |
|          * @deprecated This API is ICU internal only. | |
| */ | |
| @Deprecated | |
| protected int sIndex; | |
|         /** | |
|          * @internal | |
|          * @deprecated This API is ICU internal only. | |
| */ | |
| @Deprecated | |
| protected int c; | |
|         /** | |
|          * @internal | |
|          * @deprecated This API is ICU internal only. | |
| */ | |
| @Deprecated | |
| protected int value; | |
|         /** | |
|          * @internal | |
|          * @deprecated This API is ICU internal only. | |
| */ | |
| @Deprecated | |
| protected StringIterator(CharSequence s, int sIndex) { | |
| this.s = s; | |
| this.sIndex = sIndex; | |
| c = -1; | |
| value = 0; | |
| } | |
|         /** | |
|          * Resets the iterator to a new string and/or a new string index. | |
|          * | |
|          * @param s string to iterate over | |
|          * @param sIndex string index where the iteration will start | |
|          * @stable ICU 63 | |
| */ | |
| public void reset(CharSequence s, int sIndex) { | |
| this.s = s; | |
| this.sIndex = sIndex; | |
| c = -1; | |
| value = 0; | |
| } | |
|         /** | |
|          * Reads the next code point, post-increments the string index, | |
|          * and gets a value from the map. | |
|          * Sets an implementation-defined error value if the code point is an unpaired surrogate. | |
|          * | |
|          * @return true if the string index was not yet at the end of the string; | |
|          *         otherwise the iterator did not advance | |
|          * @stable ICU 63 | |
| */ | |
|         public boolean next() { | |
| if (sIndex >= s.length()) { | |
| return false; | |
| } | |
| c = Character.codePointAt(s, sIndex); | |
| sIndex += Character.charCount(c); | |
| value = get(c); | |
| return true; | |
| } | |
|         /** | |
|          * Reads the previous code point, pre-decrements the string index, | |
|          * and gets a value from the map. | |
|          * Sets an implementation-defined error value if the code point is an unpaired surrogate. | |
|          * | |
|          * @return true if the string index was not yet at the start of the string; | |
|          *         otherwise the iterator did not advance | |
|          * @stable ICU 63 | |
| */ | |
|         public boolean previous() { | |
| if (sIndex <= 0) { | |
| return false; | |
| } | |
| c = Character.codePointBefore(s, sIndex); | |
| sIndex -= Character.charCount(c); | |
| value = get(c); | |
| return true; | |
| } | |
|         /** | |
|          * @return the string index | |
|          * @stable ICU 63 | |
| */ | |
|         public final int getIndex() { return sIndex; } | |
|         /** | |
|          * @return the code point | |
|          * @stable ICU 63 | |
| */ | |
|         public final int getCodePoint() { return c; } | |
|         /** | |
|          * @return the map value, | |
|          *         or an implementation-defined error value if | |
|          *         the code point is an unpaired surrogate | |
|          * @stable ICU 63 | |
| */ | |
|         public final int getValue() { return value; } | |
| } | |
|     /** | |
|      * Protected no-args constructor. | |
|      * | |
|      * @stable ICU 63 | |
| */ | |
|     protected CodePointMap() { | |
| } | |
|     /** | |
|      * Returns the value for a code point as stored in the map, with range checking. | |
|      * Returns an implementation-defined error value if c is not in the range 0..U+10FFFF. | |
|      * | |
|      * @param c the code point | |
|      * @return the map value, | |
|      *         or an implementation-defined error value if | |
|      *         the code point is not in the range 0..U+10FFFF | |
|      * @stable ICU 63 | |
| */ | |
| public abstract int get(int c); | |
|     /** | |
|      * Sets the range object to a range of code points beginning with the start parameter. | |
|      * The range start is the same as the start input parameter | |
|      * (even if there are preceding code points that have the same value). | |
|      * The range end is the last code point such that | |
|      * all those from start to there have the same value. | |
|      * Returns false if start is not 0..U+10FFFF. | |
|      * Can be used to efficiently iterate over all same-value ranges in a map. | |
|      * (This is normally faster than iterating over code points and get()ting each value, | |
|      * but may be much slower than a data structure that stores ranges directly.) | |
|      * | |
|      * <p>If the {@link ValueFilter} parameter is not null, then | |
|      * the value to be delivered is passed through that filter, and the return value is the end | |
|      * of the range where all values are modified to the same actual value. | |
|      * The value is unchanged if that parameter is null. | |
|      * | |
|      * <p>Example: | |
|      * <pre> | |
|      * int start = 0; | |
|      * CodePointMap.Range range = new CodePointMap.Range(); | |
|      * while (map.getRange(start, null, range)) { | |
|      *     int end = range.getEnd(); | |
|      *     int value = range.getValue(); | |
|      *     // Work with the range start..end and its value. | |
|      *     start = end + 1; | |
|      * } | |
|      * </pre> | |
|      * | |
|      * @param start range start | |
|      * @param filter an object that may modify the map data value, | |
|      *     or null if the values from the map are to be used unmodified | |
|      * @param range the range object that will be set to the code point range and value | |
|      * @return true if start is 0..U+10FFFF; otherwise no new range is fetched | |
|      * @stable ICU 63 | |
| */ | |
| public abstract boolean getRange(int start, ValueFilter filter, Range range); | |
|     /** | |
|      * Sets the range object to a range of code points beginning with the start parameter. | |
|      * The range start is the same as the start input parameter | |
|      * (even if there are preceding code points that have the same value). | |
|      * The range end is the last code point such that | |
|      * all those from start to there have the same value. | |
|      * Returns false if start is not 0..U+10FFFF. | |
|      * | |
|      * <p>Same as the simpler {@link #getRange(int, ValueFilter, Range)} but optionally | |
|      * modifies the range if it overlaps with surrogate code points. | |
|      * | |
|      * @param start range start | |
|      * @param option defines whether surrogates are treated normally, | |
|      *               or as having the surrogateValue; usually {@link RangeOption#NORMAL} | |
|      * @param surrogateValue value for surrogates; ignored if option=={@link RangeOption#NORMAL} | |
|      * @param filter an object that may modify the map data value, | |
|      *     or null if the values from the map are to be used unmodified | |
|      * @param range the range object that will be set to the code point range and value | |
|      * @return true if start is 0..U+10FFFF; otherwise no new range is fetched | |
|      * @stable ICU 63 | |
| */ | |
| public boolean getRange(int start, RangeOption option, int surrogateValue, | |
|             ValueFilter filter, Range range) { | |
| assert option != null; | |
| if (!getRange(start, filter, range)) { | |
| return false; | |
| } | |
| if (option == RangeOption.NORMAL) { | |
| return true; | |
| } | |
| int surrEnd = option == RangeOption.FIXED_ALL_SURROGATES ? 0xdfff : 0xdbff; | |
| int end = range.end; | |
| if (end < 0xd7ff || start > surrEnd) { | |
| return true; | |
| } | |
|         // The range overlaps with surrogates, or ends just before the first one. | |
| if (range.value == surrogateValue) { | |
| if (end >= surrEnd) { | |
| // Surrogates followed by a non-surrValue range, | |
|                 // or surrogates are part of a larger surrValue range. | |
| return true; | |
| } | |
|         } else { | |
| if (start <= 0xd7ff) { | |
| range.end = 0xd7ff; // Non-surrValue range ends before surrValue surrogates. | |
| return true; | |
| } | |
| // Start is a surrogate with a non-surrValue code *unit* value. | |
|             // Return a surrValue code *point* range. | |
| range.value = surrogateValue; | |
| if (end > surrEnd) { | |
| range.end = surrEnd; // Surrogate range ends before non-surrValue rest of range. | |
| return true; | |
| } | |
| } | |
| // See if the surrValue surrogate range can be merged with | |
|         // an immediately following range. | |
| if (getRange(surrEnd + 1, filter, range) && range.value == surrogateValue) { | |
| range.start = start; | |
| return true; | |
| } | |
| range.start = start; | |
| range.end = surrEnd; | |
| range.value = surrogateValue; | |
| return true; | |
| } | |
|     /** | |
|      * Convenience iterator over same-map-value code point ranges. | |
|      * Same as looping over all ranges with {@link #getRange(int, ValueFilter, Range)} | |
|      * without filtering. | |
|      * Adjacent ranges have different map values. | |
|      * | |
|      * <p>The iterator always returns the same Range object. | |
|      * | |
|      * @return a Range iterator | |
|      * @stable ICU 63 | |
| */ | |
| @Override | |
| public Iterator<Range> iterator() { | |
| return new RangeIterator(); | |
| } | |
|     /** | |
|      * Returns an iterator (not a java.util.Iterator) over code points of a string | |
|      * for fetching map values. | |
|      * | |
|      * @param s string to iterate over | |
|      * @param sIndex string index where the iteration will start | |
|      * @return the iterator | |
|      * @stable ICU 63 | |
| */ | |
| public StringIterator stringIterator(CharSequence s, int sIndex) { | |
| return new StringIterator(s, sIndex); | |
| } | |
| } |