|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
|
|
/* |
|
* (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved |
|
* (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved |
|
* |
|
* The original version of this source code and documentation |
|
* is copyrighted and owned by Taligent, Inc., a wholly-owned |
|
* subsidiary of IBM. These materials are provided under terms |
|
* of a License Agreement between Taligent and Sun. This technology |
|
* is protected by multiple US and International patents. |
|
* |
|
* This notice and attribution to Taligent may not be removed. |
|
* Taligent is a registered trademark of Taligent, Inc. |
|
* |
|
*/ |
|
|
|
package java.text; |
|
|
|
import java.lang.ref.SoftReference; |
|
import java.text.spi.BreakIteratorProvider; |
|
import java.util.Locale; |
|
import sun.util.locale.provider.LocaleProviderAdapter; |
|
import sun.util.locale.provider.LocaleServiceProviderPool; |
|
|
|
|
|
/** |
|
* The <code>BreakIterator</code> class implements methods for finding |
|
* the location of boundaries in text. Instances of <code>BreakIterator</code> |
|
* maintain a current position and scan over text |
|
* returning the index of characters where boundaries occur. |
|
* Internally, <code>BreakIterator</code> scans text using a |
|
* <code>CharacterIterator</code>, and is thus able to scan text held |
|
* by any object implementing that protocol. A <code>StringCharacterIterator</code> |
|
* is used to scan <code>String</code> objects passed to <code>setText</code>. |
|
* |
|
* <p> |
|
* You use the factory methods provided by this class to create |
|
* instances of various types of break iterators. In particular, |
|
* use <code>getWordInstance</code>, <code>getLineInstance</code>, |
|
* <code>getSentenceInstance</code>, and <code>getCharacterInstance</code> |
|
* to create <code>BreakIterator</code>s that perform |
|
* word, line, sentence, and character boundary analysis respectively. |
|
* A single <code>BreakIterator</code> can work only on one unit |
|
* (word, line, sentence, and so on). You must use a different iterator |
|
* for each unit boundary analysis you wish to perform. |
|
* |
|
* <p><a id="line"></a> |
|
* Line boundary analysis determines where a text string can be |
|
* broken when line-wrapping. The mechanism correctly handles |
|
* punctuation and hyphenated words. Actual line breaking needs |
|
* to also consider the available line width and is handled by |
|
* higher-level software. |
|
* |
|
* <p><a id="sentence"></a> |
|
* Sentence boundary analysis allows selection with correct interpretation |
|
* of periods within numbers and abbreviations, and trailing punctuation |
|
* marks such as quotation marks and parentheses. |
|
* |
|
* <p><a id="word"></a> |
|
* Word boundary analysis is used by search and replace functions, as |
|
* well as within text editing applications that allow the user to |
|
* select words with a double click. Word selection provides correct |
|
* interpretation of punctuation marks within and following |
|
* words. Characters that are not part of a word, such as symbols |
|
* or punctuation marks, have word-breaks on both sides. |
|
* |
|
* <p><a id="character"></a> |
|
* Character boundary analysis allows users to interact with characters |
|
* as they expect to, for example, when moving the cursor through a text |
|
* string. Character boundary analysis provides correct navigation |
|
* through character strings, regardless of how the character is stored. |
|
* The boundaries returned may be those of supplementary characters, |
|
* combining character sequences, or ligature clusters. |
|
* For example, an accented character might be stored as a base character |
|
* and a diacritical mark. What users consider to be a character can |
|
* differ between languages. |
|
* |
|
* <p> |
|
* The <code>BreakIterator</code> instances returned by the factory methods |
|
* of this class are intended for use with natural languages only, not for |
|
* programming language text. It is however possible to define subclasses |
|
* that tokenize a programming language. |
|
* |
|
* <P> |
|
* <strong>Examples</strong>:<P> |
|
* Creating and using text boundaries: |
|
* <blockquote> |
|
* <pre> |
|
* public static void main(String args[]) { |
|
* if (args.length == 1) { |
|
* String stringToExamine = args[0]; |
|
* //print each word in order |
|
* BreakIterator boundary = BreakIterator.getWordInstance(); |
|
* boundary.setText(stringToExamine); |
|
* printEachForward(boundary, stringToExamine); |
|
* //print each sentence in reverse order |
|
* boundary = BreakIterator.getSentenceInstance(Locale.US); |
|
* boundary.setText(stringToExamine); |
|
* printEachBackward(boundary, stringToExamine); |
|
* printFirst(boundary, stringToExamine); |
|
* printLast(boundary, stringToExamine); |
|
* } |
|
* } |
|
* </pre> |
|
* </blockquote> |
|
* |
|
* Print each element in order: |
|
* <blockquote> |
|
* <pre> |
|
* public static void printEachForward(BreakIterator boundary, String source) { |
|
* int start = boundary.first(); |
|
* for (int end = boundary.next(); |
|
* end != BreakIterator.DONE; |
|
* start = end, end = boundary.next()) { |
|
* System.out.println(source.substring(start,end)); |
|
* } |
|
* } |
|
* </pre> |
|
* </blockquote> |
|
* |
|
* Print each element in reverse order: |
|
* <blockquote> |
|
* <pre> |
|
* public static void printEachBackward(BreakIterator boundary, String source) { |
|
* int end = boundary.last(); |
|
* for (int start = boundary.previous(); |
|
* start != BreakIterator.DONE; |
|
* end = start, start = boundary.previous()) { |
|
* System.out.println(source.substring(start,end)); |
|
* } |
|
* } |
|
* </pre> |
|
* </blockquote> |
|
* |
|
* Print first element: |
|
* <blockquote> |
|
* <pre> |
|
* public static void printFirst(BreakIterator boundary, String source) { |
|
* int start = boundary.first(); |
|
* int end = boundary.next(); |
|
* System.out.println(source.substring(start,end)); |
|
* } |
|
* </pre> |
|
* </blockquote> |
|
* |
|
* Print last element: |
|
* <blockquote> |
|
* <pre> |
|
* public static void printLast(BreakIterator boundary, String source) { |
|
* int end = boundary.last(); |
|
* int start = boundary.previous(); |
|
* System.out.println(source.substring(start,end)); |
|
* } |
|
* </pre> |
|
* </blockquote> |
|
* |
|
* Print the element at a specified position: |
|
* <blockquote> |
|
* <pre> |
|
* public static void printAt(BreakIterator boundary, int pos, String source) { |
|
* int end = boundary.following(pos); |
|
* int start = boundary.previous(); |
|
* System.out.println(source.substring(start,end)); |
|
* } |
|
* </pre> |
|
* </blockquote> |
|
* |
|
* Find the next word: |
|
* <blockquote> |
|
* <pre>{@code |
|
* public static int nextWordStartAfter(int pos, String text) { |
|
* BreakIterator wb = BreakIterator.getWordInstance(); |
|
* wb.setText(text); |
|
* int last = wb.following(pos); |
|
* int current = wb.next(); |
|
* while (current != BreakIterator.DONE) { |
|
* for (int p = last; p < current; p++) { |
|
* if (Character.isLetter(text.codePointAt(p))) |
|
* return last; |
|
* } |
|
* last = current; |
|
* current = wb.next(); |
|
* } |
|
* return BreakIterator.DONE; |
|
* } |
|
* }</pre> |
|
* (The iterator returned by BreakIterator.getWordInstance() is unique in that |
|
* the break positions it returns don't represent both the start and end of the |
|
* thing being iterated over. That is, a sentence-break iterator returns breaks |
|
* that each represent the end of one sentence and the beginning of the next. |
|
* With the word-break iterator, the characters between two boundaries might be a |
|
* word, or they might be the punctuation or whitespace between two words. The |
|
* above code uses a simple heuristic to determine which boundary is the beginning |
|
* of a word: If the characters between this boundary and the next boundary |
|
* include at least one letter (this can be an alphabetical letter, a CJK ideograph, |
|
* a Hangul syllable, a Kana character, etc.), then the text between this boundary |
|
* and the next is a word; otherwise, it's the material between words.) |
|
* </blockquote> |
|
* |
|
* @since 1.1 |
|
* @see CharacterIterator |
|
* |
|
*/ |
|
|
|
public abstract class BreakIterator implements Cloneable |
|
{ |
|
|
|
|
|
*/ |
|
protected BreakIterator() |
|
{ |
|
} |
|
|
|
|
|
|
|
|
|
*/ |
|
@Override |
|
public Object clone() |
|
{ |
|
try { |
|
return super.clone(); |
|
} |
|
catch (CloneNotSupportedException e) { |
|
throw new InternalError(e); |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public static final int DONE = -1; |
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public abstract int first(); |
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public abstract int last(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public abstract int next(int n); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public abstract int next(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public abstract int previous(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public abstract int following(int offset); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public int preceding(int offset) { |
|
// NOTE: This implementation is here solely because we can't add new |
|
// abstract methods to an existing class. There is almost ALWAYS a |
|
|
|
int pos = following(offset); |
|
while (pos >= offset && pos != DONE) { |
|
pos = previous(); |
|
} |
|
return pos; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public boolean isBoundary(int offset) { |
|
// NOTE: This implementation probably is wrong for most situations |
|
// because it fails to take into account the possibility that a |
|
// CharacterIterator passed to setText() may not have a begin offset |
|
// of 0. But since the abstract BreakIterator doesn't have that |
|
// knowledge, it assumes the begin offset is 0. If you subclass |
|
// BreakIterator, copy the SimpleTextBoundary implementation of this |
|
// function into your subclass. [This should have been abstract at |
|
|
|
if (offset == 0) { |
|
return true; |
|
} |
|
int boundary = following(offset - 1); |
|
if (boundary == DONE) { |
|
throw new IllegalArgumentException(); |
|
} |
|
return boundary == offset; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public abstract int current(); |
|
|
|
|
|
|
|
|
|
*/ |
|
public abstract CharacterIterator getText(); |
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public void setText(String newText) |
|
{ |
|
setText(new StringCharacterIterator(newText)); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public abstract void setText(CharacterIterator newText); |
|
|
|
private static final int CHARACTER_INDEX = 0; |
|
private static final int WORD_INDEX = 1; |
|
private static final int LINE_INDEX = 2; |
|
private static final int SENTENCE_INDEX = 3; |
|
|
|
@SuppressWarnings("unchecked") |
|
private static final SoftReference<BreakIteratorCache>[] iterCache = (SoftReference<BreakIteratorCache>[]) new SoftReference<?>[4]; |
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public static BreakIterator getWordInstance() |
|
{ |
|
return getWordInstance(Locale.getDefault()); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public static BreakIterator getWordInstance(Locale locale) |
|
{ |
|
return getBreakInstance(locale, WORD_INDEX); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public static BreakIterator getLineInstance() |
|
{ |
|
return getLineInstance(Locale.getDefault()); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public static BreakIterator getLineInstance(Locale locale) |
|
{ |
|
return getBreakInstance(locale, LINE_INDEX); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public static BreakIterator getCharacterInstance() |
|
{ |
|
return getCharacterInstance(Locale.getDefault()); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public static BreakIterator getCharacterInstance(Locale locale) |
|
{ |
|
return getBreakInstance(locale, CHARACTER_INDEX); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public static BreakIterator getSentenceInstance() |
|
{ |
|
return getSentenceInstance(Locale.getDefault()); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public static BreakIterator getSentenceInstance(Locale locale) |
|
{ |
|
return getBreakInstance(locale, SENTENCE_INDEX); |
|
} |
|
|
|
private static BreakIterator getBreakInstance(Locale locale, int type) { |
|
if (iterCache[type] != null) { |
|
BreakIteratorCache cache = iterCache[type].get(); |
|
if (cache != null) { |
|
if (cache.getLocale().equals(locale)) { |
|
return cache.createBreakInstance(); |
|
} |
|
} |
|
} |
|
|
|
BreakIterator result = createBreakInstance(locale, type); |
|
BreakIteratorCache cache = new BreakIteratorCache(locale, result); |
|
iterCache[type] = new SoftReference<>(cache); |
|
return result; |
|
} |
|
|
|
private static BreakIterator createBreakInstance(Locale locale, |
|
int type) { |
|
LocaleProviderAdapter adapter = LocaleProviderAdapter.getAdapter(BreakIteratorProvider.class, locale); |
|
BreakIterator iterator = createBreakInstance(adapter, locale, type); |
|
if (iterator == null) { |
|
iterator = createBreakInstance(LocaleProviderAdapter.forJRE(), locale, type); |
|
} |
|
return iterator; |
|
} |
|
|
|
private static BreakIterator createBreakInstance(LocaleProviderAdapter adapter, Locale locale, int type) { |
|
BreakIteratorProvider breakIteratorProvider = adapter.getBreakIteratorProvider(); |
|
BreakIterator iterator = null; |
|
switch (type) { |
|
case CHARACTER_INDEX: |
|
iterator = breakIteratorProvider.getCharacterInstance(locale); |
|
break; |
|
case WORD_INDEX: |
|
iterator = breakIteratorProvider.getWordInstance(locale); |
|
break; |
|
case LINE_INDEX: |
|
iterator = breakIteratorProvider.getLineInstance(locale); |
|
break; |
|
case SENTENCE_INDEX: |
|
iterator = breakIteratorProvider.getSentenceInstance(locale); |
|
break; |
|
} |
|
return iterator; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public static synchronized Locale[] getAvailableLocales() |
|
{ |
|
LocaleServiceProviderPool pool = |
|
LocaleServiceProviderPool.getPool(BreakIteratorProvider.class); |
|
return pool.getAvailableLocales(); |
|
} |
|
|
|
private static final class BreakIteratorCache { |
|
|
|
private BreakIterator iter; |
|
private Locale locale; |
|
|
|
BreakIteratorCache(Locale locale, BreakIterator iter) { |
|
this.locale = locale; |
|
this.iter = (BreakIterator) iter.clone(); |
|
} |
|
|
|
Locale getLocale() { |
|
return locale; |
|
} |
|
|
|
BreakIterator createBreakInstance() { |
|
return (BreakIterator) iter.clone(); |
|
} |
|
} |
|
} |