| 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 */  | 
 | 
 | 
 | 
/*  | 
 | 
 *  | 
 | 
 * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved  | 
 | 
 * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved  | 
 | 
 *  | 
 | 
 * The original version of this source code and documentation  | 
 | 
 * is copyrighted and owned by Taligent, Inc., a wholly-owned  | 
 | 
 * subsidiary of IBM. These materials are provided under terms  | 
 | 
 * of a License Agreement between Taligent and Sun. This technology  | 
 | 
 * is protected by multiple US and International patents.  | 
 | 
 *  | 
 | 
 * This notice and attribution to Taligent may not be removed.  | 
 | 
 * Taligent is a registered trademark of Taligent, Inc.  | 
 | 
 */  | 
 | 
 | 
 | 
package sun.util.locale.provider;  | 
 | 
 | 
 | 
import java.io.IOException;  | 
 | 
import java.text.CharacterIterator;  | 
 | 
import java.util.ArrayList;  | 
 | 
import java.util.List;  | 
 | 
import java.util.Stack;  | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 */  | 
 | 
class DictionaryBasedBreakIterator extends RuleBasedBreakIterator { | 
 | 
 | 
 | 
      | 
 | 
 | 
 | 
 | 
 | 
     */  | 
 | 
    private BreakDictionary dictionary;  | 
 | 
 | 
 | 
      | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
     */  | 
 | 
    private boolean[] categoryFlags;  | 
 | 
 | 
 | 
      | 
 | 
 | 
 | 
 | 
 | 
     */  | 
 | 
    private int dictionaryCharCount;  | 
 | 
 | 
 | 
      | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
     */  | 
 | 
    private int[] cachedBreakPositions;  | 
 | 
 | 
 | 
      | 
 | 
 | 
 | 
 | 
 | 
     */  | 
 | 
    private int positionInCache;  | 
 | 
 | 
 | 
      | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
     */  | 
 | 
    DictionaryBasedBreakIterator(String dataFile, String dictionaryFile)  | 
 | 
                                        throws IOException { | 
 | 
        super(dataFile);  | 
 | 
        byte[] tmp = super.getAdditionalData();  | 
 | 
        if (tmp != null) { | 
 | 
            prepareCategoryFlags(tmp);  | 
 | 
            super.setAdditionalData(null);  | 
 | 
        }  | 
 | 
        dictionary = new BreakDictionary(dictionaryFile);  | 
 | 
    }  | 
 | 
 | 
 | 
    private void prepareCategoryFlags(byte[] data) { | 
 | 
        categoryFlags = new boolean[data.length];  | 
 | 
        for (int i = 0; i < data.length; i++) { | 
 | 
            categoryFlags[i] = (data[i] == (byte)1) ? true : false;  | 
 | 
        }  | 
 | 
    }  | 
 | 
 | 
 | 
    @Override  | 
 | 
    public void setText(CharacterIterator newText) { | 
 | 
        super.setText(newText);  | 
 | 
        cachedBreakPositions = null;  | 
 | 
        dictionaryCharCount = 0;  | 
 | 
        positionInCache = 0;  | 
 | 
    }  | 
 | 
 | 
 | 
      | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
     */  | 
 | 
    @Override  | 
 | 
    public int first() { | 
 | 
        cachedBreakPositions = null;  | 
 | 
        dictionaryCharCount = 0;  | 
 | 
        positionInCache = 0;  | 
 | 
        return super.first();  | 
 | 
    }  | 
 | 
 | 
 | 
      | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
     */  | 
 | 
    @Override  | 
 | 
    public int last() { | 
 | 
        cachedBreakPositions = null;  | 
 | 
        dictionaryCharCount = 0;  | 
 | 
        positionInCache = 0;  | 
 | 
        return super.last();  | 
 | 
    }  | 
 | 
 | 
 | 
      | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
     */  | 
 | 
    @Override  | 
 | 
    public int previous() { | 
 | 
        CharacterIterator text = getText();  | 
 | 
 | 
 | 
        // if we have cached break positions and we're still in the range  | 
 | 
          | 
 | 
        if (cachedBreakPositions != null && positionInCache > 0) { | 
 | 
            --positionInCache;  | 
 | 
            text.setIndex(cachedBreakPositions[positionInCache]);  | 
 | 
            return cachedBreakPositions[positionInCache];  | 
 | 
        }  | 
 | 
 | 
 | 
        // otherwise, dump the cache and use the inherited previous() method to move  | 
 | 
        // backward.  This may fill up the cache with new break positions, in which  | 
 | 
          | 
 | 
        else { | 
 | 
            cachedBreakPositions = null;  | 
 | 
            int result = super.previous();  | 
 | 
            if (cachedBreakPositions != null) { | 
 | 
                positionInCache = cachedBreakPositions.length - 2;  | 
 | 
            }  | 
 | 
            return result;  | 
 | 
        }  | 
 | 
    }  | 
 | 
 | 
 | 
      | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
     */  | 
 | 
    @Override  | 
 | 
    public int preceding(int offset) { | 
 | 
        CharacterIterator text = getText();  | 
 | 
        checkOffset(offset, text);  | 
 | 
 | 
 | 
        // if we have no cached break positions, or "offset" is outside the  | 
 | 
        // range covered by the cache, we can just call the inherited routine  | 
 | 
        // (which will eventually call other routines in this class that may  | 
 | 
          | 
 | 
        if (cachedBreakPositions == null || offset <= cachedBreakPositions[0] ||  | 
 | 
                offset > cachedBreakPositions[cachedBreakPositions.length - 1]) { | 
 | 
            cachedBreakPositions = null;  | 
 | 
            return super.preceding(offset);  | 
 | 
        }  | 
 | 
 | 
 | 
        // on the other hand, if "offset" is within the range covered by the cache,  | 
 | 
        // then all we have to do is search the cache for the last break position  | 
 | 
          | 
 | 
        else { | 
 | 
            positionInCache = 0;  | 
 | 
            while (positionInCache < cachedBreakPositions.length  | 
 | 
                   && offset > cachedBreakPositions[positionInCache]) { | 
 | 
                ++positionInCache;  | 
 | 
            }  | 
 | 
            --positionInCache;  | 
 | 
            text.setIndex(cachedBreakPositions[positionInCache]);  | 
 | 
            return text.getIndex();  | 
 | 
        }  | 
 | 
    }  | 
 | 
 | 
 | 
      | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
     */  | 
 | 
    @Override  | 
 | 
    public int following(int offset) { | 
 | 
        CharacterIterator text = getText();  | 
 | 
        checkOffset(offset, text);  | 
 | 
 | 
 | 
        // if we have no cached break positions, or if "offset" is outside the  | 
 | 
        // range covered by the cache, then dump the cache and call our  | 
 | 
        // inherited following() method.  This will call other methods in this  | 
 | 
          | 
 | 
        if (cachedBreakPositions == null || offset < cachedBreakPositions[0] ||  | 
 | 
                offset >= cachedBreakPositions[cachedBreakPositions.length - 1]) { | 
 | 
            cachedBreakPositions = null;  | 
 | 
            return super.following(offset);  | 
 | 
        }  | 
 | 
 | 
 | 
        // on the other hand, if "offset" is within the range covered by the  | 
 | 
        // cache, then just search the cache for the first break position  | 
 | 
          | 
 | 
        else { | 
 | 
            positionInCache = 0;  | 
 | 
            while (positionInCache < cachedBreakPositions.length  | 
 | 
                   && offset >= cachedBreakPositions[positionInCache]) { | 
 | 
                ++positionInCache;  | 
 | 
            }  | 
 | 
            text.setIndex(cachedBreakPositions[positionInCache]);  | 
 | 
            return text.getIndex();  | 
 | 
        }  | 
 | 
    }  | 
 | 
 | 
 | 
      | 
 | 
 | 
 | 
     */  | 
 | 
    @Override  | 
 | 
    protected int handleNext() { | 
 | 
        CharacterIterator text = getText();  | 
 | 
 | 
 | 
        // if there are no cached break positions, or if we've just moved  | 
 | 
        // off the end of the range covered by the cache, we have to dump  | 
 | 
          | 
 | 
        if (cachedBreakPositions == null ||  | 
 | 
            positionInCache == cachedBreakPositions.length - 1) { | 
 | 
 | 
 | 
            // start by using the inherited handleNext() to find a tentative return  | 
 | 
            // value.   dictionaryCharCount tells us how many dictionary characters  | 
 | 
              | 
 | 
            int startPos = text.getIndex();  | 
 | 
            dictionaryCharCount = 0;  | 
 | 
            int result = super.handleNext();  | 
 | 
 | 
 | 
            // if we passed over more than one dictionary character, then we use  | 
 | 
            // divideUpDictionaryRange() to regenerate the cached break positions  | 
 | 
              | 
 | 
            if (dictionaryCharCount > 1 && result - startPos > 1) { | 
 | 
                divideUpDictionaryRange(startPos, result);  | 
 | 
            }  | 
 | 
 | 
 | 
            // otherwise, the value we got back from the inherited fuction  | 
 | 
              | 
 | 
            else { | 
 | 
                cachedBreakPositions = null;  | 
 | 
                return result;  | 
 | 
            }  | 
 | 
        }  | 
 | 
 | 
 | 
        // if the cache of break positions has been regenerated (or existed all  | 
 | 
        // along), then just advance to the next break position in the cache  | 
 | 
          | 
 | 
        if (cachedBreakPositions != null) { | 
 | 
            ++positionInCache;  | 
 | 
            text.setIndex(cachedBreakPositions[positionInCache]);  | 
 | 
            return cachedBreakPositions[positionInCache];  | 
 | 
        }  | 
 | 
        return -9999;     | 
 | 
    }  | 
 | 
 | 
 | 
      | 
 | 
 | 
 | 
     */  | 
 | 
    @Override  | 
 | 
    protected int lookupCategory(int c) { | 
 | 
        // this override of lookupCategory() exists only to keep track of whether we've  | 
 | 
        // passed over any dictionary characters.  It calls the inherited lookupCategory()  | 
 | 
        // to do the real work, and then checks whether its return value is one of the  | 
 | 
        // categories represented in the dictionary.  If it is, bump the dictionary-  | 
 | 
          | 
 | 
        int result = super.lookupCategory(c);  | 
 | 
        if (result != RuleBasedBreakIterator.IGNORE && categoryFlags[result]) { | 
 | 
            ++dictionaryCharCount;  | 
 | 
        }  | 
 | 
        return result;  | 
 | 
    }  | 
 | 
 | 
 | 
      | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
 | 
     */  | 
 | 
    @SuppressWarnings("unchecked") | 
 | 
    private void divideUpDictionaryRange(int startPos, int endPos) { | 
 | 
        CharacterIterator text = getText();  | 
 | 
 | 
 | 
        // the range we're dividing may begin or end with non-dictionary characters  | 
 | 
        // (i.e., for line breaking, we may have leading or trailing punctuation  | 
 | 
        // that needs to be kept with the word).  Seek from the beginning of the  | 
 | 
          | 
 | 
        text.setIndex(startPos);  | 
 | 
        int c = getCurrent();  | 
 | 
        int category = lookupCategory(c);  | 
 | 
        while (category == IGNORE || !categoryFlags[category]) { | 
 | 
            c = getNext();  | 
 | 
            category = lookupCategory(c);  | 
 | 
        }  | 
 | 
 | 
 | 
        // initialize.  We maintain two stacks: currentBreakPositions contains  | 
 | 
        // the list of break positions that will be returned if we successfully  | 
 | 
        // finish traversing the whole range now.  possibleBreakPositions lists  | 
 | 
        // all other possible word ends we've passed along the way.  (Whenever  | 
 | 
        // we reach an error [a sequence of characters that can't begin any word  | 
 | 
        // in the dictionary], we back up, possibly delete some breaks from  | 
 | 
        // currentBreakPositions, move a break from possibleBreakPositions  | 
 | 
        // to currentBreakPositions, and start over from there.  This process  | 
 | 
        // continues in this way until we either successfully make it all the way  | 
 | 
        // across the range, or exhaust all of our combinations of break  | 
 | 
          | 
 | 
        Stack<Integer> currentBreakPositions = new Stack<>();  | 
 | 
        Stack<Integer> possibleBreakPositions = new Stack<>();  | 
 | 
        List<Integer> wrongBreakPositions = new ArrayList<>();  | 
 | 
 | 
 | 
        // the dictionary is implemented as a trie, which is treated as a state  | 
 | 
        // machine.  -1 represents the end of a legal word.  Every word in the  | 
 | 
        // dictionary is represented by a path from the root node to -1.  A path  | 
 | 
          | 
 | 
        int state = 0;  | 
 | 
 | 
 | 
        // these two variables are used for error handling.  We keep track of the  | 
 | 
        // farthest we've gotten through the range being divided, and the combination  | 
 | 
        // of breaks that got us that far.  If we use up all possible break  | 
 | 
        // combinations, the text contains an error or a word that's not in the  | 
 | 
        // dictionary.  In this case, we "bless" the break positions that got us the  | 
 | 
        // farthest as real break positions, and then start over from scratch with  | 
 | 
          | 
 | 
        int farthestEndPoint = text.getIndex();  | 
 | 
        Stack<Integer> bestBreakPositions = null;  | 
 | 
 | 
 | 
          | 
 | 
        c = getCurrent();  | 
 | 
        while (true) { | 
 | 
 | 
 | 
            // if we can transition to state "-1" from our current state, we're  | 
 | 
            // on the last character of a legal word.  Push that position onto  | 
 | 
              | 
 | 
            if (dictionary.getNextState(state, 0) == -1) { | 
 | 
                possibleBreakPositions.push(text.getIndex());  | 
 | 
            }  | 
 | 
 | 
 | 
              | 
 | 
            state = dictionary.getNextStateFromCharacter(state, c);  | 
 | 
 | 
 | 
            // if the character we're sitting on causes us to transition to  | 
 | 
            // the "end of word" state, then it was a non-dictionary character  | 
 | 
            // and we've successfully traversed the whole range.  Drop out  | 
 | 
              | 
 | 
            if (state == -1) { | 
 | 
                currentBreakPositions.push(text.getIndex());  | 
 | 
                break;  | 
 | 
            }  | 
 | 
 | 
 | 
            // if the character we're sitting on causes us to transition to  | 
 | 
            // the error state, or if we've gone off the end of the range  | 
 | 
            // without transitioning to the "end of word" state, we've hit  | 
 | 
              | 
 | 
            else if (state == 0 || text.getIndex() >= endPos) { | 
 | 
 | 
 | 
                // if this is the farthest we've gotten, take note of it in  | 
 | 
                  | 
 | 
                if (text.getIndex() > farthestEndPoint) { | 
 | 
                    farthestEndPoint = text.getIndex();  | 
 | 
 | 
 | 
                    @SuppressWarnings("unchecked") | 
 | 
                    Stack<Integer> currentBreakPositionsCopy = (Stack<Integer>) currentBreakPositions.clone();  | 
 | 
 | 
 | 
                    bestBreakPositions = currentBreakPositionsCopy;  | 
 | 
                }  | 
 | 
 | 
 | 
                // wrongBreakPositions is a list of all break positions  | 
 | 
                // we've tried starting that didn't allow us to traverse  | 
 | 
                // all the way through the text.  Every time we pop a  | 
 | 
                // break position off of currentBreakPositions, we put it  | 
 | 
                // into wrongBreakPositions to avoid trying it again later.  | 
 | 
                // If we make it to this spot, we're either going to back  | 
 | 
                // up to a break in possibleBreakPositions and try starting  | 
 | 
                // over from there, or we've exhausted all possible break  | 
 | 
                // positions and are going to do the fallback procedure.  | 
 | 
                // This loop prevents us from messing with anything in  | 
 | 
                // possibleBreakPositions that didn't work as a starting  | 
 | 
                // point the last time we tried it (this is to prevent a bunch of  | 
 | 
                  | 
 | 
                while (!possibleBreakPositions.isEmpty()  | 
 | 
                        && wrongBreakPositions.contains(possibleBreakPositions.peek())) { | 
 | 
                    possibleBreakPositions.pop();  | 
 | 
                }  | 
 | 
 | 
 | 
                // if we've used up all possible break-position combinations, there's  | 
 | 
                // an error or an unknown word in the text.  In this case, we start  | 
 | 
                // over, treating the farthest character we've reached as the beginning  | 
 | 
                // of the range, and "blessing" the break positions that got us that  | 
 | 
                  | 
 | 
                if (possibleBreakPositions.isEmpty()) { | 
 | 
                    if (bestBreakPositions != null) { | 
 | 
                        currentBreakPositions = bestBreakPositions;  | 
 | 
                        if (farthestEndPoint < endPos) { | 
 | 
                            text.setIndex(farthestEndPoint + 1);  | 
 | 
                        }  | 
 | 
                        else { | 
 | 
                            break;  | 
 | 
                        }  | 
 | 
                    }  | 
 | 
                    else { | 
 | 
                        if ((currentBreakPositions.size() == 0 ||  | 
 | 
                             currentBreakPositions.peek().intValue() != text.getIndex())  | 
 | 
                            && text.getIndex() != startPos) { | 
 | 
                            currentBreakPositions.push(new Integer(text.getIndex()));  | 
 | 
                        }  | 
 | 
                        getNext();  | 
 | 
                        currentBreakPositions.push(new Integer(text.getIndex()));  | 
 | 
                    }  | 
 | 
                }  | 
 | 
 | 
 | 
                // if we still have more break positions we can try, then promote the  | 
 | 
                // last break in possibleBreakPositions into currentBreakPositions,  | 
 | 
                // and get rid of all entries in currentBreakPositions that come after  | 
 | 
                // it.  Then back up to that position and start over from there (i.e.,  | 
 | 
                  | 
 | 
                else { | 
 | 
                    Integer temp = possibleBreakPositions.pop();  | 
 | 
                    Integer temp2 = null;  | 
 | 
                    while (!currentBreakPositions.isEmpty() && temp.intValue() <  | 
 | 
                           currentBreakPositions.peek().intValue()) { | 
 | 
                        temp2 = currentBreakPositions.pop();  | 
 | 
                        wrongBreakPositions.add(temp2);  | 
 | 
                    }  | 
 | 
                    currentBreakPositions.push(temp);  | 
 | 
                    text.setIndex(currentBreakPositions.peek().intValue());  | 
 | 
                }  | 
 | 
 | 
 | 
                // re-sync "c" for the next go-round, and drop out of the loop if  | 
 | 
                  | 
 | 
                c = getCurrent();  | 
 | 
                if (text.getIndex() >= endPos) { | 
 | 
                    break;  | 
 | 
                }  | 
 | 
            }  | 
 | 
 | 
 | 
            // if we didn't hit any exceptional conditions on this last iteration,  | 
 | 
              | 
 | 
            else { | 
 | 
                c = getNext();  | 
 | 
            }  | 
 | 
        }  | 
 | 
 | 
 | 
        // dump the last break position in the list, and replace it with the actual  | 
 | 
        // end of the range (which may be the same character, or may be further on  | 
 | 
        // because the range actually ended with non-dictionary characters we want to  | 
 | 
          | 
 | 
        if (!currentBreakPositions.isEmpty()) { | 
 | 
            currentBreakPositions.pop();  | 
 | 
        }  | 
 | 
        currentBreakPositions.push(endPos);  | 
 | 
 | 
 | 
        // create a regular array to hold the break positions and copy  | 
 | 
        // the break positions from the stack to the array (in addition,  | 
 | 
        // our starting position goes into this array as a break position).  | 
 | 
        // This array becomes the cache of break positions used by next()  | 
 | 
          | 
 | 
        cachedBreakPositions = new int[currentBreakPositions.size() + 1];  | 
 | 
        cachedBreakPositions[0] = startPos;  | 
 | 
 | 
 | 
        for (int i = 0; i < currentBreakPositions.size(); i++) { | 
 | 
            cachedBreakPositions[i + 1] = currentBreakPositions.elementAt(i).intValue();  | 
 | 
        }  | 
 | 
        positionInCache = 0;  | 
 | 
    }  | 
 | 
}  |