|
|
|
|
|
*/ |
|
/* |
|
* Licensed to the Apache Software Foundation (ASF) under one or more |
|
* contributor license agreements. See the NOTICE file distributed with |
|
* this work for additional information regarding copyright ownership. |
|
* The ASF licenses this file to You under the Apache License, Version 2.0 |
|
* (the "License"); you may not use this file except in compliance with |
|
* the License. You may obtain a copy of the License at |
|
* |
|
* http://www.apache.org/licenses/LICENSE-2.0 |
|
* |
|
* Unless required by applicable law or agreed to in writing, software |
|
* distributed under the License is distributed on an "AS IS" BASIS, |
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
* See the License for the specific language governing permissions and |
|
* limitations under the License. |
|
*/ |
|
|
|
package com.sun.org.apache.xerces.internal.impl.xpath.regex; |
|
|
|
import java.io.IOException; |
|
import java.io.ObjectInputStream; |
|
import java.io.ObjectOutputStream; |
|
import java.io.ObjectStreamField; |
|
import java.util.ArrayList; |
|
import java.util.Collections; |
|
import java.util.HashMap; |
|
import java.util.HashSet; |
|
import java.util.List; |
|
import java.util.Map; |
|
import java.util.Set; |
|
import java.util.Vector; |
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
class Token implements java.io.Serializable { |
|
|
|
private static final long serialVersionUID = 8484976002585487481L; |
|
|
|
static final boolean COUNTTOKENS = true; |
|
static int tokens = 0; |
|
|
|
static final int CHAR = 0; |
|
static final int DOT = 11; |
|
static final int CONCAT = 1; |
|
static final int UNION = 2; |
|
static final int CLOSURE = 3; |
|
static final int RANGE = 4; |
|
static final int NRANGE = 5; |
|
static final int PAREN = 6; |
|
static final int EMPTY = 7; |
|
static final int ANCHOR = 8; |
|
static final int NONGREEDYCLOSURE = 9; |
|
static final int STRING = 10; |
|
static final int BACKREFERENCE = 12; |
|
static final int LOOKAHEAD = 20; |
|
static final int NEGATIVELOOKAHEAD = 21; |
|
static final int LOOKBEHIND = 22; |
|
static final int NEGATIVELOOKBEHIND = 23; |
|
static final int INDEPENDENT = 24; |
|
static final int MODIFIERGROUP = 25; |
|
static final int CONDITION = 26; |
|
|
|
static final int UTF16_MAX = 0x10ffff; |
|
|
|
final int type; |
|
|
|
static Token token_dot; |
|
static Token token_0to9; |
|
static Token token_wordchars; |
|
static Token token_not_0to9; |
|
static Token token_not_wordchars; |
|
static Token token_spaces; |
|
static Token token_not_spaces; |
|
static Token token_empty; |
|
static Token token_linebeginning; |
|
static Token token_linebeginning2; |
|
static Token token_lineend; |
|
static Token token_stringbeginning; |
|
static Token token_stringend; |
|
static Token token_stringend2; |
|
static Token token_wordedge; |
|
static Token token_not_wordedge; |
|
static Token token_wordbeginning; |
|
static Token token_wordend; |
|
static { |
|
Token.token_empty = new Token(Token.EMPTY); |
|
|
|
Token.token_linebeginning = Token.createAnchor('^'); |
|
Token.token_linebeginning2 = Token.createAnchor('@'); |
|
Token.token_lineend = Token.createAnchor('$'); |
|
Token.token_stringbeginning = Token.createAnchor('A'); |
|
Token.token_stringend = Token.createAnchor('z'); |
|
Token.token_stringend2 = Token.createAnchor('Z'); |
|
Token.token_wordedge = Token.createAnchor('b'); |
|
Token.token_not_wordedge = Token.createAnchor('B'); |
|
Token.token_wordbeginning = Token.createAnchor('<'); |
|
Token.token_wordend = Token.createAnchor('>'); |
|
|
|
Token.token_dot = new Token(Token.DOT); |
|
|
|
Token.token_0to9 = Token.createRange(); |
|
Token.token_0to9.addRange('0', '9'); |
|
Token.token_wordchars = Token.createRange(); |
|
Token.token_wordchars.addRange('0', '9'); |
|
Token.token_wordchars.addRange('A', 'Z'); |
|
Token.token_wordchars.addRange('_', '_'); |
|
Token.token_wordchars.addRange('a', 'z'); |
|
Token.token_spaces = Token.createRange(); |
|
Token.token_spaces.addRange('\t', '\t'); |
|
Token.token_spaces.addRange('\n', '\n'); |
|
Token.token_spaces.addRange('\f', '\f'); |
|
Token.token_spaces.addRange('\r', '\r'); |
|
Token.token_spaces.addRange(' ', ' '); |
|
|
|
Token.token_not_0to9 = Token.complementRanges(Token.token_0to9); |
|
Token.token_not_wordchars = Token.complementRanges(Token.token_wordchars); |
|
Token.token_not_spaces = Token.complementRanges(Token.token_spaces); |
|
} |
|
|
|
static Token.ParenToken createLook(int type, Token child) { |
|
if (COUNTTOKENS) Token.tokens ++; |
|
return new Token.ParenToken(type, child, 0); |
|
} |
|
static Token.ParenToken createParen(Token child, int pnumber) { |
|
if (COUNTTOKENS) Token.tokens ++; |
|
return new Token.ParenToken(Token.PAREN, child, pnumber); |
|
} |
|
static Token.ClosureToken createClosure(Token tok) { |
|
if (COUNTTOKENS) Token.tokens ++; |
|
return new Token.ClosureToken(Token.CLOSURE, tok); |
|
} |
|
static Token.ClosureToken createNGClosure(Token tok) { |
|
if (COUNTTOKENS) Token.tokens ++; |
|
return new Token.ClosureToken(Token.NONGREEDYCLOSURE, tok); |
|
} |
|
static Token.ConcatToken createConcat(Token tok1, Token tok2) { |
|
if (COUNTTOKENS) Token.tokens ++; |
|
return new Token.ConcatToken(tok1, tok2); |
|
} |
|
static Token.UnionToken createConcat() { |
|
if (COUNTTOKENS) Token.tokens ++; |
|
return new Token.UnionToken(Token.CONCAT); |
|
} |
|
static Token.UnionToken createUnion() { |
|
if (COUNTTOKENS) Token.tokens ++; |
|
return new Token.UnionToken(Token.UNION); |
|
} |
|
static Token createEmpty() { |
|
return Token.token_empty; |
|
} |
|
static RangeToken createRange() { |
|
if (COUNTTOKENS) Token.tokens ++; |
|
return new RangeToken(Token.RANGE); |
|
} |
|
static RangeToken createNRange() { |
|
if (COUNTTOKENS) Token.tokens ++; |
|
return new RangeToken(Token.NRANGE); |
|
} |
|
static Token.CharToken createChar(int ch) { |
|
if (COUNTTOKENS) Token.tokens ++; |
|
return new Token.CharToken(Token.CHAR, ch); |
|
} |
|
static private Token.CharToken createAnchor(int ch) { |
|
if (COUNTTOKENS) Token.tokens ++; |
|
return new Token.CharToken(Token.ANCHOR, ch); |
|
} |
|
static Token.StringToken createBackReference(int refno) { |
|
if (COUNTTOKENS) Token.tokens ++; |
|
return new Token.StringToken(Token.BACKREFERENCE, null, refno); |
|
} |
|
static Token.StringToken createString(String str) { |
|
if (COUNTTOKENS) Token.tokens ++; |
|
return new Token.StringToken(Token.STRING, str, 0); |
|
} |
|
static Token.ModifierToken createModifierGroup(Token child, int add, int mask) { |
|
if (COUNTTOKENS) Token.tokens ++; |
|
return new Token.ModifierToken(child, add, mask); |
|
} |
|
static Token.ConditionToken createCondition(int refno, Token condition, |
|
Token yespat, Token nopat) { |
|
if (COUNTTOKENS) Token.tokens ++; |
|
return new Token.ConditionToken(refno, condition, yespat, nopat); |
|
} |
|
|
|
protected Token(int type) { |
|
this.type = type; |
|
} |
|
|
|
|
|
|
|
*/ |
|
int size() { |
|
return 0; |
|
} |
|
Token getChild(int index) { |
|
return null; |
|
} |
|
void addChild(Token tok) { |
|
throw new RuntimeException("Not supported."); |
|
} |
|
|
|
|
|
protected void addRange(int start, int end) { |
|
throw new RuntimeException("Not supported."); |
|
} |
|
protected void sortRanges() { |
|
throw new RuntimeException("Not supported."); |
|
} |
|
protected void compactRanges() { |
|
throw new RuntimeException("Not supported."); |
|
} |
|
protected void mergeRanges(Token tok) { |
|
throw new RuntimeException("Not supported."); |
|
} |
|
protected void subtractRanges(Token tok) { |
|
throw new RuntimeException("Not supported."); |
|
} |
|
protected void intersectRanges(Token tok) { |
|
throw new RuntimeException("Not supported."); |
|
} |
|
static Token complementRanges(Token tok) { |
|
return RangeToken.complementRanges(tok); |
|
} |
|
|
|
|
|
void setMin(int min) { // for CLOSURE |
|
} |
|
void setMax(int max) { // for CLOSURE |
|
} |
|
int getMin() { |
|
return -1; |
|
} |
|
int getMax() { |
|
return -1; |
|
} |
|
int getReferenceNumber() { |
|
return 0; |
|
} |
|
String getString() { |
|
return null; |
|
} |
|
|
|
int getParenNumber() { |
|
return 0; |
|
} |
|
int getChar() { |
|
return -1; |
|
} |
|
|
|
public String toString() { |
|
return this.toString(0); |
|
} |
|
public String toString(int options) { |
|
return this.type == Token.DOT ? "." : ""; |
|
} |
|
|
|
|
|
|
|
*/ |
|
final int getMinLength() { |
|
switch (this.type) { |
|
case CONCAT: |
|
int sum = 0; |
|
for (int i = 0; i < this.size(); i ++) |
|
sum += this.getChild(i).getMinLength(); |
|
return sum; |
|
|
|
case CONDITION: |
|
case UNION: |
|
if (this.size() == 0) |
|
return 0; |
|
int ret = this.getChild(0).getMinLength(); |
|
for (int i = 1; i < this.size(); i ++) { |
|
int min = this.getChild(i).getMinLength(); |
|
if (min < ret) ret = min; |
|
} |
|
return ret; |
|
|
|
case CLOSURE: |
|
case NONGREEDYCLOSURE: |
|
if (this.getMin() >= 0) |
|
return this.getMin() * this.getChild(0).getMinLength(); |
|
return 0; |
|
|
|
case EMPTY: |
|
case ANCHOR: |
|
return 0; |
|
|
|
case DOT: |
|
case CHAR: |
|
case RANGE: |
|
case NRANGE: |
|
return 1; |
|
|
|
case INDEPENDENT: |
|
case PAREN: |
|
case MODIFIERGROUP: |
|
return this.getChild(0).getMinLength(); |
|
|
|
case BACKREFERENCE: |
|
return 0; |
|
|
|
case STRING: |
|
return this.getString().length(); |
|
|
|
case LOOKAHEAD: |
|
case NEGATIVELOOKAHEAD: |
|
case LOOKBEHIND: |
|
case NEGATIVELOOKBEHIND: |
|
return 0; |
|
|
|
default: |
|
throw new RuntimeException("Token#getMinLength(): Invalid Type: "+this.type); |
|
} |
|
} |
|
|
|
final int getMaxLength() { |
|
switch (this.type) { |
|
case CONCAT: |
|
int sum = 0; |
|
for (int i = 0; i < this.size(); i ++) { |
|
int d = this.getChild(i).getMaxLength(); |
|
if (d < 0) return -1; |
|
sum += d; |
|
} |
|
return sum; |
|
|
|
case CONDITION: |
|
case UNION: |
|
if (this.size() == 0) |
|
return 0; |
|
int ret = this.getChild(0).getMaxLength(); |
|
for (int i = 1; ret >= 0 && i < this.size(); i ++) { |
|
int max = this.getChild(i).getMaxLength(); |
|
if (max < 0) { |
|
ret = -1; |
|
break; |
|
} |
|
if (max > ret) ret = max; |
|
} |
|
return ret; |
|
|
|
case CLOSURE: |
|
case NONGREEDYCLOSURE: |
|
if (this.getMax() >= 0) |
|
// When this.child.getMaxLength() < 0, |
|
|
|
return this.getMax() * this.getChild(0).getMaxLength(); |
|
return -1; |
|
|
|
case EMPTY: |
|
case ANCHOR: |
|
return 0; |
|
|
|
case CHAR: |
|
return 1; |
|
case DOT: |
|
case RANGE: |
|
case NRANGE: |
|
return 2; |
|
|
|
case INDEPENDENT: |
|
case PAREN: |
|
case MODIFIERGROUP: |
|
return this.getChild(0).getMaxLength(); |
|
|
|
case BACKREFERENCE: |
|
return -1; |
|
|
|
case STRING: |
|
return this.getString().length(); |
|
|
|
case LOOKAHEAD: |
|
case NEGATIVELOOKAHEAD: |
|
case LOOKBEHIND: |
|
case NEGATIVELOOKBEHIND: |
|
return 0; |
|
|
|
default: |
|
throw new RuntimeException("Token#getMaxLength(): Invalid Type: "+this.type); |
|
} |
|
} |
|
|
|
static final int FC_CONTINUE = 0; |
|
static final int FC_TERMINAL = 1; |
|
static final int FC_ANY = 2; |
|
private static final boolean isSet(int options, int flag) { |
|
return (options & flag) == flag; |
|
} |
|
final int analyzeFirstCharacter(RangeToken result, int options) { |
|
switch (this.type) { |
|
case CONCAT: |
|
int ret = FC_CONTINUE; |
|
for (int i = 0; i < this.size(); i ++) |
|
if ((ret = this.getChild(i).analyzeFirstCharacter(result, options)) != FC_CONTINUE) |
|
break; |
|
return ret; |
|
|
|
case UNION: |
|
if (this.size() == 0) |
|
return FC_CONTINUE; |
|
|
|
|
|
|
|
|
|
*/ |
|
int ret2 = FC_CONTINUE; |
|
boolean hasEmpty = false; |
|
for (int i = 0; i < this.size(); i ++) { |
|
ret2 = this.getChild(i).analyzeFirstCharacter(result, options); |
|
if (ret2 == FC_ANY) |
|
break; |
|
else if (ret2 == FC_CONTINUE) |
|
hasEmpty = true; |
|
} |
|
return hasEmpty ? FC_CONTINUE : ret2; |
|
|
|
case CONDITION: |
|
int ret3 = this.getChild(0).analyzeFirstCharacter(result, options); |
|
if (this.size() == 1) return FC_CONTINUE; |
|
if (ret3 == FC_ANY) return ret3; |
|
int ret4 = this.getChild(1).analyzeFirstCharacter(result, options); |
|
if (ret4 == FC_ANY) return ret4; |
|
return ret3 == FC_CONTINUE || ret4 == FC_CONTINUE ? FC_CONTINUE : FC_TERMINAL; |
|
|
|
case CLOSURE: |
|
case NONGREEDYCLOSURE: |
|
this.getChild(0).analyzeFirstCharacter(result, options); |
|
return FC_CONTINUE; |
|
|
|
case EMPTY: |
|
case ANCHOR: |
|
return FC_CONTINUE; |
|
|
|
case CHAR: |
|
int ch = this.getChar(); |
|
result.addRange(ch, ch); |
|
if (ch < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) { |
|
ch = Character.toUpperCase((char)ch); |
|
result.addRange(ch, ch); |
|
ch = Character.toLowerCase((char)ch); |
|
result.addRange(ch, ch); |
|
} |
|
return FC_TERMINAL; |
|
|
|
case DOT: |
|
return FC_ANY; |
|
|
|
case RANGE: |
|
result.mergeRanges(this); |
|
return FC_TERMINAL; |
|
|
|
case NRANGE: |
|
result.mergeRanges(Token.complementRanges(this)); |
|
return FC_TERMINAL; |
|
|
|
case INDEPENDENT: |
|
case PAREN: |
|
return this.getChild(0).analyzeFirstCharacter(result, options); |
|
|
|
case MODIFIERGROUP: |
|
options |= ((ModifierToken)this).getOptions(); |
|
options &= ~((ModifierToken)this).getOptionsMask(); |
|
return this.getChild(0).analyzeFirstCharacter(result, options); |
|
|
|
case BACKREFERENCE: |
|
result.addRange(0, UTF16_MAX); |
|
return FC_ANY; |
|
|
|
case STRING: |
|
int cha = this.getString().charAt(0); |
|
int ch2; |
|
if (REUtil.isHighSurrogate(cha) |
|
&& this.getString().length() >= 2 |
|
&& REUtil.isLowSurrogate((ch2 = this.getString().charAt(1)))) |
|
cha = REUtil.composeFromSurrogates(cha, ch2); |
|
result.addRange(cha, cha); |
|
if (cha < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) { |
|
cha = Character.toUpperCase((char)cha); |
|
result.addRange(cha, cha); |
|
cha = Character.toLowerCase((char)cha); |
|
result.addRange(cha, cha); |
|
} |
|
return FC_TERMINAL; |
|
|
|
case LOOKAHEAD: |
|
case NEGATIVELOOKAHEAD: |
|
case LOOKBEHIND: |
|
case NEGATIVELOOKBEHIND: |
|
return FC_CONTINUE; |
|
|
|
default: |
|
throw new RuntimeException("Token#analyzeHeadCharacter(): Invalid Type: "+this.type); |
|
} |
|
} |
|
|
|
private final boolean isShorterThan(Token tok) { |
|
if (tok == null) return false; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
int mylength; |
|
if (this.type == STRING) mylength = this.getString().length(); |
|
else throw new RuntimeException("Internal Error: Illegal type: "+this.type); |
|
int otherlength; |
|
if (tok.type == STRING) otherlength = tok.getString().length(); |
|
else throw new RuntimeException("Internal Error: Illegal type: "+tok.type); |
|
return mylength < otherlength; |
|
} |
|
|
|
static class FixedStringContainer { |
|
Token token = null; |
|
int options = 0; |
|
FixedStringContainer() { |
|
} |
|
} |
|
|
|
final void findFixedString(FixedStringContainer container, int options) { |
|
switch (this.type) { |
|
case CONCAT: |
|
Token prevToken = null; |
|
int prevOptions = 0; |
|
for (int i = 0; i < this.size(); i ++) { |
|
this.getChild(i).findFixedString(container, options); |
|
if (prevToken == null || prevToken.isShorterThan(container.token)) { |
|
prevToken = container.token; |
|
prevOptions = container.options; |
|
} |
|
} |
|
container.token = prevToken; |
|
container.options = prevOptions; |
|
return; |
|
|
|
case UNION: |
|
case CLOSURE: |
|
case NONGREEDYCLOSURE: |
|
case EMPTY: |
|
case ANCHOR: |
|
case RANGE: |
|
case DOT: |
|
case NRANGE: |
|
case BACKREFERENCE: |
|
case LOOKAHEAD: |
|
case NEGATIVELOOKAHEAD: |
|
case LOOKBEHIND: |
|
case NEGATIVELOOKBEHIND: |
|
case CONDITION: |
|
container.token = null; |
|
return; |
|
|
|
case CHAR: |
|
container.token = null; |
|
return; |
|
|
|
case STRING: |
|
container.token = this; |
|
container.options = options; |
|
return; |
|
|
|
case INDEPENDENT: |
|
case PAREN: |
|
this.getChild(0).findFixedString(container, options); |
|
return; |
|
|
|
case MODIFIERGROUP: |
|
options |= ((ModifierToken)this).getOptions(); |
|
options &= ~((ModifierToken)this).getOptionsMask(); |
|
this.getChild(0).findFixedString(container, options); |
|
return; |
|
|
|
default: |
|
throw new RuntimeException("Token#findFixedString(): Invalid Type: "+this.type); |
|
} |
|
} |
|
|
|
boolean match(int ch) { |
|
throw new RuntimeException("NFAArrow#match(): Internal error: "+this.type); |
|
} |
|
|
|
|
|
private static volatile Map<String, Token> categories = null; |
|
private static volatile Map<String, Token> categories2 = null; |
|
private static final Object lock = new Object(); |
|
private static final String[] categoryNames = { |
|
"Cn", "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me", "Mc", "Nd", |
|
"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", null, "Co", "Cs", |
|
"Pd", "Ps", "Pe", "Pc", "Po", "Sm", "Sc", "Sk", "So", |
|
"Pi", "Pf", |
|
"L", "M", "N", "Z", "C", "P", "S", |
|
}; |
|
|
|
// Schema Rec. {Datatypes} - Punctuation |
|
static final int CHAR_INIT_QUOTE = 29; |
|
static final int CHAR_FINAL_QUOTE = 30; |
|
static final int CHAR_LETTER = 31; |
|
static final int CHAR_MARK = 32; |
|
static final int CHAR_NUMBER = 33; |
|
static final int CHAR_SEPARATOR = 34; |
|
static final int CHAR_OTHER = 35; |
|
static final int CHAR_PUNCTUATION = 36; |
|
static final int CHAR_SYMBOL = 37; |
|
|
|
|
|
private static final String[] blockNames = { |
|
"Basic Latin", |
|
"Latin-1 Supplement", |
|
"Latin Extended-A", |
|
"Latin Extended-B", |
|
"IPA Extensions", |
|
"Spacing Modifier Letters", |
|
"Combining Diacritical Marks", |
|
"Greek", |
|
"Cyrillic", |
|
"Armenian", |
|
"Hebrew", |
|
"Arabic", |
|
"Syriac", |
|
"Thaana", |
|
"Devanagari", |
|
"Bengali", |
|
"Gurmukhi", |
|
"Gujarati", |
|
"Oriya", |
|
"Tamil", |
|
"Telugu", |
|
"Kannada", |
|
"Malayalam", |
|
"Sinhala", |
|
"Thai", |
|
"Lao", |
|
"Tibetan", |
|
"Myanmar", |
|
"Georgian", |
|
"Hangul Jamo", |
|
"Ethiopic", |
|
"Cherokee", |
|
"Unified Canadian Aboriginal Syllabics", |
|
"Ogham", |
|
"Runic", |
|
"Khmer", |
|
"Mongolian", |
|
"Latin Extended Additional", |
|
"Greek Extended", |
|
"General Punctuation", |
|
"Superscripts and Subscripts", |
|
"Currency Symbols", |
|
"Combining Marks for Symbols", |
|
"Letterlike Symbols", |
|
"Number Forms", |
|
"Arrows", |
|
"Mathematical Operators", |
|
"Miscellaneous Technical", |
|
"Control Pictures", |
|
"Optical Character Recognition", |
|
"Enclosed Alphanumerics", |
|
"Box Drawing", |
|
"Block Elements", |
|
"Geometric Shapes", |
|
"Miscellaneous Symbols", |
|
"Dingbats", |
|
"Braille Patterns", |
|
"CJK Radicals Supplement", |
|
"Kangxi Radicals", |
|
"Ideographic Description Characters", |
|
"CJK Symbols and Punctuation", |
|
"Hiragana", |
|
"Katakana", |
|
"Bopomofo", |
|
"Hangul Compatibility Jamo", |
|
"Kanbun", |
|
"Bopomofo Extended", |
|
"Enclosed CJK Letters and Months", |
|
"CJK Compatibility", |
|
"CJK Unified Ideographs Extension A", |
|
"CJK Unified Ideographs", |
|
"Yi Syllables", |
|
"Yi Radicals", |
|
"Hangul Syllables", |
|
"Private Use", |
|
"CJK Compatibility Ideographs", |
|
"Alphabetic Presentation Forms", |
|
"Arabic Presentation Forms-A", |
|
"Combining Half Marks", |
|
"CJK Compatibility Forms", |
|
"Small Form Variants", |
|
"Arabic Presentation Forms-B", |
|
"Specials", |
|
"Halfwidth and Fullwidth Forms", |
|
//missing Specials add manually |
|
/*10300..1032F;*/ "Old Italic", |
|
"Gothic", |
|
"Deseret", |
|
"Byzantine Musical Symbols", |
|
"Musical Symbols", |
|
"Mathematical Alphanumeric Symbols", |
|
"CJK Unified Ideographs Extension B", |
|
"CJK Compatibility Ideographs Supplement", |
|
"Tags", |
|
//missing 2 private use add manually |
|
|
|
}; |
|
//ADD THOSE MANUALLY |
|
//F0000..FFFFD; "Private Use", |
|
//100000..10FFFD; "Private Use" |
|
|
|
static final String blockRanges = |
|
"\u0000\u007F\u0080\u00FF\u0100\u017F\u0180\u024F\u0250\u02AF\u02B0\u02FF\u0300\u036F" |
|
+"\u0370\u03FF\u0400\u04FF\u0530\u058F\u0590\u05FF\u0600\u06FF\u0700\u074F\u0780\u07BF" |
|
+"\u0900\u097F\u0980\u09FF\u0A00\u0A7F\u0A80\u0AFF\u0B00\u0B7F\u0B80\u0BFF\u0C00\u0C7F\u0C80\u0CFF" |
|
+"\u0D00\u0D7F\u0D80\u0DFF\u0E00\u0E7F\u0E80\u0EFF\u0F00\u0FFF\u1000\u109F\u10A0\u10FF\u1100\u11FF" |
|
+"\u1200\u137F\u13A0\u13FF\u1400\u167F\u1680\u169F\u16A0\u16FF\u1780\u17FF\u1800\u18AF\u1E00\u1EFF" |
|
+"\u1F00\u1FFF\u2000\u206F\u2070\u209F\u20A0\u20CF\u20D0\u20FF\u2100\u214F\u2150\u218F\u2190\u21FF\u2200\u22FF" |
|
+"\u2300\u23FF\u2400\u243F\u2440\u245F\u2460\u24FF\u2500\u257F\u2580\u259F\u25A0\u25FF\u2600\u26FF\u2700\u27BF" |
|
+"\u2800\u28FF\u2E80\u2EFF\u2F00\u2FDF\u2FF0\u2FFF\u3000\u303F\u3040\u309F\u30A0\u30FF\u3100\u312F\u3130\u318F" |
|
+"\u3190\u319F\u31A0\u31BF\u3200\u32FF\u3300\u33FF\u3400\u4DB5\u4E00\u9FFF\uA000\uA48F\uA490\uA4CF" |
|
+"\uAC00\uD7A3\uE000\uF8FF\uF900\uFAFF\uFB00\uFB4F\uFB50\uFDFF" |
|
+"\uFE20\uFE2F\uFE30\uFE4F\uFE50\uFE6F\uFE70\uFEFE\uFEFF\uFEFF\uFF00\uFFEF"; |
|
static final int[] nonBMPBlockRanges = { |
|
0x10300, 0x1032F, |
|
0x10330, 0x1034F, |
|
0x10400, 0x1044F, |
|
0x1D000, 0x1D0FF, |
|
0x1D100, 0x1D1FF, |
|
0x1D400, 0x1D7FF, |
|
0x20000, 0x2A6D6, |
|
0x2F800, 0x2FA1F, |
|
0xE0000, 0xE007F |
|
}; |
|
private static final int NONBMP_BLOCK_START = 84; |
|
|
|
static protected RangeToken getRange(String name, boolean positive) { |
|
|
|
Map<String, Token> localCat = Token.categories; |
|
if (localCat == null) { |
|
synchronized (lock) { |
|
localCat = Token.categories; |
|
if (localCat == null) { |
|
Map<String, Token> tmpCat = new HashMap<>(); |
|
Map<String, Token> tmpCat2 = new HashMap<>(); |
|
|
|
Token[] ranges = new Token[Token.categoryNames.length]; |
|
for (int i = 0; i < ranges.length; i ++) { |
|
ranges[i] = Token.createRange(); |
|
} |
|
int type; |
|
for (int i = 0; i < 0x10000; i ++) { |
|
type = Character.getType((char)i); |
|
if (type == Character.START_PUNCTUATION || |
|
type == Character.END_PUNCTUATION) { |
|
|
|
if (i == 0x00AB || i == 0x2018 || i == 0x201B || i == 0x201C || |
|
i == 0x201F || i == 0x2039) { |
|
type = CHAR_INIT_QUOTE; |
|
} |
|
|
|
if (i == 0x00BB || i == 0x2019 || i == 0x201D || i == 0x203A ) { |
|
type = CHAR_FINAL_QUOTE; |
|
} |
|
} |
|
ranges[type].addRange(i, i); |
|
switch (type) { |
|
case Character.UPPERCASE_LETTER: |
|
case Character.LOWERCASE_LETTER: |
|
case Character.TITLECASE_LETTER: |
|
case Character.MODIFIER_LETTER: |
|
case Character.OTHER_LETTER: |
|
type = CHAR_LETTER; |
|
break; |
|
case Character.NON_SPACING_MARK: |
|
case Character.COMBINING_SPACING_MARK: |
|
case Character.ENCLOSING_MARK: |
|
type = CHAR_MARK; |
|
break; |
|
case Character.DECIMAL_DIGIT_NUMBER: |
|
case Character.LETTER_NUMBER: |
|
case Character.OTHER_NUMBER: |
|
type = CHAR_NUMBER; |
|
break; |
|
case Character.SPACE_SEPARATOR: |
|
case Character.LINE_SEPARATOR: |
|
case Character.PARAGRAPH_SEPARATOR: |
|
type = CHAR_SEPARATOR; |
|
break; |
|
case Character.CONTROL: |
|
case Character.FORMAT: |
|
case Character.SURROGATE: |
|
case Character.PRIVATE_USE: |
|
case Character.UNASSIGNED: |
|
type = CHAR_OTHER; |
|
break; |
|
case Character.CONNECTOR_PUNCTUATION: |
|
case Character.DASH_PUNCTUATION: |
|
case Character.START_PUNCTUATION: |
|
case Character.END_PUNCTUATION: |
|
case CHAR_INIT_QUOTE: |
|
case CHAR_FINAL_QUOTE: |
|
case Character.OTHER_PUNCTUATION: |
|
type = CHAR_PUNCTUATION; |
|
break; |
|
case Character.MATH_SYMBOL: |
|
case Character.CURRENCY_SYMBOL: |
|
case Character.MODIFIER_SYMBOL: |
|
case Character.OTHER_SYMBOL: |
|
type = CHAR_SYMBOL; |
|
break; |
|
default: |
|
throw new RuntimeException("org.apache.xerces.utils.regex.Token#getRange(): Unknown Unicode category: "+type); |
|
} |
|
ranges[type].addRange(i, i); |
|
} |
|
ranges[Character.UNASSIGNED].addRange(0x10000, Token.UTF16_MAX); |
|
|
|
for (int i = 0; i < ranges.length; i ++) { |
|
if (Token.categoryNames[i] != null) { |
|
if (i == Character.UNASSIGNED) { |
|
ranges[i].addRange(0x10000, Token.UTF16_MAX); |
|
} |
|
tmpCat.put(Token.categoryNames[i], ranges[i]); |
|
tmpCat2.put(Token.categoryNames[i], |
|
Token.complementRanges(ranges[i])); |
|
} |
|
} |
|
//REVISIT: do we really need to support block names as in Unicode 3.1 |
|
// or we can just create all the names in IsBLOCKNAME format (XML Schema REC)? |
|
|
|
StringBuilder buffer = new StringBuilder(50); |
|
for (int i = 0; i < Token.blockNames.length; i ++) { |
|
Token r1 = Token.createRange(); |
|
int location; |
|
if (i < NONBMP_BLOCK_START) { |
|
location = i*2; |
|
int rstart = Token.blockRanges.charAt(location); |
|
int rend = Token.blockRanges.charAt(location+1); |
|
//DEBUGING |
|
//System.out.println(n+" " +Integer.toHexString(rstart) |
|
|
|
r1.addRange(rstart, rend); |
|
} else { |
|
location = (i - NONBMP_BLOCK_START) * 2; |
|
r1.addRange(Token.nonBMPBlockRanges[location], |
|
Token.nonBMPBlockRanges[location + 1]); |
|
} |
|
String n = Token.blockNames[i]; |
|
if (n.equals("Specials")) |
|
r1.addRange(0xfff0, 0xfffd); |
|
if (n.equals("Private Use")) { |
|
r1.addRange(0xF0000,0xFFFFD); |
|
r1.addRange(0x100000,0x10FFFD); |
|
} |
|
tmpCat.put(n, r1); |
|
tmpCat2.put(n, Token.complementRanges(r1)); |
|
buffer.setLength(0); |
|
buffer.append("Is"); |
|
if (n.indexOf(' ') >= 0) { |
|
for (int ci = 0; ci < n.length(); ci ++) |
|
if (n.charAt(ci) != ' ') buffer.append(n.charAt(ci)); |
|
} |
|
else { |
|
buffer.append(n); |
|
} |
|
Token.setAlias(tmpCat, tmpCat2, buffer.toString(), n, true); |
|
} |
|
|
|
|
|
Token.setAlias(tmpCat, tmpCat2, "ASSIGNED", "Cn", false); |
|
Token.setAlias(tmpCat, tmpCat2, "UNASSIGNED", "Cn", true); |
|
Token all = Token.createRange(); |
|
all.addRange(0, Token.UTF16_MAX); |
|
tmpCat.put("ALL", all); |
|
tmpCat2.put("ALL", Token.complementRanges(all)); |
|
Token.registerNonXS("ASSIGNED"); |
|
Token.registerNonXS("UNASSIGNED"); |
|
Token.registerNonXS("ALL"); |
|
|
|
Token isalpha = Token.createRange(); |
|
isalpha.mergeRanges(ranges[Character.UPPERCASE_LETTER]); |
|
isalpha.mergeRanges(ranges[Character.LOWERCASE_LETTER]); |
|
isalpha.mergeRanges(ranges[Character.OTHER_LETTER]); |
|
tmpCat.put("IsAlpha", isalpha); |
|
tmpCat2.put("IsAlpha", Token.complementRanges(isalpha)); |
|
Token.registerNonXS("IsAlpha"); |
|
|
|
Token isalnum = Token.createRange(); |
|
isalnum.mergeRanges(isalpha); |
|
isalnum.mergeRanges(ranges[Character.DECIMAL_DIGIT_NUMBER]); |
|
tmpCat.put("IsAlnum", isalnum); |
|
tmpCat2.put("IsAlnum", Token.complementRanges(isalnum)); |
|
Token.registerNonXS("IsAlnum"); |
|
|
|
Token isspace = Token.createRange(); |
|
isspace.mergeRanges(Token.token_spaces); |
|
isspace.mergeRanges(ranges[CHAR_SEPARATOR]); |
|
tmpCat.put("IsSpace", isspace); |
|
tmpCat2.put("IsSpace", Token.complementRanges(isspace)); |
|
Token.registerNonXS("IsSpace"); |
|
|
|
Token isword = Token.createRange(); |
|
isword.mergeRanges(isalnum); |
|
isword.addRange('_', '_'); |
|
tmpCat.put("IsWord", isword); |
|
tmpCat2.put("IsWord", Token.complementRanges(isword)); |
|
Token.registerNonXS("IsWord"); |
|
|
|
Token isascii = Token.createRange(); |
|
isascii.addRange(0, 127); |
|
tmpCat.put("IsASCII", isascii); |
|
tmpCat2.put("IsASCII", Token.complementRanges(isascii)); |
|
Token.registerNonXS("IsASCII"); |
|
|
|
Token isnotgraph = Token.createRange(); |
|
isnotgraph.mergeRanges(ranges[CHAR_OTHER]); |
|
isnotgraph.addRange(' ', ' '); |
|
tmpCat.put("IsGraph", Token.complementRanges(isnotgraph)); |
|
tmpCat2.put("IsGraph", isnotgraph); |
|
Token.registerNonXS("IsGraph"); |
|
|
|
Token isxdigit = Token.createRange(); |
|
isxdigit.addRange('0', '9'); |
|
isxdigit.addRange('A', 'F'); |
|
isxdigit.addRange('a', 'f'); |
|
tmpCat.put("IsXDigit", Token.complementRanges(isxdigit)); |
|
tmpCat2.put("IsXDigit", isxdigit); |
|
Token.registerNonXS("IsXDigit"); |
|
|
|
Token.setAlias(tmpCat, tmpCat2, "IsDigit", "Nd", true); |
|
Token.setAlias(tmpCat, tmpCat2, "IsUpper", "Lu", true); |
|
Token.setAlias(tmpCat, tmpCat2, "IsLower", "Ll", true); |
|
Token.setAlias(tmpCat, tmpCat2, "IsCntrl", "C", true); |
|
Token.setAlias(tmpCat, tmpCat2, "IsPrint", "C", false); |
|
Token.setAlias(tmpCat, tmpCat2, "IsPunct", "P", true); |
|
Token.registerNonXS("IsDigit"); |
|
Token.registerNonXS("IsUpper"); |
|
Token.registerNonXS("IsLower"); |
|
Token.registerNonXS("IsCntrl"); |
|
Token.registerNonXS("IsPrint"); |
|
Token.registerNonXS("IsPunct"); |
|
|
|
Token.setAlias(tmpCat, tmpCat2, "alpha", "IsAlpha", true); |
|
Token.setAlias(tmpCat, tmpCat2, "alnum", "IsAlnum", true); |
|
Token.setAlias(tmpCat, tmpCat2, "ascii", "IsASCII", true); |
|
Token.setAlias(tmpCat, tmpCat2, "cntrl", "IsCntrl", true); |
|
Token.setAlias(tmpCat, tmpCat2, "digit", "IsDigit", true); |
|
Token.setAlias(tmpCat, tmpCat2, "graph", "IsGraph", true); |
|
Token.setAlias(tmpCat, tmpCat2, "lower", "IsLower", true); |
|
Token.setAlias(tmpCat, tmpCat2, "print", "IsPrint", true); |
|
Token.setAlias(tmpCat, tmpCat2, "punct", "IsPunct", true); |
|
Token.setAlias(tmpCat, tmpCat2, "space", "IsSpace", true); |
|
Token.setAlias(tmpCat, tmpCat2, "upper", "IsUpper", true); |
|
Token.setAlias(tmpCat, tmpCat2, "word", "IsWord", true); |
|
Token.setAlias(tmpCat, tmpCat2, "xdigit", "IsXDigit", true); |
|
Token.registerNonXS("alpha"); |
|
Token.registerNonXS("alnum"); |
|
Token.registerNonXS("ascii"); |
|
Token.registerNonXS("cntrl"); |
|
Token.registerNonXS("digit"); |
|
Token.registerNonXS("graph"); |
|
Token.registerNonXS("lower"); |
|
Token.registerNonXS("print"); |
|
Token.registerNonXS("punct"); |
|
Token.registerNonXS("space"); |
|
Token.registerNonXS("upper"); |
|
Token.registerNonXS("word"); |
|
Token.registerNonXS("xdigit"); |
|
Token.categories = localCat = Collections.unmodifiableMap(tmpCat); |
|
Token.categories2 = Collections.unmodifiableMap(tmpCat2); |
|
} // localCat == null |
|
} // synchronized |
|
} |
|
return positive ? (RangeToken)localCat.get(name) |
|
: (RangeToken)Token.categories2.get(name); |
|
} |
|
static protected RangeToken getRange(String name, boolean positive, boolean xs) { |
|
RangeToken range = Token.getRange(name, positive); |
|
if (xs && range != null && Token.isRegisterNonXS(name)) |
|
range = null; |
|
return range; |
|
} |
|
|
|
static final Set<String> nonxs = Collections.synchronizedSet(new HashSet<>()); |
|
|
|
|
|
|
|
*/ |
|
static protected void registerNonXS(String name) { |
|
Token.nonxs.add(name); |
|
} |
|
|
|
static protected boolean isRegisterNonXS(String name) { |
|
return Token.nonxs.contains(name); |
|
} |
|
|
|
private static void setAlias(Map<String, Token> tmpCat, Map<String, Token> tmpCat2, |
|
String newName, String name, boolean positive) { |
|
Token t1 = tmpCat.get(name); |
|
Token t2 = tmpCat2.get(name); |
|
if (positive) { |
|
tmpCat.put(newName, t1); |
|
tmpCat2.put(newName, t2); |
|
} else { |
|
tmpCat2.put(newName, t1); |
|
tmpCat.put(newName, t2); |
|
} |
|
} |
|
|
|
// ------------------------------------------------------ |
|
|
|
static final String viramaString = |
|
"\u094D" |
|
+"\u09CD" |
|
+"\u0A4D" |
|
+"\u0ACD" |
|
+"\u0B4D" |
|
+"\u0BCD" |
|
+"\u0C4D" |
|
+"\u0CCD" |
|
+"\u0D4D" |
|
+"\u0E3A" |
|
+"\u0F84"; |
|
|
|
static private Token token_grapheme = null; |
|
static synchronized Token getGraphemePattern() { |
|
if (Token.token_grapheme != null) |
|
return Token.token_grapheme; |
|
|
|
Token base_char = Token.createRange(); |
|
base_char.mergeRanges(Token.getRange("ASSIGNED", true)); |
|
base_char.subtractRanges(Token.getRange("M", true)); |
|
base_char.subtractRanges(Token.getRange("C", true)); |
|
|
|
Token virama = Token.createRange(); |
|
for (int i = 0; i < Token.viramaString.length(); i++) { |
|
virama.addRange(i, i); |
|
} |
|
|
|
Token combiner_wo_virama = Token.createRange(); |
|
combiner_wo_virama.mergeRanges(Token.getRange("M", true)); |
|
combiner_wo_virama.addRange(0x1160, 0x11ff); |
|
combiner_wo_virama.addRange(0xff9e, 0xff9f); |
|
|
|
Token left = Token.createUnion(); |
|
left.addChild(base_char); |
|
left.addChild(Token.token_empty); |
|
|
|
Token foo = Token.createUnion(); |
|
foo.addChild(Token.createConcat(virama, Token.getRange("L", true))); |
|
foo.addChild(combiner_wo_virama); |
|
|
|
foo = Token.createClosure(foo); |
|
|
|
foo = Token.createConcat(left, foo); |
|
|
|
Token.token_grapheme = foo; |
|
return Token.token_grapheme; |
|
} |
|
|
|
|
|
|
|
*/ |
|
static private Token token_ccs = null; |
|
static synchronized Token getCombiningCharacterSequence() { |
|
if (Token.token_ccs != null) |
|
return Token.token_ccs; |
|
|
|
Token foo = Token.createClosure(Token.getRange("M", true)); |
|
foo = Token.createConcat(Token.getRange("M", false), foo); |
|
Token.token_ccs = foo; |
|
return Token.token_ccs; |
|
} |
|
|
|
// ------------------------------------------------------ |
|
|
|
// ------------------------------------------------------ |
|
|
|
|
|
*/ |
|
static class StringToken extends Token implements java.io.Serializable { |
|
|
|
private static final long serialVersionUID = -4614366944218504172L; |
|
|
|
String string; |
|
final int refNumber; |
|
|
|
StringToken(int type, String str, int n) { |
|
super(type); |
|
this.string = str; |
|
this.refNumber = n; |
|
} |
|
|
|
int getReferenceNumber() { |
|
return this.refNumber; |
|
} |
|
String getString() { |
|
return this.string; |
|
} |
|
|
|
public String toString(int options) { |
|
if (this.type == BACKREFERENCE) |
|
return "\\"+this.refNumber; |
|
else |
|
return REUtil.quoteMeta(this.string); |
|
} |
|
} |
|
|
|
|
|
|
|
*/ |
|
static class ConcatToken extends Token implements java.io.Serializable { |
|
|
|
private static final long serialVersionUID = 8717321425541346381L; |
|
|
|
final Token child; |
|
final Token child2; |
|
|
|
ConcatToken(Token t1, Token t2) { |
|
super(Token.CONCAT); |
|
this.child = t1; |
|
this.child2 = t2; |
|
} |
|
|
|
int size() { |
|
return 2; |
|
} |
|
Token getChild(int index) { |
|
return index == 0 ? this.child : this.child2; |
|
} |
|
|
|
public String toString(int options) { |
|
String ret; |
|
if (this.child2.type == CLOSURE && this.child2.getChild(0) == this.child) { |
|
ret = this.child.toString(options)+"+"; |
|
} else if (this.child2.type == NONGREEDYCLOSURE && this.child2.getChild(0) == this.child) { |
|
ret = this.child.toString(options)+"+?"; |
|
} else |
|
ret = this.child.toString(options)+this.child2.toString(options); |
|
return ret; |
|
} |
|
} |
|
|
|
|
|
|
|
*/ |
|
static class CharToken extends Token implements java.io.Serializable { |
|
|
|
private static final long serialVersionUID = -4394272816279496989L; |
|
|
|
final int chardata; |
|
|
|
CharToken(int type, int ch) { |
|
super(type); |
|
this.chardata = ch; |
|
} |
|
|
|
int getChar() { |
|
return this.chardata; |
|
} |
|
|
|
public String toString(int options) { |
|
String ret; |
|
switch (this.type) { |
|
case CHAR: |
|
switch (this.chardata) { |
|
case '|': case '*': case '+': case '?': |
|
case '(': case ')': case '.': case '[': |
|
case '{': case '\\': |
|
ret = "\\"+(char)this.chardata; |
|
break; |
|
case '\f': ret = "\\f"; break; |
|
case '\n': ret = "\\n"; break; |
|
case '\r': ret = "\\r"; break; |
|
case '\t': ret = "\\t"; break; |
|
case 0x1b: ret = "\\e"; break; |
|
|
|
default: |
|
if (this.chardata >= 0x10000) { |
|
String pre = "0"+Integer.toHexString(this.chardata); |
|
ret = "\\v"+pre.substring(pre.length()-6, pre.length()); |
|
} else |
|
ret = ""+(char)this.chardata; |
|
} |
|
break; |
|
|
|
case ANCHOR: |
|
if (this == Token.token_linebeginning || this == Token.token_lineend) |
|
ret = ""+(char)this.chardata; |
|
else |
|
ret = "\\"+(char)this.chardata; |
|
break; |
|
|
|
default: |
|
ret = null; |
|
} |
|
return ret; |
|
} |
|
|
|
boolean match(int ch) { |
|
if (this.type == CHAR) { |
|
return ch == this.chardata; |
|
} else |
|
throw new RuntimeException("NFAArrow#match(): Internal error: "+this.type); |
|
} |
|
} |
|
|
|
|
|
|
|
*/ |
|
static class ClosureToken extends Token implements java.io.Serializable { |
|
|
|
private static final long serialVersionUID = 1308971930673997452L; |
|
|
|
int min; |
|
int max; |
|
final Token child; |
|
|
|
ClosureToken(int type, Token tok) { |
|
super(type); |
|
this.child = tok; |
|
this.setMin(-1); |
|
this.setMax(-1); |
|
} |
|
|
|
int size() { |
|
return 1; |
|
} |
|
Token getChild(int index) { |
|
return this.child; |
|
} |
|
|
|
final void setMin(int min) { |
|
this.min = min; |
|
} |
|
final void setMax(int max) { |
|
this.max = max; |
|
} |
|
final int getMin() { |
|
return this.min; |
|
} |
|
final int getMax() { |
|
return this.max; |
|
} |
|
|
|
public String toString(int options) { |
|
String ret; |
|
if (this.type == CLOSURE) { |
|
if (this.getMin() < 0 && this.getMax() < 0) { |
|
ret = this.child.toString(options)+"*"; |
|
} else if (this.getMin() == this.getMax()) { |
|
ret = this.child.toString(options)+"{"+this.getMin()+"}"; |
|
} else if (this.getMin() >= 0 && this.getMax() >= 0) { |
|
ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}"; |
|
} else if (this.getMin() >= 0 && this.getMax() < 0) { |
|
ret = this.child.toString(options)+"{"+this.getMin()+",}"; |
|
} else |
|
throw new RuntimeException("Token#toString(): CLOSURE " |
|
+this.getMin()+", "+this.getMax()); |
|
} else { |
|
if (this.getMin() < 0 && this.getMax() < 0) { |
|
ret = this.child.toString(options)+"*?"; |
|
} else if (this.getMin() == this.getMax()) { |
|
ret = this.child.toString(options)+"{"+this.getMin()+"}?"; |
|
} else if (this.getMin() >= 0 && this.getMax() >= 0) { |
|
ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}?"; |
|
} else if (this.getMin() >= 0 && this.getMax() < 0) { |
|
ret = this.child.toString(options)+"{"+this.getMin()+",}?"; |
|
} else |
|
throw new RuntimeException("Token#toString(): NONGREEDYCLOSURE " |
|
+this.getMin()+", "+this.getMax()); |
|
} |
|
return ret; |
|
} |
|
} |
|
|
|
|
|
|
|
*/ |
|
static class ParenToken extends Token implements java.io.Serializable { |
|
|
|
private static final long serialVersionUID = -5938014719827987704L; |
|
|
|
final Token child; |
|
final int parennumber; |
|
|
|
ParenToken(int type, Token tok, int paren) { |
|
super(type); |
|
this.child = tok; |
|
this.parennumber = paren; |
|
} |
|
|
|
int size() { |
|
return 1; |
|
} |
|
Token getChild(int index) { |
|
return this.child; |
|
} |
|
|
|
int getParenNumber() { |
|
return this.parennumber; |
|
} |
|
|
|
public String toString(int options) { |
|
String ret = null; |
|
switch (this.type) { |
|
case PAREN: |
|
if (this.parennumber == 0) { |
|
ret = "(?:"+this.child.toString(options)+")"; |
|
} else { |
|
ret = "("+this.child.toString(options)+")"; |
|
} |
|
break; |
|
|
|
case LOOKAHEAD: |
|
ret = "(?="+this.child.toString(options)+")"; |
|
break; |
|
case NEGATIVELOOKAHEAD: |
|
ret = "(?!"+this.child.toString(options)+")"; |
|
break; |
|
case LOOKBEHIND: |
|
ret = "(?<="+this.child.toString(options)+")"; |
|
break; |
|
case NEGATIVELOOKBEHIND: |
|
ret = "(?<!"+this.child.toString(options)+")"; |
|
break; |
|
case INDEPENDENT: |
|
ret = "(?>"+this.child.toString(options)+")"; |
|
break; |
|
} |
|
return ret; |
|
} |
|
} |
|
|
|
|
|
|
|
*/ |
|
static class ConditionToken extends Token implements java.io.Serializable { |
|
|
|
private static final long serialVersionUID = 4353765277910594411L; |
|
|
|
final int refNumber; |
|
final Token condition; |
|
final Token yes; |
|
final Token no; |
|
ConditionToken(int refno, Token cond, Token yespat, Token nopat) { |
|
super(Token.CONDITION); |
|
this.refNumber = refno; |
|
this.condition = cond; |
|
this.yes = yespat; |
|
this.no = nopat; |
|
} |
|
int size() { |
|
return this.no == null ? 1 : 2; |
|
} |
|
Token getChild(int index) { |
|
if (index == 0) return this.yes; |
|
if (index == 1) return this.no; |
|
throw new RuntimeException("Internal Error: "+index); |
|
} |
|
|
|
public String toString(int options) { |
|
String ret; |
|
if (refNumber > 0) { |
|
ret = "(?("+refNumber+")"; |
|
} else if (this.condition.type == Token.ANCHOR) { |
|
ret = "(?("+this.condition+")"; |
|
} else { |
|
ret = "(?"+this.condition; |
|
} |
|
|
|
if (this.no == null) { |
|
ret += this.yes+")"; |
|
} else { |
|
ret += this.yes+"|"+this.no+")"; |
|
} |
|
return ret; |
|
} |
|
} |
|
|
|
|
|
|
|
*/ |
|
static class ModifierToken extends Token implements java.io.Serializable { |
|
|
|
private static final long serialVersionUID = -9114536559696480356L; |
|
|
|
final Token child; |
|
final int add; |
|
final int mask; |
|
|
|
ModifierToken(Token tok, int add, int mask) { |
|
super(Token.MODIFIERGROUP); |
|
this.child = tok; |
|
this.add = add; |
|
this.mask = mask; |
|
} |
|
|
|
int size() { |
|
return 1; |
|
} |
|
Token getChild(int index) { |
|
return this.child; |
|
} |
|
|
|
int getOptions() { |
|
return this.add; |
|
} |
|
int getOptionsMask() { |
|
return this.mask; |
|
} |
|
|
|
public String toString(int options) { |
|
return "(?" |
|
+(this.add == 0 ? "" : REUtil.createOptionString(this.add)) |
|
+(this.mask == 0 ? "" : REUtil.createOptionString(this.mask)) |
|
+":" |
|
+this.child.toString(options) |
|
+")"; |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
*/ |
|
static class UnionToken extends Token implements java.io.Serializable { |
|
|
|
private static final long serialVersionUID = -2568843945989489861L; |
|
|
|
List<Token> children; |
|
|
|
|
|
|
|
*/ |
|
private static final ObjectStreamField[] serialPersistentFields = |
|
new ObjectStreamField[] { |
|
new ObjectStreamField("children", Vector.class), |
|
}; |
|
|
|
UnionToken(int type) { |
|
super(type); |
|
} |
|
|
|
@Override |
|
void addChild(Token tok) { |
|
if (tok == null) return; |
|
if (this.children == null) this.children = new ArrayList<>(); |
|
if (this.type == UNION) { |
|
this.children.add(tok); |
|
return; |
|
} |
|
|
|
if (tok.type == CONCAT) { |
|
for (int i = 0; i < tok.size(); i ++) |
|
this.addChild(tok.getChild(i)); |
|
return; |
|
} |
|
int size = this.children.size(); |
|
if (size == 0) { |
|
this.children.add(tok); |
|
return; |
|
} |
|
Token previous = this.children.get(size - 1); |
|
if (!((previous.type == CHAR || previous.type == STRING) |
|
&& (tok.type == CHAR || tok.type == STRING))) { |
|
this.children.add(tok); |
|
return; |
|
} |
|
|
|
//System.err.println("Merge '"+previous+"' and '"+tok+"'."); |
|
|
|
StringBuilder buffer; |
|
int nextMaxLength = (tok.type == CHAR ? 2 : tok.getString().length()); |
|
if (previous.type == CHAR) { |
|
buffer = new StringBuilder(2 + nextMaxLength); |
|
int ch = previous.getChar(); |
|
if (ch >= 0x10000) |
|
buffer.append(REUtil.decomposeToSurrogates(ch)); |
|
else |
|
buffer.append((char)ch); |
|
previous = Token.createString(null); |
|
this.children.set(size - 1, previous); |
|
} else { |
|
buffer = new StringBuilder(previous.getString().length() + nextMaxLength); |
|
buffer.append(previous.getString()); |
|
} |
|
|
|
if (tok.type == CHAR) { |
|
int ch = tok.getChar(); |
|
if (ch >= 0x10000) |
|
buffer.append(REUtil.decomposeToSurrogates(ch)); |
|
else |
|
buffer.append((char)ch); |
|
} else { |
|
buffer.append(tok.getString()); |
|
} |
|
|
|
((StringToken)previous).string = new String(buffer); |
|
} |
|
|
|
@Override |
|
int size() { |
|
return this.children == null ? 0 : this.children.size(); |
|
} |
|
@Override |
|
Token getChild(int index) { |
|
return this.children.get(index); |
|
} |
|
|
|
@Override |
|
public String toString(int options) { |
|
String ret; |
|
if (this.type == CONCAT) { |
|
if (this.children.size() == 2) { |
|
Token ch = this.getChild(0); |
|
Token ch2 = this.getChild(1); |
|
if (ch2.type == CLOSURE && ch2.getChild(0) == ch) { |
|
ret = ch.toString(options)+"+"; |
|
} else if (ch2.type == NONGREEDYCLOSURE && ch2.getChild(0) == ch) { |
|
ret = ch.toString(options)+"+?"; |
|
} else |
|
ret = ch.toString(options)+ch2.toString(options); |
|
} else { |
|
StringBuilder sb = new StringBuilder(); |
|
this.children.stream().forEach((children1) -> { |
|
sb.append((children1).toString(options)); |
|
}); |
|
ret = sb.toString(); |
|
} |
|
return ret; |
|
} |
|
if (this.children.size() == 2 && this.getChild(1).type == EMPTY) { |
|
ret = this.getChild(0).toString(options)+"?"; |
|
} else if (this.children.size() == 2 |
|
&& this.getChild(0).type == EMPTY) { |
|
ret = this.getChild(1).toString(options)+"??"; |
|
} else { |
|
StringBuilder sb = new StringBuilder(); |
|
sb.append((this.children.get(0)).toString(options)); |
|
for (int i = 1; i < this.children.size(); i ++) { |
|
sb.append('|'); |
|
sb.append((this.children.get(i)).toString(options)); |
|
} |
|
ret = sb.toString(); |
|
} |
|
return ret; |
|
} |
|
|
|
|
|
|
|
*/ |
|
private void writeObject(ObjectOutputStream out) throws IOException { |
|
|
|
Vector<Token> vChildren = (children == null)? null : new Vector<>(children); |
|
|
|
|
|
ObjectOutputStream.PutField pf = out.putFields(); |
|
pf.put("children", vChildren); |
|
out.writeFields(); |
|
} |
|
|
|
@SuppressWarnings("unchecked") |
|
private void readObject(ObjectInputStream in) |
|
throws IOException, ClassNotFoundException { |
|
|
|
ObjectInputStream.GetField gf = in.readFields(); |
|
Vector<Token> vChildren = (Vector<Token>)gf.get("children", null); |
|
|
|
|
|
if (vChildren != null) children = new ArrayList<>(vChildren); |
|
} |
|
} |
|
} |