|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
/* |
|
******************************************************************************* |
|
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved * |
|
* * |
|
* The original version of this source code and documentation is copyrighted * |
|
* and owned by IBM, These materials are provided under terms of a License * |
|
* Agreement between IBM and Sun. This technology is protected by multiple * |
|
* US and International patents. This notice and attribution to IBM may not * |
|
* to removed. * |
|
******************************************************************************* |
|
*/ |
|
|
|
package sun.text.normalizer; |
|
|
|
import java.io.BufferedInputStream; |
|
import java.io.ByteArrayInputStream; |
|
import java.io.IOException; |
|
import java.io.BufferedInputStream; |
|
import java.io.InputStream; |
|
|
|
|
|
|
|
*/ |
|
public final class NormalizerImpl { |
|
|
|
static final NormalizerImpl IMPL; |
|
|
|
static |
|
{ |
|
try |
|
{ |
|
IMPL = new NormalizerImpl(); |
|
} |
|
catch (Exception e) |
|
{ |
|
throw new RuntimeException(e.getMessage()); |
|
} |
|
} |
|
|
|
static final int UNSIGNED_BYTE_MASK =0xFF; |
|
static final long UNSIGNED_INT_MASK = 0xffffffffL; |
|
|
|
|
|
|
|
|
|
*/ |
|
private static final String DATA_FILE_NAME = "/sun/text/resources/unorm.icu"; |
|
|
|
// norm32 value constants |
|
|
|
|
|
public static final int QC_NFC=0x11; |
|
public static final int QC_NFKC=0x22; |
|
public static final int QC_NFD=4; |
|
public static final int QC_NFKD=8; /* no */ |
|
|
|
public static final int QC_ANY_NO=0xf; |
|
|
|
|
|
|
|
*/ |
|
public static final int QC_MAYBE=0x10; |
|
public static final int QC_ANY_MAYBE=0x30; |
|
|
|
public static final int QC_MASK=0x3f; |
|
|
|
private static final int COMBINES_FWD=0x40; |
|
private static final int COMBINES_BACK=0x80; |
|
public static final int COMBINES_ANY=0xc0; |
|
|
|
private static final int CC_SHIFT=8; |
|
public static final int CC_MASK=0xff00; |
|
|
|
private static final int EXTRA_SHIFT=16; |
|
|
|
|
|
private static final long MIN_SPECIAL = 0xfc000000 & UNSIGNED_INT_MASK; |
|
private static final long SURROGATES_TOP = 0xfff00000 & UNSIGNED_INT_MASK; |
|
private static final long MIN_HANGUL = 0xfff00000 & UNSIGNED_INT_MASK; |
|
|
|
private static final long JAMO_V_TOP = 0xfff30000 & UNSIGNED_INT_MASK; |
|
|
|
|
|
/* indexes[] value names */ |
|
|
|
static final int INDEX_TRIE_SIZE = 0; |
|
|
|
static final int INDEX_CHAR_COUNT = 1; |
|
|
|
static final int INDEX_COMBINE_DATA_COUNT = 2; |
|
|
|
public static final int INDEX_MIN_NFC_NO_MAYBE = 6; |
|
|
|
public static final int INDEX_MIN_NFKC_NO_MAYBE = 7; |
|
|
|
public static final int INDEX_MIN_NFD_NO_MAYBE = 8; |
|
|
|
public static final int INDEX_MIN_NFKD_NO_MAYBE = 9; |
|
|
|
static final int INDEX_FCD_TRIE_SIZE = 10; |
|
|
|
static final int INDEX_AUX_TRIE_SIZE = 11; |
|
|
|
static final int INDEX_TOP = 32; |
|
|
|
|
|
/* AUX constants */ |
|
|
|
private static final int AUX_UNSAFE_SHIFT = 11; |
|
private static final int AUX_COMP_EX_SHIFT = 10; |
|
private static final int AUX_NFC_SKIPPABLE_F_SHIFT = 12; |
|
|
|
private static final int AUX_MAX_FNC = 1<<AUX_COMP_EX_SHIFT; |
|
private static final int AUX_UNSAFE_MASK = (int)((1<<AUX_UNSAFE_SHIFT) & UNSIGNED_INT_MASK); |
|
private static final int AUX_FNC_MASK = (int)((AUX_MAX_FNC-1) & UNSIGNED_INT_MASK); |
|
private static final int AUX_COMP_EX_MASK = (int)((1<<AUX_COMP_EX_SHIFT) & UNSIGNED_INT_MASK); |
|
private static final long AUX_NFC_SKIP_F_MASK = ((UNSIGNED_INT_MASK&1)<<AUX_NFC_SKIPPABLE_F_SHIFT); |
|
|
|
private static final int MAX_BUFFER_SIZE = 20; |
|
|
|
/*******************************/ |
|
|
|
|
|
static final class NormTrieImpl implements Trie.DataManipulate{ |
|
static IntTrie normTrie= null; |
|
/** |
|
* Called by com.ibm.icu.util.Trie to extract from a lead surrogate's |
|
* data the index array offset of the indexes for that lead surrogate. |
|
* @param property data value for a surrogate from the trie, including |
|
* the folding offset |
|
* @return data offset or 0 if there is no data for the lead surrogate |
|
*/ |
|
|
|
public int getFoldingOffset(int value){ |
|
return BMP_INDEX_LENGTH+ |
|
((value>>(EXTRA_SHIFT-SURROGATE_BLOCK_BITS))& |
|
(0x3ff<<SURROGATE_BLOCK_BITS)); |
|
} |
|
|
|
} |
|
static final class FCDTrieImpl implements Trie.DataManipulate{ |
|
static CharTrie fcdTrie=null; |
|
/** |
|
* Called by com.ibm.icu.util.Trie to extract from a lead surrogate's |
|
* data the index array offset of the indexes for that lead surrogate. |
|
* @param property data value for a surrogate from the trie, including |
|
* the folding offset |
|
* @return data offset or 0 if there is no data for the lead surrogate |
|
*/ |
|
|
|
public int getFoldingOffset(int value){ |
|
return value; |
|
} |
|
} |
|
|
|
static final class AuxTrieImpl implements Trie.DataManipulate{ |
|
static CharTrie auxTrie = null; |
|
/** |
|
* Called by com.ibm.icu.util.Trie to extract from a lead surrogate's |
|
* data the index array offset of the indexes for that lead surrogate. |
|
* @param property data value for a surrogate from the trie, including |
|
* the folding offset |
|
* @return data offset or 0 if there is no data for the lead surrogate |
|
*/ |
|
|
|
public int getFoldingOffset(int value){ |
|
return (value &AUX_FNC_MASK)<<SURROGATE_BLOCK_BITS; |
|
} |
|
} |
|
|
|
/****************************************************/ |
|
|
|
|
|
private static FCDTrieImpl fcdTrieImpl; |
|
private static NormTrieImpl normTrieImpl; |
|
private static AuxTrieImpl auxTrieImpl; |
|
private static int[] indexes; |
|
private static char[] combiningTable; |
|
private static char[] extraData; |
|
|
|
private static boolean isDataLoaded; |
|
private static boolean isFormatVersion_2_1; |
|
private static boolean isFormatVersion_2_2; |
|
private static byte[] unicodeVersion; |
|
|
|
|
|
|
|
*/ |
|
private static final int DATA_BUFFER_SIZE = 25000; |
|
|
|
|
|
|
|
|
|
*/ |
|
public static final int MIN_WITH_LEAD_CC=0x300; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
private static final int DECOMP_FLAG_LENGTH_HAS_CC=0x80; |
|
|
|
|
|
*/ |
|
private static final int DECOMP_LENGTH_MASK=0x7f; |
|
|
|
|
|
private static final int BMP_INDEX_LENGTH=0x10000>>Trie.INDEX_STAGE_1_SHIFT_; |
|
|
|
|
|
*/ |
|
private static final int SURROGATE_BLOCK_BITS=10-Trie.INDEX_STAGE_1_SHIFT_; |
|
|
|
|
|
|
|
public static int getFromIndexesArr(int index){ |
|
return indexes[index]; |
|
} |
|
|
|
// protected constructor --------------------------------------------- |
|
|
|
|
|
|
|
|
|
*/ |
|
private NormalizerImpl() throws IOException { |
|
|
|
if(!isDataLoaded){ |
|
|
|
|
|
InputStream i = ICUData.getRequiredStream(DATA_FILE_NAME); |
|
BufferedInputStream b = new BufferedInputStream(i,DATA_BUFFER_SIZE); |
|
NormalizerDataReader reader = new NormalizerDataReader(b); |
|
|
|
|
|
indexes = reader.readIndexes(NormalizerImpl.INDEX_TOP); |
|
|
|
byte[] normBytes = new byte[indexes[NormalizerImpl.INDEX_TRIE_SIZE]]; |
|
|
|
int combiningTableTop = indexes[NormalizerImpl.INDEX_COMBINE_DATA_COUNT]; |
|
combiningTable = new char[combiningTableTop]; |
|
|
|
int extraDataTop = indexes[NormalizerImpl.INDEX_CHAR_COUNT]; |
|
extraData = new char[extraDataTop]; |
|
|
|
byte[] fcdBytes = new byte[indexes[NormalizerImpl.INDEX_FCD_TRIE_SIZE]]; |
|
byte[] auxBytes = new byte[indexes[NormalizerImpl.INDEX_AUX_TRIE_SIZE]]; |
|
|
|
fcdTrieImpl = new FCDTrieImpl(); |
|
normTrieImpl = new NormTrieImpl(); |
|
auxTrieImpl = new AuxTrieImpl(); |
|
|
|
|
|
reader.read(normBytes, fcdBytes,auxBytes, extraData, combiningTable); |
|
|
|
NormTrieImpl.normTrie = new IntTrie( new ByteArrayInputStream(normBytes),normTrieImpl ); |
|
FCDTrieImpl.fcdTrie = new CharTrie( new ByteArrayInputStream(fcdBytes),fcdTrieImpl ); |
|
AuxTrieImpl.auxTrie = new CharTrie( new ByteArrayInputStream(auxBytes),auxTrieImpl ); |
|
|
|
// we reached here without any exceptions so the data is fully |
|
|
|
isDataLoaded = true; |
|
|
|
|
|
byte[] formatVersion = reader.getDataFormatVersion(); |
|
|
|
isFormatVersion_2_1 =( formatVersion[0]>2 |
|
|| |
|
(formatVersion[0]==2 && formatVersion[1]>=1) |
|
); |
|
isFormatVersion_2_2 =( formatVersion[0]>2 |
|
|| |
|
(formatVersion[0]==2 && formatVersion[1]>=2) |
|
); |
|
unicodeVersion = reader.getUnicodeVersion(); |
|
b.close(); |
|
} |
|
} |
|
|
|
/* ---------------------------------------------------------------------- */ |
|
|
|
/* Korean Hangul and Jamo constants */ |
|
|
|
public static final int JAMO_L_BASE=0x1100; |
|
public static final int JAMO_V_BASE=0x1161; |
|
public static final int JAMO_T_BASE=0x11a7; /* "trail" jamo */ |
|
|
|
public static final int HANGUL_BASE=0xac00; |
|
|
|
public static final int JAMO_L_COUNT=19; |
|
public static final int JAMO_V_COUNT=21; |
|
public static final int JAMO_T_COUNT=28; |
|
public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT; |
|
|
|
private static boolean isHangulWithoutJamoT(char c) { |
|
c-=HANGUL_BASE; |
|
return c<HANGUL_COUNT && c%JAMO_T_COUNT==0; |
|
} |
|
|
|
/* norm32 helpers */ |
|
|
|
|
|
private static boolean isNorm32Regular(long norm32) { |
|
return norm32<MIN_SPECIAL; |
|
} |
|
|
|
|
|
private static boolean isNorm32LeadSurrogate(long norm32) { |
|
return MIN_SPECIAL<=norm32 && norm32<SURROGATES_TOP; |
|
} |
|
|
|
|
|
private static boolean isNorm32HangulOrJamo(long norm32) { |
|
return norm32>=MIN_HANGUL; |
|
} |
|
|
|
|
|
|
|
|
|
*/ |
|
private static boolean isJamoVTNorm32JamoV(long norm32) { |
|
return norm32<JAMO_V_TOP; |
|
} |
|
|
|
/* data access primitives ----------------------------------------------- */ |
|
|
|
public static long getNorm32(char c) { |
|
return ((UNSIGNED_INT_MASK) & (NormTrieImpl.normTrie.getLeadValue(c))); |
|
} |
|
|
|
public static long getNorm32FromSurrogatePair(long norm32, |
|
char c2) { |
|
|
|
|
|
|
|
*/ |
|
return ((UNSIGNED_INT_MASK) & |
|
NormTrieImpl.normTrie.getTrailValue((int)norm32, c2)); |
|
} |
|
|
|
private static long getNorm32(int c){ |
|
return (UNSIGNED_INT_MASK&(NormTrieImpl.normTrie.getCodePointValue(c))); |
|
} |
|
|
|
|
|
|
|
|
|
*/ |
|
private static long getNorm32(char[] p,int start, |
|
int mask) { |
|
long norm32= getNorm32(p[start]); |
|
if(((norm32&mask)>0) && isNorm32LeadSurrogate(norm32)) { |
|
|
|
norm32=getNorm32FromSurrogatePair(norm32, p[start+1]); |
|
} |
|
return norm32; |
|
} |
|
|
|
|
|
public static VersionInfo getUnicodeVersion(){ |
|
return VersionInfo.getInstance(unicodeVersion[0], unicodeVersion[1], |
|
unicodeVersion[2], unicodeVersion[3]); |
|
} |
|
|
|
public static char getFCD16(char c) { |
|
return FCDTrieImpl.fcdTrie.getLeadValue(c); |
|
} |
|
|
|
public static char getFCD16FromSurrogatePair(char fcd16, char c2) { |
|
|
|
|
|
* */ |
|
return FCDTrieImpl.fcdTrie.getTrailValue(fcd16, c2); |
|
} |
|
public static int getFCD16(int c) { |
|
return FCDTrieImpl.fcdTrie.getCodePointValue(c); |
|
} |
|
|
|
private static int getExtraDataIndex(long norm32) { |
|
return (int)(norm32>>EXTRA_SHIFT); |
|
} |
|
|
|
private static final class DecomposeArgs{ |
|
int cc; |
|
int trailCC; |
|
int length; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
private static int decompose(long norm32, |
|
int qcMask, |
|
DecomposeArgs args) { |
|
int p= getExtraDataIndex(norm32); |
|
args.length=extraData[p++]; |
|
|
|
if((norm32&qcMask&QC_NFKD)!=0 && args.length>=0x100) { |
|
|
|
p+=((args.length>>7)&1)+(args.length&DECOMP_LENGTH_MASK); |
|
args.length>>=8; |
|
} |
|
|
|
if((args.length&DECOMP_FLAG_LENGTH_HAS_CC)>0) { |
|
|
|
char bothCCs=extraData[p++]; |
|
args.cc=(UNSIGNED_BYTE_MASK) & (bothCCs>>8); |
|
args.trailCC=(UNSIGNED_BYTE_MASK) & bothCCs; |
|
} else { |
|
|
|
args.cc=args.trailCC=0; |
|
} |
|
|
|
args.length&=DECOMP_LENGTH_MASK; |
|
return p; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
private static int decompose(long norm32, |
|
DecomposeArgs args) { |
|
|
|
int p= getExtraDataIndex(norm32); |
|
args.length=extraData[p++]; |
|
|
|
if((args.length&DECOMP_FLAG_LENGTH_HAS_CC)>0) { |
|
|
|
char bothCCs=extraData[p++]; |
|
args.cc=(UNSIGNED_BYTE_MASK) & (bothCCs>>8); |
|
args.trailCC=(UNSIGNED_BYTE_MASK) & bothCCs; |
|
} else { |
|
|
|
args.cc=args.trailCC=0; |
|
} |
|
|
|
args.length&=DECOMP_LENGTH_MASK; |
|
return p; |
|
} |
|
|
|
|
|
private static final class NextCCArgs{ |
|
char[] source; |
|
int next; |
|
int limit; |
|
char c; |
|
char c2; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
private static int getNextCC(NextCCArgs args) { |
|
long norm32; |
|
|
|
args.c=args.source[args.next++]; |
|
|
|
norm32= getNorm32(args.c); |
|
if((norm32 & CC_MASK)==0) { |
|
args.c2=0; |
|
return 0; |
|
} else { |
|
if(!isNorm32LeadSurrogate(norm32)) { |
|
args.c2=0; |
|
} else { |
|
|
|
if(args.next!=args.limit && |
|
UTF16.isTrailSurrogate(args.c2=args.source[args.next])){ |
|
++args.next; |
|
norm32=getNorm32FromSurrogatePair(norm32, args.c2); |
|
} else { |
|
args.c2=0; |
|
return 0; |
|
} |
|
} |
|
|
|
return (int)((UNSIGNED_BYTE_MASK) & (norm32>>CC_SHIFT)); |
|
} |
|
} |
|
|
|
private static final class PrevArgs{ |
|
char[] src; |
|
int start; |
|
int current; |
|
char c; |
|
char c2; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
private static long getPrevNorm32(PrevArgs args, |
|
int minC, |
|
int mask) { |
|
long norm32; |
|
|
|
args.c=args.src[--args.current]; |
|
args.c2=0; |
|
|
|
|
|
|
|
*/ |
|
if(args.c<minC) { |
|
return 0; |
|
} else if(!UTF16.isSurrogate(args.c)) { |
|
return getNorm32(args.c); |
|
} else if(UTF16.isLeadSurrogate(args.c)) { |
|
|
|
return 0; |
|
} else if(args.current!=args.start && |
|
UTF16.isLeadSurrogate(args.c2=args.src[args.current-1])) { |
|
--args.current; |
|
norm32=getNorm32(args.c2); |
|
|
|
if((norm32&mask)==0) { |
|
|
|
|
|
*/ |
|
return 0; |
|
} else { |
|
|
|
return getNorm32FromSurrogatePair(norm32, args.c); |
|
} |
|
} else { |
|
|
|
args.c2=0; |
|
return 0; |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
*/ |
|
private static int getPrevCC(PrevArgs args) { |
|
|
|
return (int)((UNSIGNED_BYTE_MASK)&(getPrevNorm32(args, MIN_WITH_LEAD_CC, |
|
CC_MASK)>>CC_SHIFT)); |
|
} |
|
|
|
|
|
|
|
|
|
*/ |
|
public static boolean isNFDSafe(long norm32, |
|
intccOrQCMask, |
|
int decompQCMask) { |
|
if((norm32&ccOrQCMask)==0) { |
|
return true; /* cc==0 and no decomposition: this is NF*D safe */ |
|
} |
|
|
|
|
|
if(isNorm32Regular(norm32) && (norm32&decompQCMask)!=0) { |
|
DecomposeArgs args=new DecomposeArgs(); |
|
|
|
decompose(norm32, decompQCMask, args); |
|
return args.cc==0; |
|
} else { |
|
|
|
return (norm32&CC_MASK)==0; |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
*/ |
|
public static boolean isTrueStarter(long norm32, |
|
int ccOrQCMask, |
|
int decompQCMask) { |
|
if((norm32&ccOrQCMask)==0) { |
|
return true; /* this is a true starter (could be Hangul or Jamo L)*/ |
|
} |
|
|
|
|
|
if((norm32&decompQCMask)!=0) { |
|
int p; |
|
DecomposeArgs args=new DecomposeArgs(); |
|
|
|
p=decompose(norm32, decompQCMask, args); |
|
|
|
if(args.cc==0) { |
|
int qcMask=ccOrQCMask&QC_MASK; |
|
|
|
|
|
if((getNorm32(extraData,p, qcMask)&qcMask)==0) { |
|
|
|
return true; |
|
} |
|
} |
|
} |
|
return false; |
|
} |
|
|
|
/* reorder UTF-16 in-place ---------------------------------------------- */ |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
private static int insertOrdered(char[] source, |
|
int start, |
|
int current, int p, |
|
char c, char c2, |
|
int cc) { |
|
int back, preBack; |
|
int r; |
|
int prevCC, trailCC=cc; |
|
|
|
if(start<current && cc!=0) { |
|
|
|
preBack=back=current; |
|
PrevArgs prevArgs = new PrevArgs(); |
|
prevArgs.current = current; |
|
prevArgs.start = start; |
|
prevArgs.src = source; |
|
|
|
prevCC=getPrevCC(prevArgs); |
|
preBack = prevArgs.current; |
|
|
|
if(cc<prevCC) { |
|
|
|
trailCC=prevCC; |
|
back=preBack; |
|
while(start<preBack) { |
|
prevCC=getPrevCC(prevArgs); |
|
preBack=prevArgs.current; |
|
if(cc>=prevCC) { |
|
break; |
|
} |
|
back=preBack; |
|
} |
|
|
|
|
|
// this is where we are right now with all these indicies: |
|
// [start]..[pPreBack] 0..? code points that we can ignore |
|
// [pPreBack]..[pBack] 0..1 code points with prevCC<=cc |
|
// [pBack]..[current] 0..n code points with >cc, move up to insert (c, c2) |
|
// [current]..[p] 1 code point (c, c2) with cc |
|
|
|
|
|
r=p; |
|
do { |
|
source[--r]=source[--current]; |
|
} while(back!=current); |
|
} |
|
} |
|
|
|
|
|
source[current]=c; |
|
if(c2!=0) { |
|
source[(current+1)]=c2; |
|
} |
|
|
|
|
|
return trailCC; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
private static int mergeOrdered(char[] source, |
|
int start, |
|
int current, |
|
char[] data, |
|
int next, |
|
int limit, |
|
boolean isOrdered) { |
|
int r; |
|
int cc, trailCC=0; |
|
boolean adjacent; |
|
|
|
adjacent= current==next; |
|
NextCCArgs ncArgs = new NextCCArgs(); |
|
ncArgs.source = data; |
|
ncArgs.next = next; |
|
ncArgs.limit = limit; |
|
|
|
if(start!=current || !isOrdered) { |
|
|
|
while(ncArgs.next<ncArgs.limit) { |
|
cc=getNextCC(ncArgs); |
|
if(cc==0) { |
|
|
|
trailCC=0; |
|
if(adjacent) { |
|
current=ncArgs.next; |
|
} else { |
|
data[current++]=ncArgs.c; |
|
if(ncArgs.c2!=0) { |
|
data[current++]=ncArgs.c2; |
|
} |
|
} |
|
if(isOrdered) { |
|
break; |
|
} else { |
|
start=current; |
|
} |
|
} else { |
|
r=current+(ncArgs.c2==0 ? 1 : 2); |
|
trailCC=insertOrdered(source,start, current, r, |
|
ncArgs.c, ncArgs.c2, cc); |
|
current=r; |
|
} |
|
} |
|
} |
|
|
|
if(ncArgs.next==ncArgs.limit) { |
|
|
|
return trailCC; |
|
} else { |
|
if(!adjacent) { |
|
|
|
do { |
|
source[current++]=data[ncArgs.next++]; |
|
} while(ncArgs.next!=ncArgs.limit); |
|
ncArgs.limit=current; |
|
} |
|
PrevArgs prevArgs = new PrevArgs(); |
|
prevArgs.src = data; |
|
prevArgs.start = start; |
|
prevArgs.current = ncArgs.limit; |
|
return getPrevCC(prevArgs); |
|
} |
|
|
|
} |
|
private static int mergeOrdered(char[] source, |
|
int start, |
|
int current, |
|
char[] data, |
|
final int next, |
|
final int limit) { |
|
return mergeOrdered(source,start,current,data,next,limit,true); |
|
} |
|
|
|
public static NormalizerBase.QuickCheckResult quickCheck(char[] src, |
|
int srcStart, |
|
int srcLimit, |
|
int minNoMaybe, |
|
int qcMask, |
|
int options, |
|
boolean allowMaybe, |
|
UnicodeSet nx){ |
|
|
|
int ccOrQCMask; |
|
long norm32; |
|
char c, c2; |
|
char cc, prevCC; |
|
long qcNorm32; |
|
NormalizerBase.QuickCheckResult result; |
|
ComposePartArgs args = new ComposePartArgs(); |
|
char[] buffer ; |
|
int start = srcStart; |
|
|
|
if(!isDataLoaded) { |
|
return NormalizerBase.MAYBE; |
|
} |
|
|
|
ccOrQCMask=CC_MASK|qcMask; |
|
result=NormalizerBase.YES; |
|
prevCC=0; |
|
|
|
for(;;) { |
|
for(;;) { |
|
if(srcStart==srcLimit) { |
|
return result; |
|
} else if((c=src[srcStart++])>=minNoMaybe && |
|
(( norm32=getNorm32(c)) & ccOrQCMask)!=0) { |
|
break; |
|
} |
|
prevCC=0; |
|
} |
|
|
|
|
|
|
|
if(isNorm32LeadSurrogate(norm32)) { |
|
|
|
if(srcStart!=srcLimit&& UTF16.isTrailSurrogate(c2=src[srcStart])) { |
|
++srcStart; |
|
norm32=getNorm32FromSurrogatePair(norm32,c2); |
|
} else { |
|
norm32=0; |
|
c2=0; |
|
} |
|
}else{ |
|
c2=0; |
|
} |
|
if(nx_contains(nx, c, c2)) { |
|
|
|
norm32=0; |
|
} |
|
|
|
|
|
cc=(char)((norm32>>CC_SHIFT)&0xFF); |
|
if(cc!=0 && cc<prevCC) { |
|
return NormalizerBase.NO; |
|
} |
|
prevCC=cc; |
|
|
|
|
|
qcNorm32 = norm32 & qcMask; |
|
if((qcNorm32& QC_ANY_NO)>=1) { |
|
result= NormalizerBase.NO; |
|
break; |
|
} else if(qcNorm32!=0) { |
|
|
|
if(allowMaybe){ |
|
result=NormalizerBase.MAYBE; |
|
}else{ |
|
// normalize a section around here to see if it is really |
|
|
|
int prevStarter; |
|
int decompQCMask; |
|
|
|
decompQCMask=(qcMask<<2)&0xf; |
|
|
|
// find the previous starter |
|
|
|
|
|
prevStarter=srcStart-1; |
|
if(UTF16.isTrailSurrogate(src[prevStarter])) { |
|
// safe because unpaired surrogates do not result |
|
|
|
--prevStarter; |
|
} |
|
prevStarter=findPreviousStarter(src, start, prevStarter, |
|
ccOrQCMask, decompQCMask, |
|
(char)minNoMaybe); |
|
|
|
// find the next true starter in [src..limit[ - modifies |
|
|
|
srcStart=findNextStarter(src,srcStart, srcLimit, qcMask, |
|
decompQCMask,(char) minNoMaybe); |
|
|
|
|
|
args.prevCC = prevCC; |
|
|
|
|
|
buffer = composePart(args,prevStarter,src,srcStart,srcLimit,options,nx); |
|
|
|
|
|
if(0!=strCompare(buffer,0,args.length,src,prevStarter,srcStart, false)) { |
|
result=NormalizerBase.NO; |
|
break; |
|
} |
|
|
|
// continue after the next starter |
|
} |
|
} |
|
} |
|
return result; |
|
} |
|
|
|
|
|
//------------------------------------------------------ |
|
// make NFD & NFKD |
|
//------------------------------------------------------ |
|
|
|
public static int decompose(char[] src,int srcStart,int srcLimit, |
|
char[] dest,int destStart,int destLimit, |
|
boolean compat,int[] outTrailCC, |
|
UnicodeSet nx) { |
|
|
|
char[] buffer = new char[3]; |
|
int prevSrc; |
|
long norm32; |
|
int ccOrQCMask, qcMask; |
|
int reorderStartIndex, length; |
|
char c, c2, minNoMaybe; |
|
int cc, prevCC, trailCC; |
|
char[] p; |
|
int pStart; |
|
int destIndex = destStart; |
|
int srcIndex = srcStart; |
|
if(!compat) { |
|
minNoMaybe=(char)indexes[INDEX_MIN_NFD_NO_MAYBE]; |
|
qcMask=QC_NFD; |
|
} else { |
|
minNoMaybe=(char)indexes[INDEX_MIN_NFKD_NO_MAYBE]; |
|
qcMask=QC_NFKD; |
|
} |
|
|
|
|
|
ccOrQCMask=CC_MASK|qcMask; |
|
reorderStartIndex=0; |
|
prevCC=0; |
|
norm32=0; |
|
c=0; |
|
pStart=0; |
|
|
|
cc=trailCC=-1; |
|
|
|
for(;;) { |
|
|
|
|
|
*/ |
|
prevSrc=srcIndex; |
|
|
|
while(srcIndex!=srcLimit &&((c=src[srcIndex])<minNoMaybe || |
|
((norm32=getNorm32(c))&ccOrQCMask)==0)){ |
|
prevCC=0; |
|
++srcIndex; |
|
} |
|
|
|
|
|
if(srcIndex!=prevSrc) { |
|
length=srcIndex-prevSrc; |
|
if((destIndex+length)<=destLimit) { |
|
System.arraycopy(src,prevSrc,dest,destIndex,length); |
|
} |
|
|
|
destIndex+=length; |
|
reorderStartIndex=destIndex; |
|
} |
|
|
|
|
|
if(srcIndex==srcLimit) { |
|
break; |
|
} |
|
|
|
|
|
++srcIndex; |
|
|
|
/* check one above-minimum, relevant code unit */ |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
if(isNorm32HangulOrJamo(norm32)) { |
|
if(nx_contains(nx, c)) { |
|
c2=0; |
|
p=null; |
|
length=1; |
|
} else { |
|
|
|
p=buffer; |
|
pStart=0; |
|
cc=trailCC=0; |
|
|
|
c-=HANGUL_BASE; |
|
|
|
c2=(char)(c%JAMO_T_COUNT); |
|
c/=JAMO_T_COUNT; |
|
if(c2>0) { |
|
buffer[2]=(char)(JAMO_T_BASE+c2); |
|
length=3; |
|
} else { |
|
length=2; |
|
} |
|
|
|
buffer[1]=(char)(JAMO_V_BASE+c%JAMO_V_COUNT); |
|
buffer[0]=(char)(JAMO_L_BASE+c/JAMO_V_COUNT); |
|
} |
|
} else { |
|
if(isNorm32Regular(norm32)) { |
|
c2=0; |
|
length=1; |
|
} else { |
|
|
|
if(srcIndex!=srcLimit && |
|
UTF16.isTrailSurrogate(c2=src[srcIndex])) { |
|
++srcIndex; |
|
length=2; |
|
norm32=getNorm32FromSurrogatePair(norm32, c2); |
|
} else { |
|
c2=0; |
|
length=1; |
|
norm32=0; |
|
} |
|
} |
|
|
|
|
|
if(nx_contains(nx, c, c2)) { |
|
|
|
cc=trailCC=0; |
|
p=null; |
|
} else if((norm32&qcMask)==0) { |
|
|
|
cc=trailCC=(int)((UNSIGNED_BYTE_MASK) & (norm32>>CC_SHIFT)); |
|
p=null; |
|
pStart=-1; |
|
} else { |
|
DecomposeArgs arg = new DecomposeArgs(); |
|
|
|
|
|
*/ |
|
pStart=decompose(norm32, qcMask, arg); |
|
p=extraData; |
|
length=arg.length; |
|
cc=arg.cc; |
|
trailCC=arg.trailCC; |
|
if(length==1) { |
|
|
|
c=p[pStart]; |
|
c2=0; |
|
p=null; |
|
pStart=-1; |
|
} |
|
} |
|
} |
|
|
|
|
|
|
|
*/ |
|
if((destIndex+length)<=destLimit) { |
|
int reorderSplit=destIndex; |
|
if(p==null) { |
|
|
|
if(cc!=0 && cc<prevCC) { |
|
|
|
|
|
*/ |
|
destIndex+=length; |
|
trailCC=insertOrdered(dest,reorderStartIndex, |
|
reorderSplit, destIndex, c, c2, cc); |
|
} else { |
|
|
|
dest[destIndex++]=c; |
|
if(c2!=0) { |
|
dest[destIndex++]=c2; |
|
} |
|
} |
|
} else { |
|
|
|
|
|
*/ |
|
if(cc!=0 && cc<prevCC) { |
|
|
|
|
|
*/ |
|
destIndex+=length; |
|
trailCC=mergeOrdered(dest,reorderStartIndex, |
|
reorderSplit,p, pStart,pStart+length); |
|
} else { |
|
|
|
do { |
|
dest[destIndex++]=p[pStart++]; |
|
} while(--length>0); |
|
} |
|
} |
|
} else { |
|
/* buffer overflow */ |
|
|
|
destIndex+=length; |
|
} |
|
|
|
prevCC=trailCC; |
|
if(prevCC==0) { |
|
reorderStartIndex=destIndex; |
|
} |
|
} |
|
|
|
outTrailCC[0]=prevCC; |
|
|
|
return destIndex - destStart; |
|
} |
|
|
|
|
|
private static final class NextCombiningArgs{ |
|
char[] source; |
|
int start; |
|
|
|
char c; |
|
char c2; |
|
int combiningIndex; |
|
char cc; |
|
} |
|
|
|
|
|
private static int getNextCombining(NextCombiningArgs args, |
|
int limit, |
|
UnicodeSet nx) { |
|
long norm32; |
|
int combineFlags; |
|
|
|
args.c=args.source[args.start++]; |
|
norm32=getNorm32(args.c); |
|
|
|
|
|
args.c2=0; |
|
args.combiningIndex=0; |
|
args.cc=0; |
|
|
|
if((norm32&(CC_MASK|COMBINES_ANY))==0) { |
|
return 0; |
|
} else { |
|
if(isNorm32Regular(norm32)) { |
|
/* set cc etc. below */ |
|
} else if(isNorm32HangulOrJamo(norm32)) { |
|
|
|
args.combiningIndex=(int)((UNSIGNED_INT_MASK)&(0xfff0| |
|
(norm32>>EXTRA_SHIFT))); |
|
return (int)(norm32&COMBINES_ANY); |
|
} else { |
|
|
|
if(args.start!=limit && UTF16.isTrailSurrogate(args.c2= |
|
args.source[args.start])) { |
|
++args.start; |
|
norm32=getNorm32FromSurrogatePair(norm32, args.c2); |
|
} else { |
|
args.c2=0; |
|
return 0; |
|
} |
|
} |
|
|
|
if(nx_contains(nx, args.c, args.c2)) { |
|
return 0; /* excluded: norm32==0 */ |
|
} |
|
|
|
args.cc= (char)((norm32>>CC_SHIFT)&0xff); |
|
|
|
combineFlags=(int)(norm32&COMBINES_ANY); |
|
if(combineFlags!=0) { |
|
int index = getExtraDataIndex(norm32); |
|
args.combiningIndex=index>0 ? extraData[(index-1)] :0; |
|
} |
|
|
|
return combineFlags; |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
private static int getCombiningIndexFromStarter(char c,char c2){ |
|
long norm32; |
|
|
|
norm32=getNorm32(c); |
|
if(c2!=0) { |
|
norm32=getNorm32FromSurrogatePair(norm32, c2); |
|
} |
|
return extraData[(getExtraDataIndex(norm32)-1)]; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
private static int combine(char[]table,int tableStart, |
|
int combineBackIndex, |
|
int[] outValues) { |
|
int key; |
|
int value,value2; |
|
|
|
if(outValues.length<2){ |
|
throw new IllegalArgumentException(); |
|
} |
|
|
|
|
|
for(;;) { |
|
key=table[tableStart++]; |
|
if(key>=combineBackIndex) { |
|
break; |
|
} |
|
tableStart+= ((table[tableStart]&0x8000) != 0)? 2 : 1; |
|
} |
|
|
|
|
|
if((key&0x7fff)==combineBackIndex) { |
|
|
|
value=table[tableStart]; |
|
|
|
|
|
key=(int)((UNSIGNED_INT_MASK)&((value&0x2000)+1)); |
|
|
|
|
|
|
|
*/ |
|
if((value&0x8000) != 0) { |
|
if((value&0x4000) != 0) { |
|
|
|
value=(int)((UNSIGNED_INT_MASK)&((value&0x3ff)|0xd800)); |
|
value2=table[tableStart+1]; |
|
} else { |
|
|
|
value=table[tableStart+1]; |
|
value2=0; |
|
} |
|
} else { |
|
|
|
value&=0x1fff; |
|
value2=0; |
|
} |
|
outValues[0]=value; |
|
outValues[1]=value2; |
|
return key; |
|
} else { |
|
|
|
return 0; |
|
} |
|
} |
|
|
|
|
|
private static final class RecomposeArgs{ |
|
char[] source; |
|
int start; |
|
int limit; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
private static char recompose(RecomposeArgs args, int options, UnicodeSet nx) { |
|
int remove, q, r; |
|
int combineFlags; |
|
int combineFwdIndex, combineBackIndex; |
|
int result, value=0, value2=0; |
|
int prevCC; |
|
boolean starterIsSupplementary; |
|
int starter; |
|
int[] outValues = new int[2]; |
|
starter=-1; |
|
combineFwdIndex=0; |
|
starterIsSupplementary=false; |
|
prevCC=0; |
|
|
|
NextCombiningArgs ncArg = new NextCombiningArgs(); |
|
ncArg.source = args.source; |
|
|
|
ncArg.cc =0; |
|
ncArg.c2 =0; |
|
|
|
for(;;) { |
|
ncArg.start = args.start; |
|
combineFlags=getNextCombining(ncArg,args.limit,nx); |
|
combineBackIndex=ncArg.combiningIndex; |
|
args.start = ncArg.start; |
|
|
|
if(((combineFlags&COMBINES_BACK)!=0) && starter!=-1) { |
|
if((combineBackIndex&0x8000)!=0) { |
|
/* c is a Jamo V/T, see if we can compose it with the |
|
* previous character |
|
*/ |
|
|
|
if((options&BEFORE_PRI_29)!=0 || prevCC==0) { |
|
remove=-1; |
|
combineFlags=0; |
|
ncArg.c2=args.source[starter]; |
|
if(combineBackIndex==0xfff2) { |
|
|
|
|
|
*/ |
|
ncArg.c2=(char)(ncArg.c2-JAMO_L_BASE); |
|
if(ncArg.c2<JAMO_L_COUNT) { |
|
remove=args.start-1; |
|
ncArg.c=(char)(HANGUL_BASE+(ncArg.c2*JAMO_V_COUNT+ |
|
(ncArg.c-JAMO_V_BASE))*JAMO_T_COUNT); |
|
if(args.start!=args.limit && |
|
(ncArg.c2=(char)(args.source[args.start] |
|
-JAMO_T_BASE))<JAMO_T_COUNT) { |
|
++args.start; |
|
ncArg.c+=ncArg.c2; |
|
} else { |
|
|
|
combineFlags=COMBINES_FWD; |
|
} |
|
if(!nx_contains(nx, ncArg.c)) { |
|
args.source[starter]=ncArg.c; |
|
} else { |
|
|
|
if(!isHangulWithoutJamoT(ncArg.c)) { |
|
--args.start; /* undo the ++args.start from reading the Jamo T */ |
|
} |
|
|
|
remove=args.start; |
|
} |
|
} |
|
|
|
/* |
|
* Normally, the following can not occur: |
|
* Since the input is in NFD, there are no Hangul LV syllables that |
|
* a Jamo T could combine with. |
|
* All Jamo Ts are combined above when handling Jamo Vs. |
|
* |
|
* However, before the PRI #29 fix, this can occur due to |
|
* an intervening combining mark between the Hangul LV and the Jamo T. |
|
*/ |
|
} else { |
|
|
|
if(isHangulWithoutJamoT(ncArg.c2)) { |
|
ncArg.c2+=ncArg.c-JAMO_T_BASE; |
|
if(!nx_contains(nx, ncArg.c2)) { |
|
remove=args.start-1; |
|
args.source[starter]=ncArg.c2; |
|
} |
|
} |
|
} |
|
|
|
if(remove!=-1) { |
|
|
|
q=remove; |
|
r=args.start; |
|
while(r<args.limit) { |
|
args.source[q++]=args.source[r++]; |
|
} |
|
args.start=remove; |
|
args.limit=q; |
|
} |
|
|
|
ncArg.c2=0; /* c2 held *starter temporarily */ |
|
|
|
if(combineFlags!=0) { |
|
/* |
|
* not starter=NULL because the composition is a Hangul LV syllable |
|
* and might combine once more (but only before the PRI #29 fix) |
|
*/ |
|
|
|
|
|
if(args.start==args.limit) { |
|
return (char)prevCC; |
|
} |
|
|
|
|
|
combineFwdIndex=0xfff0; |
|
|
|
|
|
continue; |
|
} |
|
} |
|
|
|
/* |
|
* now: cc==0 and the combining index does not include |
|
* "forward" -> the rest of the loop body will reset starter |
|
* to NULL; technically, a composed Hangul syllable is a |
|
* starter, but it does not combine forward now that we have |
|
* consumed all eligible Jamos; for Jamo V/T, combineFlags |
|
* does not contain _NORM_COMBINES_FWD |
|
*/ |
|
|
|
} else if( |
|
|
|
!((combineFwdIndex&0x8000)!=0) && |
|
|
|
((options&BEFORE_PRI_29)!=0 ? |
|
(prevCC!=ncArg.cc || prevCC==0) : |
|
(prevCC<ncArg.cc || prevCC==0)) && |
|
|
|
0!=(result=combine(combiningTable,combineFwdIndex, |
|
combineBackIndex, outValues)) && |
|
|
|
!nx_contains(nx, (char)value, (char)value2) |
|
) { |
|
value=outValues[0]; |
|
value2=outValues[1]; |
|
|
|
|
|
*/ |
|
remove= ncArg.c2==0 ? args.start-1 : args.start-2; /* index to the combining mark */ |
|
|
|
|
|
args.source[starter]=(char)value; |
|
if(starterIsSupplementary) { |
|
if(value2!=0) { |
|
|
|
args.source[starter+1]=(char)value2; |
|
} else { |
|
|
|
* move the intermediate characters forward one */ |
|
starterIsSupplementary=false; |
|
q=starter+1; |
|
r=q+1; |
|
while(r<remove) { |
|
args.source[q++]=args.source[r++]; |
|
} |
|
--remove; |
|
} |
|
} else if(value2!=0) { |
|
starterIsSupplementary=true; |
|
args.source[starter+1]=(char)value2; |
|
/* } else { both are on the BMP, nothing more to do */ |
|
} |
|
|
|
|
|
* over it */ |
|
if(remove<args.start) { |
|
q=remove; |
|
r=args.start; |
|
while(r<args.limit) { |
|
args.source[q++]=args.source[r++]; |
|
} |
|
args.start=remove; |
|
args.limit=q; |
|
} |
|
|
|
/* keep prevCC because we removed the combining mark */ |
|
|
|
|
|
if(args.start==args.limit) { |
|
return (char)prevCC; |
|
} |
|
|
|
|
|
if(result>1) { |
|
combineFwdIndex=getCombiningIndexFromStarter((char)value, |
|
(char)value2); |
|
} else { |
|
starter=-1; |
|
} |
|
|
|
|
|
continue; |
|
} |
|
} |
|
|
|
|
|
prevCC=ncArg.cc; |
|
if(args.start==args.limit) { |
|
return (char)prevCC; |
|
} |
|
|
|
|
|
if(ncArg.cc==0) { |
|
|
|
if((combineFlags&COMBINES_FWD)!=0) { |
|
|
|
if(ncArg.c2==0) { |
|
starterIsSupplementary=false; |
|
starter=args.start-1; |
|
} else { |
|
starterIsSupplementary=false; |
|
starter=args.start-2; |
|
} |
|
combineFwdIndex=combineBackIndex; |
|
} else { |
|
|
|
starter=-1; |
|
} |
|
} else if((options&OPTIONS_COMPOSE_CONTIGUOUS)!=0) { |
|
|
|
starter=-1; |
|
} |
|
} |
|
} |
|
|
|
// find the last true starter between src[start]....src[current] going |
|
|
|
private static int findPreviousStarter(char[]src, int srcStart, int current, |
|
int ccOrQCMask, |
|
int decompQCMask, |
|
char minNoMaybe) { |
|
long norm32; |
|
PrevArgs args = new PrevArgs(); |
|
args.src = src; |
|
args.start = srcStart; |
|
args.current = current; |
|
|
|
while(args.start<args.current) { |
|
norm32= getPrevNorm32(args, minNoMaybe, ccOrQCMask|decompQCMask); |
|
if(isTrueStarter(norm32, ccOrQCMask, decompQCMask)) { |
|
break; |
|
} |
|
} |
|
return args.current; |
|
} |
|
|
|
|
|
|
|
*/ |
|
private static int findNextStarter(char[] src,int start,int limit, |
|
int qcMask, |
|
int decompQCMask, |
|
char minNoMaybe) { |
|
int p; |
|
long norm32; |
|
int ccOrQCMask; |
|
char c, c2; |
|
|
|
ccOrQCMask=CC_MASK|qcMask; |
|
|
|
DecomposeArgs decompArgs = new DecomposeArgs(); |
|
|
|
for(;;) { |
|
if(start==limit) { |
|
break; /* end of string */ |
|
} |
|
c=src[start]; |
|
if(c<minNoMaybe) { |
|
break; /* catches NUL terminater, too */ |
|
} |
|
|
|
norm32=getNorm32(c); |
|
if((norm32&ccOrQCMask)==0) { |
|
break; /* true starter */ |
|
} |
|
|
|
if(isNorm32LeadSurrogate(norm32)) { |
|
|
|
if((start+1)==limit || |
|
!UTF16.isTrailSurrogate(c2=(src[start+1]))){ |
|
|
|
break; |
|
} |
|
norm32=getNorm32FromSurrogatePair(norm32, c2); |
|
|
|
if((norm32&ccOrQCMask)==0) { |
|
break; /* true starter */ |
|
} |
|
} else { |
|
c2=0; |
|
} |
|
|
|
|
|
if((norm32&decompQCMask)!=0) { |
|
|
|
* extra data */ |
|
p=decompose(norm32, decompQCMask, decompArgs); |
|
|
|
|
|
* starter */ |
|
if(decompArgs.cc==0 && (getNorm32(extraData,p, qcMask)&qcMask)==0) { |
|
break; /* true starter */ |
|
} |
|
} |
|
|
|
start+= c2==0 ? 1 : 2; /* not a true starter, continue */ |
|
} |
|
|
|
return start; |
|
} |
|
|
|
|
|
private static final class ComposePartArgs{ |
|
int prevCC; |
|
int length; /* length of decomposed part */ |
|
} |
|
|
|
|
|
private static char[] composePart(ComposePartArgs args, |
|
int prevStarter, |
|
char[] src, int start, int limit, |
|
int options, |
|
UnicodeSet nx) { |
|
int recomposeLimit; |
|
boolean compat =((options&OPTIONS_COMPAT)!=0); |
|
|
|
|
|
int[] outTrailCC = new int[1]; |
|
char[] buffer = new char[(limit-prevStarter)*MAX_BUFFER_SIZE]; |
|
|
|
for(;;){ |
|
args.length=decompose(src,prevStarter,(start), |
|
buffer,0,buffer.length, |
|
compat,outTrailCC,nx); |
|
if(args.length<=buffer.length){ |
|
break; |
|
}else{ |
|
buffer = new char[args.length]; |
|
} |
|
} |
|
|
|
|
|
recomposeLimit=args.length; |
|
|
|
if(args.length>=2) { |
|
RecomposeArgs rcArgs = new RecomposeArgs(); |
|
rcArgs.source = buffer; |
|
rcArgs.start = 0; |
|
rcArgs.limit = recomposeLimit; |
|
args.prevCC=recompose(rcArgs, options, nx); |
|
recomposeLimit = rcArgs.limit; |
|
} |
|
|
|
|
|
args.length=recomposeLimit; |
|
return buffer; |
|
} |
|
|
|
private static boolean composeHangul(char prev, char c, |
|
long norm32, |
|
char[] src,int[] srcIndex, int limit, |
|
boolean compat, |
|
char[] dest,int destIndex, |
|
UnicodeSet nx) { |
|
int start=srcIndex[0]; |
|
if(isJamoVTNorm32JamoV(norm32)) { |
|
|
|
* following Jamo T */ |
|
prev=(char)(prev-JAMO_L_BASE); |
|
if(prev<JAMO_L_COUNT) { |
|
c=(char)(HANGUL_BASE+(prev*JAMO_V_COUNT+ |
|
(c-JAMO_V_BASE))*JAMO_T_COUNT); |
|
|
|
|
|
* compatibility) */ |
|
if(start!=limit) { |
|
char next, t; |
|
|
|
next=src[start]; |
|
if((t=(char)(next-JAMO_T_BASE))<JAMO_T_COUNT) { |
|
|
|
++start; |
|
c+=t; |
|
} else if(compat) { |
|
|
|
* (BMP only) */ |
|
norm32=getNorm32(next); |
|
if(isNorm32Regular(norm32) && ((norm32&QC_NFKD)!=0)) { |
|
int p /*index into extra data array*/; |
|
DecomposeArgs dcArgs = new DecomposeArgs(); |
|
p=decompose(norm32, QC_NFKD, dcArgs); |
|
if(dcArgs.length==1 && |
|
(t=(char)(extraData[p]-JAMO_T_BASE)) |
|
<JAMO_T_COUNT) { |
|
|
|
++start; |
|
c+=t; |
|
} |
|
} |
|
} |
|
} |
|
if(nx_contains(nx, c)) { |
|
if(!isHangulWithoutJamoT(c)) { |
|
--start; /* undo ++start from reading the Jamo T */ |
|
} |
|
return false; |
|
} |
|
dest[destIndex]=c; |
|
srcIndex[0]=start; |
|
return true; |
|
} |
|
} else if(isHangulWithoutJamoT(prev)) { |
|
|
|
* contain a Jamo T */ |
|
c=(char)(prev+(c-JAMO_T_BASE)); |
|
if(nx_contains(nx, c)) { |
|
return false; |
|
} |
|
dest[destIndex]=c; |
|
srcIndex[0]=start; |
|
return true; |
|
} |
|
return false; |
|
} |
|
/* |
|
public static int compose(char[] src, char[] dest,boolean compat, UnicodeSet nx){ |
|
return compose(src,0,src.length,dest,0,dest.length,compat, nx); |
|
} |
|
*/ |
|
|
|
public static int compose(char[] src, int srcStart, int srcLimit, |
|
char[] dest,int destStart,int destLimit, |
|
int options,UnicodeSet nx) { |
|
|
|
int prevSrc, prevStarter; |
|
long norm32; |
|
int ccOrQCMask, qcMask; |
|
int reorderStartIndex, length; |
|
char c, c2, minNoMaybe; |
|
int cc, prevCC; |
|
int[] ioIndex = new int[1]; |
|
int destIndex = destStart; |
|
int srcIndex = srcStart; |
|
|
|
if((options&OPTIONS_COMPAT)!=0) { |
|
minNoMaybe=(char)indexes[INDEX_MIN_NFKC_NO_MAYBE]; |
|
qcMask=QC_NFKC; |
|
} else { |
|
minNoMaybe=(char)indexes[INDEX_MIN_NFC_NO_MAYBE]; |
|
qcMask=QC_NFC; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
prevStarter=srcIndex; |
|
|
|
ccOrQCMask=CC_MASK|qcMask; |
|
reorderStartIndex=0; |
|
prevCC=0; |
|
|
|
|
|
norm32=0; |
|
c=0; |
|
|
|
for(;;) { |
|
|
|
* the quick check */ |
|
prevSrc=srcIndex; |
|
|
|
while(srcIndex!=srcLimit && ((c=src[srcIndex])<minNoMaybe || |
|
((norm32=getNorm32(c))&ccOrQCMask)==0)) { |
|
prevCC=0; |
|
++srcIndex; |
|
} |
|
|
|
|
|
|
|
if(srcIndex!=prevSrc) { |
|
length=srcIndex-prevSrc; |
|
if((destIndex+length)<=destLimit) { |
|
System.arraycopy(src,prevSrc,dest,destIndex,length); |
|
} |
|
destIndex+=length; |
|
reorderStartIndex=destIndex; |
|
|
|
|
|
* loop */ |
|
prevStarter=srcIndex-1; |
|
if(UTF16.isTrailSurrogate(src[prevStarter]) && |
|
prevSrc<prevStarter && |
|
UTF16.isLeadSurrogate(src[(prevStarter-1)])) { |
|
--prevStarter; |
|
} |
|
|
|
prevSrc=srcIndex; |
|
} |
|
|
|
|
|
if(srcIndex==srcLimit) { |
|
break; |
|
} |
|
|
|
|
|
++srcIndex; |
|
|
|
/* |
|
* source buffer pointers: |
|
* |
|
* all done quick check current char not yet |
|
* "yes" but (c, c2) processed |
|
* may combine |
|
* forward |
|
* [-------------[-------------[-------------[-------------[ |
|
* | | | | | |
|
* start prevStarter prevSrc src limit |
|
* |
|
* |
|
* destination buffer pointers and indexes: |
|
* |
|
* all done might take not filled yet |
|
* characters for |
|
* reordering |
|
* [-------------[-------------[-------------[ |
|
* | | | | |
|
* dest reorderStartIndex destIndex destCapacity |
|
*/ |
|
|
|
/* check one above-minimum, relevant code unit */ |
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
if(isNorm32HangulOrJamo(norm32)) { |
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
prevCC=cc=0; |
|
reorderStartIndex=destIndex; |
|
ioIndex[0]=srcIndex; |
|
if( |
|
destIndex>0 && |
|
composeHangul(src[(prevSrc-1)], c, norm32,src, ioIndex, |
|
srcLimit, (options&OPTIONS_COMPAT)!=0, dest, |
|
destIndex<=destLimit ? destIndex-1: 0, |
|
nx) |
|
) { |
|
srcIndex=ioIndex[0]; |
|
prevStarter=srcIndex; |
|
continue; |
|
} |
|
|
|
srcIndex = ioIndex[0]; |
|
|
|
|
|
* append to dest */ |
|
c2=0; |
|
length=1; |
|
prevStarter=prevSrc; |
|
} else { |
|
if(isNorm32Regular(norm32)) { |
|
c2=0; |
|
length=1; |
|
} else { |
|
|
|
if(srcIndex!=srcLimit && |
|
UTF16.isTrailSurrogate(c2=src[srcIndex])) { |
|
++srcIndex; |
|
length=2; |
|
norm32=getNorm32FromSurrogatePair(norm32, c2); |
|
} else { |
|
|
|
c2=0; |
|
length=1; |
|
norm32=0; |
|
} |
|
} |
|
ComposePartArgs args =new ComposePartArgs(); |
|
|
|
|
|
if(nx_contains(nx, c, c2)) { |
|
|
|
cc=0; |
|
} else if((norm32&qcMask)==0) { |
|
cc=(int)((UNSIGNED_BYTE_MASK)&(norm32>>CC_SHIFT)); |
|
} else { |
|
char[] p; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
int decompQCMask=(qcMask<<2)&0xf; /* decomposition quick check mask */ |
|
|
|
|
|
|
|
|
|
*/ |
|
if(isTrueStarter(norm32, CC_MASK|qcMask, decompQCMask)) { |
|
prevStarter=prevSrc; |
|
} else { |
|
|
|
destIndex-=prevSrc-prevStarter; |
|
} |
|
|
|
|
|
srcIndex=findNextStarter(src, srcIndex,srcLimit, qcMask, |
|
decompQCMask, minNoMaybe); |
|
|
|
args.prevCC = prevCC; |
|
|
|
args.length = length; |
|
p=composePart(args,prevStarter,src,srcIndex,srcLimit,options,nx); |
|
|
|
if(p==null) { |
|
|
|
break; |
|
} |
|
|
|
prevCC = args.prevCC; |
|
length = args.length; |
|
|
|
|
|
* buffer */ |
|
if((destIndex+args.length)<=destLimit) { |
|
int i=0; |
|
while(i<args.length) { |
|
dest[destIndex++]=p[i++]; |
|
--length; |
|
} |
|
} else { |
|
/* buffer overflow */ |
|
|
|
destIndex+=length; |
|
} |
|
|
|
prevStarter=srcIndex; |
|
continue; |
|
} |
|
} |
|
|
|
|
|
if((destIndex+length)<=destLimit) { |
|
if(cc!=0 && cc<prevCC) { |
|
|
|
* text */ |
|
int reorderSplit= destIndex; |
|
destIndex+=length; |
|
prevCC=insertOrdered(dest,reorderStartIndex, reorderSplit, |
|
destIndex, c, c2, cc); |
|
} else { |
|
|
|
dest[destIndex++]=c; |
|
if(c2!=0) { |
|
dest[destIndex++]=c2; |
|
} |
|
prevCC=cc; |
|
} |
|
} else { |
|
/* buffer overflow */ |
|
|
|
destIndex+=length; |
|
prevCC=cc; |
|
} |
|
} |
|
|
|
return destIndex - destStart; |
|
} |
|
|
|
public static int getCombiningClass(int c) { |
|
long norm32; |
|
norm32=getNorm32(c); |
|
return (int)((norm32>>CC_SHIFT)&0xFF); |
|
} |
|
|
|
public static boolean isFullCompositionExclusion(int c) { |
|
if(isFormatVersion_2_1) { |
|
int aux =AuxTrieImpl.auxTrie.getCodePointValue(c); |
|
return (aux & AUX_COMP_EX_MASK)!=0; |
|
} else { |
|
return false; |
|
} |
|
} |
|
|
|
public static boolean isCanonSafeStart(int c) { |
|
if(isFormatVersion_2_1) { |
|
int aux = AuxTrieImpl.auxTrie.getCodePointValue(c); |
|
return (aux & AUX_UNSAFE_MASK)==0; |
|
} else { |
|
return false; |
|
} |
|
} |
|
|
|
|
|
public static boolean isNFSkippable(int c, NormalizerBase.Mode mode, long mask) { |
|
long norm32; |
|
mask = mask & UNSIGNED_INT_MASK; |
|
char aux; |
|
|
|
|
|
norm32 = getNorm32(c); |
|
|
|
if((norm32&mask)!=0) { |
|
return false; /* fails (a)..(e), not skippable */ |
|
} |
|
|
|
if(mode == NormalizerBase.NFD || mode == NormalizerBase.NFKD || mode == NormalizerBase.NONE){ |
|
return true; /* NF*D, passed (a)..(c), is skippable */ |
|
} |
|
/* check conditions (a)..(e), see unormimp.h */ |
|
|
|
|
|
if((norm32& QC_NFD)==0) { |
|
return true; /* no canonical decomposition, is skippable */ |
|
} |
|
|
|
|
|
if(isNorm32HangulOrJamo(norm32)) { |
|
|
|
return !isHangulWithoutJamoT((char)c); /* LVT are skippable, LV are not */ |
|
} |
|
|
|
/* if(mode<=UNORM_NFKC) { -- enable when implementing FCC */ |
|
|
|
if(!isFormatVersion_2_2) { |
|
return false; /* no (f) data, say not skippable to be safe */ |
|
} |
|
|
|
|
|
aux = AuxTrieImpl.auxTrie.getCodePointValue(c); |
|
return (aux&AUX_NFC_SKIP_F_MASK)==0; /* TRUE=skippable if the (f) flag is not set */ |
|
|
|
/* } else { FCC, test fcd<=1 instead of the above } */ |
|
} |
|
|
|
public static UnicodeSet addPropertyStarts(UnicodeSet set) { |
|
int c; |
|
|
|
/* add the start code point of each same-value range of each trie */ |
|
|
|
TrieIterator normIter = new TrieIterator(NormTrieImpl.normTrie); |
|
RangeValueIterator.Element normResult = new RangeValueIterator.Element(); |
|
|
|
while(normIter.next(normResult)){ |
|
set.add(normResult.start); |
|
} |
|
|
|
|
|
TrieIterator fcdIter = new TrieIterator(FCDTrieImpl.fcdTrie); |
|
RangeValueIterator.Element fcdResult = new RangeValueIterator.Element(); |
|
|
|
while(fcdIter.next(fcdResult)){ |
|
set.add(fcdResult.start); |
|
} |
|
|
|
if(isFormatVersion_2_1){ |
|
|
|
TrieIterator auxIter = new TrieIterator(AuxTrieImpl.auxTrie); |
|
RangeValueIterator.Element auxResult = new RangeValueIterator.Element(); |
|
while(auxIter.next(auxResult)){ |
|
set.add(auxResult.start); |
|
} |
|
} |
|
|
|
for(c=HANGUL_BASE; c<HANGUL_BASE+HANGUL_COUNT; c+=JAMO_T_COUNT) { |
|
set.add(c); |
|
set.add(c+1); |
|
} |
|
set.add(HANGUL_BASE+HANGUL_COUNT); /* add Hangul+1 to continue with other properties */ |
|
return set; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public static final int quickCheck(int c, int modeValue) { |
|
final int qcMask[/*UNORM_MODE_COUNT*/]={ |
|
0, 0, QC_NFD, QC_NFKD, QC_NFC, QC_NFKC |
|
}; |
|
|
|
int norm32=(int)getNorm32(c)&qcMask[modeValue]; |
|
|
|
if(norm32==0) { |
|
return 1; |
|
} else if((norm32&QC_ANY_NO)!=0) { |
|
return 0; |
|
} else { |
|
return 2; |
|
} |
|
} |
|
|
|
private static int strCompare(char[] s1, int s1Start, int s1Limit, |
|
char[] s2, int s2Start, int s2Limit, |
|
boolean codePointOrder) { |
|
|
|
int start1, start2, limit1, limit2; |
|
|
|
char c1, c2; |
|
|
|
|
|
start1=s1Start; |
|
start2=s2Start; |
|
|
|
int length1, length2; |
|
|
|
length1 = s1Limit - s1Start; |
|
length2 = s2Limit - s2Start; |
|
|
|
int lengthResult; |
|
|
|
if(length1<length2) { |
|
lengthResult=-1; |
|
limit1=start1+length1; |
|
} else if(length1==length2) { |
|
lengthResult=0; |
|
limit1=start1+length1; |
|
} else { |
|
lengthResult=1; |
|
limit1=start1+length2; |
|
} |
|
|
|
if(s1==s2) { |
|
return lengthResult; |
|
} |
|
|
|
for(;;) { |
|
|
|
if(s1Start==limit1) { |
|
return lengthResult; |
|
} |
|
|
|
c1=s1[s1Start]; |
|
c2=s2[s2Start]; |
|
if(c1!=c2) { |
|
break; |
|
} |
|
++s1Start; |
|
++s2Start; |
|
} |
|
|
|
|
|
limit1=start1+length1; |
|
limit2=start2+length2; |
|
|
|
|
|
|
|
if(c1>=0xd800 && c2>=0xd800 && codePointOrder) { |
|
|
|
* supplementary ones */ |
|
if( |
|
( c1<=0xdbff && (s1Start+1)!=limit1 && |
|
UTF16.isTrailSurrogate(s1[(s1Start+1)]) |
|
) || |
|
( UTF16.isTrailSurrogate(c1) && start1!=s1Start && |
|
UTF16.isLeadSurrogate(s1[(s1Start-1)]) |
|
) |
|
) { |
|
/* part of a surrogate pair, leave >=d800 */ |
|
} else { |
|
|
|
c1-=0x2800; |
|
} |
|
|
|
if( |
|
( c2<=0xdbff && (s2Start+1)!=limit2 && |
|
UTF16.isTrailSurrogate(s2[(s2Start+1)]) |
|
) || |
|
( UTF16.isTrailSurrogate(c2) && start2!=s2Start && |
|
UTF16.isLeadSurrogate(s2[(s2Start-1)]) |
|
) |
|
) { |
|
/* part of a surrogate pair, leave >=d800 */ |
|
} else { |
|
|
|
c2-=0x2800; |
|
} |
|
} |
|
|
|
|
|
return (int)c1-(int)c2; |
|
} |
|
|
|
|
|
/* |
|
* Status of tailored normalization |
|
* |
|
* This was done initially for investigation on Unicode public review issue 7 |
|
* (http://www.unicode.org/review/). See Jitterbug 2481. |
|
* While the UTC at meeting #94 (2003mar) did not take up the issue, this is |
|
* a permanent feature in ICU 2.6 in support of IDNA which requires true |
|
* Unicode 3.2 normalization. |
|
* (NormalizationCorrections are rolled into IDNA mapping tables.) |
|
* |
|
* Tailored normalization as implemented here allows to "normalize less" |
|
* than full Unicode normalization would. |
|
* Based internally on a UnicodeSet of code points that are |
|
* "excluded from normalization", the normalization functions leave those |
|
* code points alone ("inert"). This means that tailored normalization |
|
* still transforms text into a canonically equivalent form. |
|
* It does not add decompositions to code points that do not have any or |
|
* change decomposition results. |
|
* |
|
* Any function that searches for a safe boundary has not been touched, |
|
* which means that these functions will be over-pessimistic when |
|
* exclusions are applied. |
|
* This should not matter because subsequent checks and normalizations |
|
* do apply the exclusions; only a little more of the text may be processed |
|
* than necessary under exclusions. |
|
* |
|
* Normalization exclusions have the following effect on excluded code points c: |
|
* - c is not decomposed |
|
* - c is not a composition target |
|
* - c does not combine forward or backward for composition |
|
* except that this is not implemented for Jamo |
|
* - c is treated as having a combining class of 0 |
|
*/ |
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
private static final int OPTIONS_NX_MASK=0x1f; |
|
private static final int OPTIONS_UNICODE_MASK=0xe0; |
|
public static final int OPTIONS_SETS_MASK=0xff; |
|
|
|
private static final UnicodeSet[] nxCache = new UnicodeSet[OPTIONS_SETS_MASK+1]; |
|
|
|
/* Constants for options flags for normalization.*/ |
|
|
|
|
|
|
|
|
|
*/ |
|
private static final int NX_HANGUL = 1; |
|
|
|
|
|
|
|
*/ |
|
private static final int NX_CJK_COMPAT=2; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public static final int BEFORE_PRI_29=0x100; |
|
|
|
/* |
|
* The following options are used only in some composition functions. |
|
* They use bits 12 and up to preserve lower bits for the available options |
|
* space in unorm_compare() - |
|
* see documentation for UNORM_COMPARE_NORM_OPTIONS_SHIFT. |
|
*/ |
|
|
|
|
|
public static final int OPTIONS_COMPAT=0x1000; |
|
|
|
public static final int OPTIONS_COMPOSE_CONTIGUOUS=0x2000; |
|
|
|
/* normalization exclusion sets --------------------------------------------- */ |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
private static final synchronized UnicodeSet internalGetNXHangul() { |
|
/* internal function, does not check for incoming U_FAILURE */ |
|
|
|
if(nxCache[NX_HANGUL]==null) { |
|
nxCache[NX_HANGUL]=new UnicodeSet(0xac00, 0xd7a3); |
|
} |
|
return nxCache[NX_HANGUL]; |
|
} |
|
|
|
private static final synchronized UnicodeSet internalGetNXCJKCompat() { |
|
/* internal function, does not check for incoming U_FAILURE */ |
|
|
|
if(nxCache[NX_CJK_COMPAT]==null) { |
|
|
|
|
|
UnicodeSet set, hasDecomp; |
|
|
|
set=new UnicodeSet("[:Ideographic:]"); |
|
|
|
|
|
hasDecomp=new UnicodeSet(); |
|
|
|
|
|
UnicodeSetIterator it = new UnicodeSetIterator(set); |
|
int start, end; |
|
long norm32; |
|
|
|
while(it.nextRange() && (it.codepoint != UnicodeSetIterator.IS_STRING)) { |
|
start=it.codepoint; |
|
end=it.codepointEnd; |
|
while(start<=end) { |
|
norm32 = getNorm32(start); |
|
if((norm32 & QC_NFD)>0) { |
|
hasDecomp.add(start); |
|
} |
|
++start; |
|
} |
|
} |
|
|
|
|
|
nxCache[NX_CJK_COMPAT]=hasDecomp; |
|
|
|
} |
|
|
|
return nxCache[NX_CJK_COMPAT]; |
|
} |
|
|
|
private static final synchronized UnicodeSet internalGetNXUnicode(int options) { |
|
options &= OPTIONS_UNICODE_MASK; |
|
if(options==0) { |
|
return null; |
|
} |
|
|
|
if(nxCache[options]==null) { |
|
|
|
UnicodeSet set = new UnicodeSet(); |
|
|
|
switch(options) { |
|
case NormalizerBase.UNICODE_3_2: |
|
set.applyPattern("[:^Age=3.2:]"); |
|
break; |
|
default: |
|
return null; |
|
} |
|
|
|
nxCache[options]=set; |
|
} |
|
|
|
return nxCache[options]; |
|
} |
|
|
|
|
|
private static final synchronized UnicodeSet internalGetNX(int options) { |
|
options&=OPTIONS_SETS_MASK; |
|
|
|
if(nxCache[options]==null) { |
|
|
|
if(options==NX_HANGUL) { |
|
return internalGetNXHangul(); |
|
} |
|
if(options==NX_CJK_COMPAT) { |
|
return internalGetNXCJKCompat(); |
|
} |
|
if((options & OPTIONS_UNICODE_MASK)!=0 && (options & OPTIONS_NX_MASK)==0) { |
|
return internalGetNXUnicode(options); |
|
} |
|
|
|
|
|
UnicodeSet set; |
|
UnicodeSet other; |
|
|
|
set=new UnicodeSet(); |
|
|
|
|
|
if((options & NX_HANGUL)!=0 && null!=(other=internalGetNXHangul())) { |
|
set.addAll(other); |
|
} |
|
if((options&NX_CJK_COMPAT)!=0 && null!=(other=internalGetNXCJKCompat())) { |
|
set.addAll(other); |
|
} |
|
if((options&OPTIONS_UNICODE_MASK)!=0 && null!=(other=internalGetNXUnicode(options))) { |
|
set.addAll(other); |
|
} |
|
|
|
nxCache[options]=set; |
|
} |
|
return nxCache[options]; |
|
} |
|
|
|
public static final UnicodeSet getNX(int options) { |
|
if((options&=OPTIONS_SETS_MASK)==0) { |
|
|
|
return null; |
|
} else { |
|
return internalGetNX(options); |
|
} |
|
} |
|
|
|
private static final boolean nx_contains(UnicodeSet nx, int c) { |
|
return nx!=null && nx.contains(c); |
|
} |
|
|
|
private static final boolean nx_contains(UnicodeSet nx, char c, char c2) { |
|
return nx!=null && nx.contains(c2==0 ? c : UCharacterProperty.getRawSupplementary(c, c2)); |
|
} |
|
|
|
/*****************************************************************************/ |
|
|
|
/** |
|
* Get the canonical decomposition |
|
* sherman for ComposedCharIter |
|
*/ |
|
|
|
public static int getDecompose(int chars[], String decomps[]) { |
|
DecomposeArgs args = new DecomposeArgs(); |
|
int length=0; |
|
long norm32 = 0; |
|
int ch = -1; |
|
int index = 0; |
|
int i = 0; |
|
|
|
while (++ch < 0x2fa1e) { //no cannoical above 0x3ffff |
|
//TBD !!!! the hack code heres save us about 50ms for startup |
|
|
|
if (ch == 0x30ff) |
|
ch = 0xf900; |
|
else if (ch == 0x10000) |
|
ch = 0x1d15e; |
|
else if (ch == 0x1d1c1) |
|
ch = 0x2f800; |
|
|
|
norm32 = NormalizerImpl.getNorm32(ch); |
|
if((norm32 & QC_NFD)!=0 && i < chars.length) { |
|
chars[i] = ch; |
|
index = decompose(norm32, args); |
|
decomps[i++] = new String(extraData,index, args.length); |
|
} |
|
} |
|
return i; |
|
} |
|
|
|
//------------------------------------------------------ |
|
// special method for Collation |
|
|
|
private static boolean needSingleQuotation(char c) { |
|
return (c >= 0x0009 && c <= 0x000D) || |
|
(c >= 0x0020 && c <= 0x002F) || |
|
(c >= 0x003A && c <= 0x0040) || |
|
(c >= 0x005B && c <= 0x0060) || |
|
(c >= 0x007B && c <= 0x007E); |
|
} |
|
|
|
public static String canonicalDecomposeWithSingleQuotation(String string) { |
|
char[] src = string.toCharArray(); |
|
int srcIndex = 0; |
|
int srcLimit = src.length; |
|
char[] dest = new char[src.length * 3]; |
|
int destIndex = 0; |
|
int destLimit = dest.length; |
|
|
|
char[] buffer = new char[3]; |
|
int prevSrc; |
|
long norm32; |
|
int ccOrQCMask; |
|
int qcMask = QC_NFD; |
|
int reorderStartIndex, length; |
|
char c, c2; |
|
char minNoMaybe = (char)indexes[INDEX_MIN_NFD_NO_MAYBE]; |
|
int cc, prevCC, trailCC; |
|
char[] p; |
|
int pStart; |
|
|
|
|
|
|
|
ccOrQCMask = CC_MASK | qcMask; |
|
reorderStartIndex = 0; |
|
prevCC = 0; |
|
norm32 = 0; |
|
c = 0; |
|
pStart = 0; |
|
|
|
cc = trailCC = -1; |
|
for(;;) { |
|
prevSrc=srcIndex; |
|
|
|
while (srcIndex != srcLimit && |
|
(( c = src[srcIndex]) < minNoMaybe || |
|
((norm32 = getNorm32(c)) & ccOrQCMask) == 0 || |
|
( c >= '\uac00' && c <= '\ud7a3'))){ |
|
|
|
prevCC = 0; |
|
++srcIndex; |
|
} |
|
|
|
|
|
if (srcIndex != prevSrc) { |
|
length = srcIndex - prevSrc; |
|
if ((destIndex + length) <= destLimit) { |
|
System.arraycopy(src,prevSrc,dest,destIndex,length); |
|
} |
|
|
|
destIndex += length; |
|
reorderStartIndex = destIndex; |
|
} |
|
|
|
|
|
if(srcIndex == srcLimit) { |
|
break; |
|
} |
|
|
|
++srcIndex; |
|
|
|
if(isNorm32Regular(norm32)) { |
|
c2 = 0; |
|
length = 1; |
|
} else { |
|
|
|
if(srcIndex != srcLimit && |
|
Character.isLowSurrogate(c2 = src[srcIndex])) { |
|
++srcIndex; |
|
length = 2; |
|
norm32 = getNorm32FromSurrogatePair(norm32, c2); |
|
} else { |
|
c2 = 0; |
|
length = 1; |
|
norm32 = 0; |
|
} |
|
} |
|
|
|
|
|
if((norm32 & qcMask) == 0) { |
|
|
|
cc = trailCC = (int)((UNSIGNED_BYTE_MASK) & (norm32 >> CC_SHIFT)); |
|
p = null; |
|
pStart = -1; |
|
} else { |
|
DecomposeArgs arg = new DecomposeArgs(); |
|
// c decomposes, get everything from the variable-length |
|
|
|
pStart = decompose(norm32, qcMask, arg); |
|
p = extraData; |
|
length = arg.length; |
|
cc = arg.cc; |
|
trailCC = arg.trailCC; |
|
if(length == 1) { |
|
|
|
c = p[pStart]; |
|
c2 = 0; |
|
p = null; |
|
pStart = -1; |
|
} |
|
} |
|
|
|
if((destIndex + length * 3) >= destLimit) { // 2 SingleQuotations |
|
|
|
char[] tmpBuf = new char[destLimit * 2]; |
|
System.arraycopy(dest, 0, tmpBuf, 0, destIndex); |
|
dest = tmpBuf; |
|
destLimit = dest.length; |
|
} |
|
|
|
{ |
|
int reorderSplit = destIndex; |
|
if(p == null) { |
|
|
|
if (needSingleQuotation(c)) { |
|
//if we need single quotation, no need to consider "prevCC" |
|
|
|
dest[destIndex++] = '\''; |
|
dest[destIndex++] = c; |
|
dest[destIndex++] = '\''; |
|
trailCC = 0; |
|
} else if(cc != 0 && cc < prevCC) { |
|
// (c, c2) is out of order with respect to the preceding |
|
|
|
destIndex += length; |
|
trailCC = insertOrdered(dest,reorderStartIndex, |
|
reorderSplit, destIndex, c, c2, cc); |
|
} else { |
|
|
|
dest[destIndex++] = c; |
|
if(c2 != 0) { |
|
dest[destIndex++] = c2; |
|
} |
|
} |
|
} else { |
|
// general: multiple code points (ordered by themselves) |
|
|
|
if (needSingleQuotation(p[pStart])) { |
|
dest[destIndex++] = '\''; |
|
dest[destIndex++] = p[pStart++]; |
|
dest[destIndex++] = '\''; |
|
length--; |
|
do { |
|
dest[destIndex++] = p[pStart++]; |
|
} while(--length > 0); |
|
} else |
|
if(cc != 0 && cc < prevCC) { |
|
destIndex += length; |
|
trailCC = mergeOrdered(dest,reorderStartIndex, |
|
reorderSplit,p, pStart,pStart+length); |
|
} else { |
|
|
|
do { |
|
dest[destIndex++] = p[pStart++]; |
|
} while(--length > 0); |
|
} |
|
} |
|
} |
|
prevCC = trailCC; |
|
if(prevCC == 0) { |
|
reorderStartIndex = destIndex; |
|
} |
|
} |
|
return new String(dest, 0, destIndex); |
|
} |
|
|
|
//------------------------------------------------------ |
|
// mapping method for IDNA/StringPrep |
|
//------------------------------------------------------ |
|
|
|
/* |
|
* Normalization using NormalizerBase.UNICODE_3_2 option supports Unicode |
|
* 3.2 normalization with Corrigendum 4 corrections. However, normalization |
|
* without the corrections is necessary for IDNA/StringPrep support. |
|
* This method is called when NormalizerBase.UNICODE_3_2_0_ORIGINAL option |
|
* (= sun.text.Normalizer.UNICODE_3_2) is used and normalizes five |
|
* characters in Corrigendum 4 before normalization in order to avoid |
|
* incorrect normalization. |
|
* For the Corrigendum 4 issue, refer |
|
* http://www.unicode.org/versions/corrigendum4.html |
|
*/ |
|
|
|
|
|
|
|
*/ |
|
public static final int WITHOUT_CORRIGENDUM4_CORRECTIONS=0x40000; |
|
|
|
private static final char[][] corrigendum4MappingTable = { |
|
{'\uD844', '\uDF6A'}, |
|
{'\u5F33'}, |
|
{'\u43AB'}, |
|
{'\u7AAE'}, |
|
{'\u4D57'}}; |
|
|
|
|
|
|
|
|
|
*/ |
|
public static String convert(String str) { |
|
if (str == null) { |
|
return null; |
|
} |
|
|
|
int ch = UCharacterIterator.DONE; |
|
StringBuffer dest = new StringBuffer(); |
|
UCharacterIterator iter = UCharacterIterator.getInstance(str); |
|
|
|
while ((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){ |
|
switch (ch) { |
|
case 0x2F868: |
|
dest.append(corrigendum4MappingTable[0]); |
|
break; |
|
case 0x2F874: |
|
dest.append(corrigendum4MappingTable[1]); |
|
break; |
|
case 0x2F91F: |
|
dest.append(corrigendum4MappingTable[2]); |
|
break; |
|
case 0x2F95F: |
|
dest.append(corrigendum4MappingTable[3]); |
|
break; |
|
case 0x2F9BF: |
|
dest.append(corrigendum4MappingTable[4]); |
|
break; |
|
default: |
|
UTF16.append(dest,ch); |
|
break; |
|
} |
|
} |
|
|
|
return dest.toString(); |
|
} |
|
} |