| /* | |
|  * Copyright (c) 2005, 2013, Oracle and/or its affiliates. All rights reserved. | |
|  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. | |
|  * | |
|  * This code is free software; you can redistribute it and/or modify it | |
|  * under the terms of the GNU General Public License version 2 only, as | |
|  * published by the Free Software Foundation.  Oracle designates this | |
|  * particular file as subject to the "Classpath" exception as provided | |
|  * by Oracle in the LICENSE file that accompanied this code. | |
|  * | |
|  * This code is distributed in the hope that it will be useful, but WITHOUT | |
|  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
|  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License | |
|  * version 2 for more details (a copy is included in the LICENSE file that | |
|  * accompanied this code). | |
|  * | |
|  * You should have received a copy of the GNU General Public License version | |
|  * 2 along with this work; if not, write to the Free Software Foundation, | |
|  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. | |
|  * | |
|  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA | |
|  * or visit www.oracle.com if you need additional information or have any | |
|  * questions. | |
| */ | |
| package java.net; | |
| import java.io.InputStream; | |
| import java.io.IOException; | |
| import java.security.AccessController; | |
| import java.security.PrivilegedAction; | |
| import sun.net.idn.StringPrep; | |
| import sun.net.idn.Punycode; | |
| import sun.text.normalizer.UCharacterIterator; | |
| /** | |
|  * Provides methods to convert internationalized domain names (IDNs) between | |
|  * a normal Unicode representation and an ASCII Compatible Encoding (ACE) representation. | |
|  * Internationalized domain names can use characters from the entire range of | |
|  * Unicode, while traditional domain names are restricted to ASCII characters. | |
|  * ACE is an encoding of Unicode strings that uses only ASCII characters and | |
|  * can be used with software (such as the Domain Name System) that only | |
|  * understands traditional domain names. | |
|  * | |
|  * <p>Internationalized domain names are defined in <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>. | |
|  * RFC 3490 defines two operations: ToASCII and ToUnicode. These 2 operations employ | |
|  * <a href="http://www.ietf.org/rfc/rfc3491.txt">Nameprep</a> algorithm, which is a | |
|  * profile of <a href="http://www.ietf.org/rfc/rfc3454.txt">Stringprep</a>, and | |
|  * <a href="http://www.ietf.org/rfc/rfc3492.txt">Punycode</a> algorithm to convert | |
|  * domain name string back and forth. | |
|  * | |
|  * <p>The behavior of aforementioned conversion process can be adjusted by various flags: | |
|  *   <ul> | |
|  *     <li>If the ALLOW_UNASSIGNED flag is used, the domain name string to be converted | |
|  *         can contain code points that are unassigned in Unicode 3.2, which is the | |
|  *         Unicode version on which IDN conversion is based. If the flag is not used, | |
|  *         the presence of such unassigned code points is treated as an error. | |
|  *     <li>If the USE_STD3_ASCII_RULES flag is used, ASCII strings are checked against <a href="http://www.ietf.org/rfc/rfc1122.txt">RFC 1122</a> and <a href="http://www.ietf.org/rfc/rfc1123.txt">RFC 1123</a>. | |
|  *         It is an error if they don't meet the requirements. | |
|  *   </ul> | |
|  * These flags can be logically OR'ed together. | |
|  * | |
|  * <p>The security consideration is important with respect to internationalization | |
|  * domain name support. For example, English domain names may be <i>homographed</i> | |
|  * - maliciously misspelled by substitution of non-Latin letters. | |
|  * <a href="http://www.unicode.org/reports/tr36/">Unicode Technical Report #36</a> | |
|  * discusses security issues of IDN support as well as possible solutions. | |
|  * Applications are responsible for taking adequate security measures when using | |
|  * international domain names. | |
|  * | |
|  * @author Edward Wang | |
|  * @since 1.6 | |
|  * | |
| */ | |
| public final class IDN { | |
|     /** | |
|      * Flag to allow processing of unassigned code points | |
| */ | |
| public static final int ALLOW_UNASSIGNED = 0x01; | |
|     /** | |
|      * Flag to turn on the check against STD-3 ASCII rules | |
| */ | |
| public static final int USE_STD3_ASCII_RULES = 0x02; | |
|     /** | |
|      * Translates a string from Unicode to ASCII Compatible Encoding (ACE), | |
|      * as defined by the ToASCII operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>. | |
|      * | |
|      * <p>ToASCII operation can fail. ToASCII fails if any step of it fails. | |
|      * If ToASCII operation fails, an IllegalArgumentException will be thrown. | |
|      * In this case, the input string should not be used in an internationalized domain name. | |
|      * | |
|      * <p> A label is an individual part of a domain name. The original ToASCII operation, | |
|      * as defined in RFC 3490, only operates on a single label. This method can handle | |
|      * both label and entire domain name, by assuming that labels in a domain name are | |
|      * always separated by dots. The following characters are recognized as dots: | |
|      * \u002E (full stop), \u3002 (ideographic full stop), \uFF0E (fullwidth full stop), | |
|      * and \uFF61 (halfwidth ideographic full stop). if dots are | |
|      * used as label separators, this method also changes all of them to \u002E (full stop) | |
|      * in output translated string. | |
|      * | |
|      * @param input     the string to be processed | |
|      * @param flag      process flag; can be 0 or any logical OR of possible flags | |
|      * | |
|      * @return          the translated {@code String} | |
|      * | |
|      * @throws IllegalArgumentException   if the input string doesn't conform to RFC 3490 specification | |
| */ | |
| public static String toASCII(String input, int flag) | |
|     { | |
| int p = 0, q = 0; | |
| StringBuilder out = new StringBuilder(); | |
| if (isRootLabel(input)) { | |
|             return "."; | |
| } | |
| while (p < input.length()) { | |
| q = searchDots(input, p); | |
| out.append(toASCIIInternal(input.substring(p, q), flag)); | |
| if (q != (input.length())) { | |
|                // has more labels, or keep the trailing dot as at present | |
| out.append('.'); | |
| } | |
| p = q + 1; | |
| } | |
| return out.toString(); | |
| } | |
|     /** | |
|      * Translates a string from Unicode to ASCII Compatible Encoding (ACE), | |
|      * as defined by the ToASCII operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>. | |
|      * | |
|      * <p> This convenience method works as if by invoking the | |
|      * two-argument counterpart as follows: | |
|      * <blockquote> | |
|      * {@link #toASCII(String, int) toASCII}(input, 0); | |
|      * </blockquote> | |
|      * | |
|      * @param input     the string to be processed | |
|      * | |
|      * @return          the translated {@code String} | |
|      * | |
|      * @throws IllegalArgumentException   if the input string doesn't conform to RFC 3490 specification | |
| */ | |
| public static String toASCII(String input) { | |
| return toASCII(input, 0); | |
| } | |
|     /** | |
|      * Translates a string from ASCII Compatible Encoding (ACE) to Unicode, | |
|      * as defined by the ToUnicode operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>. | |
|      * | |
|      * <p>ToUnicode never fails. In case of any error, the input string is returned unmodified. | |
|      * | |
|      * <p> A label is an individual part of a domain name. The original ToUnicode operation, | |
|      * as defined in RFC 3490, only operates on a single label. This method can handle | |
|      * both label and entire domain name, by assuming that labels in a domain name are | |
|      * always separated by dots. The following characters are recognized as dots: | |
|      * \u002E (full stop), \u3002 (ideographic full stop), \uFF0E (fullwidth full stop), | |
|      * and \uFF61 (halfwidth ideographic full stop). | |
|      * | |
|      * @param input     the string to be processed | |
|      * @param flag      process flag; can be 0 or any logical OR of possible flags | |
|      * | |
|      * @return          the translated {@code String} | |
| */ | |
| public static String toUnicode(String input, int flag) { | |
| int p = 0, q = 0; | |
| StringBuilder out = new StringBuilder(); | |
| if (isRootLabel(input)) { | |
|             return "."; | |
| } | |
| while (p < input.length()) { | |
| q = searchDots(input, p); | |
| out.append(toUnicodeInternal(input.substring(p, q), flag)); | |
| if (q != (input.length())) { | |
|                // has more labels, or keep the trailing dot as at present | |
| out.append('.'); | |
| } | |
| p = q + 1; | |
| } | |
| return out.toString(); | |
| } | |
|     /** | |
|      * Translates a string from ASCII Compatible Encoding (ACE) to Unicode, | |
|      * as defined by the ToUnicode operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>. | |
|      * | |
|      * <p> This convenience method works as if by invoking the | |
|      * two-argument counterpart as follows: | |
|      * <blockquote> | |
|      * {@link #toUnicode(String, int) toUnicode}(input, 0); | |
|      * </blockquote> | |
|      * | |
|      * @param input     the string to be processed | |
|      * | |
|      * @return          the translated {@code String} | |
| */ | |
| public static String toUnicode(String input) { | |
| return toUnicode(input, 0); | |
| } | |
| /* ---------------- Private members -------------- */ | |
|     // ACE Prefix is "xn--" | |
| private static final String ACE_PREFIX = "xn--"; | |
| private static final int ACE_PREFIX_LENGTH = ACE_PREFIX.length(); | |
| private static final int MAX_LABEL_LENGTH = 63; | |
|     // single instance of nameprep | |
| private static StringPrep namePrep = null; | |
|     static { | |
| InputStream stream = null; | |
|         try { | |
| final String IDN_PROFILE = "uidna.spp"; | |
| if (System.getSecurityManager() != null) { | |
| stream = AccessController.doPrivileged(new PrivilegedAction<>() { | |
| public InputStream run() { | |
| return StringPrep.class.getResourceAsStream(IDN_PROFILE); | |
| } | |
| }); | |
|             } else { | |
| stream = StringPrep.class.getResourceAsStream(IDN_PROFILE); | |
| } | |
| namePrep = new StringPrep(stream); | |
| stream.close(); | |
| } catch (IOException e) { | |
|             // should never reach here | |
| assert false; | |
| } | |
| } | |
| /* ---------------- Private operations -------------- */ | |
| // | |
| // to suppress the default zero-argument constructor | |
|     // | |
|     private IDN() {} | |
| // | |
| // toASCII operation; should only apply to a single label | |
|     // | |
| private static String toASCIIInternal(String label, int flag) | |
|     { | |
| // step 1 | |
|         // Check if the string contains code points outside the ASCII range 0..0x7c. | |
| boolean isASCII = isAllASCII(label); | |
| StringBuffer dest; | |
| // step 2 | |
|         // perform the nameprep operation; flag ALLOW_UNASSIGNED is used here | |
| if (!isASCII) { | |
| UCharacterIterator iter = UCharacterIterator.getInstance(label); | |
|             try { | |
| dest = namePrep.prepare(iter, flag); | |
|             } catch (java.text.ParseException e) { | |
| throw new IllegalArgumentException(e); | |
| } | |
|         } else { | |
| dest = new StringBuffer(label); | |
| } | |
| // step 8, move forward to check the smallest number of the code points | |
|         // the length must be inside 1..63 | |
| if (dest.length() == 0) { | |
| throw new IllegalArgumentException( | |
|                         "Empty label is not a legal name"); | |
| } | |
| // step 3 | |
| // Verify the absence of non-LDH ASCII code points | |
| // 0..0x2c, 0x2e..0x2f, 0x3a..0x40, 0x5b..0x60, 0x7b..0x7f | |
|         // Verify the absence of leading and trailing hyphen | |
| boolean useSTD3ASCIIRules = ((flag & USE_STD3_ASCII_RULES) != 0); | |
| if (useSTD3ASCIIRules) { | |
| for (int i = 0; i < dest.length(); i++) { | |
| int c = dest.charAt(i); | |
| if (isNonLDHAsciiCodePoint(c)) { | |
| throw new IllegalArgumentException( | |
|                         "Contains non-LDH ASCII characters"); | |
| } | |
| } | |
| if (dest.charAt(0) == '-' || | |
| dest.charAt(dest.length() - 1) == '-') { | |
| throw new IllegalArgumentException( | |
|                         "Has leading or trailing hyphen"); | |
| } | |
| } | |
| if (!isASCII) { | |
| // step 4 | |
|             // If all code points are inside 0..0x7f, skip to step 8 | |
| if (!isAllASCII(dest.toString())) { | |
| // step 5 | |
|                 // verify the sequence does not begin with ACE prefix | |
| if(!startsWithACEPrefix(dest)){ | |
| // step 6 | |
|                     // encode the sequence with punycode | |
|                     try { | |
| dest = Punycode.encode(dest, null); | |
|                     } catch (java.text.ParseException e) { | |
| throw new IllegalArgumentException(e); | |
| } | |
| dest = toASCIILower(dest); | |
| // step 7 | |
|                     // prepend the ACE prefix | |
| dest.insert(0, ACE_PREFIX); | |
|                 } else { | |
| throw new IllegalArgumentException("The input starts with the ACE Prefix"); | |
| } | |
| } | |
| } | |
| // step 8 | |
|         // the length must be inside 1..63 | |
| if (dest.length() > MAX_LABEL_LENGTH) { | |
| throw new IllegalArgumentException("The label in the input is too long"); | |
| } | |
| return dest.toString(); | |
| } | |
| // | |
| // toUnicode operation; should only apply to a single label | |
|     // | |
| private static String toUnicodeInternal(String label, int flag) { | |
| boolean[] caseFlags = null; | |
| StringBuffer dest; | |
| // step 1 | |
|         // find out if all the codepoints in input are ASCII | |
| boolean isASCII = isAllASCII(label); | |
| if(!isASCII){ | |
| // step 2 | |
|             // perform the nameprep operation; flag ALLOW_UNASSIGNED is used here | |
|             try { | |
| UCharacterIterator iter = UCharacterIterator.getInstance(label); | |
| dest = namePrep.prepare(iter, flag); | |
| } catch (Exception e) { | |
|                 // toUnicode never fails; if any step fails, return the input string | |
| return label; | |
| } | |
|         } else { | |
| dest = new StringBuffer(label); | |
| } | |
| // step 3 | |
|         // verify ACE Prefix | |
| if(startsWithACEPrefix(dest)) { | |
| // step 4 | |
|             // Remove the ACE Prefix | |
| String temp = dest.substring(ACE_PREFIX_LENGTH, dest.length()); | |
|             try { | |
| // step 5 | |
|                 // Decode using punycode | |
| StringBuffer decodeOut = Punycode.decode(new StringBuffer(temp), null); | |
| // step 6 | |
|                 // Apply toASCII | |
| String toASCIIOut = toASCII(decodeOut.toString(), flag); | |
| // step 7 | |
|                 // verify | |
| if (toASCIIOut.equalsIgnoreCase(dest.toString())) { | |
| // step 8 | |
|                     // return output of step 5 | |
| return decodeOut.toString(); | |
| } | |
| } catch (Exception ignored) { | |
| // no-op | |
| } | |
| } | |
|         // just return the input | |
| return label; | |
| } | |
| // | |
| // LDH stands for "letter/digit/hyphen", with characters restricted to the | |
| // 26-letter Latin alphabet <A-Z a-z>, the digits <0-9>, and the hyphen | |
| // <->. | |
| // Non LDH refers to characters in the ASCII range, but which are not | |
| // letters, digits or the hypen. | |
| // | |
| // non-LDH = 0..0x2C, 0x2E..0x2F, 0x3A..0x40, 0x5B..0x60, 0x7B..0x7F | |
|     // | |
|     private static boolean isNonLDHAsciiCodePoint(int ch){ | |
| return (0x0000 <= ch && ch <= 0x002C) || | |
| (0x002E <= ch && ch <= 0x002F) || | |
| (0x003A <= ch && ch <= 0x0040) || | |
| (0x005B <= ch && ch <= 0x0060) || | |
| (0x007B <= ch && ch <= 0x007F); | |
| } | |
| // | |
| // search dots in a string and return the index of that character; | |
| // or if there is no dots, return the length of input string | |
| // dots might be: \u002E (full stop), \u3002 (ideographic full stop), \uFF0E (fullwidth full stop), | |
| // and \uFF61 (halfwidth ideographic full stop). | |
|     // | |
| private static int searchDots(String s, int start) { | |
| int i; | |
| for (i = start; i < s.length(); i++) { | |
| if (isLabelSeparator(s.charAt(i))) { | |
| break; | |
| } | |
| } | |
| return i; | |
| } | |
| // | |
| // to check if a string is a root label, ".". | |
|     // | |
| private static boolean isRootLabel(String s) { | |
| return (s.length() == 1 && isLabelSeparator(s.charAt(0))); | |
| } | |
| // | |
| // to check if a character is a label separator, i.e. a dot character. | |
|     // | |
|     private static boolean isLabelSeparator(char c) { | |
| return (c == '.' || c == '\u3002' || c == '\uFF0E' || c == '\uFF61'); | |
| } | |
| // | |
| // to check if a string only contains US-ASCII code point | |
|     // | |
| private static boolean isAllASCII(String input) { | |
| boolean isASCII = true; | |
| for (int i = 0; i < input.length(); i++) { | |
| int c = input.charAt(i); | |
| if (c > 0x7F) { | |
| isASCII = false; | |
| break; | |
| } | |
| } | |
| return isASCII; | |
| } | |
| // | |
| // to check if a string starts with ACE-prefix | |
|     // | |
| private static boolean startsWithACEPrefix(StringBuffer input){ | |
| boolean startsWithPrefix = true; | |
| if(input.length() < ACE_PREFIX_LENGTH){ | |
| return false; | |
| } | |
| for(int i = 0; i < ACE_PREFIX_LENGTH; i++){ | |
| if(toASCIILower(input.charAt(i)) != ACE_PREFIX.charAt(i)){ | |
| startsWithPrefix = false; | |
| } | |
| } | |
| return startsWithPrefix; | |
| } | |
|     private static char toASCIILower(char ch){ | |
|         if('A' <= ch && ch <= 'Z'){ | |
| return (char)(ch + 'a' - 'A'); | |
| } | |
| return ch; | |
| } | |
| private static StringBuffer toASCIILower(StringBuffer input){ | |
| StringBuffer dest = new StringBuffer(); | |
| for(int i = 0; i < input.length();i++){ | |
| dest.append(toASCIILower(input.charAt(i))); | |
| } | |
| return dest; | |
| } | |
| } |