/* | 
|
 * Copyright (c) 1998, 2013, Oracle and/or its affiliates. All rights reserved. | 
|
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. | 
|
 * | 
|
 * This code is free software; you can redistribute it and/or modify it | 
|
 * under the terms of the GNU General Public License version 2 only, as | 
|
 * published by the Free Software Foundation.  Oracle designates this | 
|
 * particular file as subject to the "Classpath" exception as provided | 
|
 * by Oracle in the LICENSE file that accompanied this code. | 
|
 * | 
|
 * This code is distributed in the hope that it will be useful, but WITHOUT | 
|
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | 
|
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License | 
|
 * version 2 for more details (a copy is included in the LICENSE file that | 
|
 * accompanied this code). | 
|
 * | 
|
 * You should have received a copy of the GNU General Public License version | 
|
 * 2 along with this work; if not, write to the Free Software Foundation, | 
|
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. | 
|
 * | 
|
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA | 
|
 * or visit www.oracle.com if you need additional information or have any | 
|
 * questions. | 
|
*/  | 
|
package javax.swing.text.html.parser;  | 
|
import javax.swing.text.SimpleAttributeSet;  | 
|
import javax.swing.text.html.HTMLEditorKit;  | 
|
import javax.swing.text.html.HTML;  | 
|
import javax.swing.text.ChangedCharSetException;  | 
|
import java.util.*;  | 
|
import java.io.*;  | 
|
import java.net.*;  | 
|
/** | 
|
 * A Parser for HTML Documents (actually, you can specify a DTD, but | 
|
 * you should really only use this class with the html dtd in swing). | 
|
 * Reads an InputStream of HTML and | 
|
 * invokes the appropriate methods in the ParserCallback class. This | 
|
 * is the default parser used by HTMLEditorKit to parse HTML url's. | 
|
 * <p>This will message the callback for all valid tags, as well as | 
|
 * tags that are implied but not explicitly specified. For example, the | 
|
 * html string (<p>blah) only has a p tag defined. The callback | 
|
 * will see the following methods: | 
|
 * <ol><li><i>handleStartTag(html, ...)</i></li> | 
|
 *     <li><i>handleStartTag(head, ...)</i></li> | 
|
 *     <li><i>handleEndTag(head)</i></li> | 
|
 *     <li><i>handleStartTag(body, ...)</i></li> | 
|
 *     <li><i>handleStartTag(p, ...)</i></li> | 
|
 *     <li><i>handleText(...)</i></li> | 
|
 *     <li><i>handleEndTag(p)</i></li> | 
|
 *     <li><i>handleEndTag(body)</i></li> | 
|
 *     <li><i>handleEndTag(html)</i></li> | 
|
 * </ol> | 
|
 * The items in <i>italic</i> are implied, that is, although they were not | 
|
 * explicitly specified, to be correct html they should have been present | 
|
 * (head isn't necessary, but it is still generated). For tags that | 
|
 * are implied, the AttributeSet argument will have a value of | 
|
 * <code>Boolean.TRUE</code> for the key | 
|
 * <code>HTMLEditorKit.ParserCallback.IMPLIED</code>. | 
|
 * <p>HTML.Attributes defines a type safe enumeration of html attributes. | 
|
 * If an attribute key of a tag is defined in HTML.Attribute, the | 
|
 * HTML.Attribute will be used as the key, otherwise a String will be used. | 
|
 * For example <p foo=bar class=neat> has two attributes. foo is | 
|
 * not defined in HTML.Attribute, where as class is, therefore the | 
|
 * AttributeSet will have two values in it, HTML.Attribute.CLASS with | 
|
 * a String value of 'neat' and the String key 'foo' with a String value of | 
|
 * 'bar'. | 
|
 * <p>The position argument will indicate the start of the tag, comment | 
|
 * or text. Similar to arrays, the first character in the stream has a | 
|
 * position of 0. For tags that are | 
|
 * implied the position will indicate | 
|
 * the location of the next encountered tag. In the first example, | 
|
 * the implied start body and html tags will have the same position as the | 
|
 * p tag, and the implied end p, html and body tags will all have the same | 
|
 * position. | 
|
 * <p>As html skips whitespace the position for text will be the position | 
|
 * of the first valid character, eg in the string '\n\n\nblah' | 
|
 * the text 'blah' will have a position of 3, the newlines are skipped. | 
|
 * <p> | 
|
 * For attributes that do not have a value, eg in the html | 
|
 * string <code><foo blah></code> the attribute <code>blah</code> | 
|
 * does not have a value, there are two possible values that will be | 
|
 * placed in the AttributeSet's value: | 
|
 * <ul> | 
|
 * <li>If the DTD does not contain an definition for the element, or the | 
|
 *     definition does not have an explicit value then the value in the | 
|
 *     AttributeSet will be <code>HTML.NULL_ATTRIBUTE_VALUE</code>. | 
|
 * <li>If the DTD contains an explicit value, as in: | 
|
 *     <code><!ATTLIST OPTION selected (selected) #IMPLIED></code> | 
|
 *     this value from the dtd (in this case selected) will be used. | 
|
 * </ul> | 
|
 * <p> | 
|
 * Once the stream has been parsed, the callback is notified of the most | 
|
 * likely end of line string. The end of line string will be one of | 
|
 * \n, \r or \r\n, which ever is encountered the most in parsing the | 
|
 * stream. | 
|
 * | 
|
 * @author      Sunita Mani | 
|
*/  | 
|
public class DocumentParser extends javax.swing.text.html.parser.Parser {  | 
|
private int inbody;  | 
|
private int intitle;  | 
|
private int inhead;  | 
|
private int instyle;  | 
|
private int inscript;  | 
|
private boolean seentitle;  | 
|
private HTMLEditorKit.ParserCallback callback = null;  | 
|
private boolean ignoreCharSet = false;  | 
|
private static final boolean debugFlag = false;  | 
|
public DocumentParser(DTD dtd) {  | 
|
super(dtd);  | 
|
}  | 
|
public void parse(Reader in, HTMLEditorKit.ParserCallback callback, boolean ignoreCharSet) throws IOException {  | 
|
this.ignoreCharSet = ignoreCharSet;  | 
|
this.callback = callback;  | 
|
parse(in);  | 
|
        // end of line | 
|
callback.handleEndOfLineString(getEndOfLineString());  | 
|
}  | 
|
    /** | 
|
     * Handle Start Tag. | 
|
*/  | 
|
protected void handleStartTag(TagElement tag) {  | 
|
Element elem = tag.getElement();  | 
|
if (elem == dtd.body) {  | 
|
inbody++;  | 
|
} else if (elem == dtd.html) {  | 
|
} else if (elem == dtd.head) {  | 
|
inhead++;  | 
|
} else if (elem == dtd.title) {  | 
|
intitle++;  | 
|
} else if (elem == dtd.style) {  | 
|
instyle++;  | 
|
} else if (elem == dtd.script) {  | 
|
inscript++;  | 
|
}  | 
|
if (debugFlag) {  | 
|
if (tag.fictional()) {  | 
|
debug("Start Tag: " + tag.getHTMLTag() + " pos: " + getCurrentPos());  | 
|
            } else { | 
|
debug("Start Tag: " + tag.getHTMLTag() + " attributes: " +  | 
|
getAttributes() + " pos: " + getCurrentPos());  | 
|
}  | 
|
}  | 
|
if (tag.fictional()) {  | 
|
SimpleAttributeSet attrs = new SimpleAttributeSet();  | 
|
attrs.addAttribute(HTMLEditorKit.ParserCallback.IMPLIED,  | 
|
Boolean.TRUE);  | 
|
callback.handleStartTag(tag.getHTMLTag(), attrs,  | 
|
getBlockStartPosition());  | 
|
        } else { | 
|
callback.handleStartTag(tag.getHTMLTag(), getAttributes(),  | 
|
getBlockStartPosition());  | 
|
flushAttributes();  | 
|
}  | 
|
}  | 
|
    protected void handleComment(char text[]) { | 
|
if (debugFlag) {  | 
|
debug("comment: ->" + new String(text) + "<-"  | 
|
+ " pos: " + getCurrentPos());  | 
|
}  | 
|
callback.handleComment(text, getBlockStartPosition());  | 
|
}  | 
|
    /** | 
|
     * Handle Empty Tag. | 
|
*/  | 
|
protected void handleEmptyTag(TagElement tag) throws ChangedCharSetException {  | 
|
Element elem = tag.getElement();  | 
|
if (elem == dtd.meta && !ignoreCharSet) {  | 
|
SimpleAttributeSet atts = getAttributes();  | 
|
if (atts != null) {  | 
|
String content = (String)atts.getAttribute(HTML.Attribute.CONTENT);  | 
|
if (content != null) {  | 
|
if ("content-type".equalsIgnoreCase((String)atts.getAttribute(HTML.Attribute.HTTPEQUIV))) {  | 
|
if (!content.equalsIgnoreCase("text/html") &&  | 
|
!content.equalsIgnoreCase("text/plain")) {  | 
|
throw new ChangedCharSetException(content, false);  | 
|
}  | 
|
} else if ("charset" .equalsIgnoreCase((String)atts.getAttribute(HTML.Attribute.HTTPEQUIV))) {  | 
|
throw new ChangedCharSetException(content, true);  | 
|
}  | 
|
}  | 
|
}  | 
|
}  | 
|
if (inbody != 0 || elem == dtd.meta || elem == dtd.base || elem == dtd.isindex || elem == dtd.style || elem == dtd.link) {  | 
|
if (debugFlag) {  | 
|
if (tag.fictional()) {  | 
|
debug("Empty Tag: " + tag.getHTMLTag() + " pos: " + getCurrentPos());  | 
|
                } else { | 
|
debug("Empty Tag: " + tag.getHTMLTag() + " attributes: "  | 
|
+ getAttributes() + " pos: " + getCurrentPos());  | 
|
}  | 
|
}  | 
|
if (tag.fictional()) {  | 
|
SimpleAttributeSet attrs = new SimpleAttributeSet();  | 
|
attrs.addAttribute(HTMLEditorKit.ParserCallback.IMPLIED,  | 
|
Boolean.TRUE);  | 
|
callback.handleSimpleTag(tag.getHTMLTag(), attrs,  | 
|
getBlockStartPosition());  | 
|
            } else { | 
|
callback.handleSimpleTag(tag.getHTMLTag(), getAttributes(),  | 
|
getBlockStartPosition());  | 
|
flushAttributes();  | 
|
}  | 
|
}  | 
|
}  | 
|
    /** | 
|
     * Handle End Tag. | 
|
*/  | 
|
protected void handleEndTag(TagElement tag) {  | 
|
Element elem = tag.getElement();  | 
|
if (elem == dtd.body) {  | 
|
inbody--;  | 
|
} else if (elem == dtd.title) {  | 
|
intitle--;  | 
|
seentitle = true;  | 
|
} else if (elem == dtd.head) {  | 
|
inhead--;  | 
|
} else if (elem == dtd.style) {  | 
|
instyle--;  | 
|
} else if (elem == dtd.script) {  | 
|
inscript--;  | 
|
}  | 
|
if (debugFlag) {  | 
|
debug("End Tag: " + tag.getHTMLTag() + " pos: " + getCurrentPos());  | 
|
}  | 
|
callback.handleEndTag(tag.getHTMLTag(), getBlockStartPosition());  | 
|
}  | 
|
    /** | 
|
     * Handle Text. | 
|
*/  | 
|
    protected void handleText(char data[]) { | 
|
if (data != null) {  | 
|
if (inscript != 0) {  | 
|
callback.handleComment(data, getBlockStartPosition());  | 
|
return;  | 
|
}  | 
|
if (inbody != 0 || ((instyle != 0) ||  | 
|
((intitle != 0) && !seentitle))) {  | 
|
if (debugFlag) {  | 
|
debug("text: ->" + new String(data) + "<-" + " pos: " + getCurrentPos());  | 
|
}  | 
|
callback.handleText(data, getBlockStartPosition());  | 
|
}  | 
|
}  | 
|
}  | 
|
    /* | 
|
     * Error handling. | 
|
*/  | 
|
protected void handleError(int ln, String errorMsg) {  | 
|
if (debugFlag) {  | 
|
debug("Error: ->" + errorMsg + "<-" + " pos: " + getCurrentPos());  | 
|
}  | 
|
        /* PENDING: need to improve the error string. */ | 
|
callback.handleError(errorMsg, getCurrentPos());  | 
|
}  | 
|
    /* | 
|
     * debug messages | 
|
*/  | 
|
private void debug(String msg) {  | 
|
System.out.println(msg);  | 
|
}  | 
|
}  |