|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
|
|
package javax.swing.text.html.parser; |
|
|
|
import javax.swing.text.SimpleAttributeSet; |
|
import javax.swing.text.html.HTML; |
|
import javax.swing.text.ChangedCharSetException; |
|
import java.io.*; |
|
import java.util.Hashtable; |
|
import java.util.Properties; |
|
import java.util.Vector; |
|
import java.util.Enumeration; |
|
import java.net.URL; |
|
|
|
import sun.misc.MessageUtils; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
public |
|
class Parser implements DTDConstants { |
|
|
|
private char text[] = new char[1024]; |
|
private int textpos = 0; |
|
private TagElement last; |
|
private boolean space; |
|
|
|
private char str[] = new char[128]; |
|
private int strpos = 0; |
|
|
|
protected DTD dtd = null; |
|
|
|
private int ch; |
|
private int ln; |
|
private Reader in; |
|
|
|
private Element recent; |
|
private TagStack stack; |
|
private boolean skipTag = false; |
|
private TagElement lastFormSent = null; |
|
private SimpleAttributeSet attributes = new SimpleAttributeSet(); |
|
|
|
// State for <html>, <head> and <body>. Since people like to slap |
|
// together HTML documents without thinking, occasionally they |
|
// have multiple instances of these tags. These booleans track |
|
// the first sightings of these tags so they can be safely ignored |
|
|
|
private boolean seenHtml = false; |
|
private boolean seenHead = false; |
|
private boolean seenBody = false; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
private boolean ignoreSpace; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
protected boolean strict = false; |
|
|
|
|
|
|
|
private int crlfCount; |
|
|
|
private int crCount; |
|
|
|
private int lfCount; |
|
|
|
// |
|
// To correctly identify the start of a tag/comment/text we need two |
|
// ivars. Two are needed as handleText isn't invoked until the tag |
|
// after the text has been parsed, that is the parser parses the text, |
|
// then a tag, then invokes handleText followed by handleStart. |
|
// |
|
|
|
|
|
* tag, text. Use getBlockStartPosition to access this. */ |
|
private int currentBlockStartPos; |
|
|
|
private int lastBlockStartPos; |
|
|
|
|
|
|
|
|
|
*/ |
|
private static final char[] cp1252Map = { |
|
8218, |
|
402, |
|
8222, |
|
8230, |
|
8224, |
|
8225, |
|
710, |
|
8240, |
|
352, |
|
8249, |
|
338, |
|
141, |
|
142, |
|
143, |
|
144, |
|
8216, |
|
8217, |
|
8220, |
|
8221, |
|
8226, |
|
8211, |
|
8212, |
|
732, |
|
8482, |
|
353, |
|
8250, |
|
339, |
|
157, |
|
158, |
|
376 |
|
}; |
|
|
|
public Parser(DTD dtd) { |
|
this.dtd = dtd; |
|
} |
|
|
|
|
|
|
|
|
|
*/ |
|
protected int getCurrentLine() { |
|
return ln; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
int getBlockStartPosition() { |
|
return Math.max(0, lastBlockStartPos - 1); |
|
} |
|
|
|
|
|
|
|
*/ |
|
protected TagElement makeTag(Element elem, boolean fictional) { |
|
return new TagElement(elem, fictional); |
|
} |
|
|
|
protected TagElement makeTag(Element elem) { |
|
return makeTag(elem, false); |
|
} |
|
|
|
protected SimpleAttributeSet getAttributes() { |
|
return attributes; |
|
} |
|
|
|
protected void flushAttributes() { |
|
attributes.removeAttributes(attributes); |
|
} |
|
|
|
|
|
|
|
*/ |
|
protected void handleText(char text[]) { |
|
} |
|
|
|
|
|
|
|
*/ |
|
protected void handleTitle(char text[]) { |
|
// default behavior is to call handleText. Subclasses |
|
|
|
handleText(text); |
|
} |
|
|
|
|
|
|
|
*/ |
|
protected void handleComment(char text[]) { |
|
} |
|
|
|
protected void handleEOFInComment() { |
|
// We've reached EOF. Our recovery strategy is to |
|
// see if we have more than one line in the comment; |
|
// if so, we pretend that the comment was an unterminated |
|
// single line comment, and reparse the lines after the |
|
// first line as normal HTML content. |
|
|
|
int commentEndPos = strIndexOf('\n'); |
|
if (commentEndPos >= 0) { |
|
handleComment(getChars(0, commentEndPos)); |
|
try { |
|
in.close(); |
|
in = new CharArrayReader(getChars(commentEndPos + 1)); |
|
ch = '>'; |
|
} catch (IOException e) { |
|
error("ioexception"); |
|
} |
|
|
|
resetStrBuffer(); |
|
} else { |
|
|
|
error("eof.comment"); |
|
} |
|
} |
|
|
|
|
|
|
|
*/ |
|
protected void handleEmptyTag(TagElement tag) throws ChangedCharSetException { |
|
} |
|
|
|
|
|
|
|
*/ |
|
protected void handleStartTag(TagElement tag) { |
|
} |
|
|
|
|
|
|
|
*/ |
|
protected void handleEndTag(TagElement tag) { |
|
} |
|
|
|
|
|
|
|
*/ |
|
protected void handleError(int ln, String msg) { |
|
/* |
|
Thread.dumpStack(); |
|
System.out.println("**** " + stack); |
|
System.out.println("line " + ln + ": error: " + msg); |
|
System.out.println(); |
|
*/ |
|
} |
|
|
|
|
|
|
|
*/ |
|
void handleText(TagElement tag) { |
|
if (tag.breaksFlow()) { |
|
space = false; |
|
if (!strict) { |
|
ignoreSpace = true; |
|
} |
|
} |
|
if (textpos == 0) { |
|
if ((!space) || (stack == null) || last.breaksFlow() || |
|
!stack.advance(dtd.pcdata)) { |
|
last = tag; |
|
space = false; |
|
lastBlockStartPos = currentBlockStartPos; |
|
return; |
|
} |
|
} |
|
if (space) { |
|
if (!ignoreSpace) { |
|
|
|
if (textpos + 1 > text.length) { |
|
char newtext[] = new char[text.length + 200]; |
|
System.arraycopy(text, 0, newtext, 0, text.length); |
|
text = newtext; |
|
} |
|
|
|
|
|
text[textpos++] = ' '; |
|
if (!strict && !tag.getElement().isEmpty()) { |
|
ignoreSpace = true; |
|
} |
|
} |
|
space = false; |
|
} |
|
char newtext[] = new char[textpos]; |
|
System.arraycopy(text, 0, newtext, 0, textpos); |
|
// Handles cases of bad html where the title tag |
|
|
|
if (tag.getElement().getName().equals("title")) { |
|
handleTitle(newtext); |
|
} else { |
|
handleText(newtext); |
|
} |
|
lastBlockStartPos = currentBlockStartPos; |
|
textpos = 0; |
|
last = tag; |
|
space = false; |
|
} |
|
|
|
|
|
|
|
*/ |
|
protected void error(String err, String arg1, String arg2, |
|
String arg3) { |
|
handleError(ln, err + " " + arg1 + " " + arg2 + " " + arg3); |
|
} |
|
|
|
protected void error(String err, String arg1, String arg2) { |
|
error(err, arg1, arg2, "?"); |
|
} |
|
protected void error(String err, String arg1) { |
|
error(err, arg1, "?", "?"); |
|
} |
|
protected void error(String err) { |
|
error(err, "?", "?", "?"); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
protected void startTag(TagElement tag) throws ChangedCharSetException { |
|
Element elem = tag.getElement(); |
|
|
|
// If the tag is an empty tag and texpos != 0 |
|
// this implies that there is text before the |
|
// start tag that needs to be processed before |
|
// handling the tag. |
|
|
|
if (!elem.isEmpty() || |
|
((last != null) && !last.breaksFlow()) || |
|
(textpos != 0)) { |
|
handleText(tag); |
|
} else { |
|
// this variable gets updated in handleText(). |
|
// Since in this case we do not call handleText() |
|
// we need to update it here. |
|
|
|
last = tag; |
|
// Note that we should really check last.breakFlows before |
|
|
|
space = false; |
|
} |
|
lastBlockStartPos = currentBlockStartPos; |
|
|
|
|
|
for (AttributeList a = elem.atts ; a != null ; a = a.next) { |
|
if ((a.modifier == REQUIRED) && |
|
((attributes.isEmpty()) || |
|
((!attributes.isDefined(a.name)) && |
|
(!attributes.isDefined(HTML.getAttributeKey(a.name)))))) { |
|
error("req.att ", a.getName(), elem.getName()); |
|
} |
|
} |
|
|
|
if (elem.isEmpty()) { |
|
handleEmptyTag(tag); |
|
/* |
|
} else if (elem.getName().equals("form")) { |
|
handleStartTag(tag); |
|
*/ |
|
} else { |
|
recent = elem; |
|
stack = new TagStack(tag, stack); |
|
handleStartTag(tag); |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
*/ |
|
protected void endTag(boolean omitted) { |
|
handleText(stack.tag); |
|
|
|
if (omitted && !stack.elem.omitEnd()) { |
|
error("end.missing", stack.elem.getName()); |
|
} else if (!stack.terminate()) { |
|
error("end.unexpected", stack.elem.getName()); |
|
} |
|
|
|
|
|
handleEndTag(stack.tag); |
|
stack = stack.next; |
|
recent = (stack != null) ? stack.elem : null; |
|
} |
|
|
|
|
|
boolean ignoreElement(Element elem) { |
|
|
|
String stackElement = stack.elem.getName(); |
|
String elemName = elem.getName(); |
|
|
|
|
|
|
|
|
|
|
|
been seen. **/ |
|
if ((elemName.equals("html") && seenHtml) || |
|
(elemName.equals("head") && seenHead) || |
|
(elemName.equals("body") && seenBody)) { |
|
return true; |
|
} |
|
if (elemName.equals("dt") || elemName.equals("dd")) { |
|
TagStack s = stack; |
|
while (s != null && !s.elem.getName().equals("dl")) { |
|
s = s.next; |
|
} |
|
if (s == null) { |
|
return true; |
|
} |
|
} |
|
|
|
if (((stackElement.equals("table")) && |
|
(!elemName.equals("#pcdata")) && (!elemName.equals("input"))) || |
|
((elemName.equals("font")) && |
|
(stackElement.equals("ul") || stackElement.equals("ol"))) || |
|
(elemName.equals("meta") && stack != null) || |
|
(elemName.equals("style") && seenBody) || |
|
(stackElement.equals("table") && elemName.equals("a"))) { |
|
return true; |
|
} |
|
return false; |
|
} |
|
|
|
|
|
/** |
|
* Marks the first time a tag has been seen in a document |
|
*/ |
|
|
|
protected void markFirstTime(Element elem) { |
|
String elemName = elem.getName(); |
|
if (elemName.equals("html")) { |
|
seenHtml = true; |
|
} else if (elemName.equals("head")) { |
|
seenHead = true; |
|
} else if (elemName.equals("body")) { |
|
if (buf.length == 1) { |
|
|
|
char[] newBuf = new char[256]; |
|
|
|
newBuf[0] = buf[0]; |
|
buf = newBuf; |
|
} |
|
seenBody = true; |
|
} |
|
} |
|
|
|
|
|
|
|
*/ |
|
boolean legalElementContext(Element elem) throws ChangedCharSetException { |
|
|
|
// System.out.println("-- legalContext -- " + elem); |
|
|
|
|
|
if (stack == null) { |
|
|
|
if (elem != dtd.html) { |
|
|
|
startTag(makeTag(dtd.html, true)); |
|
return legalElementContext(elem); |
|
} |
|
return true; |
|
} |
|
|
|
|
|
if (stack.advance(elem)) { |
|
|
|
markFirstTime(elem); |
|
return true; |
|
} |
|
boolean insertTag = false; |
|
|
|
// The use of all error recovery strategies are contingent |
|
// on the value of the strict property. |
|
// |
|
// These are commonly occurring errors. if insertTag is true, |
|
// then we want to adopt an error recovery strategy that |
|
// involves attempting to insert an additional tag to |
|
// legalize the context. The two errors addressed here |
|
// are: |
|
// 1) when a <td> or <th> is seen soon after a <table> tag. |
|
// In this case we insert a <tr>. |
|
// 2) when any other tag apart from a <tr> is seen |
|
// in the context of a <tr>. In this case we would |
|
// like to add a <td>. If a <tr> is seen within a |
|
// <tr> context, then we will close out the current |
|
// <tr>. |
|
// |
|
// This insertion strategy is handled later in the method. |
|
// The reason for checking this now, is that in other cases |
|
// we would like to apply other error recovery strategies for example |
|
// ignoring tags. |
|
// |
|
// In certain cases it is better to ignore a tag than try to |
|
// fix the situation. So the first test is to see if this |
|
// is what we need to do. |
|
|
|
String stackElemName = stack.elem.getName(); |
|
String elemName = elem.getName(); |
|
|
|
|
|
if (!strict && |
|
((stackElemName.equals("table") && elemName.equals("td")) || |
|
(stackElemName.equals("table") && elemName.equals("th")) || |
|
(stackElemName.equals("tr") && !elemName.equals("tr")))){ |
|
insertTag = true; |
|
} |
|
|
|
|
|
if (!strict && !insertTag && (stack.elem.getName() != elem.getName() || |
|
elem.getName().equals("body"))) { |
|
if (skipTag = ignoreElement(elem)) { |
|
error("tag.ignore", elem.getName()); |
|
return skipTag; |
|
} |
|
} |
|
|
|
// Check for anything after the start of the table besides tr, td, th |
|
// or caption, and if those aren't there, insert the <tr> and call |
|
|
|
if (!strict && stackElemName.equals("table") && |
|
!elemName.equals("tr") && !elemName.equals("td") && |
|
!elemName.equals("th") && !elemName.equals("caption")) { |
|
Element e = dtd.getElement("tr"); |
|
TagElement t = makeTag(e, true); |
|
legalTagContext(t); |
|
startTag(t); |
|
error("start.missing", elem.getName()); |
|
return legalElementContext(elem); |
|
} |
|
|
|
// They try to find a legal context by checking if the current |
|
// tag is valid in an enclosing context. If so |
|
// close out the tags by outputing end tags and then |
|
// insert the current tag. If the tags that are |
|
// being closed out do not have an optional end tag |
|
// specification in the DTD then an html error is |
|
// reported. |
|
|
|
if (!insertTag && stack.terminate() && (!strict || stack.elem.omitEnd())) { |
|
for (TagStack s = stack.next ; s != null ; s = s.next) { |
|
if (s.advance(elem)) { |
|
while (stack != s) { |
|
endTag(true); |
|
} |
|
return true; |
|
} |
|
if (!s.terminate() || (strict && !s.elem.omitEnd())) { |
|
break; |
|
} |
|
} |
|
} |
|
|
|
// Check if we know what tag is expected next. |
|
// If so insert the tag. Report an error if the |
|
// tag does not have its start tag spec in the DTD as optional. |
|
|
|
Element next = stack.first(); |
|
if (next != null && (!strict || next.omitStart()) && |
|
!(next==dtd.head && elem==dtd.pcdata) ) { |
|
|
|
TagElement t = makeTag(next, true); |
|
legalTagContext(t); |
|
startTag(t); |
|
if (!next.omitStart()) { |
|
error("start.missing", elem.getName()); |
|
} |
|
return legalElementContext(elem); |
|
} |
|
|
|
|
|
// Traverse the list of expected elements and determine if adding |
|
// any of these elements would make for a legal context. |
|
// |
|
|
|
if (!strict) { |
|
ContentModel content = stack.contentModel(); |
|
Vector<Element> elemVec = new Vector<Element>(); |
|
if (content != null) { |
|
content.getElements(elemVec); |
|
for (Element e : elemVec) { |
|
// Ensure that this element has not been included as |
|
// part of the exclusions in the DTD. |
|
|
|
if (stack.excluded(e.getIndex())) { |
|
continue; |
|
} |
|
|
|
boolean reqAtts = false; |
|
|
|
for (AttributeList a = e.getAttributes(); a != null ; a = a.next) { |
|
if (a.modifier == REQUIRED) { |
|
reqAtts = true; |
|
break; |
|
} |
|
} |
|
// Ensure that no tag that has required attributes |
|
// gets inserted. |
|
|
|
if (reqAtts) { |
|
continue; |
|
} |
|
|
|
ContentModel m = e.getContent(); |
|
if (m != null && m.first(elem)) { |
|
|
|
TagElement t = makeTag(e, true); |
|
legalTagContext(t); |
|
startTag(t); |
|
error("start.missing", e.getName()); |
|
return legalElementContext(elem); |
|
} |
|
} |
|
} |
|
} |
|
|
|
// Check if the stack can be terminated. If so add the appropriate |
|
// end tag. Report an error if the tag being ended does not have its |
|
// end tag spec in the DTD as optional. |
|
|
|
if (stack.terminate() && (stack.elem != dtd.body) && (!strict || stack.elem.omitEnd())) { |
|
|
|
if (!stack.elem.omitEnd()) { |
|
error("end.missing", elem.getName()); |
|
} |
|
|
|
endTag(true); |
|
return legalElementContext(elem); |
|
} |
|
|
|
|
|
return false; |
|
} |
|
|
|
|
|
|
|
*/ |
|
void legalTagContext(TagElement tag) throws ChangedCharSetException { |
|
if (legalElementContext(tag.getElement())) { |
|
markFirstTime(tag.getElement()); |
|
return; |
|
} |
|
|
|
|
|
if (tag.breaksFlow() && (stack != null) && !stack.tag.breaksFlow()) { |
|
endTag(true); |
|
legalTagContext(tag); |
|
return; |
|
} |
|
|
|
|
|
for (TagStack s = stack ; s != null ; s = s.next) { |
|
if (s.tag.getElement() == dtd.head) { |
|
while (stack != s) { |
|
endTag(true); |
|
} |
|
endTag(true); |
|
legalTagContext(tag); |
|
return; |
|
} |
|
} |
|
|
|
|
|
error("tag.unexpected", tag.getElement().getName()); |
|
} |
|
|
|
|
|
|
|
|
|
*/ |
|
void errorContext() throws ChangedCharSetException { |
|
for (; (stack != null) && (stack.tag.getElement() != dtd.body) ; stack = stack.next) { |
|
handleEndTag(stack.tag); |
|
} |
|
if (stack == null) { |
|
legalElementContext(dtd.body); |
|
startTag(makeTag(dtd.body, true)); |
|
} |
|
} |
|
|
|
|
|
|
|
*/ |
|
void addString(int c) { |
|
if (strpos == str.length) { |
|
char newstr[] = new char[str.length + 128]; |
|
System.arraycopy(str, 0, newstr, 0, str.length); |
|
str = newstr; |
|
} |
|
str[strpos++] = (char)c; |
|
} |
|
|
|
|
|
|
|
*/ |
|
String getString(int pos) { |
|
char newStr[] = new char[strpos - pos]; |
|
System.arraycopy(str, pos, newStr, 0, strpos - pos); |
|
strpos = pos; |
|
return new String(newStr); |
|
} |
|
|
|
char[] getChars(int pos) { |
|
char newStr[] = new char[strpos - pos]; |
|
System.arraycopy(str, pos, newStr, 0, strpos - pos); |
|
strpos = pos; |
|
return newStr; |
|
} |
|
|
|
char[] getChars(int pos, int endPos) { |
|
char newStr[] = new char[endPos - pos]; |
|
System.arraycopy(str, pos, newStr, 0, endPos - pos); |
|
// REMIND: it's not clear whether this version should set strpos or not |
|
|
|
return newStr; |
|
} |
|
|
|
void resetStrBuffer() { |
|
strpos = 0; |
|
} |
|
|
|
int strIndexOf(char target) { |
|
for (int i = 0; i < strpos; i++) { |
|
if (str[i] == target) { |
|
return i; |
|
} |
|
} |
|
|
|
return -1; |
|
} |
|
|
|
|
|
|
|
|
|
*/ |
|
void skipSpace() throws IOException { |
|
while (true) { |
|
switch (ch) { |
|
case '\n': |
|
ln++; |
|
ch = readCh(); |
|
lfCount++; |
|
break; |
|
|
|
case '\r': |
|
ln++; |
|
if ((ch = readCh()) == '\n') { |
|
ch = readCh(); |
|
crlfCount++; |
|
} |
|
else { |
|
crCount++; |
|
} |
|
break; |
|
case ' ': |
|
case '\t': |
|
ch = readCh(); |
|
break; |
|
|
|
default: |
|
return; |
|
} |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
boolean parseIdentifier(boolean lower) throws IOException { |
|
switch (ch) { |
|
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': |
|
case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': |
|
case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': |
|
case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': |
|
case 'Y': case 'Z': |
|
if (lower) { |
|
ch = 'a' + (ch - 'A'); |
|
} |
|
|
|
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': |
|
case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': |
|
case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': |
|
case 's': case 't': case 'u': case 'v': case 'w': case 'x': |
|
case 'y': case 'z': |
|
break; |
|
|
|
default: |
|
return false; |
|
} |
|
|
|
while (true) { |
|
addString(ch); |
|
|
|
switch (ch = readCh()) { |
|
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': |
|
case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': |
|
case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': |
|
case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': |
|
case 'Y': case 'Z': |
|
if (lower) { |
|
ch = 'a' + (ch - 'A'); |
|
} |
|
|
|
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': |
|
case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': |
|
case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': |
|
case 's': case 't': case 'u': case 'v': case 'w': case 'x': |
|
case 'y': case 'z': |
|
|
|
case '0': case '1': case '2': case '3': case '4': |
|
case '5': case '6': case '7': case '8': case '9': |
|
|
|
case '.': case '-': |
|
|
|
case '_': |
|
break; |
|
|
|
default: |
|
return true; |
|
} |
|
} |
|
} |
|
|
|
|
|
|
|
*/ |
|
private char[] parseEntityReference() throws IOException { |
|
int pos = strpos; |
|
|
|
if ((ch = readCh()) == '#') { |
|
int n = 0; |
|
ch = readCh(); |
|
if ((ch >= '0') && (ch <= '9') || |
|
ch == 'x' || ch == 'X') { |
|
|
|
if ((ch >= '0') && (ch <= '9')) { |
|
|
|
while ((ch >= '0') && (ch <= '9')) { |
|
n = (n * 10) + ch - '0'; |
|
ch = readCh(); |
|
} |
|
} else { |
|
|
|
ch = readCh(); |
|
char lch = (char) Character.toLowerCase(ch); |
|
while ((lch >= '0') && (lch <= '9') || |
|
(lch >= 'a') && (lch <= 'f')) { |
|
if (lch >= '0' && lch <= '9') { |
|
n = (n * 16) + lch - '0'; |
|
} else { |
|
n = (n * 16) + lch - 'a' + 10; |
|
} |
|
ch = readCh(); |
|
lch = (char) Character.toLowerCase(ch); |
|
} |
|
} |
|
switch (ch) { |
|
case '\n': |
|
ln++; |
|
ch = readCh(); |
|
lfCount++; |
|
break; |
|
|
|
case '\r': |
|
ln++; |
|
if ((ch = readCh()) == '\n') { |
|
ch = readCh(); |
|
crlfCount++; |
|
} |
|
else { |
|
crCount++; |
|
} |
|
break; |
|
|
|
case ';': |
|
ch = readCh(); |
|
break; |
|
} |
|
char data[] = mapNumericReference(n); |
|
return data; |
|
} |
|
addString('#'); |
|
if (!parseIdentifier(false)) { |
|
error("ident.expected"); |
|
strpos = pos; |
|
char data[] = {'&', '#'}; |
|
return data; |
|
} |
|
} else if (!parseIdentifier(false)) { |
|
char data[] = {'&'}; |
|
return data; |
|
} |
|
|
|
boolean semicolon = false; |
|
|
|
switch (ch) { |
|
case '\n': |
|
ln++; |
|
ch = readCh(); |
|
lfCount++; |
|
break; |
|
|
|
case '\r': |
|
ln++; |
|
if ((ch = readCh()) == '\n') { |
|
ch = readCh(); |
|
crlfCount++; |
|
} |
|
else { |
|
crCount++; |
|
} |
|
break; |
|
|
|
case ';': |
|
semicolon = true; |
|
|
|
ch = readCh(); |
|
break; |
|
} |
|
|
|
String nm = getString(pos); |
|
Entity ent = dtd.getEntity(nm); |
|
|
|
// entities are case sensitive - however if strict |
|
// is false then we will try to make a match by |
|
// converting the string to all lowercase. |
|
|
|
if (!strict && (ent == null)) { |
|
ent = dtd.getEntity(nm.toLowerCase()); |
|
} |
|
if ((ent == null) || !ent.isGeneral()) { |
|
|
|
if (nm.length() == 0) { |
|
error("invalid.entref", nm); |
|
return new char[0]; |
|
} |
|
|
|
String str = "&" + nm + (semicolon ? ";" : ""); |
|
|
|
char b[] = new char[str.length()]; |
|
str.getChars(0, b.length, b, 0); |
|
return b; |
|
} |
|
return ent.getData(); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
private char[] mapNumericReference(int c) { |
|
char[] data; |
|
if (c >= 0xffff) { |
|
try { |
|
data = Character.toChars(c); |
|
} catch (IllegalArgumentException e) { |
|
data = new char[0]; |
|
} |
|
} else { |
|
data = new char[1]; |
|
data[0] = (c < 130 || c > 159) ? (char) c : cp1252Map[c - 130]; |
|
} |
|
return data; |
|
} |
|
|
|
|
|
|
|
*/ |
|
void parseComment() throws IOException { |
|
|
|
while (true) { |
|
int c = ch; |
|
switch (c) { |
|
case '-': |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
**/ |
|
if (!strict && (strpos != 0) && (str[strpos - 1] == '-')) { |
|
if ((ch = readCh()) == '>') { |
|
return; |
|
} |
|
if (ch == '!') { |
|
if ((ch = readCh()) == '>') { |
|
return; |
|
} else { |
|
|
|
addString('-'); |
|
addString('!'); |
|
continue; |
|
} |
|
} |
|
break; |
|
} |
|
|
|
if ((ch = readCh()) == '-') { |
|
ch = readCh(); |
|
if (strict || ch == '>') { |
|
return; |
|
} |
|
if (ch == '!') { |
|
if ((ch = readCh()) == '>') { |
|
return; |
|
} else { |
|
|
|
addString('-'); |
|
addString('!'); |
|
continue; |
|
} |
|
} |
|
|
|
addString('-'); |
|
} |
|
break; |
|
|
|
case -1: |
|
handleEOFInComment(); |
|
return; |
|
|
|
case '\n': |
|
ln++; |
|
ch = readCh(); |
|
lfCount++; |
|
break; |
|
|
|
case '>': |
|
ch = readCh(); |
|
break; |
|
|
|
case '\r': |
|
ln++; |
|
if ((ch = readCh()) == '\n') { |
|
ch = readCh(); |
|
crlfCount++; |
|
} |
|
else { |
|
crCount++; |
|
} |
|
c = '\n'; |
|
break; |
|
default: |
|
ch = readCh(); |
|
break; |
|
} |
|
|
|
addString(c); |
|
} |
|
} |
|
|
|
|
|
|
|
*/ |
|
void parseLiteral(boolean replace) throws IOException { |
|
while (true) { |
|
int c = ch; |
|
switch (c) { |
|
case -1: |
|
error("eof.literal", stack.elem.getName()); |
|
endTag(true); |
|
return; |
|
|
|
case '>': |
|
ch = readCh(); |
|
int i = textpos - (stack.elem.name.length() + 2), j = 0; |
|
|
|
|
|
if ((i >= 0) && (text[i++] == '<') && (text[i] == '/')) { |
|
while ((++i < textpos) && |
|
(Character.toLowerCase(text[i]) == stack.elem.name.charAt(j++))); |
|
if (i == textpos) { |
|
textpos -= (stack.elem.name.length() + 2); |
|
if ((textpos > 0) && (text[textpos-1] == '\n')) { |
|
textpos--; |
|
} |
|
endTag(false); |
|
return; |
|
} |
|
} |
|
break; |
|
|
|
case '&': |
|
char data[] = parseEntityReference(); |
|
if (textpos + data.length > text.length) { |
|
char newtext[] = new char[Math.max(textpos + data.length + 128, text.length * 2)]; |
|
System.arraycopy(text, 0, newtext, 0, text.length); |
|
text = newtext; |
|
} |
|
System.arraycopy(data, 0, text, textpos, data.length); |
|
textpos += data.length; |
|
continue; |
|
|
|
case '\n': |
|
ln++; |
|
ch = readCh(); |
|
lfCount++; |
|
break; |
|
|
|
case '\r': |
|
ln++; |
|
if ((ch = readCh()) == '\n') { |
|
ch = readCh(); |
|
crlfCount++; |
|
} |
|
else { |
|
crCount++; |
|
} |
|
c = '\n'; |
|
break; |
|
default: |
|
ch = readCh(); |
|
break; |
|
} |
|
|
|
|
|
if (textpos == text.length) { |
|
char newtext[] = new char[text.length + 128]; |
|
System.arraycopy(text, 0, newtext, 0, text.length); |
|
text = newtext; |
|
} |
|
text[textpos++] = (char)c; |
|
} |
|
} |
|
|
|
|
|
|
|
*/ |
|
String parseAttributeValue(boolean lower) throws IOException { |
|
int delim = -1; |
|
|
|
|
|
switch(ch) { |
|
case '\'': |
|
case '"': |
|
delim = ch; |
|
ch = readCh(); |
|
break; |
|
} |
|
|
|
|
|
while (true) { |
|
int c = ch; |
|
|
|
switch (c) { |
|
case '\n': |
|
ln++; |
|
ch = readCh(); |
|
lfCount++; |
|
if (delim < 0) { |
|
return getString(0); |
|
} |
|
break; |
|
|
|
case '\r': |
|
ln++; |
|
|
|
if ((ch = readCh()) == '\n') { |
|
ch = readCh(); |
|
crlfCount++; |
|
} |
|
else { |
|
crCount++; |
|
} |
|
if (delim < 0) { |
|
return getString(0); |
|
} |
|
break; |
|
|
|
case '\t': |
|
if (delim < 0) |
|
c = ' '; |
|
case ' ': |
|
ch = readCh(); |
|
if (delim < 0) { |
|
return getString(0); |
|
} |
|
break; |
|
|
|
case '>': |
|
case '<': |
|
if (delim < 0) { |
|
return getString(0); |
|
} |
|
ch = readCh(); |
|
break; |
|
|
|
case '\'': |
|
case '"': |
|
ch = readCh(); |
|
if (c == delim) { |
|
return getString(0); |
|
} else if (delim == -1) { |
|
error("attvalerr"); |
|
if (strict || ch == ' ') { |
|
return getString(0); |
|
} else { |
|
continue; |
|
} |
|
} |
|
break; |
|
|
|
case '=': |
|
if (delim < 0) { |
|
|
|
|
|
|
|
*/ |
|
error("attvalerr"); |
|
|
|
|
|
process the rest of the img tag. */ |
|
if (strict) { |
|
return getString(0); |
|
} |
|
} |
|
ch = readCh(); |
|
break; |
|
|
|
case '&': |
|
if (strict && delim < 0) { |
|
ch = readCh(); |
|
break; |
|
} |
|
|
|
char data[] = parseEntityReference(); |
|
for (int i = 0 ; i < data.length ; i++) { |
|
c = data[i]; |
|
addString((lower && (c >= 'A') && (c <= 'Z')) ? 'a' + c - 'A' : c); |
|
} |
|
continue; |
|
|
|
case -1: |
|
return getString(0); |
|
|
|
default: |
|
if (lower && (c >= 'A') && (c <= 'Z')) { |
|
c = 'a' + c - 'A'; |
|
} |
|
ch = readCh(); |
|
break; |
|
} |
|
addString(c); |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
*/ |
|
void parseAttributeSpecificationList(Element elem) throws IOException { |
|
|
|
while (true) { |
|
skipSpace(); |
|
|
|
switch (ch) { |
|
case '/': |
|
case '>': |
|
case '<': |
|
case -1: |
|
return; |
|
|
|
case '-': |
|
if ((ch = readCh()) == '-') { |
|
ch = readCh(); |
|
parseComment(); |
|
strpos = 0; |
|
} else { |
|
error("invalid.tagchar", "-", elem.getName()); |
|
ch = readCh(); |
|
} |
|
continue; |
|
} |
|
|
|
AttributeList att; |
|
String attname; |
|
String attvalue; |
|
|
|
if (parseIdentifier(true)) { |
|
attname = getString(0); |
|
skipSpace(); |
|
if (ch == '=') { |
|
ch = readCh(); |
|
skipSpace(); |
|
att = elem.getAttribute(attname); |
|
// Bug ID 4102750 |
|
// Load the NAME of an Attribute Case Sensitive |
|
// The case of the NAME must be intact |
|
|
|
attvalue = parseAttributeValue((att != null) && (att.type != CDATA) && (att.type != NOTATION) && (att.type != NAME)); |
|
// attvalue = parseAttributeValue((att != null) && (att.type != CDATA) && (att.type != NOTATION)); |
|
} else { |
|
attvalue = attname; |
|
att = elem.getAttributeByValue(attvalue); |
|
if (att == null) { |
|
att = elem.getAttribute(attname); |
|
if (att != null) { |
|
attvalue = att.getValue(); |
|
} |
|
else { |
|
// Make it null so that NULL_ATTRIBUTE_VALUE is |
|
|
|
attvalue = null; |
|
} |
|
} |
|
} |
|
} else if (!strict && ch == ',') { |
|
ch = readCh(); |
|
continue; |
|
} else if (!strict && ch == '"') { |
|
ch = readCh(); |
|
skipSpace(); |
|
if (parseIdentifier(true)) { |
|
attname = getString(0); |
|
if (ch == '"') { |
|
ch = readCh(); |
|
} |
|
skipSpace(); |
|
if (ch == '=') { |
|
ch = readCh(); |
|
skipSpace(); |
|
att = elem.getAttribute(attname); |
|
attvalue = parseAttributeValue((att != null) && |
|
(att.type != CDATA) && |
|
(att.type != NOTATION)); |
|
} else { |
|
attvalue = attname; |
|
att = elem.getAttributeByValue(attvalue); |
|
if (att == null) { |
|
att = elem.getAttribute(attname); |
|
if (att != null) { |
|
attvalue = att.getValue(); |
|
} |
|
} |
|
} |
|
} else { |
|
char str[] = {(char)ch}; |
|
error("invalid.tagchar", new String(str), elem.getName()); |
|
ch = readCh(); |
|
continue; |
|
} |
|
} else if (!strict && (attributes.isEmpty()) && (ch == '=')) { |
|
ch = readCh(); |
|
skipSpace(); |
|
attname = elem.getName(); |
|
att = elem.getAttribute(attname); |
|
attvalue = parseAttributeValue((att != null) && |
|
(att.type != CDATA) && |
|
(att.type != NOTATION)); |
|
} else if (!strict && (ch == '=')) { |
|
ch = readCh(); |
|
skipSpace(); |
|
attvalue = parseAttributeValue(true); |
|
error("attvalerr"); |
|
return; |
|
} else { |
|
char str[] = {(char)ch}; |
|
error("invalid.tagchar", new String(str), elem.getName()); |
|
if (!strict) { |
|
ch = readCh(); |
|
continue; |
|
} else { |
|
return; |
|
} |
|
} |
|
|
|
if (att != null) { |
|
attname = att.getName(); |
|
} else { |
|
error("invalid.tagatt", attname, elem.getName()); |
|
} |
|
|
|
|
|
if (attributes.isDefined(attname)) { |
|
error("multi.tagatt", attname, elem.getName()); |
|
} |
|
if (attvalue == null) { |
|
attvalue = ((att != null) && (att.value != null)) ? att.value : |
|
HTML.NULL_ATTRIBUTE_VALUE; |
|
} else if ((att != null) && (att.values != null) && !att.values.contains(attvalue)) { |
|
error("invalid.tagattval", attname, elem.getName()); |
|
} |
|
HTML.Attribute attkey = HTML.getAttributeKey(attname); |
|
if (attkey == null) { |
|
attributes.addAttribute(attname, attvalue); |
|
} else { |
|
attributes.addAttribute(attkey, attvalue); |
|
} |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
*/ |
|
public String parseDTDMarkup() throws IOException { |
|
|
|
StringBuilder strBuff = new StringBuilder(); |
|
ch = readCh(); |
|
while(true) { |
|
switch (ch) { |
|
case '>': |
|
ch = readCh(); |
|
return strBuff.toString(); |
|
case -1: |
|
error("invalid.markup"); |
|
return strBuff.toString(); |
|
case '\n': |
|
ln++; |
|
ch = readCh(); |
|
lfCount++; |
|
break; |
|
case '"': |
|
ch = readCh(); |
|
break; |
|
case '\r': |
|
ln++; |
|
if ((ch = readCh()) == '\n') { |
|
ch = readCh(); |
|
crlfCount++; |
|
} |
|
else { |
|
crCount++; |
|
} |
|
break; |
|
default: |
|
strBuff.append((char)(ch & 0xFF)); |
|
ch = readCh(); |
|
break; |
|
} |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
protected boolean parseMarkupDeclarations(StringBuffer strBuff) throws IOException { |
|
|
|
|
|
if ((strBuff.length() == "DOCTYPE".length()) && |
|
(strBuff.toString().toUpperCase().equals("DOCTYPE"))) { |
|
parseDTDMarkup(); |
|
return true; |
|
} |
|
return false; |
|
} |
|
|
|
|
|
|
|
*/ |
|
void parseInvalidTag() throws IOException { |
|
|
|
while (true) { |
|
skipSpace(); |
|
switch (ch) { |
|
case '>': |
|
case -1: |
|
ch = readCh(); |
|
return; |
|
case '<': |
|
return; |
|
default: |
|
ch = readCh(); |
|
|
|
} |
|
} |
|
} |
|
|
|
|
|
|
|
*/ |
|
void parseTag() throws IOException { |
|
Element elem; |
|
boolean net = false; |
|
boolean warned = false; |
|
boolean unknown = false; |
|
|
|
switch (ch = readCh()) { |
|
case '!': |
|
switch (ch = readCh()) { |
|
case '-': |
|
|
|
while (true) { |
|
if (ch == '-') { |
|
if (!strict || ((ch = readCh()) == '-')) { |
|
ch = readCh(); |
|
if (!strict && ch == '-') { |
|
ch = readCh(); |
|
} |
|
// send over any text you might see |
|
// before parsing and sending the |
|
|
|
if (textpos != 0) { |
|
char newtext[] = new char[textpos]; |
|
System.arraycopy(text, 0, newtext, 0, textpos); |
|
handleText(newtext); |
|
lastBlockStartPos = currentBlockStartPos; |
|
textpos = 0; |
|
} |
|
parseComment(); |
|
last = makeTag(dtd.getElement("comment"), true); |
|
handleComment(getChars(0)); |
|
continue; |
|
} else if (!warned) { |
|
warned = true; |
|
error("invalid.commentchar", "-"); |
|
} |
|
} |
|
skipSpace(); |
|
switch (ch) { |
|
case '-': |
|
continue; |
|
case '>': |
|
ch = readCh(); |
|
case -1: |
|
return; |
|
default: |
|
ch = readCh(); |
|
if (!warned) { |
|
warned = true; |
|
error("invalid.commentchar", |
|
String.valueOf((char)ch)); |
|
} |
|
break; |
|
} |
|
} |
|
|
|
default: |
|
|
|
StringBuffer strBuff = new StringBuffer(); |
|
while (true) { |
|
strBuff.append((char)ch); |
|
if (parseMarkupDeclarations(strBuff)) { |
|
return; |
|
} |
|
switch(ch) { |
|
case '>': |
|
ch = readCh(); |
|
case -1: |
|
error("invalid.markup"); |
|
return; |
|
case '\n': |
|
ln++; |
|
ch = readCh(); |
|
lfCount++; |
|
break; |
|
case '\r': |
|
ln++; |
|
if ((ch = readCh()) == '\n') { |
|
ch = readCh(); |
|
crlfCount++; |
|
} |
|
else { |
|
crCount++; |
|
} |
|
break; |
|
|
|
default: |
|
ch = readCh(); |
|
break; |
|
} |
|
} |
|
} |
|
|
|
case '/': |
|
|
|
switch (ch = readCh()) { |
|
case '>': |
|
ch = readCh(); |
|
case '<': |
|
|
|
if (recent == null) { |
|
error("invalid.shortend"); |
|
return; |
|
} |
|
elem = recent; |
|
break; |
|
|
|
default: |
|
if (!parseIdentifier(true)) { |
|
error("expected.endtagname"); |
|
return; |
|
} |
|
skipSpace(); |
|
switch (ch) { |
|
case '>': |
|
ch = readCh(); |
|
case '<': |
|
break; |
|
|
|
default: |
|
error("expected", "'>'"); |
|
while ((ch != -1) && (ch != '\n') && (ch != '>')) { |
|
ch = readCh(); |
|
} |
|
if (ch == '>') { |
|
ch = readCh(); |
|
} |
|
break; |
|
} |
|
String elemStr = getString(0); |
|
if (!dtd.elementExists(elemStr)) { |
|
error("end.unrecognized", elemStr); |
|
|
|
if ((textpos > 0) && (text[textpos-1] == '\n')) { |
|
textpos--; |
|
} |
|
elem = dtd.getElement("unknown"); |
|
elem.name = elemStr; |
|
unknown = true; |
|
} else { |
|
elem = dtd.getElement(elemStr); |
|
} |
|
break; |
|
} |
|
|
|
|
|
// If the stack is null, we're seeing end tags without any begin |
|
// tags. Ignore them. |
|
|
|
if (stack == null) { |
|
error("end.extra.tag", elem.getName()); |
|
return; |
|
} |
|
|
|
|
|
if ((textpos > 0) && (text[textpos-1] == '\n')) { |
|
// In a pre tag, if there are blank lines |
|
// we do not want to remove the newline |
|
// before the end tag. Hence this code. |
|
|
|
if (stack.pre) { |
|
if ((textpos > 1) && (text[textpos-2] != '\n')) { |
|
textpos--; |
|
} |
|
} else { |
|
textpos--; |
|
} |
|
} |
|
|
|
// If the end tag is a form, since we did not put it |
|
// on the tag stack, there is no corresponding start |
|
// start tag to find. Hence do not touch the tag stack. |
|
// |
|
|
|
/* |
|
if (!strict && elem.getName().equals("form")) { |
|
if (lastFormSent != null) { |
|
handleEndTag(lastFormSent); |
|
return; |
|
} else { |
|
// do nothing. |
|
return; |
|
} |
|
} |
|
*/ |
|
|
|
if (unknown) { |
|
// we will not see a corresponding start tag |
|
// on the the stack. If we are seeing an |
|
// end tag, lets send this on as an empty |
|
// tag with the end tag attribute set to |
|
|
|
TagElement t = makeTag(elem); |
|
handleText(t); |
|
attributes.addAttribute(HTML.Attribute.ENDTAG, "true"); |
|
handleEmptyTag(makeTag(elem)); |
|
unknown = false; |
|
return; |
|
} |
|
|
|
// find the corresponding start tag |
|
|
|
// A commonly occurring error appears to be the insertion |
|
// of extra end tags in a table. The intent here is ignore |
|
// such extra end tags. |
|
|
|
if (!strict) { |
|
String stackElem = stack.elem.getName(); |
|
|
|
if (stackElem.equals("table")) { |
|
// If it is not a valid end tag ignore it and return |
|
|
|
if (!elem.getName().equals(stackElem)) { |
|
error("tag.ignore", elem.getName()); |
|
return; |
|
} |
|
} |
|
|
|
|
|
|
|
if (stackElem.equals("tr") || |
|
stackElem.equals("td")) { |
|
if ((!elem.getName().equals("table")) && |
|
(!elem.getName().equals(stackElem))) { |
|
error("tag.ignore", elem.getName()); |
|
return; |
|
} |
|
} |
|
} |
|
TagStack sp = stack; |
|
|
|
while ((sp != null) && (elem != sp.elem)) { |
|
sp = sp.next; |
|
} |
|
if (sp == null) { |
|
error("unmatched.endtag", elem.getName()); |
|
return; |
|
} |
|
|
|
// People put font ending tags in the darndest places. |
|
// Don't close other contexts based on them being between |
|
// a font tag and the corresponding end tag. Instead, |
|
// ignore the end tag like it doesn't exist and allow the end |
|
|
|
String elemName = elem.getName(); |
|
if (stack != sp && |
|
(elemName.equals("font") || |
|
elemName.equals("center"))) { |
|
|
|
// Since closing out a center tag can have real wierd |
|
// effects on the formatting, make sure that tags |
|
// for which omitting an end tag is legimitate |
|
// get closed out. |
|
|
|
if (elemName.equals("center")) { |
|
while(stack.elem.omitEnd() && stack != sp) { |
|
endTag(true); |
|
} |
|
if (stack.elem == elem) { |
|
endTag(false); |
|
} |
|
} |
|
return; |
|
} |
|
// People do the same thing with center tags. In this |
|
// case we would like to close off the center tag but |
|
// not necessarily all enclosing tags. |
|
|
|
|
|
|
|
|
|
while (stack != sp) { |
|
endTag(true); |
|
} |
|
|
|
endTag(false); |
|
return; |
|
|
|
case -1: |
|
error("eof"); |
|
return; |
|
} |
|
|
|
|
|
if (!parseIdentifier(true)) { |
|
elem = recent; |
|
if ((ch != '>') || (elem == null)) { |
|
error("expected.tagname"); |
|
return; |
|
} |
|
} else { |
|
String elemStr = getString(0); |
|
|
|
if (elemStr.equals("image")) { |
|
elemStr = "img"; |
|
} |
|
|
|
/* determine if this element is part of the dtd. */ |
|
|
|
if (!dtd.elementExists(elemStr)) { |
|
|
|
error("tag.unrecognized ", elemStr); |
|
elem = dtd.getElement("unknown"); |
|
elem.name = elemStr; |
|
unknown = true; |
|
} else { |
|
elem = dtd.getElement(elemStr); |
|
} |
|
} |
|
|
|
|
|
parseAttributeSpecificationList(elem); |
|
|
|
switch (ch) { |
|
case '/': |
|
net = true; |
|
case '>': |
|
ch = readCh(); |
|
if (ch == '>' && net) { |
|
ch = readCh(); |
|
} |
|
case '<': |
|
break; |
|
|
|
default: |
|
error("expected", "'>'"); |
|
break; |
|
} |
|
|
|
if (!strict) { |
|
if (elem.getName().equals("script")) { |
|
error("javascript.unsupported"); |
|
} |
|
} |
|
|
|
// ignore RE after start tag |
|
|
|
if (!elem.isEmpty()) { |
|
if (ch == '\n') { |
|
ln++; |
|
lfCount++; |
|
ch = readCh(); |
|
} else if (ch == '\r') { |
|
ln++; |
|
if ((ch = readCh()) == '\n') { |
|
ch = readCh(); |
|
crlfCount++; |
|
} |
|
else { |
|
crCount++; |
|
} |
|
} |
|
} |
|
|
|
|
|
TagElement tag = makeTag(elem, false); |
|
|
|
|
|
/** In dealing with forms, we have decided to treat |
|
them as legal in any context. Also, even though |
|
they do have a start and an end tag, we will |
|
not put this tag on the stack. This is to deal |
|
several pages in the web oasis that choose to |
|
start and end forms in any possible location. **/ |
|
|
|
/* |
|
if (!strict && elem.getName().equals("form")) { |
|
if (lastFormSent == null) { |
|
lastFormSent = tag; |
|
} else { |
|
handleEndTag(lastFormSent); |
|
lastFormSent = tag; |
|
} |
|
} else { |
|
*/ |
|
// Smlly, if a tag is unknown, we will apply |
|
// no legalTagContext logic to it. |
|
|
|
if (!unknown) { |
|
legalTagContext(tag); |
|
|
|
// If skip tag is true, this implies that |
|
// the tag was illegal and that the error |
|
// recovery strategy adopted is to ignore |
|
|
|
if (!strict && skipTag) { |
|
skipTag = false; |
|
return; |
|
} |
|
} |
|
/* |
|
} |
|
*/ |
|
|
|
startTag(tag); |
|
|
|
if (!elem.isEmpty()) { |
|
switch (elem.getType()) { |
|
case CDATA: |
|
parseLiteral(false); |
|
break; |
|
case RCDATA: |
|
parseLiteral(true); |
|
break; |
|
default: |
|
if (stack != null) { |
|
stack.net = net; |
|
} |
|
break; |
|
} |
|
} |
|
} |
|
|
|
private static final String START_COMMENT = "<!--"; |
|
private static final String END_COMMENT = "-->"; |
|
private static final char[] SCRIPT_END_TAG = "</script>".toCharArray(); |
|
private static final char[] SCRIPT_END_TAG_UPPER_CASE = |
|
"</SCRIPT>".toCharArray(); |
|
|
|
void parseScript() throws IOException { |
|
char[] charsToAdd = new char[SCRIPT_END_TAG.length]; |
|
boolean insideComment = false; |
|
|
|
|
|
while (true) { |
|
int i = 0; |
|
while (!insideComment && i < SCRIPT_END_TAG.length |
|
&& (SCRIPT_END_TAG[i] == ch |
|
|| SCRIPT_END_TAG_UPPER_CASE[i] == ch)) { |
|
charsToAdd[i] = (char) ch; |
|
ch = readCh(); |
|
i++; |
|
} |
|
if (i == SCRIPT_END_TAG.length) { |
|
return; |
|
} |
|
|
|
if (!insideComment && i == 1 && charsToAdd[0] == START_COMMENT.charAt(0)) { |
|
|
|
while (i < START_COMMENT.length() |
|
&& START_COMMENT.charAt(i) == ch) { |
|
charsToAdd[i] = (char) ch; |
|
ch = readCh(); |
|
i++; |
|
} |
|
if (i == START_COMMENT.length()) { |
|
insideComment = true; |
|
} |
|
} |
|
if (insideComment) { |
|
while (i < END_COMMENT.length() |
|
&& END_COMMENT.charAt(i) == ch) { |
|
charsToAdd[i] = (char) ch; |
|
ch = readCh(); |
|
i++; |
|
} |
|
if (i == END_COMMENT.length()) { |
|
insideComment = false; |
|
} |
|
} |
|
|
|
|
|
if (i > 0) { |
|
for (int j = 0; j < i; j++) { |
|
addString(charsToAdd[j]); |
|
} |
|
continue; |
|
} |
|
switch (ch) { |
|
case -1: |
|
error("eof.script"); |
|
return; |
|
case '\n': |
|
ln++; |
|
ch = readCh(); |
|
lfCount++; |
|
addString('\n'); |
|
break; |
|
case '\r': |
|
ln++; |
|
if ((ch = readCh()) == '\n') { |
|
ch = readCh(); |
|
crlfCount++; |
|
} else { |
|
crCount++; |
|
} |
|
addString('\n'); |
|
break; |
|
default: |
|
addString(ch); |
|
ch = readCh(); |
|
break; |
|
} // switch |
|
} // while |
|
} |
|
|
|
|
|
|
|
*/ |
|
void parseContent() throws IOException { |
|
Thread curThread = Thread.currentThread(); |
|
|
|
for (;;) { |
|
if (curThread.isInterrupted()) { |
|
curThread.interrupt(); |
|
break; |
|
} |
|
|
|
int c = ch; |
|
currentBlockStartPos = currentPosition; |
|
|
|
if (recent == dtd.script) { // means: if after starting <script> tag |
|
|
|
|
|
parseScript(); |
|
last = makeTag(dtd.getElement("comment"), true); |
|
|
|
|
|
String str = new String(getChars(0)).trim(); |
|
int minLength = START_COMMENT.length() + END_COMMENT.length(); |
|
if (str.startsWith(START_COMMENT) && str.endsWith(END_COMMENT) |
|
&& str.length() >= (minLength)) { |
|
str = str.substring(START_COMMENT.length(), |
|
str.length() - END_COMMENT.length()); |
|
} |
|
|
|
|
|
handleComment(str.toCharArray()); |
|
endTag(false); |
|
lastBlockStartPos = currentPosition; |
|
|
|
continue; |
|
} else { |
|
switch (c) { |
|
case '<': |
|
parseTag(); |
|
lastBlockStartPos = currentPosition; |
|
continue; |
|
|
|
case '/': |
|
ch = readCh(); |
|
if ((stack != null) && stack.net) { |
|
|
|
endTag(false); |
|
continue; |
|
} else if (textpos == 0) { |
|
if (!legalElementContext(dtd.pcdata)) { |
|
error("unexpected.pcdata"); |
|
} |
|
if (last.breaksFlow()) { |
|
space = false; |
|
} |
|
} |
|
break; |
|
|
|
case -1: |
|
return; |
|
|
|
case '&': |
|
if (textpos == 0) { |
|
if (!legalElementContext(dtd.pcdata)) { |
|
error("unexpected.pcdata"); |
|
} |
|
if (last.breaksFlow()) { |
|
space = false; |
|
} |
|
} |
|
char data[] = parseEntityReference(); |
|
if (textpos + data.length + 1 > text.length) { |
|
char newtext[] = new char[Math.max(textpos + data.length + 128, text.length * 2)]; |
|
System.arraycopy(text, 0, newtext, 0, text.length); |
|
text = newtext; |
|
} |
|
if (space) { |
|
space = false; |
|
text[textpos++] = ' '; |
|
} |
|
System.arraycopy(data, 0, text, textpos, data.length); |
|
textpos += data.length; |
|
ignoreSpace = false; |
|
continue; |
|
|
|
case '\n': |
|
ln++; |
|
lfCount++; |
|
ch = readCh(); |
|
if ((stack != null) && stack.pre) { |
|
break; |
|
} |
|
if (textpos == 0) { |
|
lastBlockStartPos = currentPosition; |
|
} |
|
if (!ignoreSpace) { |
|
space = true; |
|
} |
|
continue; |
|
|
|
case '\r': |
|
ln++; |
|
c = '\n'; |
|
if ((ch = readCh()) == '\n') { |
|
ch = readCh(); |
|
crlfCount++; |
|
} |
|
else { |
|
crCount++; |
|
} |
|
if ((stack != null) && stack.pre) { |
|
break; |
|
} |
|
if (textpos == 0) { |
|
lastBlockStartPos = currentPosition; |
|
} |
|
if (!ignoreSpace) { |
|
space = true; |
|
} |
|
continue; |
|
|
|
|
|
case '\t': |
|
case ' ': |
|
ch = readCh(); |
|
if ((stack != null) && stack.pre) { |
|
break; |
|
} |
|
if (textpos == 0) { |
|
lastBlockStartPos = currentPosition; |
|
} |
|
if (!ignoreSpace) { |
|
space = true; |
|
} |
|
continue; |
|
|
|
default: |
|
if (textpos == 0) { |
|
if (!legalElementContext(dtd.pcdata)) { |
|
error("unexpected.pcdata"); |
|
} |
|
if (last.breaksFlow()) { |
|
space = false; |
|
} |
|
} |
|
ch = readCh(); |
|
break; |
|
} |
|
} |
|
|
|
|
|
if (textpos + 2 > text.length) { |
|
char newtext[] = new char[text.length + 128]; |
|
System.arraycopy(text, 0, newtext, 0, text.length); |
|
text = newtext; |
|
} |
|
|
|
|
|
if (space) { |
|
if (textpos == 0) { |
|
lastBlockStartPos--; |
|
} |
|
text[textpos++] = ' '; |
|
space = false; |
|
} |
|
text[textpos++] = (char)c; |
|
ignoreSpace = false; |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
*/ |
|
String getEndOfLineString() { |
|
if (crlfCount >= crCount) { |
|
if (lfCount >= crlfCount) { |
|
return "\n"; |
|
} |
|
else { |
|
return "\r\n"; |
|
} |
|
} |
|
else { |
|
if (crCount > lfCount) { |
|
return "\r"; |
|
} |
|
else { |
|
return "\n"; |
|
} |
|
} |
|
} |
|
|
|
|
|
|
|
*/ |
|
public synchronized void parse(Reader in) throws IOException { |
|
this.in = in; |
|
|
|
this.ln = 1; |
|
|
|
seenHtml = false; |
|
seenHead = false; |
|
seenBody = false; |
|
|
|
crCount = lfCount = crlfCount = 0; |
|
|
|
try { |
|
ch = readCh(); |
|
text = new char[1024]; |
|
str = new char[128]; |
|
|
|
parseContent(); |
|
// NOTE: interruption may have occurred. Control flows out |
|
|
|
while (stack != null) { |
|
endTag(true); |
|
} |
|
in.close(); |
|
} catch (IOException e) { |
|
errorContext(); |
|
error("ioexception"); |
|
throw e; |
|
} catch (Exception e) { |
|
errorContext(); |
|
error("exception", e.getClass().getName(), e.getMessage()); |
|
e.printStackTrace(); |
|
} catch (ThreadDeath e) { |
|
errorContext(); |
|
error("terminated"); |
|
e.printStackTrace(); |
|
throw e; |
|
} finally { |
|
for (; stack != null ; stack = stack.next) { |
|
handleEndTag(stack.tag); |
|
} |
|
|
|
text = null; |
|
str = null; |
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*/ |
|
private char buf[] = new char[1]; |
|
private int pos; |
|
private int len; |
|
|
|
|
|
|
|
*/ |
|
private int currentPosition; |
|
|
|
|
|
private final int readCh() throws IOException { |
|
|
|
if (pos >= len) { |
|
|
|
// This loop allows us to ignore interrupts if the flag |
|
|
|
for (;;) { |
|
try { |
|
len = in.read(buf); |
|
break; |
|
} catch (InterruptedIOException ex) { |
|
throw ex; |
|
} |
|
} |
|
|
|
if (len <= 0) { |
|
return -1; |
|
} |
|
pos = 0; |
|
} |
|
++currentPosition; |
|
|
|
return buf[pos++]; |
|
} |
|
|
|
|
|
protected int getCurrentPos() { |
|
return currentPosition; |
|
} |
|
} |