X-Git-Url: http://secure.phpeclipse.com diff --git a/net.sourceforge.phpeclipse/src/net/sourceforge/phpdt/tidy/ParserImpl.java b/net.sourceforge.phpeclipse/src/net/sourceforge/phpdt/tidy/ParserImpl.java new file mode 100644 index 0000000..8ac6ecb --- /dev/null +++ b/net.sourceforge.phpeclipse/src/net/sourceforge/phpdt/tidy/ParserImpl.java @@ -0,0 +1,3205 @@ +/* + * @(#)ParserImpl.java 1.11 2000/08/16 + * + */ + +package net.sourceforge.phpdt.tidy; + +/** + * + * HTML Parser implementation + * + * (c) 1998-2000 (W3C) MIT, INRIA, Keio University + * See Tidy.java for the copyright notice. + * Derived from + * HTML Tidy Release 4 Aug 2000 + * + * @author Dave Raggett + * @author Andy Quick (translation to Java) + * @version 1.0, 1999/05/22 + * @version 1.0.1, 1999/05/29 + * @version 1.1, 1999/06/18 Java Bean + * @version 1.2, 1999/07/10 Tidy Release 7 Jul 1999 + * @version 1.3, 1999/07/30 Tidy Release 26 Jul 1999 + * @version 1.4, 1999/09/04 DOM support + * @version 1.5, 1999/10/23 Tidy Release 27 Sep 1999 + * @version 1.6, 1999/11/01 Tidy Release 22 Oct 1999 + * @version 1.7, 1999/12/06 Tidy Release 30 Nov 1999 + * @version 1.8, 2000/01/22 Tidy Release 13 Jan 2000 + * @version 1.9, 2000/06/03 Tidy Release 30 Apr 2000 + * @version 1.10, 2000/07/22 Tidy Release 8 Jul 2000 + * @version 1.11, 2000/08/16 Tidy Release 4 Aug 2000 + */ + +public class ParserImpl { + + //private static int SeenBodyEndTag; /* AQ: moved into lexer structure */ + + private static void parseTag(Lexer lexer, Node node, short mode) + { + // Local fix by GLP 2000-12-21. Need to reset insertspace if this + // is both a non-inline and empty tag (base, link, meta, isindex, hr, area). + // Remove this code once the fix is made in Tidy. + +/****** (Original code follows) + if ((node.tag.model & Dict.CM_EMPTY) != 0) + { + lexer.waswhite = false; + return; + } + else if (!((node.tag.model & Dict.CM_INLINE) != 0)) + lexer.insertspace = false; +*******/ + + if (!((node.tag.model & Dict.CM_INLINE) != 0)) + lexer.insertspace = false; + + if ((node.tag.model & Dict.CM_EMPTY) != 0) + { + lexer.waswhite = false; + return; + } + + if (node.tag.parser == null || node.type == Node.StartEndTag) + return; + + node.tag.parser.parse(lexer, node, mode); + } + + private static void moveToHead(Lexer lexer, Node element, Node node) + { + Node head; + TagTable tt = lexer.configuration.tt; + + + if (node.type == Node.StartTag || node.type == Node.StartEndTag) + { + Report.warning(lexer, element, node, Report.TAG_NOT_ALLOWED_IN); + + while (element.tag != tt.tagHtml) + element = element.parent; + + for (head = element.content; head != null; head = head.next) + { + if (head.tag == tt.tagHead) + { + Node.insertNodeAtEnd(head, node); + break; + } + } + + if (node.tag.parser != null) + parseTag(lexer, node, Lexer.IgnoreWhitespace); + } + else + { + Report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED); + } + } + + public static class ParseHTML implements Parser { + + public void parse( Lexer lexer, Node html, short mode ) + { + Node node, head; + Node frameset = null; + Node noframes = null; + + lexer.configuration.XmlTags = false; + lexer.seenBodyEndTag = 0; + TagTable tt = lexer.configuration.tt; + + for (;;) + { + node = lexer.getToken(Lexer.IgnoreWhitespace); + + if (node == null) + { + node = lexer.inferredTag("head"); + break; + } + + if (node.tag == tt.tagHead) + break; + + if (node.tag == html.tag && node.type == Node.EndTag) + { + Report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED); + continue; + } + + /* deal with comments etc. */ + if (Node.insertMisc(html, node)) + continue; + + lexer.ungetToken(); + node = lexer.inferredTag("head"); + break; + } + + head = node; + Node.insertNodeAtEnd(html, head); + getParseHead().parse(lexer, head, mode); + + for (;;) + { + node = lexer.getToken(Lexer.IgnoreWhitespace); + + if (node == null) + { + if (frameset == null) /* create an empty body */ + node = lexer.inferredTag("body"); + + return; + } + + /* robustly handle html tags */ + if (node.tag == html.tag) + { + if (node.type != Node.StartTag && frameset == null) + Report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED); + + continue; + } + + /* deal with comments etc. */ + if (Node.insertMisc(html, node)) + continue; + + /* if frameset document coerce to */ + if (node.tag == tt.tagBody) + { + if (node.type != Node.StartTag) + { + Report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED); + continue; + } + + if (frameset != null) + { + lexer.ungetToken(); + + if (noframes == null) + { + noframes = lexer.inferredTag("noframes"); + Node.insertNodeAtEnd(frameset, noframes); + Report.warning(lexer, html, noframes, Report.INSERTING_TAG); + } + + parseTag(lexer, noframes, mode); + continue; + } + + break; /* to parse body */ + } + + /* flag an error if we see more than one frameset */ + if (node.tag == tt.tagFrameset) + { + if (node.type != Node.StartTag) + { + Report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED); + continue; + } + + if (frameset != null) + Report.error(lexer, html, node, Report.DUPLICATE_FRAMESET); + else + frameset = node; + + Node.insertNodeAtEnd(html, node); + parseTag(lexer, node, mode); + + /* + see if it includes a noframes element so + that we can merge subsequent noframes elements + */ + + for (node = frameset.content; node != null; node = node.next) + { + if (node.tag == tt.tagNoframes) + noframes = node; + } + continue; + } + + /* if not a frameset document coerce <noframes> to <body> */ + if (node.tag == tt.tagNoframes) + { + if (node.type != Node.StartTag) + { + Report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED); + continue; + } + + if (frameset == null) + { + Report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED); + node = lexer.inferredTag("body"); + break; + } + + if (noframes == null) + { + noframes = node; + Node.insertNodeAtEnd(frameset, noframes); + } + + parseTag(lexer, noframes, mode); + continue; + } + + if (node.type == Node.StartTag || node.type == Node.StartEndTag) + { + if (node.tag != null && (node.tag.model & Dict.CM_HEAD) != 0) + { + moveToHead(lexer, html, node); + continue; + } + } + + lexer.ungetToken(); + + /* insert other content into noframes element */ + + if (frameset != null) + { + if (noframes == null) + { + noframes = lexer.inferredTag("noframes"); + Node.insertNodeAtEnd(frameset, noframes); + } + else + Report.warning(lexer, html, node, Report.NOFRAMES_CONTENT); + + parseTag(lexer, noframes, mode); + continue; + } + + node = lexer.inferredTag("body"); + break; + } + + /* node must be body */ + + Node.insertNodeAtEnd(html, node); + parseTag(lexer, node, mode); + } + + }; + + public static class ParseHead implements Parser { + + public void parse( Lexer lexer, Node head, short mode ) + { + Node node; + int HasTitle = 0; + int HasBase = 0; + TagTable tt = lexer.configuration.tt; + + while (true) + { + node = lexer.getToken(Lexer.IgnoreWhitespace); + if (node == null) break; + if (node.tag == head.tag && node.type == Node.EndTag) + { + head.closed = true; + break; + } + + if (node.type == Node.TextNode) + { + lexer.ungetToken(); + break; + } + + /* deal with comments etc. */ + if (Node.insertMisc(head, node)) + continue; + + if (node.type == Node.DocTypeTag) + { + Node.insertDocType(lexer, head, node); + continue; + } + + /* discard unknown tags */ + if (node.tag == null) + { + Report.warning(lexer, head, node, Report.DISCARDING_UNEXPECTED); + continue; + } + + if (!((node.tag.model & Dict.CM_HEAD) != 0)) + { + lexer.ungetToken(); + break; + } + + if (node.type == Node.StartTag || node.type == Node.StartEndTag) + { + if (node.tag == tt.tagTitle) + { + ++HasTitle; + + if (HasTitle > 1) + Report.warning(lexer, head, node, Report.TOO_MANY_ELEMENTS); + } + else if (node.tag == tt.tagBase) + { + ++HasBase; + + if (HasBase > 1) + Report.warning(lexer, head, node, Report.TOO_MANY_ELEMENTS); + } + else if (node.tag == tt.tagNoscript) + Report.warning(lexer, head, node, Report.TAG_NOT_ALLOWED_IN); + + Node.insertNodeAtEnd(head, node); + parseTag(lexer, node, Lexer.IgnoreWhitespace); + continue; + } + + /* discard unexpected text nodes and end tags */ + Report.warning(lexer, head, node, Report.DISCARDING_UNEXPECTED); + } + + if (HasTitle == 0) + { + Report.warning(lexer, head, null, Report.MISSING_TITLE_ELEMENT); + Node.insertNodeAtEnd(head, lexer.inferredTag( "title")); + } + } + + }; + + public static class ParseTitle implements Parser { + + public void parse( Lexer lexer, Node title, short mode ) + { + Node node; + + while (true) + { + node = lexer.getToken(Lexer.MixedContent); + if (node == null) break; + if (node.tag == title.tag && node.type == Node.EndTag) + { + title.closed = true; + Node.trimSpaces(lexer, title); + return; + } + + if (node.type == Node.TextNode) + { + /* only called for 1st child */ + if (title.content == null) + Node.trimInitialSpace(lexer, title, node); + + if (node.start >= node.end) + { + continue; + } + + Node.insertNodeAtEnd(title, node); + continue; + } + + /* deal with comments etc. */ + if (Node.insertMisc(title, node)) + continue; + + /* discard unknown tags */ + if (node.tag == null) + { + Report.warning(lexer, title, node, Report.DISCARDING_UNEXPECTED); + continue; + } + + /* pushback unexpected tokens */ + Report.warning(lexer, title, node, Report.MISSING_ENDTAG_BEFORE); + lexer.ungetToken(); + Node.trimSpaces(lexer, title); + return; + } + + Report.warning(lexer, title, node, Report.MISSING_ENDTAG_FOR); + } + + }; + + public static class ParseScript implements Parser { + + public void parse( Lexer lexer, Node script, short mode ) + { + /* + This isn't quite right for CDATA content as it recognises + tags within the content and parses them accordingly. + This will unfortunately screw up scripts which include + < + letter, < + !, < + ? or < + / + letter + */ + + Node node; + + node = lexer.getCDATA( script); + + if (node != null) + Node.insertNodeAtEnd(script, node); + } + + }; + + public static class ParseBody implements Parser { + + public void parse( Lexer lexer, Node body, short mode ) + { + Node node; + boolean checkstack, iswhitenode; + + mode = Lexer.IgnoreWhitespace; + checkstack = true; + TagTable tt = lexer.configuration.tt; + + while (true) + { + node = lexer.getToken(mode); + if (node == null) break; + if (node.tag == body.tag && node.type == Node.EndTag) + { + body.closed = true; + Node.trimSpaces(lexer, body); + lexer.seenBodyEndTag = 1; + mode = Lexer.IgnoreWhitespace; + + if (body.parent.tag == tt.tagNoframes) + break; + + continue; + } + + if (node.tag == tt.tagNoframes) + { + if (node.type == Node.StartTag) + { + Node.insertNodeAtEnd(body, node); + getParseBlock().parse(lexer, node, mode); + continue; + } + + if (node.type == Node.EndTag && + body.parent.tag == tt.tagNoframes) + { + Node.trimSpaces(lexer, body); + lexer.ungetToken(); + break; + } + } + + if ((node.tag == tt.tagFrame || node.tag == tt.tagFrameset) + && body.parent.tag == tt.tagNoframes) + { + Node.trimSpaces(lexer, body); + lexer.ungetToken(); + break; + } + + if (node.tag == tt.tagHtml) + { + if (node.type == Node.StartTag || node.type == Node.StartEndTag) + Report.warning(lexer, body, node, Report.DISCARDING_UNEXPECTED); + + continue; + } + + iswhitenode = false; + + if (node.type == Node.TextNode && + node.end <= node.start + 1 && + node.textarray[node.start] == (byte)' ') + iswhitenode = true; + + /* deal with comments etc. */ + if (Node.insertMisc(body, node)) + continue; + + if (lexer.seenBodyEndTag == 1 && !iswhitenode) + { + ++lexer.seenBodyEndTag; + Report.warning(lexer, body, node, Report.CONTENT_AFTER_BODY); + } + + /* mixed content model permits text */ + if (node.type == Node.TextNode) + { + if (iswhitenode && mode == Lexer.IgnoreWhitespace) + { + continue; + } + + if (lexer.configuration.EncloseBodyText && !iswhitenode) + { + Node para; + + lexer.ungetToken(); + para = lexer.inferredTag("p"); + Node.insertNodeAtEnd(body, para); + parseTag(lexer, para, mode); + mode = Lexer.MixedContent; + continue; + } + else /* strict doesn't allow text here */ + lexer.versions &= ~(Dict.VERS_HTML40_STRICT | Dict.VERS_HTML20); + + if (checkstack) + { + checkstack = false; + + if (lexer.inlineDup( node) > 0) + continue; + } + + Node.insertNodeAtEnd(body, node); + mode = Lexer.MixedContent; + continue; + } + + if (node.type == Node.DocTypeTag) + { + Node.insertDocType(lexer, body, node); + continue; + } + /* discard unknown and PARAM tags */ + if (node.tag == null || node.tag == tt.tagParam) + { + Report.warning(lexer, body, node, Report.DISCARDING_UNEXPECTED); + continue; + } + + /* + Netscape allows LI and DD directly in BODY + We infer UL or DL respectively and use this + boolean to exclude block-level elements so as + to match Netscape's observed behaviour. + */ + lexer.excludeBlocks = false; + + if (!((node.tag.model & Dict.CM_BLOCK) != 0) && + !((node.tag.model & Dict.CM_INLINE) != 0)) + { + /* avoid this error message being issued twice */ + if (!((node.tag.model & Dict.CM_HEAD) != 0)) + Report.warning(lexer, body, node, Report.TAG_NOT_ALLOWED_IN); + + if ((node.tag.model & Dict.CM_HTML) != 0) + { + /* copy body attributes if current body was inferred */ + if (node.tag == tt.tagBody && body.implicit + && body.attributes == null) + { + body.attributes = node.attributes; + node.attributes = null; + } + + continue; + } + + if ((node.tag.model & Dict.CM_HEAD) != 0) + { + moveToHead(lexer, body, node); + continue; + } + + if ((node.tag.model & Dict.CM_LIST) != 0) + { + lexer.ungetToken(); + node = lexer.inferredTag( "ul"); + Node.addClass(node, "noindent"); + lexer.excludeBlocks = true; + } + else if ((node.tag.model & Dict.CM_DEFLIST) != 0) + { + lexer.ungetToken(); + node = lexer.inferredTag( "dl"); + lexer.excludeBlocks = true; + } + else if ((node.tag.model & (Dict.CM_TABLE | Dict.CM_ROWGRP | Dict.CM_ROW)) != 0) + { + lexer.ungetToken(); + node = lexer.inferredTag( "table"); + lexer.excludeBlocks = true; + } + else + { + /* AQ: The following line is from the official C + version of tidy. It doesn't make sense to me + because the '!' operator has higher precedence + than the '&' operator. It seems to me that the + expression always evaluates to 0. + + if (!node->tag->model & (CM_ROW | CM_FIELD)) + + AQ: 13Jan2000 fixed in C tidy + */ + if (!((node.tag.model & (Dict.CM_ROW | Dict.CM_FIELD)) != 0)) + { + lexer.ungetToken(); + return; + } + + /* ignore </td> </th> <option> etc. */ + continue; + } + } + + if (node.type == Node.EndTag) + { + if (node.tag == tt.tagBr) + node.type = Node.StartTag; + else if (node.tag == tt.tagP) + { + Node.coerceNode(lexer, node, tt.tagBr); + Node.insertNodeAtEnd(body, node); + node = lexer.inferredTag("br"); + } + else if ((node.tag.model & Dict.CM_INLINE) != 0) + lexer.popInline(node); + } + + if (node.type == Node.StartTag || node.type == Node.StartEndTag) + { + if (((node.tag.model & Dict.CM_INLINE) != 0) && !((node.tag.model & Dict.CM_MIXED) != 0)) + { + /* HTML4 strict doesn't allow inline content here */ + /* but HTML2 does allow img elements as children of body */ + if (node.tag == tt.tagImg) + lexer.versions &= ~Dict.VERS_HTML40_STRICT; + else + lexer.versions &= ~(Dict.VERS_HTML40_STRICT | Dict.VERS_HTML20); + + if (checkstack && !node.implicit) + { + checkstack = false; + + if (lexer.inlineDup( node) > 0) + continue; + } + + mode = Lexer.MixedContent; + } + else + { + checkstack = true; + mode = Lexer.IgnoreWhitespace; + } + + if (node.implicit) + Report.warning(lexer, body, node, Report.INSERTING_TAG); + + Node.insertNodeAtEnd(body, node); + parseTag(lexer, node, mode); + continue; + } + + /* discard unexpected tags */ + Report.warning(lexer, body, node, Report.DISCARDING_UNEXPECTED); + } + } + + }; + + public static class ParseFrameSet implements Parser { + + public void parse( Lexer lexer, Node frameset, short mode ) + { + Node node; + TagTable tt = lexer.configuration.tt; + + lexer.badAccess |= Report.USING_FRAMES; + + while (true) + { + node = lexer.getToken(Lexer.IgnoreWhitespace); + if (node == null) break; + if (node.tag == frameset.tag && node.type == Node.EndTag) + { + frameset.closed = true; + Node.trimSpaces(lexer, frameset); + return; + } + + /* deal with comments etc. */ + if (Node.insertMisc(frameset, node)) + continue; + + if (node.tag == null) + { + Report.warning(lexer, frameset, node, Report.DISCARDING_UNEXPECTED); + continue; + } + + if (node.type == Node.StartTag || node.type == Node.StartEndTag) + { + if (node.tag != null && (node.tag.model & Dict.CM_HEAD) != 0) + { + moveToHead(lexer, frameset, node); + continue; + } + } + + if (node.tag == tt.tagBody) + { + lexer.ungetToken(); + node = lexer.inferredTag("noframes"); + Report.warning(lexer, frameset, node, Report.INSERTING_TAG); + } + + if (node.type == Node.StartTag && (node.tag.model & Dict.CM_FRAMES) != 0) + { + Node.insertNodeAtEnd(frameset, node); + lexer.excludeBlocks = false; + parseTag(lexer, node, Lexer.MixedContent); + continue; + } + else if (node.type == Node.StartEndTag && (node.tag.model & Dict.CM_FRAMES) != 0) + { + Node.insertNodeAtEnd(frameset, node); + continue; + } + + /* discard unexpected tags */ + Report.warning(lexer, frameset, node, Report.DISCARDING_UNEXPECTED); + } + + Report.warning(lexer, frameset, node, Report.MISSING_ENDTAG_FOR); + } + + }; + + public static class ParseInline implements Parser { + + public void parse( Lexer lexer, Node element, short mode ) + { + Node node, parent; + TagTable tt = lexer.configuration.tt; + + if ((element.tag.model & Dict.CM_EMPTY) != 0) + return; + + if (element.tag == tt.tagA) + { + if (element.attributes == null) + { + Report.warning(lexer, element.parent, element, Report.DISCARDING_UNEXPECTED); + Node.discardElement(element); + return; + } + } + + /* + ParseInline is used for some block level elements like H1 to H6 + For such elements we need to insert inline emphasis tags currently + on the inline stack. For Inline elements, we normally push them + onto the inline stack provided they aren't implicit or OBJECT/APPLET. + This test is carried out in PushInline and PopInline, see istack.c + We don't push A or SPAN to replicate current browser behavior + */ + if (((element.tag.model & Dict.CM_BLOCK) != 0) || (element.tag == tt.tagDt)) + lexer.inlineDup( null); + else if ((element.tag.model & Dict.CM_INLINE) != 0 && + element.tag != tt.tagA && element.tag != tt.tagSpan) + lexer.pushInline( element); + + if (element.tag == tt.tagNobr) + lexer.badLayout |= Report.USING_NOBR; + else if (element.tag == tt.tagFont) + lexer.badLayout |= Report.USING_FONT; + + /* Inline elements may or may not be within a preformatted element */ + if (mode != Lexer.Preformatted) + mode = Lexer.MixedContent; + + while (true) + { + node = lexer.getToken(mode); + if (node == null) break; + /* end tag for current element */ + if (node.tag == element.tag && node.type == Node.EndTag) + { + if ((element.tag.model & Dict.CM_INLINE) != 0 && + element.tag != tt.tagA) + lexer.popInline( node); + + if (!((mode & Lexer.Preformatted) != 0)) + Node.trimSpaces(lexer, element); + /* + if a font element wraps an anchor and nothing else + then move the font element inside the anchor since + otherwise it won't alter the anchor text color + */ + if (element.tag == tt.tagFont && + element.content != null && + element.content == element.last) + { + Node child = element.content; + + if (child.tag == tt.tagA) + { + child.parent = element.parent; + child.next = element.next; + child.prev = element.prev; + + if (child.prev != null) + child.prev.next = child; + else + child.parent.content = child; + + if (child.next != null) + child.next.prev = child; + else + child.parent.last = child; + + element.next = null; + element.prev = null; + element.parent = child; + element.content = child.content; + element.last = child.last; + child.content = element; + child.last = element; + for (child = element.content; child != null; child = child.next) + child.parent = element; + } + } + element.closed = true; + Node.trimSpaces(lexer, element); + Node.trimEmptyElement(lexer, element); + return; + } + + /* <u>...<u> map 2nd <u> to </u> if 1st is explicit */ + /* otherwise emphasis nesting is probably unintentional */ + /* big and small have cumulative effect to leave them alone */ + if (node.type == Node.StartTag + && node.tag == element.tag + && lexer.isPushed(node) + && !node.implicit + && !element.implicit + && node.tag != null && ((node.tag.model & Dict.CM_INLINE) != 0) + && node.tag != tt.tagA + && node.tag != tt.tagFont + && node.tag != tt.tagBig + && node.tag != tt.tagSmall) + { + if (element.content != null && node.attributes == null) + { + Report.warning(lexer, element, node, Report.COERCE_TO_ENDTAG); + node.type = Node.EndTag; + lexer.ungetToken(); + continue; + } + + Report.warning(lexer, element, node, Report.NESTED_EMPHASIS); + } + + if (node.type == Node.TextNode) + { + /* only called for 1st child */ + if (element.content == null && + !((mode & Lexer.Preformatted) != 0)) + Node.trimSpaces(lexer, element); + + if (node.start >= node.end) + { + continue; + } + + Node.insertNodeAtEnd(element, node); + continue; + } + + /* mixed content model so allow text */ + if (Node.insertMisc(element, node)) + continue; + + /* deal with HTML tags */ + if (node.tag == tt.tagHtml) + { + if (node.type == Node.StartTag || node.type == Node.StartEndTag) + { + Report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED); + continue; + } + + /* otherwise infer end of inline element */ + lexer.ungetToken(); + if (!((mode & Lexer.Preformatted) != 0)) + Node.trimSpaces(lexer, element); + Node.trimEmptyElement(lexer, element); + return; + } + + /* within <dt> or <pre> map <p> to <br> */ + if (node.tag == tt.tagP && + node.type == Node.StartTag && + ((mode & Lexer.Preformatted) != 0 || + element.tag == tt.tagDt || + element.isDescendantOf(tt.tagDt))) + { + node.tag = tt.tagBr; + node.element = "br"; + Node.trimSpaces(lexer, element); + Node.insertNodeAtEnd(element, node); + continue; + } + + /* ignore unknown and PARAM tags */ + if (node.tag == null || node.tag == tt.tagParam) + { + Report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED); + continue; + } + + if (node.tag == tt.tagBr && node.type == Node.EndTag) + node.type = Node.StartTag; + + if (node.type == Node.EndTag) + { + /* coerce </br> to <br> */ + if (node.tag == tt.tagBr) + node.type = Node.StartTag; + else if (node.tag == tt.tagP) + { + /* coerce unmatched </p> to <br><br> */ + if (!element.isDescendantOf(tt.tagP)) + { + Node.coerceNode(lexer, node, tt.tagBr); + Node.trimSpaces(lexer, element); + Node.insertNodeAtEnd(element, node); + node = lexer.inferredTag("br"); + continue; + } + } + else if ((node.tag.model & Dict.CM_INLINE) != 0 + && node.tag != tt.tagA + && !((node.tag.model & Dict.CM_OBJECT) != 0) + && (element.tag.model & Dict.CM_INLINE) != 0) + { + /* allow any inline end tag to end current element */ + lexer.popInline( element); + + if (element.tag != tt.tagA) + { + if (node.tag == tt.tagA && node.tag != element.tag) + { + Report.warning(lexer, element, node, Report.MISSING_ENDTAG_BEFORE); + lexer.ungetToken(); + } + else + { + Report.warning(lexer, element, node, Report.NON_MATCHING_ENDTAG); + } + + if (!((mode & Lexer.Preformatted) != 0)) + Node.trimSpaces(lexer, element); + Node.trimEmptyElement(lexer, element); + return; + } + + /* if parent is <a> then discard unexpected inline end tag */ + Report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED); + continue; + } /* special case </tr> etc. for stuff moved in front of table */ + else if (lexer.exiled + && node.tag.model != 0 + && (node.tag.model & Dict.CM_TABLE) != 0) + { + lexer.ungetToken(); + Node.trimSpaces(lexer, element); + Node.trimEmptyElement(lexer, element); + return; + } + } + + /* allow any header tag to end current header */ + if ((node.tag.model & Dict.CM_HEADING) != 0 && (element.tag.model & Dict.CM_HEADING) != 0) + { + if (node.tag == element.tag) + { + Report.warning(lexer, element, node, Report.NON_MATCHING_ENDTAG); + } + else + { + Report.warning(lexer, element, node, Report.MISSING_ENDTAG_BEFORE); + lexer.ungetToken(); + } + if (!((mode & Lexer.Preformatted) != 0)) + Node.trimSpaces(lexer, element); + Node.trimEmptyElement(lexer, element); + return; + } + + /* + an <A> tag to ends any open <A> element + but <A href=...> is mapped to </A><A href=...> + */ + if (node.tag == tt.tagA && !node.implicit && lexer.isPushed(node)) + { + /* coerce <a> to </a> unless it has some attributes */ + if (node.attributes == null) + { + node.type = Node.EndTag; + Report.warning(lexer, element, node, Report.COERCE_TO_ENDTAG); + lexer.popInline( node); + lexer.ungetToken(); + continue; + } + + lexer.ungetToken(); + Report.warning(lexer, element, node, Report.MISSING_ENDTAG_BEFORE); + lexer.popInline( element); + if (!((mode & Lexer.Preformatted) != 0)) + Node.trimSpaces(lexer, element); + Node.trimEmptyElement(lexer, element); + return; + } + + if ((element.tag.model & Dict.CM_HEADING) != 0) + { + if (node.tag == tt.tagCenter || + node.tag == tt.tagDiv) + { + if (node.type != Node.StartTag && + node.type != Node.StartEndTag) + { + Report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED); + continue; + } + + Report.warning(lexer, element, node, Report.TAG_NOT_ALLOWED_IN); + + /* insert center as parent if heading is empty */ + if (element.content == null) + { + Node.insertNodeAsParent(element, node); + continue; + } + + /* split heading and make center parent of 2nd part */ + Node.insertNodeAfterElement(element, node); + + if (!((mode & Lexer.Preformatted) != 0)) + Node.trimSpaces(lexer, element); + + element = lexer.cloneNode(element); + element.start = lexer.lexsize; + element.end = lexer.lexsize; + Node.insertNodeAtEnd(node, element); + continue; + } + + if (node.tag == tt.tagHr) + { + if (node.type != Node.StartTag && + node.type != Node.StartEndTag) + { + Report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED); + continue; + } + + Report.warning(lexer, element, node, Report.TAG_NOT_ALLOWED_IN); + + /* insert hr before heading if heading is empty */ + if (element.content == null) + { + Node.insertNodeBeforeElement(element, node); + continue; + } + + /* split heading and insert hr before 2nd part */ + Node.insertNodeAfterElement(element, node); + + if (!((mode & Lexer.Preformatted) != 0)) + Node.trimSpaces(lexer, element); + + element = lexer.cloneNode(element); + element.start = lexer.lexsize; + element.end = lexer.lexsize; + Node.insertNodeAfterElement(node, element); + continue; + } + } + + if (element.tag == tt.tagDt) + { + if (node.tag == tt.tagHr) + { + Node dd; + + if (node.type != Node.StartTag && + node.type != Node.StartEndTag) + { + Report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED); + continue; + } + + Report.warning(lexer, element, node, Report.TAG_NOT_ALLOWED_IN); + dd = lexer.inferredTag("dd"); + + /* insert hr within dd before dt if dt is empty */ + if (element.content == null) + { + Node.insertNodeBeforeElement(element, dd); + Node.insertNodeAtEnd(dd, node); + continue; + } + + /* split dt and insert hr within dd before 2nd part */ + Node.insertNodeAfterElement(element, dd); + Node.insertNodeAtEnd(dd, node); + + if (!((mode & Lexer.Preformatted) != 0)) + Node.trimSpaces(lexer, element); + + element = lexer.cloneNode(element); + element.start = lexer.lexsize; + element.end = lexer.lexsize; + Node.insertNodeAfterElement(dd, element); + continue; + } + } + + + /* + if this is the end tag for an ancestor element + then infer end tag for this element + */ + if (node.type == Node.EndTag) + { + for (parent = element.parent; + parent != null; parent = parent.parent) + { + if (node.tag == parent.tag) + { + if (!((element.tag.model & Dict.CM_OPT) != 0) && + !element.implicit) + Report.warning(lexer, element, node, Report.MISSING_ENDTAG_BEFORE); + + if (element.tag == tt.tagA) + lexer.popInline(element); + + lexer.ungetToken(); + + if (!((mode & Lexer.Preformatted) != 0)) + Node.trimSpaces(lexer, element); + + Node.trimEmptyElement(lexer, element); + return; + } + } + } + + /* block level tags end this element */ + if (!((node.tag.model & Dict.CM_INLINE) != 0)) + { + if (node.type != Node.StartTag) + { + Report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED); + continue; + } + + if (!((element.tag.model & Dict.CM_OPT) != 0)) + Report.warning(lexer, element, node, Report.MISSING_ENDTAG_BEFORE); + + if ((node.tag.model & Dict.CM_HEAD) != 0 && + !((node.tag.model & Dict.CM_BLOCK) != 0)) + { + moveToHead(lexer, element, node); + continue; + } + + /* + prevent anchors from propagating into block tags + except for headings h1 to h6 + */ + if (element.tag == tt.tagA) + { + if (node.tag != null && + !((node.tag.model & Dict.CM_HEADING) != 0)) + lexer.popInline(element); + else if (!(element.content != null)) + { + Node.discardElement(element); + lexer.ungetToken(); + return; + } + } + + lexer.ungetToken(); + + if (!((mode & Lexer.Preformatted) != 0)) + Node.trimSpaces(lexer, element); + + Node.trimEmptyElement(lexer, element); + return; + } + + /* parse inline element */ + if (node.type == Node.StartTag || node.type == Node.StartEndTag) + { + if (node.implicit) + Report.warning(lexer, element, node, Report.INSERTING_TAG); + + /* trim white space before <br> */ + if (node.tag == tt.tagBr) + Node.trimSpaces(lexer, element); + + Node.insertNodeAtEnd(element, node); + parseTag(lexer, node, mode); + continue; + } + + /* discard unexpected tags */ + Report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED); + } + + if (!((element.tag.model & Dict.CM_OPT) != 0)) + Report.warning(lexer, element, node, Report.MISSING_ENDTAG_FOR); + + Node.trimEmptyElement(lexer, element); + } + }; + + public static class ParseList implements Parser { + + public void parse( Lexer lexer, Node list, short mode ) + { + Node node; + Node parent; + TagTable tt = lexer.configuration.tt; + + if ((list.tag.model & Dict.CM_EMPTY) != 0) + return; + + lexer.insert = -1; /* defer implicit inline start tags */ + + while (true) + { + node = lexer.getToken(Lexer.IgnoreWhitespace); + if (node == null) break; + + if (node.tag == list.tag && node.type == Node.EndTag) + { + if ((list.tag.model & Dict.CM_OBSOLETE) != 0) + Node.coerceNode(lexer, list, tt.tagUl); + + list.closed = true; + Node.trimEmptyElement(lexer, list); + return; + } + + /* deal with comments etc. */ + if (Node.insertMisc(list, node)) + continue; + + if (node.type != Node.TextNode && node.tag == null) + { + Report.warning(lexer, list, node, Report.DISCARDING_UNEXPECTED); + continue; + } + + /* + if this is the end tag for an ancestor element + then infer end tag for this element + */ + if (node.type == Node.EndTag) + { + if (node.tag == tt.tagForm) + { + lexer.badForm = 1; + Report.warning(lexer, list, node, Report.DISCARDING_UNEXPECTED); + continue; + } + + if (node.tag != null && (node.tag.model & Dict.CM_INLINE) != 0) + { + Report.warning(lexer, list, node, Report.DISCARDING_UNEXPECTED); + lexer.popInline(node); + continue; + } + + for (parent = list.parent; + parent != null; parent = parent.parent) + { + if (node.tag == parent.tag) + { + Report.warning(lexer, list, node, Report.MISSING_ENDTAG_BEFORE); + lexer.ungetToken(); + + if ((list.tag.model & Dict.CM_OBSOLETE) != 0) + Node.coerceNode(lexer, list, tt.tagUl); + + Node.trimEmptyElement(lexer, list); + return; + } + } + + Report.warning(lexer, list, node, Report.DISCARDING_UNEXPECTED); + continue; + } + + if (node.tag != tt.tagLi) + { + lexer.ungetToken(); + + if (node.tag != null && + (node.tag.model & Dict.CM_BLOCK) != 0 && + lexer.excludeBlocks) + { + Report.warning(lexer, list, node, Report.MISSING_ENDTAG_BEFORE); + Node.trimEmptyElement(lexer, list); + return; + } + + node = lexer.inferredTag("li"); + node.addAttribute("style", "list-style: none"); + Report.warning(lexer, list, node, Report.MISSING_STARTTAG); + } + + /* node should be <LI> */ + Node.insertNodeAtEnd(list, node); + parseTag(lexer, node, Lexer.IgnoreWhitespace); + } + + if ((list.tag.model & Dict.CM_OBSOLETE) != 0) + Node.coerceNode(lexer, list, tt.tagUl); + + Report.warning(lexer, list, node, Report.MISSING_ENDTAG_FOR); + Node.trimEmptyElement(lexer, list); + } + + }; + + public static class ParseDefList implements Parser { + + public void parse( Lexer lexer, Node list, short mode ) + { + Node node, parent; + TagTable tt = lexer.configuration.tt; + + if ((list.tag.model & Dict.CM_EMPTY) != 0) + return; + + lexer.insert = -1; /* defer implicit inline start tags */ + + while (true) + { + node = lexer.getToken(Lexer.IgnoreWhitespace); + if (node == null) break; + if (node.tag == list.tag && node.type == Node.EndTag) + { + list.closed = true; + Node.trimEmptyElement(lexer, list); + return; + } + + /* deal with comments etc. */ + if (Node.insertMisc(list, node)) + continue; + + if (node.type == Node.TextNode) + { + lexer.ungetToken(); + node = lexer.inferredTag( "dt"); + Report.warning(lexer, list, node, Report.MISSING_STARTTAG); + } + + if (node.tag == null) + { + Report.warning(lexer, list, node, Report.DISCARDING_UNEXPECTED); + continue; + } + + /* + if this is the end tag for an ancestor element + then infer end tag for this element + */ + if (node.type == Node.EndTag) + { + if (node.tag == tt.tagForm) + { + lexer.badForm = 1; + Report.warning(lexer, list, node, Report.DISCARDING_UNEXPECTED); + continue; + } + + for (parent = list.parent; + parent != null; parent = parent.parent) + { + if (node.tag == parent.tag) + { + Report.warning(lexer, list, node, Report.MISSING_ENDTAG_BEFORE); + + lexer.ungetToken(); + Node.trimEmptyElement(lexer, list); + return; + } + } + } + + /* center in a dt or a dl breaks the dl list in two */ + if (node.tag == tt.tagCenter) + { + if (list.content != null) + Node.insertNodeAfterElement(list, node); + else /* trim empty dl list */ + { + Node.insertNodeBeforeElement(list, node); + Node.discardElement(list); + } + + /* and parse contents of center */ + parseTag(lexer, node, mode); + + /* now create a new dl element */ + list = lexer.inferredTag("dl"); + Node.insertNodeAfterElement(node, list); + continue; + } + + if (!(node.tag == tt.tagDt || node.tag == tt.tagDd)) + { + lexer.ungetToken(); + + if (!((node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0)) + { + Report.warning(lexer, list, node, Report.TAG_NOT_ALLOWED_IN); + Node.trimEmptyElement(lexer, list); + return; + } + + /* if DD appeared directly in BODY then exclude blocks */ + if (!((node.tag.model & Dict.CM_INLINE) != 0) && lexer.excludeBlocks) + { + Node.trimEmptyElement(lexer, list); + return; + } + + node = lexer.inferredTag( "dd"); + Report.warning(lexer, list, node, Report.MISSING_STARTTAG); + } + + if (node.type == Node.EndTag) + { + Report.warning(lexer, list, node, Report.DISCARDING_UNEXPECTED); + continue; + } + + /* node should be <DT> or <DD>*/ + Node.insertNodeAtEnd(list, node); + parseTag(lexer, node, Lexer.IgnoreWhitespace); + } + + Report.warning(lexer, list, node, Report.MISSING_ENDTAG_FOR); + Node.trimEmptyElement(lexer, list); + } + + }; + + public static class ParsePre implements Parser { + + public void parse( Lexer lexer, Node pre, short mode ) + { + Node node, parent; + TagTable tt = lexer.configuration.tt; + + if ((pre.tag.model & Dict.CM_EMPTY) != 0) + return; + + if ((pre.tag.model & Dict.CM_OBSOLETE) != 0) + Node.coerceNode(lexer, pre, tt.tagPre); + + lexer.inlineDup( null); /* tell lexer to insert inlines if needed */ + + while (true) + { + node = lexer.getToken(Lexer.Preformatted); + if (node == null) break; + if (node.tag == pre.tag && node.type == Node.EndTag) + { + Node.trimSpaces(lexer, pre); + pre.closed = true; + Node.trimEmptyElement(lexer, pre); + return; + } + + if (node.tag == tt.tagHtml) + { + if (node.type == Node.StartTag || node.type == Node.StartEndTag) + Report.warning(lexer, pre, node, Report.DISCARDING_UNEXPECTED); + + continue; + } + + if (node.type == Node.TextNode) + { + /* if first check for inital newline */ + if (pre.content == null) + { + if (node.textarray[node.start] == (byte)'\n') + ++node.start; + + if (node.start >= node.end) + { + continue; + } + } + + Node.insertNodeAtEnd(pre, node); + continue; + } + + /* deal with comments etc. */ + if (Node.insertMisc(pre, node)) + continue; + + /* discard unknown and PARAM tags */ + if (node.tag == null || node.tag == tt.tagParam) + { + Report.warning(lexer, pre, node, Report.DISCARDING_UNEXPECTED); + continue; + } + + if (node.tag == tt.tagP) + { + if (node.type == Node.StartTag) + { + Report.warning(lexer, pre, node, Report.USING_BR_INPLACE_OF); + + /* trim white space before <p> in <pre>*/ + Node.trimSpaces(lexer, pre); + + /* coerce both <p> and </p> to <br> */ + Node.coerceNode(lexer, node, tt.tagBr); + Node.insertNodeAtEnd(pre, node); + } + else + { + Report.warning(lexer, pre, node, Report.DISCARDING_UNEXPECTED); + } + continue; + } + + if ((node.tag.model & Dict.CM_HEAD) != 0 && !((node.tag.model & Dict.CM_BLOCK) != 0)) + { + moveToHead(lexer, pre, node); + continue; + } + + /* + if this is the end tag for an ancestor element + then infer end tag for this element + */ + if (node.type == Node.EndTag) + { + if (node.tag == tt.tagForm) + { + lexer.badForm = 1; + Report.warning(lexer, pre, node, Report.DISCARDING_UNEXPECTED); + continue; + } + + for (parent = pre.parent; + parent != null; parent = parent.parent) + { + if (node.tag == parent.tag) + { + Report.warning(lexer, pre, node, Report.MISSING_ENDTAG_BEFORE); + + lexer.ungetToken(); + Node.trimSpaces(lexer, pre); + Node.trimEmptyElement(lexer, pre); + return; + } + } + } + + /* what about head content, HEAD, BODY tags etc? */ + if (!((node.tag.model & Dict.CM_INLINE) != 0)) + { + if (node.type != Node.StartTag) + { + Report.warning(lexer, pre, node, Report.DISCARDING_UNEXPECTED); + continue; + } + + Report.warning(lexer, pre, node, Report.MISSING_ENDTAG_BEFORE); + lexer.excludeBlocks = true; + + /* check if we need to infer a container */ + if ((node.tag.model & Dict.CM_LIST) != 0) + { + lexer.ungetToken(); + node = lexer.inferredTag( "ul"); + Node.addClass(node, "noindent"); + } + else if ((node.tag.model & Dict.CM_DEFLIST) != 0) + { + lexer.ungetToken(); + node = lexer.inferredTag( "dl"); + } + else if ((node.tag.model & Dict.CM_TABLE) != 0) + { + lexer.ungetToken(); + node = lexer.inferredTag( "table"); + } + + Node.insertNodeAfterElement(pre, node); + pre = lexer.inferredTag( "pre"); + Node.insertNodeAfterElement(node, pre); + parseTag(lexer, node, Lexer.IgnoreWhitespace); + lexer.excludeBlocks = false; + continue; + } + /* + if (!((node.tag.model & Dict.CM_INLINE) != 0)) + { + Report.warning(lexer, pre, node, Report.MISSING_ENDTAG_BEFORE); + lexer.ungetToken(); + return; + } + */ + if (node.type == Node.StartTag || node.type == Node.StartEndTag) + { + /* trim white space before <br> */ + if (node.tag == tt.tagBr) + Node.trimSpaces(lexer, pre); + + Node.insertNodeAtEnd(pre, node); + parseTag(lexer, node, Lexer.Preformatted); + continue; + } + + /* discard unexpected tags */ + Report.warning(lexer, pre, node, Report.DISCARDING_UNEXPECTED); + } + + Report.warning(lexer, pre, node, Report.MISSING_ENDTAG_FOR); + Node.trimEmptyElement(lexer, pre); + } + + }; + + public static class ParseBlock implements Parser { + + public void parse( Lexer lexer, Node element, short mode ) + /* + element is node created by the lexer + upon seeing the start tag, or by the + parser when the start tag is inferred + */ + { + Node node, parent; + boolean checkstack; + int istackbase = 0; + TagTable tt = lexer.configuration.tt; + + checkstack = true; + + if ((element.tag.model & Dict.CM_EMPTY) != 0) + return; + + if (element.tag == tt.tagForm && element.isDescendantOf(tt.tagForm)) + Report.warning(lexer, element, null, Report.ILLEGAL_NESTING); + + /* + InlineDup() asks the lexer to insert inline emphasis tags + currently pushed on the istack, but take care to avoid + propagating inline emphasis inside OBJECT or APPLET. + For these elements a fresh inline stack context is created + and disposed of upon reaching the end of the element. + They thus behave like table cells in this respect. + */ + if ((element.tag.model & Dict.CM_OBJECT) != 0) + { + istackbase = lexer.istackbase; + lexer.istackbase = lexer.istack.size(); + } + + if (!((element.tag.model & Dict.CM_MIXED) != 0)) + lexer.inlineDup( null); + + mode = Lexer.IgnoreWhitespace; + + while (true) + { + node = lexer.getToken(mode /*Lexer.MixedContent*/); + if (node == null) break; + /* end tag for this element */ + if (node.type == Node.EndTag && node.tag != null && + (node.tag == element.tag || element.was == node.tag)) + { + + if ((element.tag.model & Dict.CM_OBJECT) != 0) + { + /* pop inline stack */ + while (lexer.istack.size() > lexer.istackbase) + lexer.popInline( null); + lexer.istackbase = istackbase; + } + + element.closed = true; + Node.trimSpaces(lexer, element); + Node.trimEmptyElement(lexer, element); + return; + } + + if (node.tag == tt.tagHtml || + node.tag == tt.tagHead || + node.tag == tt.tagBody) + { + if (node.type == Node.StartTag || node.type == Node.StartEndTag) + Report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED); + + continue; + } + + if (node.type == Node.EndTag) + { + if (node.tag == null) + { + Report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED); + + continue; + } + else if (node.tag == tt.tagBr) + node.type = Node.StartTag; + else if (node.tag == tt.tagP) + { + Node.coerceNode(lexer, node, tt.tagBr); + Node.insertNodeAtEnd(element, node); + node = lexer.inferredTag("br"); + } + else + { + /* + if this is the end tag for an ancestor element + then infer end tag for this element + */ + for (parent = element.parent; + parent != null; parent = parent.parent) + { + if (node.tag == parent.tag) + { + if (!((element.tag.model & Dict.CM_OPT) != 0)) + Report.warning(lexer, element, node, Report.MISSING_ENDTAG_BEFORE); + + lexer.ungetToken(); + + if ((element.tag.model & Dict.CM_OBJECT) != 0) + { + /* pop inline stack */ + while (lexer.istack.size() > lexer.istackbase) + lexer.popInline( null); + lexer.istackbase = istackbase; + } + + Node.trimSpaces(lexer, element); + Node.trimEmptyElement(lexer, element); + return; + } + } + /* special case </tr> etc. for stuff moved in front of table */ + if (lexer.exiled + && node.tag.model != 0 + && (node.tag.model & Dict.CM_TABLE) != 0) + { + lexer.ungetToken(); + Node.trimSpaces(lexer, element); + Node.trimEmptyElement(lexer, element); + return; + } + } + } + + /* mixed content model permits text */ + if (node.type == Node.TextNode) + { + boolean iswhitenode = false; + + if (node.type == Node.TextNode && + node.end <= node.start + 1 && + lexer.lexbuf[node.start] == (byte)' ') + iswhitenode = true; + + if (lexer.configuration.EncloseBlockText && !iswhitenode) + { + lexer.ungetToken(); + node = lexer.inferredTag("p"); + Node.insertNodeAtEnd(element, node); + parseTag(lexer, node, Lexer.MixedContent); + continue; + } + + if (checkstack) + { + checkstack = false; + + if (!((element.tag.model & Dict.CM_MIXED) != 0)) + { + if (lexer.inlineDup( node) > 0) + continue; + } + } + + Node.insertNodeAtEnd(element, node); + mode = Lexer.MixedContent; + /* + HTML4 strict doesn't allow mixed content for + elements with %block; as their content model + */ + lexer.versions &= ~Dict.VERS_HTML40_STRICT; + continue; + } + + if (Node.insertMisc(element, node)) + continue; + + /* allow PARAM elements? */ + if (node.tag == tt.tagParam) + { + if (((element.tag.model & Dict.CM_PARAM) != 0) && + (node.type == Node.StartTag || node.type == Node.StartEndTag)) + { + Node.insertNodeAtEnd(element, node); + continue; + } + + /* otherwise discard it */ + Report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED); + continue; + } + + /* allow AREA elements? */ + if (node.tag == tt.tagArea) + { + if ((element.tag == tt.tagMap) && + (node.type == Node.StartTag || node.type == Node.StartEndTag)) + { + Node.insertNodeAtEnd(element, node); + continue; + } + + /* otherwise discard it */ + Report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED); + continue; + } + + /* ignore unknown start/end tags */ + if (node.tag == null) + { + Report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED); + continue; + } + + /* + Allow Dict.CM_INLINE elements here. + + Allow Dict.CM_BLOCK elements here unless + lexer.excludeBlocks is yes. + + LI and DD are special cased. + + Otherwise infer end tag for this element. + */ + + if (!((node.tag.model & Dict.CM_INLINE) != 0)) + { + if (node.type != Node.StartTag && node.type != Node.StartEndTag) + { + Report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED); + continue; + } + + if (element.tag == tt.tagTd || element.tag == tt.tagTh) + { + /* if parent is a table cell, avoid inferring the end of the cell */ + + if ((node.tag.model & Dict.CM_HEAD) != 0) + { + moveToHead(lexer, element, node); + continue; + } + + if ((node.tag.model & Dict.CM_LIST) != 0) + { + lexer.ungetToken(); + node = lexer.inferredTag( "ul"); + Node.addClass(node, "noindent"); + lexer.excludeBlocks = true; + } + else if ((node.tag.model & Dict.CM_DEFLIST) != 0) + { + lexer.ungetToken(); + node = lexer.inferredTag( "dl"); + lexer.excludeBlocks = true; + } + + /* infer end of current table cell */ + if (!((node.tag.model & Dict.CM_BLOCK) != 0)) + { + lexer.ungetToken(); + Node.trimSpaces(lexer, element); + Node.trimEmptyElement(lexer, element); + return; + } + } + else if ((node.tag.model & Dict.CM_BLOCK) != 0) + { + if (lexer.excludeBlocks) + { + if (!((element.tag.model & Dict.CM_OPT) != 0)) + Report.warning(lexer, element, node, Report.MISSING_ENDTAG_BEFORE); + + lexer.ungetToken(); + + if ((element.tag.model & Dict.CM_OBJECT) != 0) + lexer.istackbase = istackbase; + + Node.trimSpaces(lexer, element); + Node.trimEmptyElement(lexer, element); + return; + } + } + else /* things like list items */ + { + if (!((element.tag.model & Dict.CM_OPT) != 0) && !element.implicit) + Report.warning(lexer, element, node, Report.MISSING_ENDTAG_BEFORE); + + if ((node.tag.model & Dict.CM_HEAD) != 0) + { + moveToHead(lexer, element, node); + continue; + } + + lexer.ungetToken(); + + if ((node.tag.model & Dict.CM_LIST) != 0) + { + if (element.parent != null && element.parent.tag != null && + element.parent.tag.parser == getParseList()) + { + Node.trimSpaces(lexer, element); + Node.trimEmptyElement(lexer, element); + return; + } + + node = lexer.inferredTag("ul"); + Node.addClass(node, "noindent"); + } + else if ((node.tag.model & Dict.CM_DEFLIST) != 0) + { + if (element.parent.tag == tt.tagDl) + { + Node.trimSpaces(lexer, element); + Node.trimEmptyElement(lexer, element); + return; + } + + node = lexer.inferredTag("dl"); + } + else if ((node.tag.model & Dict.CM_TABLE) != 0 || + (node.tag.model & Dict.CM_ROW) != 0) + { + node = lexer.inferredTag("table"); + } + else if ((element.tag.model & Dict.CM_OBJECT) != 0) + { + /* pop inline stack */ + while (lexer.istack.size() > lexer.istackbase) + lexer.popInline( null); + lexer.istackbase = istackbase; + Node.trimSpaces(lexer, element); + Node.trimEmptyElement(lexer, element); + return; + + } + else + { + Node.trimSpaces(lexer, element); + Node.trimEmptyElement(lexer, element); + return; + } + } + } + + /* parse known element */ + if (node.type == Node.StartTag || node.type == Node.StartEndTag) + { + if ((node.tag.model & Dict.CM_INLINE) != 0) + { + if (checkstack && !node.implicit) + { + checkstack = false; + + if (lexer.inlineDup( node) > 0) + continue; + } + + mode = Lexer.MixedContent; + } + else + { + checkstack = true; + mode = Lexer.IgnoreWhitespace; + } + + /* trim white space before <br> */ + if (node.tag == tt.tagBr) + Node.trimSpaces(lexer, element); + + Node.insertNodeAtEnd(element, node); + + if (node.implicit) + Report.warning(lexer, element, node, Report.INSERTING_TAG); + + parseTag(lexer, node, Lexer.IgnoreWhitespace /*Lexer.MixedContent*/); + continue; + } + + /* discard unexpected tags */ + if (node.type == Node.EndTag) + lexer.popInline( node); /* if inline end tag */ + + Report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED); + } + + if (!((element.tag.model & Dict.CM_OPT) != 0)) + Report.warning(lexer, element, node, Report.MISSING_ENDTAG_FOR); + + if ((element.tag.model & Dict.CM_OBJECT) != 0) + { + /* pop inline stack */ + while (lexer.istack.size() > lexer.istackbase) + lexer.popInline( null); + lexer.istackbase = istackbase; + } + + Node.trimSpaces(lexer, element); + Node.trimEmptyElement(lexer, element); + } + + }; + + public static class ParseTableTag implements Parser { + + public void parse( Lexer lexer, Node table, short mode ) + { + Node node, parent; + int istackbase; + TagTable tt = lexer.configuration.tt; + + lexer.deferDup(); + istackbase = lexer.istackbase; + lexer.istackbase = lexer.istack.size(); + + while (true) + { + node = lexer.getToken(Lexer.IgnoreWhitespace); + if (node == null) break; + if (node.tag == table.tag && node.type == Node.EndTag) + { + lexer.istackbase = istackbase; + table.closed = true; + Node.trimEmptyElement(lexer, table); + return; + } + + /* deal with comments etc. */ + if (Node.insertMisc(table, node)) + continue; + + /* discard unknown tags */ + if (node.tag == null && node.type != Node.TextNode) + { + Report.warning(lexer, table, node, Report.DISCARDING_UNEXPECTED); + continue; + } + + /* if TD or TH or text or inline or block then infer <TR> */ + + if (node.type != Node.EndTag) + { + if (node.tag == tt.tagTd || + node.tag == tt.tagTh || + node.tag == tt.tagTable) + { + lexer.ungetToken(); + node = lexer.inferredTag( "tr"); + Report.warning(lexer, table, node, Report.MISSING_STARTTAG); + } + else if (node.type == Node.TextNode + || (node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0) + { + Node.insertNodeBeforeElement(table, node); + Report.warning(lexer, table, node, Report.TAG_NOT_ALLOWED_IN); + lexer.exiled = true; + + /* AQ: TODO + Line 2040 of parser.c (13 Jan 2000) reads as follows: + if (!node->type == TextNode) + This will always evaluate to false. + This has been reported to Dave Raggett <dsr@w3.org> + */ + //Should be?: if (!(node.type == Node.TextNode)) + if (false) + parseTag(lexer, node, Lexer.IgnoreWhitespace); + + lexer.exiled = false; + continue; + } + else if ((node.tag.model & Dict.CM_HEAD) != 0) + { + moveToHead(lexer, table, node); + continue; + } + } + + /* + if this is the end tag for an ancestor element + then infer end tag for this element + */ + if (node.type == Node.EndTag) + { + if (node.tag == tt.tagForm) + { + lexer.badForm = 1; + Report.warning(lexer, table, node, Report.DISCARDING_UNEXPECTED); + continue; + } + + if (node.tag != null && (node.tag.model & (Dict.CM_TABLE|Dict.CM_ROW)) != 0) + { + Report.warning(lexer, table, node, Report.DISCARDING_UNEXPECTED); + continue; + } + + for (parent = table.parent; + parent != null; parent = parent.parent) + { + if (node.tag == parent.tag) + { + Report.warning(lexer, table, node, Report.MISSING_ENDTAG_BEFORE); + lexer.ungetToken(); + lexer.istackbase = istackbase; + Node.trimEmptyElement(lexer, table); + return; + } + } + } + + if (!((node.tag.model & Dict.CM_TABLE) != 0)) + { + lexer.ungetToken(); + Report.warning(lexer, table, node, Report.TAG_NOT_ALLOWED_IN); + lexer.istackbase = istackbase; + Node.trimEmptyElement(lexer, table); + return; + } + + if (node.type == Node.StartTag || node.type == Node.StartEndTag) + { + Node.insertNodeAtEnd(table, node);; + parseTag(lexer, node, Lexer.IgnoreWhitespace); + continue; + } + + /* discard unexpected text nodes and end tags */ + Report.warning(lexer, table, node, Report.DISCARDING_UNEXPECTED); + } + + Report.warning(lexer, table, node, Report.MISSING_ENDTAG_FOR); + Node.trimEmptyElement(lexer, table); + lexer.istackbase = istackbase; + } + + }; + + public static class ParseColGroup implements Parser { + + public void parse( Lexer lexer, Node colgroup, short mode ) + { + Node node, parent; + TagTable tt = lexer.configuration.tt; + + if ((colgroup.tag.model & Dict.CM_EMPTY) != 0) + return; + + while (true) + { + node = lexer.getToken(Lexer.IgnoreWhitespace); + if (node == null) break; + if (node.tag == colgroup.tag && node.type == Node.EndTag) + { + colgroup.closed = true; + return; + } + + /* + if this is the end tag for an ancestor element + then infer end tag for this element + */ + if (node.type == Node.EndTag) + { + if (node.tag == tt.tagForm) + { + lexer.badForm = 1; + Report.warning(lexer, colgroup, node, Report.DISCARDING_UNEXPECTED); + continue; + } + + for (parent = colgroup.parent; + parent != null; parent = parent.parent) + { + + if (node.tag == parent.tag) + { + lexer.ungetToken(); + return; + } + } + } + + if (node.type == Node.TextNode) + { + lexer.ungetToken(); + return; + } + + /* deal with comments etc. */ + if (Node.insertMisc(colgroup, node)) + continue; + + /* discard unknown tags */ + if (node.tag == null) + { + Report.warning(lexer, colgroup, node, Report.DISCARDING_UNEXPECTED); + continue; + } + + if (node.tag != tt.tagCol) + { + lexer.ungetToken(); + return; + } + + if (node.type == Node.EndTag) + { + Report.warning(lexer, colgroup, node, Report.DISCARDING_UNEXPECTED); + continue; + } + + /* node should be <COL> */ + Node.insertNodeAtEnd(colgroup, node); + parseTag(lexer, node, Lexer.IgnoreWhitespace); + } + } + + }; + + public static class ParseRowGroup implements Parser { + + public void parse( Lexer lexer, Node rowgroup, short mode ) + { + Node node, parent; + TagTable tt = lexer.configuration.tt; + + if ((rowgroup.tag.model & Dict.CM_EMPTY) != 0) + return; + + while (true) + { + node = lexer.getToken(Lexer.IgnoreWhitespace); + if (node == null) break; + if (node.tag == rowgroup.tag) + { + if (node.type == Node.EndTag) + { + rowgroup.closed = true; + Node.trimEmptyElement(lexer, rowgroup); + return; + } + + lexer.ungetToken(); + return; + } + + /* if </table> infer end tag */ + if (node.tag == tt.tagTable && node.type == Node.EndTag) + { + lexer.ungetToken(); + Node.trimEmptyElement(lexer, rowgroup); + return; + } + + /* deal with comments etc. */ + if (Node.insertMisc(rowgroup, node)) + continue; + + /* discard unknown tags */ + if (node.tag == null && node.type != Node.TextNode) + { + Report.warning(lexer, rowgroup, node, Report.DISCARDING_UNEXPECTED); + continue; + } + + /* + if TD or TH then infer <TR> + if text or inline or block move before table + if head content move to head + */ + + if (node.type != Node.EndTag) + { + if (node.tag == tt.tagTd || node.tag == tt.tagTh) + { + lexer.ungetToken(); + node = lexer.inferredTag("tr"); + Report.warning(lexer, rowgroup, node, Report.MISSING_STARTTAG); + } + else if (node.type == Node.TextNode + || (node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0) + { + Node.moveBeforeTable(rowgroup, node, tt); + Report.warning(lexer, rowgroup, node, Report.TAG_NOT_ALLOWED_IN); + lexer.exiled = true; + + if (node.type != Node.TextNode) + parseTag(lexer, node, Lexer.IgnoreWhitespace); + + lexer.exiled = false; + continue; + } + else if ((node.tag.model & Dict.CM_HEAD) != 0) + { + Report.warning(lexer, rowgroup, node, Report.TAG_NOT_ALLOWED_IN); + moveToHead(lexer, rowgroup, node); + continue; + } + } + + /* + if this is the end tag for ancestor element + then infer end tag for this element + */ + if (node.type == Node.EndTag) + { + if (node.tag == tt.tagForm) + { + lexer.badForm = 1; + Report.warning(lexer, rowgroup, node, Report.DISCARDING_UNEXPECTED); + continue; + } + + if (node.tag == tt.tagTr || node.tag == tt.tagTd || node.tag == tt.tagTh) + { + Report.warning(lexer, rowgroup, node, Report.DISCARDING_UNEXPECTED); + continue; + } + + for (parent = rowgroup.parent; + parent != null; parent = parent.parent) + { + if (node.tag == parent.tag) + { + lexer.ungetToken(); + Node.trimEmptyElement(lexer, rowgroup); + return; + } + } + } + + /* + if THEAD, TFOOT or TBODY then implied end tag + + */ + if ((node.tag.model & Dict.CM_ROWGRP) != 0) + { + if (node.type != Node.EndTag) + lexer.ungetToken(); + + Node.trimEmptyElement(lexer, rowgroup); + return; + } + + if (node.type == Node.EndTag) + { + Report.warning(lexer, rowgroup, node, Report.DISCARDING_UNEXPECTED); + continue; + } + + if (!(node.tag == tt.tagTr)) + { + node = lexer.inferredTag( "tr"); + Report.warning(lexer, rowgroup, node, Report.MISSING_STARTTAG); + lexer.ungetToken(); + } + + /* node should be <TR> */ + Node.insertNodeAtEnd(rowgroup, node); + parseTag(lexer, node, Lexer.IgnoreWhitespace); + } + + Node.trimEmptyElement(lexer, rowgroup); + } + + }; + + public static class ParseRow implements Parser { + + public void parse( Lexer lexer, Node row, short mode ) + { + Node node, parent; + boolean exclude_state; + TagTable tt = lexer.configuration.tt; + + if ((row.tag.model & Dict.CM_EMPTY) != 0) + return; + + while (true) + { + node = lexer.getToken(Lexer.IgnoreWhitespace); + if (node == null) break; + if (node.tag == row.tag) + { + if (node.type == Node.EndTag) + { + row.closed = true; + Node.fixEmptyRow(lexer, row); + return; + } + + lexer.ungetToken(); + Node.fixEmptyRow(lexer, row); + return; + } + + /* + if this is the end tag for an ancestor element + then infer end tag for this element + */ + if (node.type == Node.EndTag) + { + if (node.tag == tt.tagForm) + { + lexer.badForm = 1; + Report.warning(lexer, row, node, Report.DISCARDING_UNEXPECTED); + continue; + } + + if (node.tag == tt.tagTd || node.tag == tt.tagTh) + { + Report.warning(lexer, row, node, Report.DISCARDING_UNEXPECTED); + continue; + } + + for (parent = row.parent; + parent != null; parent = parent.parent) + { + if (node.tag == parent.tag) + { + lexer.ungetToken(); + Node.trimEmptyElement(lexer, row); + return; + } + } + } + + /* deal with comments etc. */ + if (Node.insertMisc(row, node)) + continue; + + /* discard unknown tags */ + if (node.tag == null && node.type != Node.TextNode) + { + Report.warning(lexer, row, node, Report.DISCARDING_UNEXPECTED); + continue; + } + + /* discard unexpected <table> element */ + if (node.tag == tt.tagTable) + { + Report.warning(lexer, row, node, Report.DISCARDING_UNEXPECTED); + continue; + } + + /* THEAD, TFOOT or TBODY */ + if (node.tag != null && (node.tag.model & Dict.CM_ROWGRP) != 0) + { + lexer.ungetToken(); + Node.trimEmptyElement(lexer, row); + return; + } + + if (node.type == Node.EndTag) + { + Report.warning(lexer, row, node, Report.DISCARDING_UNEXPECTED); + continue; + } + + /* + if text or inline or block move before table + if head content move to head + */ + + if (node.type != Node.EndTag) + { + if (node.tag == tt.tagForm) + { + lexer.ungetToken(); + node = lexer.inferredTag("td"); + Report.warning(lexer, row, node, Report.MISSING_STARTTAG); + } + else if (node.type == Node.TextNode + || (node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0) + { + Node.moveBeforeTable(row, node, tt); + Report.warning(lexer, row, node, Report.TAG_NOT_ALLOWED_IN); + lexer.exiled = true; + + if (node.type != Node.TextNode) + parseTag(lexer, node, Lexer.IgnoreWhitespace); + + lexer.exiled = false; + continue; + } + else if ((node.tag.model & Dict.CM_HEAD) != 0) + { + Report.warning(lexer, row, node, Report.TAG_NOT_ALLOWED_IN); + moveToHead(lexer, row, node); + continue; + } + } + + if (!(node.tag == tt.tagTd || node.tag == tt.tagTh)) + { + Report.warning(lexer, row, node, Report.TAG_NOT_ALLOWED_IN); + continue; + } + + /* node should be <TD> or <TH> */ + Node.insertNodeAtEnd(row, node); + exclude_state = lexer.excludeBlocks; + lexer.excludeBlocks = false; + parseTag(lexer, node, Lexer.IgnoreWhitespace); + lexer.excludeBlocks = exclude_state; + + /* pop inline stack */ + + while (lexer.istack.size() > lexer.istackbase) + lexer.popInline( null); + } + + Node.trimEmptyElement(lexer, row); + } + + }; + + public static class ParseNoFrames implements Parser { + + public void parse( Lexer lexer, Node noframes, short mode ) + { + Node node; + boolean checkstack; + TagTable tt = lexer.configuration.tt; + + lexer.badAccess |= Report.USING_NOFRAMES; + mode = Lexer.IgnoreWhitespace; + checkstack = true; + + while (true) + { + node = lexer.getToken(mode); + if (node == null) break; + if (node.tag == noframes.tag && node.type == Node.EndTag) + { + noframes.closed = true; + Node.trimSpaces(lexer, noframes); + return; + } + + if ((node.tag == tt.tagFrame || node.tag == tt.tagFrameset)) + { + Report.warning(lexer, noframes, node, Report.MISSING_ENDTAG_BEFORE); + Node.trimSpaces(lexer, noframes); + lexer.ungetToken(); + return; + } + + if (node.tag == tt.tagHtml) + { + if (node.type == Node.StartTag || node.type == Node.StartEndTag) + Report.warning(lexer, noframes, node, Report.DISCARDING_UNEXPECTED); + + continue; + } + + /* deal with comments etc. */ + if (Node.insertMisc(noframes, node)) + continue; + + if (node.tag == tt.tagBody && node.type == Node.StartTag) + { + Node.insertNodeAtEnd(noframes, node); + parseTag(lexer, node, Lexer.IgnoreWhitespace /*MixedContent*/); + continue; + } + + /* implicit body element inferred */ + if (node.type == Node.TextNode || node.tag != null) + { + lexer.ungetToken(); + node = lexer.inferredTag("body"); + if (lexer.configuration.XmlOut) + Report.warning(lexer, noframes, node, Report.INSERTING_TAG); + Node.insertNodeAtEnd(noframes, node); + parseTag(lexer, node, Lexer.IgnoreWhitespace /*MixedContent*/); + continue; + } + /* discard unexpected end tags */ + Report.warning(lexer, noframes, node, Report.DISCARDING_UNEXPECTED); + } + + Report.warning(lexer, noframes, node, Report.MISSING_ENDTAG_FOR); + } + + }; + + public static class ParseSelect implements Parser { + + public void parse( Lexer lexer, Node field, short mode ) + { + Node node; + TagTable tt = lexer.configuration.tt; + + lexer.insert = -1; /* defer implicit inline start tags */ + + while (true) + { + node = lexer.getToken(Lexer.IgnoreWhitespace); + if (node == null) break; + if (node.tag == field.tag && node.type == Node.EndTag) + { + field.closed = true; + Node.trimSpaces(lexer, field); + return; + } + + /* deal with comments etc. */ + if (Node.insertMisc(field, node)) + continue; + + if (node.type == Node.StartTag && + (node.tag == tt.tagOption || + node.tag == tt.tagOptgroup || + node.tag == tt.tagScript)) + { + Node.insertNodeAtEnd(field, node); + parseTag(lexer, node, Lexer.IgnoreWhitespace); + continue; + } + + /* discard unexpected tags */ + Report.warning(lexer, field, node, Report.DISCARDING_UNEXPECTED); + } + + Report.warning(lexer, field, node, Report.MISSING_ENDTAG_FOR); + } + + }; + + public static class ParseText implements Parser { + + public void parse( Lexer lexer, Node field, short mode ) + { + Node node; + TagTable tt = lexer.configuration.tt; + + lexer.insert = -1; /* defer implicit inline start tags */ + + if (field.tag == tt.tagTextarea) + mode = Lexer.Preformatted; + + while (true) + { + node = lexer.getToken(mode); + if (node == null) break; + if (node.tag == field.tag && node.type == Node.EndTag) + { + field.closed = true; + Node.trimSpaces(lexer, field); + return; + } + + /* deal with comments etc. */ + if (Node.insertMisc(field, node)) + continue; + + if (node.type == Node.TextNode) + { + /* only called for 1st child */ + if (field.content == null && !((mode & Lexer.Preformatted) != 0)) + Node.trimSpaces(lexer, field); + + if (node.start >= node.end) + { + continue; + } + + Node.insertNodeAtEnd(field, node); + continue; + } + + if (node.tag == tt.tagFont) + { + Report.warning(lexer, field, node, Report.DISCARDING_UNEXPECTED); + continue; + } + + /* terminate element on other tags */ + if (!((field.tag.model & Dict.CM_OPT) != 0)) + Report.warning(lexer, field, node, Report.MISSING_ENDTAG_BEFORE); + + lexer.ungetToken(); + Node.trimSpaces(lexer, field); + return; + } + + if (!((field.tag.model & Dict.CM_OPT) != 0)) + Report.warning(lexer, field, node, Report.MISSING_ENDTAG_FOR); + } + + }; + + public static class ParseOptGroup implements Parser { + + public void parse( Lexer lexer, Node field, short mode ) + { + Node node; + TagTable tt = lexer.configuration.tt; + + lexer.insert = -1; /* defer implicit inline start tags */ + + while (true) + { + node = lexer.getToken(Lexer.IgnoreWhitespace); + if (node == null) break; + if (node.tag == field.tag && node.type == Node.EndTag) + { + field.closed = true; + Node.trimSpaces(lexer, field); + return; + } + + /* deal with comments etc. */ + if (Node.insertMisc(field, node)) + continue; + + if (node.type == Node.StartTag && + (node.tag == tt.tagOption || node.tag == tt.tagOptgroup)) + { + if (node.tag == tt.tagOptgroup) + Report.warning(lexer, field, node, Report.CANT_BE_NESTED); + + Node.insertNodeAtEnd(field, node); + parseTag(lexer, node, Lexer.MixedContent); + continue; + } + + /* discard unexpected tags */ + Report.warning(lexer, field, node, Report.DISCARDING_UNEXPECTED); + } + } + + }; + + public static Parser getParseHTML() + { + return _parseHTML; + } + + public static Parser getParseHead() + { + return _parseHead; + } + + public static Parser getParseTitle() + { + return _parseTitle; + } + + public static Parser getParseScript() + { + return _parseScript; + } + + public static Parser getParseBody() + { + return _parseBody; + } + + public static Parser getParseFrameSet() + { + return _parseFrameSet; + } + + public static Parser getParseInline() + { + return _parseInline; + } + + public static Parser getParseList() + { + return _parseList; + } + + public static Parser getParseDefList() + { + return _parseDefList; + } + + public static Parser getParsePre() + { + return _parsePre; + } + + public static Parser getParseBlock() + { + return _parseBlock; + } + + public static Parser getParseTableTag() + { + return _parseTableTag; + } + + public static Parser getParseColGroup() + { + return _parseColGroup; + } + + public static Parser getParseRowGroup() + { + return _parseRowGroup; + } + + public static Parser getParseRow() + { + return _parseRow; + } + + public static Parser getParseNoFrames() + { + return _parseNoFrames; + } + + public static Parser getParseSelect() + { + return _parseSelect; + } + + public static Parser getParseText() + { + return _parseText; + } + + public static Parser getParseOptGroup() + { + return _parseOptGroup; + } + + + private static Parser _parseHTML = new ParseHTML(); + private static Parser _parseHead = new ParseHead(); + private static Parser _parseTitle = new ParseTitle(); + private static Parser _parseScript = new ParseScript(); + private static Parser _parseBody = new ParseBody(); + private static Parser _parseFrameSet = new ParseFrameSet(); + private static Parser _parseInline = new ParseInline(); + private static Parser _parseList = new ParseList(); + private static Parser _parseDefList = new ParseDefList(); + private static Parser _parsePre = new ParsePre(); + private static Parser _parseBlock = new ParseBlock(); + private static Parser _parseTableTag = new ParseTableTag(); + private static Parser _parseColGroup = new ParseColGroup(); + private static Parser _parseRowGroup = new ParseRowGroup(); + private static Parser _parseRow = new ParseRow(); + private static Parser _parseNoFrames = new ParseNoFrames(); + private static Parser _parseSelect = new ParseSelect(); + private static Parser _parseText = new ParseText(); + private static Parser _parseOptGroup = new ParseOptGroup(); + + /* + HTML is the top level element + */ + public static Node parseDocument(Lexer lexer) + { + Node node, document, html; + Node doctype = null; + TagTable tt = lexer.configuration.tt; + + document = lexer.newNode(); + document.type = Node.RootNode; + + while (true) + { + node = lexer.getToken(Lexer.IgnoreWhitespace); + if (node == null) break; + + /* deal with comments etc. */ + if (Node.insertMisc(document, node)) + continue; + + if (node.type == Node.DocTypeTag) + { + if (doctype == null) + { + Node.insertNodeAtEnd(document, node); + doctype = node; + } + else + Report.warning(lexer, document, node, Report.DISCARDING_UNEXPECTED); + continue; + } + + if (node.type == Node.EndTag) + { + Report.warning(lexer, document, node, Report.DISCARDING_UNEXPECTED); //TODO? + continue; + } + + if (node.type != Node.StartTag || node.tag != tt.tagHtml) + { + lexer.ungetToken(); + html = lexer.inferredTag("html"); + } + else + html = node; + + Node.insertNodeAtEnd(document, html); + getParseHTML().parse(lexer, html, (short)0); // TODO? + break; + } + + return document; + } + + /** + * Indicates whether or not whitespace should be preserved for this element. + * If an <code>xml:space</code> attribute is found, then if the attribute value is + * <code>preserve</code>, returns <code>true</code>. For any other value, returns + * <code>false</code>. If an <code>xml:space</code> attribute was <em>not</em> + * found, then the following element names result in a return value of <code>true: + * pre, script, style,</code> and <code>xsl:text</code>. Finally, if a + * <code>TagTable</code> was passed in and the element appears as the "pre" element + * in the <code>TagTable</code>, then <code>true</code> will be returned. + * Otherwise, <code>false</code> is returned. + * @param element The <code>Node</code> to test to see if whitespace should be + * preserved. + * @param tt The <code>TagTable</code> to test for the <code>getNodePre()</code> + * function. This may be <code>null</code>, in which case this test + * is bypassed. + * @return <code>true</code> or <code>false</code>, as explained above. + */ + + public static boolean XMLPreserveWhiteSpace(Node element, TagTable tt) + { + AttVal attribute; + + /* search attributes for xml:space */ + for (attribute = element.attributes; attribute != null; attribute = attribute.next) + { + if (attribute.attribute.equals("xml:space")) + { + if (attribute.value.equals("preserve")) + return true; + + return false; + } + } + + /* kludge for html docs without explicit xml:space attribute */ + if (Lexer.wstrcasecmp(element.element, "pre") == 0 + || Lexer.wstrcasecmp(element.element, "script") == 0 + || Lexer.wstrcasecmp(element.element, "style") == 0) + return true; + + if ( (tt != null) && (tt.findParser(element) == getParsePre()) ) + return true; + + /* kludge for XSL docs */ + if (Lexer.wstrcasecmp(element.element, "xsl:text") == 0) + return true; + + return false; + } + + /* + XML documents + */ + public static void parseXMLElement(Lexer lexer, Node element, short mode) + { + Node node; + + /* Jeff Young's kludge for XSL docs */ + + if (Lexer.wstrcasecmp(element.element, "xsl:text") == 0) + return; + + /* if node is pre or has xml:space="preserve" then do so */ + + if (XMLPreserveWhiteSpace(element, lexer.configuration.tt)) + mode = Lexer.Preformatted; + + while (true) + { + node = lexer.getToken(mode); + if (node == null) break; + if (node.type == Node.EndTag && node.element.equals(element.element)) + { + element.closed = true; + break; + } + + /* discard unexpected end tags */ + if (node.type == Node.EndTag) + { + Report.error(lexer, element, node, Report.UNEXPECTED_ENDTAG); + continue; + } + + /* parse content on seeing start tag */ + if (node.type == Node.StartTag) + parseXMLElement(lexer, node, mode); + + Node.insertNodeAtEnd(element, node); + } + + /* + if first child is text then trim initial space and + delete text node if it is empty. + */ + + node = element.content; + + if (node != null && node.type == Node.TextNode && mode != Lexer.Preformatted) + { + if (node.textarray[node.start] == (byte)' ') + { + node.start++; + + if (node.start >= node.end) + Node.discardElement(node); + } + } + + /* + if last child is text then trim final space and + delete the text node if it is empty + */ + + node = element.last; + + if (node != null && node.type == Node.TextNode && mode != Lexer.Preformatted) + { + if (node.textarray[node.end - 1] == (byte)' ') + { + node.end--; + + if (node.start >= node.end) + Node.discardElement(node); + } + } + } + + public static Node parseXMLDocument(Lexer lexer) + { + Node node, document, doctype; + + document = lexer.newNode(); + document.type = Node.RootNode; + doctype = null; + lexer.configuration.XmlTags = true; + + while (true) + { + node = lexer.getToken(Lexer.IgnoreWhitespace); + if (node == null) break; + /* discard unexpected end tags */ + if (node.type == Node.EndTag) + { + Report.warning(lexer, null, node, Report.UNEXPECTED_ENDTAG); + continue; + } + + /* deal with comments etc. */ + if (Node.insertMisc(document, node)) + continue; + + if (node.type == Node.DocTypeTag) + { + if (doctype == null) + { + Node.insertNodeAtEnd(document, node); + doctype = node; + } + else + Report.warning(lexer, document, node, Report.DISCARDING_UNEXPECTED); // TODO + continue; + } + + /* if start tag then parse element's content */ + if (node.type == Node.StartTag) + { + Node.insertNodeAtEnd(document, node); + parseXMLElement(lexer, node, Lexer.IgnoreWhitespace); + } + + } + +if (false) { //#if 0 + /* discard the document type */ + node = document.findDocType(); + + if (node != null) + Node.discardElement(node); +} // #endif + + if (doctype != null && !lexer.checkDocTypeKeyWords(doctype)) + Report.warning(lexer, doctype, null, Report.DTYPE_NOT_UPPER_CASE); + + /* ensure presence of initial <?XML version="1.0"?> */ + if (lexer.configuration.XmlPi) + lexer.fixXMLPI(document); + + return document; + } + + public static boolean isJavaScript(Node node) + { + boolean result = false; + AttVal attr; + + if (node.attributes == null) + return true; + + for (attr = node.attributes; attr != null; attr = attr.next) + { + if ( (Lexer.wstrcasecmp(attr.attribute, "language") == 0 + || Lexer.wstrcasecmp(attr.attribute, "type") == 0) + && Lexer.wsubstr(attr.value, "javascript")) + result = true; + } + + return result; + } + +}