X-Git-Url: http://secure.phpeclipse.com diff --git a/net.sourceforge.phpeclipse/src/net/sourceforge/phpdt/tidy/Lexer.java b/net.sourceforge.phpeclipse/src/net/sourceforge/phpdt/tidy/Lexer.java new file mode 100644 index 0000000..f5f5548 --- /dev/null +++ b/net.sourceforge.phpeclipse/src/net/sourceforge/phpdt/tidy/Lexer.java @@ -0,0 +1,3134 @@ +/* + * @(#)Lexer.java 1.11 2000/08/16 + * + */ + +package net.sourceforge.phpdt.tidy; + +/** + * + * Lexer for html parser + * + * (c) 1998-2000 (W3C) MIT, INRIA, Keio University + * See Tidy.java for the copyright notice. + * Derived from + * HTML Tidy Release 4 Aug 2000 + * + * @author Dave Raggett + * @author Andy Quick (translation to Java) + * @version 1.0, 1999/05/22 + * @version 1.0.1, 1999/05/29 + * @version 1.1, 1999/06/18 Java Bean + * @version 1.2, 1999/07/10 Tidy Release 7 Jul 1999 + * @version 1.3, 1999/07/30 Tidy Release 26 Jul 1999 + * @version 1.4, 1999/09/04 DOM support + * @version 1.5, 1999/10/23 Tidy Release 27 Sep 1999 + * @version 1.6, 1999/11/01 Tidy Release 22 Oct 1999 + * @version 1.7, 1999/12/06 Tidy Release 30 Nov 1999 + * @version 1.8, 2000/01/22 Tidy Release 13 Jan 2000 + * @version 1.9, 2000/06/03 Tidy Release 30 Apr 2000 + * @version 1.10, 2000/07/22 Tidy Release 8 Jul 2000 + * @version 1.11, 2000/08/16 Tidy Release 4 Aug 2000 + */ + +/* + Given a file stream fp it returns a sequence of tokens. + + GetToken(fp) gets the next token + UngetToken(fp) provides one level undo + + The tags include an attribute list: + + - linked list of attribute/value nodes + - each node has 2 null-terminated strings. + - entities are replaced in attribute values + + white space is compacted if not in preformatted mode + If not in preformatted mode then leading white space + is discarded and subsequent white space sequences + compacted to single space chars. + + If XmlTags is no then Tag names are folded to upper + case and attribute names to lower case. + + Not yet done: + - Doctype subset and marked sections +*/ + +import java.io.PrintWriter; +import java.util.Stack; +import java.util.Vector; + +import org.eclipse.core.resources.IFile; +import sun.security.krb5.internal.av; + +public class Lexer { + + private IFile iFile; + public StreamIn in; /* file stream */ + public PrintWriter errout; /* error output stream */ + public short badAccess; /* for accessibility errors */ + public short badLayout; /* for bad style errors */ + public short badChars; /* for bad char encodings */ + public short badForm; /* for mismatched/mispositioned form tags */ + public short warnings; /* count of warnings in this document */ + public short errors; /* count of errors */ + public int lines; /* lines seen */ + public int columns; /* at start of current token */ + public boolean waswhite; /* used to collapse contiguous white space */ + public boolean pushed; /* true after token has been pushed back */ + public boolean insertspace; /* when space is moved after end tag */ + public boolean excludeBlocks; /* Netscape compatibility */ + public boolean exiled; /* true if moved out of table */ + public boolean isvoyager; /* true if xmlns attribute on html element */ + public short versions; /* bit vector of HTML versions */ + public int doctype; /* version as given by doctype (if any) */ + public boolean badDoctype; /* e.g. if html or PUBLIC is missing */ + public int txtstart; /* start of current node */ + public int txtend; /* end of current node */ + public short state; /* state of lexer's finite state machine */ + public Node token; + + /* + lexer character buffer + + parse tree nodes span onto this buffer + which contains the concatenated text + contents of all of the elements. + + lexsize must be reset for each file. + */ + public byte[] lexbuf; /* byte buffer of UTF-8 chars */ + public int lexlength; /* allocated */ + public int lexsize; /* used */ + + /* Inline stack for compatibility with Mosaic */ + public Node inode; /* for deferring text node */ + public int insert; /* for inferring inline tags */ + public Stack istack; + public int istackbase; /* start of frame */ + + public Style styles; /* used for cleaning up presentation markup */ + + public Configuration configuration; + protected int seenBodyEndTag; /* used by parser */ + private Vector nodeList; + + public Lexer(IFile iFile, StreamIn in, Configuration configuration) + { + this.iFile = iFile; + this.in = in; + this.lines = 1; + this.columns = 1; + this.state = LEX_CONTENT; + this.badAccess = 0; + this.badLayout = 0; + this.badChars = 0; + this.badForm = 0; + this.warnings = 0; + this.errors = 0; + this.waswhite = false; + this.pushed = false; + this.insertspace = false; + this.exiled = false; + this.isvoyager = false; + this.versions = Dict.VERS_EVERYTHING; + this.doctype = Dict.VERS_UNKNOWN; + this.badDoctype = false; + this.txtstart = 0; + this.txtend = 0; + this.token = null; + this.lexbuf = null; + this.lexlength = 0; + this.lexsize = 0; + this.inode = null; + this.insert = -1; + this.istack = new Stack(); + this.istackbase = 0; + this.styles = null; + this.configuration = configuration; + this.seenBodyEndTag = 0; + this.nodeList = new Vector(); + } + + public IFile getIFile() { + return iFile; + } + + public Node newNode() + { + Node node = new Node(); + nodeList.addElement(node); + return node; + } + + public Node newNode(short type, byte[] textarray, int start, int end) + { + Node node = new Node(type, textarray, start, end); + nodeList.addElement(node); + return node; + } + + public Node newNode(short type, byte[] textarray, int start, int end, String element) + { + Node node = new Node(type, textarray, start, end, element, configuration.tt); + nodeList.addElement(node); + return node; + } + + public Node cloneNode(Node node) + { + Node cnode = (Node)node.clone(); + nodeList.addElement(cnode); + for (AttVal att = cnode.attributes; att != null; att = att.next) { + if (att.asp != null) + nodeList.addElement(att.asp); + if (att.php != null) + nodeList.addElement(att.php); + } + return cnode; + } + + public AttVal cloneAttributes(AttVal attrs) + { + AttVal cattrs = (AttVal)attrs.clone(); + for (AttVal att = cattrs; att != null; att = att.next) { + if (att.asp != null) + nodeList.addElement(att.asp); + if (att.php != null) + nodeList.addElement(att.php); + } + return cattrs; + } + + protected void updateNodeTextArrays(byte[] oldtextarray, byte[] newtextarray) + { + Node node; + for (int i = 0; i < nodeList.size(); i++) { + node = (Node)(nodeList.elementAt(i)); + if (node.textarray == oldtextarray) + node.textarray = newtextarray; + } + } + + /* used for creating preformatted text from Word2000 */ + public Node newLineNode() + { + Node node = newNode(); + + node.textarray = this.lexbuf; + node.start = this.lexsize; + addCharToLexer((int)'\n'); + node.end = this.lexsize; + return node; + } + + // Should always be able convert to/from UTF-8, so encoding exceptions are + // converted to an Error to avoid adding throws declarations in + // lots of methods. + + public static byte[] getBytes(String str) { + try { + return str.getBytes("UTF8"); + } catch (java.io.UnsupportedEncodingException e) { + throw new Error("string to UTF-8 conversion failed: " + e.getMessage()); + } + } + + public static String getString(byte[] bytes, int offset, int length) { + try { + return new String(bytes, offset, length, "UTF8"); + } catch (java.io.UnsupportedEncodingException e) { + throw new Error("UTF-8 to string conversion failed: " + e.getMessage()); + } + } + + public boolean endOfInput() + { + return this.in.isEndOfStream(); + } + + public void addByte(int c) + { + if (this.lexsize + 1 >= this.lexlength) + { + while (this.lexsize + 1 >= this.lexlength) + { + if (this.lexlength == 0) + this.lexlength = 8192; + else + this.lexlength = this.lexlength * 2; + } + + byte[] temp = this.lexbuf; + this.lexbuf = new byte[ this.lexlength ]; + if (temp != null) + { + System.arraycopy( temp, 0, this.lexbuf, 0, temp.length ); + updateNodeTextArrays(temp, this.lexbuf); + } + } + + this.lexbuf[this.lexsize++] = (byte)c; + this.lexbuf[this.lexsize] = (byte)'\0'; /* debug */ + } + + public void changeChar(byte c) + { + if (this.lexsize > 0) + { + this.lexbuf[this.lexsize-1] = c; + } + } + + /* store char c as UTF-8 encoded byte stream */ + public void addCharToLexer(int c) + { + if (c < 128) + addByte(c); + else if (c <= 0x7FF) + { + addByte(0xC0 | (c >> 6)); + addByte(0x80 | (c & 0x3F)); + } + else if (c <= 0xFFFF) + { + addByte(0xE0 | (c >> 12)); + addByte(0x80 | ((c >> 6) & 0x3F)); + addByte(0x80 | (c & 0x3F)); + } + else if (c <= 0x1FFFFF) + { + addByte(0xF0 | (c >> 18)); + addByte(0x80 | ((c >> 12) & 0x3F)); + addByte(0x80 | ((c >> 6) & 0x3F)); + addByte(0x80 | (c & 0x3F)); + } + else + { + addByte(0xF8 | (c >> 24)); + addByte(0x80 | ((c >> 18) & 0x3F)); + addByte(0x80 | ((c >> 12) & 0x3F)); + addByte(0x80 | ((c >> 6) & 0x3F)); + addByte(0x80 | (c & 0x3F)); + } + } + + public void addStringToLexer(String str) + { + for ( int i = 0; i < str.length(); i++ ) { + addCharToLexer( (int)str.charAt(i) ); + } + } + + /* + No longer attempts to insert missing ';' for unknown + enitities unless one was present already, since this + gives unexpected results. + + For example: + was tidied to: + rather than: + + My thanks for Maurice Buxton for spotting this. + */ + public void parseEntity(short mode) + { + short map; + int start; + boolean first = true; + boolean semicolon = false; + boolean numeric = false; + int c, ch, startcol; + String str; + + start = this.lexsize - 1; /* to start at "&" */ + startcol = this.in.curcol - 1; + + while (true) + { + c = this.in.readChar(); + if (c == StreamIn.EndOfStream) break; + if (c == ';') + { + semicolon = true; + break; + } + + if (first && c == '#') + { + addCharToLexer(c); + first = false; + numeric = true; + continue; + } + + first = false; + map = MAP((char)c); + + /* AQ: Added flag for numeric entities so that numeric entities + with missing semi-colons are recognized. + Eg. "rep..." is recognized as "rep" + */ + if (numeric && ((c == 'x') || ((map & DIGIT) != 0))) + { + addCharToLexer(c); + continue; + } + if (!numeric && ((map & NAMECHAR) != 0)) + { + addCharToLexer(c); + continue; + } + + /* otherwise put it back */ + + this.in.ungetChar(c); + break; + } + + str = getString( this.lexbuf, start, this.lexsize - start ); + ch = EntityTable.getDefaultEntityTable().entityCode( str ); + + /* deal with unrecognized entities */ + if (ch <= 0) + { + /* set error position just before offending chararcter */ + this.lines = this.in.curline; + this.columns = startcol; + + if (this.lexsize > start +1 ) + { + Report.entityError(this, Report.UNKNOWN_ENTITY, str, ch); + + if (semicolon) + addCharToLexer(';'); + } + else /* naked & */ + { + Report.entityError(this, Report.UNESCAPED_AMPERSAND, str, ch); + } + } + else + { + if (c != ';') /* issue warning if not terminated by ';' */ + { + /* set error position just before offending chararcter */ + this.lines = this.in.curline; + this.columns = startcol; + Report.entityError(this, Report.MISSING_SEMICOLON, str, c); + } + + this.lexsize = start; + + if (ch == 160 && (mode & Preformatted) != 0) + ch = ' '; + + addCharToLexer(ch); + + if (ch == '&' && !this.configuration.QuoteAmpersand) + { + addCharToLexer('a'); + addCharToLexer('m'); + addCharToLexer('p'); + addCharToLexer(';'); + } + } + } + + public char parseTagName() + { + short map; + int c; + + /* fold case of first char in buffer */ + + c = this.lexbuf[this.txtstart]; + map = MAP((char)c); + + if (!this.configuration.XmlTags && (map & UPPERCASE) != 0) + { + c += (int)((int)'a' - (int)'A'); + this.lexbuf[this.txtstart] = (byte)c; + } + + while (true) + { + c = this.in.readChar(); + if (c == StreamIn.EndOfStream) break; + map = MAP((char)c); + + if ((map & NAMECHAR) == 0) + break; + + /* fold case of subsequent chars */ + + if (!this.configuration.XmlTags && (map & UPPERCASE) != 0) + c += (int)((int)'a' - (int)'A'); + + addCharToLexer(c); + } + + this.txtend = this.lexsize; + return (char)c; + } + + public void addStringLiteral(String str) + { + for ( int i = 0; i < str.length(); i++ ) { + addCharToLexer( (int)str.charAt(i) ); + } + } + + /* choose what version to use for new doctype */ + public short HTMLVersion() + { + short versions; + + versions = this.versions; + + if ((versions & Dict.VERS_HTML20) != 0) + return Dict.VERS_HTML20; + + if ((versions & Dict.VERS_HTML32) != 0) + return Dict.VERS_HTML32; + + if ((versions & Dict.VERS_HTML40_STRICT) != 0) + return Dict.VERS_HTML40_STRICT; + + if ((versions & Dict.VERS_HTML40_LOOSE) != 0) + return Dict.VERS_HTML40_LOOSE; + + if ((versions & Dict.VERS_FRAMES) != 0) + return Dict.VERS_FRAMES; + + return Dict.VERS_UNKNOWN; + } + + public String HTMLVersionName() + { + short guessed; + int j; + + guessed = apparentVersion(); + + for (j = 0; j < W3CVersion.length; ++j) + { + if (guessed == W3CVersion[j].code) + { + if (this.isvoyager) + return W3CVersion[j].voyagerName; + + return W3CVersion[j].name; + } + } + + return null; + } + + /* add meta element for Tidy */ + public boolean addGenerator(Node root) + { + AttVal attval; + Node node; + Node head = root.findHEAD(configuration.tt); + + if (head != null) + { + for (node = head.content; node != null; node = node.next) + { + if (node.tag == configuration.tt.tagMeta) + { + attval = node.getAttrByName("name"); + + if (attval != null && attval.value != null && + Lexer.wstrcasecmp(attval.value, "generator") == 0) + { + attval = node.getAttrByName("content"); + + if (attval != null && attval.value != null && + attval.value.length() >= 9 && + Lexer.wstrcasecmp(attval.value.substring(0, 9), "HTML Tidy") == 0) + { + return false; + } + } + } + } + + node = this.inferredTag("meta"); + node.addAttribute("content", "HTML Tidy, see www.w3.org"); + node.addAttribute("name", "generator"); + Node.insertNodeAtStart(head, node); + return true; + } + + return false; + } + + /* return true if substring s is in p and isn't all in upper case */ + /* this is used to check the case of SYSTEM, PUBLIC, DTD and EN */ + /* len is how many chars to check in p */ + private static boolean findBadSubString(String s, String p, int len) + { + int n = s.length(); + int i = 0; + String ps; + + while (n < len) + { + ps = p.substring(i, i + n); + if (wstrcasecmp(s, ps) == 0) + return (!ps.equals(s.substring(0, n))); + + ++i; + --len; + } + + return false; + } + + public boolean checkDocTypeKeyWords(Node doctype) + { + int len = doctype.end - doctype.start; + String s = getString(this.lexbuf, doctype.start, len); + + return !( + findBadSubString("SYSTEM", s, len) || + findBadSubString("PUBLIC", s, len) || + findBadSubString("//DTD", s, len) || + findBadSubString("//W3C", s, len) || + findBadSubString("//EN", s, len) + ); + } + + /* examine to identify version */ + public short findGivenVersion(Node doctype) + { + String p, s; + int i, j; + int len; + String str1; + String str2; + + /* if root tag for doctype isn't html give up now */ + str1 = getString(this.lexbuf, doctype.start, 5); + if (wstrcasecmp(str1, "html ") != 0) + return 0; + + if (!checkDocTypeKeyWords(doctype)) + Report.warning(this, doctype, null, Report.DTYPE_NOT_UPPER_CASE); + + /* give up if all we are given is the system id for the doctype */ + str1 = getString(this.lexbuf, doctype.start + 5, 7); + if (wstrcasecmp(str1, "SYSTEM ") == 0) + { + /* but at least ensure the case is correct */ + if (!str1.substring(0, 6).equals("SYSTEM")) + System.arraycopy( getBytes("SYSTEM"), 0, + this.lexbuf, doctype.start + 5, 6 ); + return 0; /* unrecognized */ + } + + if (wstrcasecmp(str1, "PUBLIC ") == 0) + { + if (!str1.substring(0, 6).equals("PUBLIC")) + System.arraycopy( getBytes("PUBLIC "), 0, + this.lexbuf, doctype.start + 5, 6 ); + } + else + this.badDoctype = true; + + for (i = doctype.start; i < doctype.end; ++i) + { + if (this.lexbuf[i] == (byte)'"') + { + str1 = getString( this.lexbuf, i + 1, 12 ); + str2 = getString( this.lexbuf, i + 1, 13 ); + if (str1.equals("-//W3C//DTD ")) + { + /* compute length of identifier e.g. "HTML 4.0 Transitional" */ + for (j = i + 13; j < doctype.end && this.lexbuf[j] != (byte)'/'; ++j); + len = j - i - 13; + p = getString( this.lexbuf, i + 13, len ); + + for (j = 1; j < W3CVersion.length; ++j) + { + s = W3CVersion[j].name; + if (len == s.length() && s.equals(p)) + return W3CVersion[j].code; + } + + /* else unrecognized version */ + } + else if (str2.equals("-//IETF//DTD ")) + { + /* compute length of identifier e.g. "HTML 2.0" */ + for (j = i + 14; j < doctype.end && this.lexbuf[j] != (byte)'/'; ++j); + len = j - i - 14; + + p = getString( this.lexbuf, i + 14, len ); + s = W3CVersion[0].name; + if (len == s.length() && s.equals(p)) + return W3CVersion[0].code; + + /* else unrecognized version */ + } + break; + } + } + + return 0; + } + + public void fixHTMLNameSpace(Node root, String profile) + { + Node node; + AttVal prev, attr; + + for (node = root.content; + node != null && node.tag != configuration.tt.tagHtml; node = node.next); + + if (node != null) + { + prev = null; + + for (attr = node.attributes; attr != null; attr = attr.next) + { + if (attr.attribute.equals("xmlns")) + break; + + prev = attr; + } + + if (attr != null) + { + if (!attr.value.equals(profile)) + { + Report.warning(this, node, null, Report.INCONSISTENT_NAMESPACE); + attr.value = profile; + } + } + else + { + attr = new AttVal( node.attributes, null, (int)'"', + "xmlns", profile ); + attr.dict = + AttributeTable.getDefaultAttributeTable().findAttribute( attr ); + node.attributes = attr; + } + } + } + + public boolean setXHTMLDocType(Node root) + { + String fpi = " "; + String sysid = ""; + String namespace = XHTML_NAMESPACE; + Node doctype; + + doctype = root.findDocType(); + + if (configuration.docTypeMode == Configuration.DOCTYPE_OMIT) + { + if (doctype != null) + Node.discardElement(doctype); + return true; + } + + if (configuration.docTypeMode == Configuration.DOCTYPE_AUTO) + { + /* see what flavor of XHTML this document matches */ + if ((this.versions & Dict.VERS_HTML40_STRICT) != 0) + { /* use XHTML strict */ + fpi = "-//W3C//DTD XHTML 1.0 Strict//EN"; + sysid = voyager_strict; + } + else if ((this.versions & Dict.VERS_LOOSE) != 0) + { + fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN"; + sysid = voyager_loose; + } + else if ((this.versions & Dict.VERS_FRAMES) != 0) + { /* use XHTML frames */ + fpi = "-//W3C//DTD XHTML 1.0 Frameset//EN"; + sysid = voyager_frameset; + } + else /* lets assume XHTML transitional */ + { + fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN"; + sysid = voyager_loose; + } + } + else if (configuration.docTypeMode == Configuration.DOCTYPE_STRICT) + { + fpi = "-//W3C//DTD XHTML 1.0 Strict//EN"; + sysid = voyager_strict; + } + else if (configuration.docTypeMode == Configuration.DOCTYPE_LOOSE) + { + fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN"; + sysid = voyager_loose; + } + + fixHTMLNameSpace(root, namespace); + + if (doctype == null) + { + doctype = newNode(Node.DocTypeTag, this.lexbuf, 0, 0); + doctype.next = root.content; + doctype.parent = root; + doctype.prev = null; + root.content = doctype; + } + + if (configuration.docTypeMode == Configuration.DOCTYPE_USER && + configuration.docTypeStr != null) + { + fpi = configuration.docTypeStr; + sysid = ""; + } + + this.txtstart = this.lexsize; + this.txtend = this.lexsize; + + /* add public identifier */ + addStringLiteral("html PUBLIC "); + + /* check if the fpi is quoted or not */ + if (fpi.charAt(0) == '"') + addStringLiteral(fpi); + else + { + addStringLiteral("\""); + addStringLiteral(fpi); + addStringLiteral("\""); + } + + if (sysid.length() + 6 >= this.configuration.wraplen) + addStringLiteral("\n\""); + else + addStringLiteral("\n \""); + + /* add system identifier */ + addStringLiteral(sysid); + addStringLiteral("\""); + + this.txtend = this.lexsize; + + doctype.start = this.txtstart; + doctype.end = this.txtend; + + return false; + } + + public short apparentVersion() + { + switch (this.doctype) + { + case Dict.VERS_UNKNOWN: + return HTMLVersion(); + + case Dict.VERS_HTML20: + if ((this.versions & Dict.VERS_HTML20) != 0) + return Dict.VERS_HTML20; + + break; + + case Dict.VERS_HTML32: + if ((this.versions & Dict.VERS_HTML32) != 0) + return Dict.VERS_HTML32; + + break; /* to replace old version by new */ + + case Dict.VERS_HTML40_STRICT: + if ((this.versions & Dict.VERS_HTML40_STRICT) != 0) + return Dict.VERS_HTML40_STRICT; + + break; + + case Dict.VERS_HTML40_LOOSE: + if ((this.versions & Dict.VERS_HTML40_LOOSE) != 0) + return Dict.VERS_HTML40_LOOSE; + + break; /* to replace old version by new */ + + case Dict.VERS_FRAMES: + if ((this.versions & Dict.VERS_FRAMES) != 0) + return Dict.VERS_FRAMES; + + break; + } + + Report.warning(this, null, null, Report.INCONSISTENT_VERSION); + return this.HTMLVersion(); + } + + /* fixup doctype if missing */ + public boolean fixDocType(Node root) + { + Node doctype; + int guessed = Dict.VERS_HTML40_STRICT, i; + + if (this.badDoctype) + Report.warning(this, null, null, Report.MALFORMED_DOCTYPE); + + if (configuration.XmlOut) + return true; + + doctype = root.findDocType(); + + if (configuration.docTypeMode == Configuration.DOCTYPE_OMIT) + { + if (doctype != null) + Node.discardElement(doctype); + return true; + } + + if (configuration.docTypeMode == Configuration.DOCTYPE_STRICT) + { + Node.discardElement(doctype); + doctype = null; + guessed = Dict.VERS_HTML40_STRICT; + } + else if (configuration.docTypeMode == Configuration.DOCTYPE_LOOSE) + { + Node.discardElement(doctype); + doctype = null; + guessed = Dict.VERS_HTML40_LOOSE; + } + else if (configuration.docTypeMode == Configuration.DOCTYPE_AUTO) + { + if (doctype != null) + { + if (this.doctype == Dict.VERS_UNKNOWN) + return false; + + switch (this.doctype) + { + case Dict.VERS_UNKNOWN: + return false; + + case Dict.VERS_HTML20: + if ((this.versions & Dict.VERS_HTML20) != 0) + return true; + + break; /* to replace old version by new */ + + case Dict.VERS_HTML32: + if ((this.versions & Dict.VERS_HTML32) != 0) + return true; + + break; /* to replace old version by new */ + + case Dict.VERS_HTML40_STRICT: + if ((this.versions & Dict.VERS_HTML40_STRICT) != 0) + return true; + + break; /* to replace old version by new */ + + case Dict.VERS_HTML40_LOOSE: + if ((this.versions & Dict.VERS_HTML40_LOOSE) != 0) + return true; + + break; /* to replace old version by new */ + + case Dict.VERS_FRAMES: + if ((this.versions & Dict.VERS_FRAMES) != 0) + return true; + + break; /* to replace old version by new */ + } + + /* INCONSISTENT_VERSION warning is now issued by ApparentVersion() */ + } + + /* choose new doctype */ + guessed = HTMLVersion(); + } + + if (guessed == Dict.VERS_UNKNOWN) + return false; + + /* for XML use the Voyager system identifier */ + if (this.configuration.XmlOut || this.configuration.XmlTags || this.isvoyager) + { + if (doctype != null) + Node.discardElement(doctype); + + for (i = 0; i < W3CVersion.length; ++i) + { + if (guessed == W3CVersion[i].code) + { + fixHTMLNameSpace(root, W3CVersion[i].profile); + break; + } + } + + return true; + } + + if (doctype == null) + { + doctype = newNode(Node.DocTypeTag, this.lexbuf, 0, 0); + doctype.next = root.content; + doctype.parent = root; + doctype.prev = null; + root.content = doctype; + } + + this.txtstart = this.lexsize; + this.txtend = this.lexsize; + + /* use the appropriate public identifier */ + addStringLiteral("html PUBLIC "); + + if (configuration.docTypeMode == Configuration.DOCTYPE_USER && + configuration.docTypeStr != null) + addStringLiteral(configuration.docTypeStr); + else if (guessed == Dict.VERS_HTML20) + addStringLiteral("\"-//IETF//DTD HTML 2.0//EN\""); + else + { + addStringLiteral("\"-//W3C//DTD "); + + for (i = 0; i < W3CVersion.length; ++i) + { + if (guessed == W3CVersion[i].code) + { + addStringLiteral(W3CVersion[i].name); + break; + } + } + + addStringLiteral("//EN\""); + } + + this.txtend = this.lexsize; + + doctype.start = this.txtstart; + doctype.end = this.txtend; + + return true; + } + + /* ensure XML document starts with */ + public boolean fixXMLPI(Node root) + { + Node xml; + int s; + + if( root.content != null && root.content.type == Node.ProcInsTag) + { + s = root.content.start; + + if (this.lexbuf[s] == (byte)'x' && + this.lexbuf[s+1] == (byte)'m' && + this.lexbuf[s+2] == (byte)'l') + return true; + } + + xml = newNode(Node.ProcInsTag, this.lexbuf, 0, 0); + xml.next = root.content; + + if (root.content != null) + { + root.content.prev = xml; + xml.next = root.content; + } + + root.content = xml; + + this.txtstart = this.lexsize; + this.txtend = this.lexsize; + addStringLiteral("xml version=\"1.0\""); + if (this.configuration.CharEncoding == Configuration.LATIN1) + addStringLiteral(" encoding=\"ISO-8859-1\""); + this.txtend = this.lexsize; + + xml.start = this.txtstart; + xml.end = this.txtend; + return false; + } + + public Node inferredTag(String name) + { + Node node; + + node = newNode(Node.StartTag, + this.lexbuf, + this.txtstart, + this.txtend, + name); + node.implicit = true; + return node; + } + + public static boolean expectsContent(Node node) + { + if (node.type != Node.StartTag) + return false; + + /* unknown element? */ + if (node.tag == null) + return true; + + if ((node.tag.model & Dict.CM_EMPTY) != 0) + return false; + + return true; + } + + /* + create a text node for the contents of + a CDATA element like style or script + which ends with for some foo. + */ + public Node getCDATA(Node container) + { + int c, lastc, start, len, i; + String str; + boolean endtag = false; + + this.lines = this.in.curline; + this.columns = this.in.curcol; + this.waswhite = false; + this.txtstart = this.lexsize; + this.txtend = this.lexsize; + + lastc = (int)'\0'; + start = -1; + + while (true) + { + c = this.in.readChar(); + if (c == StreamIn.EndOfStream) break; + /* treat \r\n as \n and \r as \n */ + + if (c == (int)'/' && lastc == (int)'<') + { + if (endtag) + { + this.lines = this.in.curline; + this.columns = this.in.curcol - 3; + + Report.warning(this, null, null, Report.BAD_CDATA_CONTENT); + } + + start = this.lexsize + 1; /* to first letter */ + endtag = true; + } + else if (c == (int)'>' && start >= 0) + { + len = this.lexsize - start; + if (len == container.element.length()) + { + str = getString( this.lexbuf, start, len ); + if (Lexer.wstrcasecmp(str, container.element) == 0) + { + this.txtend = start - 2; + break; + } + } + + this.lines = this.in.curline; + this.columns = this.in.curcol - 3; + + Report.warning(this, null, null, Report.BAD_CDATA_CONTENT); + + /* if javascript insert backslash before / */ + + if (ParserImpl.isJavaScript(container)) + { + for (i = this.lexsize; i > start-1; --i) + this.lexbuf[i] = this.lexbuf[i-1]; + + this.lexbuf[start-1] = (byte)'\\'; + this.lexsize++; + } + + start = -1; + } + else if (c == (int)'\r') + { + c = this.in.readChar(); + + if (c != (int)'\n') + this.in.ungetChar(c); + + c = (int)'\n'; + } + + addCharToLexer((int)c); + this.txtend = this.lexsize; + lastc = c; + } + + if (c == StreamIn.EndOfStream) + Report.warning(this, container, null, Report.MISSING_ENDTAG_FOR); + + if (this.txtend > this.txtstart) + { + this.token = newNode(Node.TextNode, + this.lexbuf, + this.txtstart, + this.txtend); + return this.token; + } + + return null; + } + + public void ungetToken() + { + this.pushed = true; + } + + public static final short IgnoreWhitespace = 0; + public static final short MixedContent = 1; + public static final short Preformatted = 2; + public static final short IgnoreMarkup = 3; + + /* + modes for GetToken() + + MixedContent -- for elements which don't accept PCDATA + Preformatted -- white space preserved as is + IgnoreMarkup -- for CDATA elements such as script, style + */ + + public Node getToken(short mode) + { + short map; + int c = 0; + int lastc; + int badcomment = 0; + MutableBoolean isempty = new MutableBoolean(); + AttVal attributes; + + if (this.pushed) + { + /* duplicate inlines in preference to pushed text nodes when appropriate */ + if (this.token.type != Node.TextNode || + (this.insert == -1 && this.inode == null)) + { + this.pushed = false; + return this.token; + } + } + + /* at start of block elements, unclosed inline + elements are inserted into the token stream */ + + if (this.insert != -1 || this.inode != null) + return insertedToken(); + + this.lines = this.in.curline; + this.columns = this.in.curcol; + this.waswhite = false; + + this.txtstart = this.lexsize; + this.txtend = this.lexsize; + + while (true) + { + c = this.in.readChar(); + if (c == StreamIn.EndOfStream) break; + if (this.insertspace && mode != IgnoreWhitespace) + { + addCharToLexer(' '); + this.waswhite = true; + this.insertspace = false; + } + + /* treat \r\n as \n and \r as \n */ + + if (c == '\r') + { + c = this.in.readChar(); + + if (c != '\n') + this.in.ungetChar(c); + + c = '\n'; + } + + addCharToLexer(c); + + switch (this.state) + { + case LEX_CONTENT: /* element content */ + map = MAP((char)c); + + /* + Discard white space if appropriate. Its cheaper + to do this here rather than in parser methods + for elements that don't have mixed content. + */ + if (((map & WHITE) != 0) && (mode == IgnoreWhitespace) + && this.lexsize == this.txtstart + 1) + { + --this.lexsize; + this.waswhite = false; + this.lines = this.in.curline; + this.columns = this.in.curcol; + continue; + } + + if (c == '<') + { + this.state = LEX_GT; + continue; + } + + if ((map & WHITE) != 0) + { + /* was previous char white? */ + if (this.waswhite) + { + if (mode != Preformatted && mode != IgnoreMarkup) + { + --this.lexsize; + this.lines = this.in.curline; + this.columns = this.in.curcol; + } + } + else /* prev char wasn't white */ + { + this.waswhite = true; + lastc = c; + + if (mode != Preformatted && mode != IgnoreMarkup && c != ' ') + changeChar((byte)' '); + } + + continue; + } + else if (c == '&' && mode != IgnoreMarkup) + parseEntity(mode); + + /* this is needed to avoid trimming trailing whitespace */ + if (mode == IgnoreWhitespace) + mode = MixedContent; + + this.waswhite = false; + continue; + + case LEX_GT: /* < */ + + /* check for endtag */ + if (c == '/') + { + c = this.in.readChar(); + if (c == StreamIn.EndOfStream) + { + this.in.ungetChar(c); + continue; + } + + addCharToLexer(c); + map = MAP((char)c); + + if ((map & LETTER) != 0) + { + this.lexsize -= 3; + this.txtend = this.lexsize; + this.in.ungetChar(c); + this.state = LEX_ENDTAG; + this.lexbuf[this.lexsize] = (byte)'\0'; /* debug */ + this.in.curcol -= 2; + + /* if some text before the this.txtstart) + { + /* trim space char before end tag */ + if (mode == IgnoreWhitespace && this.lexbuf[this.lexsize - 1] == (byte)' ') + { + this.lexsize -= 1; + this.txtend = this.lexsize; + } + + this.token = newNode(Node.TextNode, + this.lexbuf, + this.txtstart, + this.txtend); + return this.token; + } + + continue; /* no text so keep going */ + } + + /* otherwise treat as CDATA */ + this.waswhite = false; + this.state = LEX_CONTENT; + continue; + } + + if (mode == IgnoreMarkup) + { + /* otherwise treat as CDATA */ + this.waswhite = false; + this.state = LEX_CONTENT; + continue; + } + + /* + look out for comments, doctype or marked sections + this isn't quite right, but its getting there ... + */ + if (c == '!') + { + c = this.in.readChar(); + + if (c == '-') + { + c = this.in.readChar(); + + if (c == '-') + { + this.state = LEX_COMMENT; /* comment */ + this.lexsize -= 2; + this.txtend = this.lexsize; + + /* if some text before < return it now */ + if (this.txtend > this.txtstart) + { + this.token = newNode(Node.TextNode, + this.lexbuf, + this.txtstart, + this.txtend); + return this.token; + } + + this.txtstart = this.lexsize; + continue; + } + + Report.warning(this, null, null, Report.MALFORMED_COMMENT); + } + else if (c == 'd' || c == 'D') + { + this.state = LEX_DOCTYPE; /* doctype */ + this.lexsize -= 2; + this.txtend = this.lexsize; + mode = IgnoreWhitespace; + + /* skip until white space or '>' */ + + for (;;) + { + c = this.in.readChar(); + + if (c == StreamIn.EndOfStream || c == '>') + { + this.in.ungetChar(c); + break; + } + + map = MAP((char)c); + + if ((map & WHITE) == 0) + continue; + + /* and skip to end of whitespace */ + + for (;;) + { + c = this.in.readChar(); + + if (c == StreamIn.EndOfStream || c == '>') + { + this.in.ungetChar(c); + break; + } + + map = MAP((char)c); + + if ((map & WHITE) != 0) + continue; + + this.in.ungetChar(c); + break; + } + + break; + } + + /* if some text before < return it now */ + if (this.txtend > this.txtstart) + { + this.token = newNode(Node.TextNode, + this.lexbuf, + this.txtstart, + this.txtend); + return this.token; + } + + this.txtstart = this.lexsize; + continue; + } + else if (c == '[') + { + /* Word 2000 embeds ... sequences */ + this.lexsize -= 2; + this.state = LEX_SECTION; + this.txtend = this.lexsize; + + /* if some text before < return it now */ + if (this.txtend > this.txtstart) + { + this.token = newNode(Node.TextNode, + this.lexbuf, + this.txtstart, + this.txtend); + return this.token; + } + + this.txtstart = this.lexsize; + continue; + } + + /* otherwise swallow chars up to and including next '>' */ + while (true) + { + c = this.in.readChar(); + if (c == '>') break; + if (c == -1) + { + this.in.ungetChar(c); + break; + } + } + + this.lexsize -= 2; + this.lexbuf[this.lexsize] = (byte)'\0'; + this.state = LEX_CONTENT; + continue; + } + + /* + processing instructions + */ + + if (c == '?') + { + this.lexsize -= 2; + this.state = LEX_PROCINSTR; + this.txtend = this.lexsize; + + /* if some text before < return it now */ + if (this.txtend > this.txtstart) + { + this.token = newNode(Node.TextNode, + this.lexbuf, + this.txtstart, + this.txtend); + return this.token; + } + + this.txtstart = this.lexsize; + continue; + } + + /* Microsoft ASP's e.g. <% ... server-code ... %> */ + if (c == '%') + { + this.lexsize -= 2; + this.state = LEX_ASP; + this.txtend = this.lexsize; + + /* if some text before < return it now */ + if (this.txtend > this.txtstart) + { + this.token = newNode(Node.TextNode, + this.lexbuf, + this.txtstart, + this.txtend); + return this.token; + } + + this.txtstart = this.lexsize; + continue; + } + + /* Netscapes JSTE e.g. <# ... server-code ... #> */ + if (c == '#') + { + this.lexsize -= 2; + this.state = LEX_JSTE; + this.txtend = this.lexsize; + + /* if some text before < return it now */ + if (this.txtend > this.txtstart) + { + this.token = newNode(Node.TextNode, + this.lexbuf, + this.txtstart, + this.txtend); + return this.token; + } + + this.txtstart = this.lexsize; + continue; + } + + map = MAP((char)c); + + /* check for start tag */ + if ((map & LETTER) != 0) + { + this.in.ungetChar(c); /* push back letter */ + this.lexsize -= 2; /* discard "<" + letter */ + this.txtend = this.lexsize; + this.state = LEX_STARTTAG; /* ready to read tag name */ + + /* if some text before < return it now */ + if (this.txtend > this.txtstart) + { + this.token = newNode(Node.TextNode, + this.lexbuf, + this.txtstart, + this.txtend); + return this.token; + } + + continue; /* no text so keep going */ + } + + /* otherwise treat as CDATA */ + this.state = LEX_CONTENT; + this.waswhite = false; + continue; + + case LEX_ENDTAG: /* ' */ + while (c != '>') + { + c = this.in.readChar(); + + if (c == StreamIn.EndOfStream) + break; + } + + if (c == StreamIn.EndOfStream) + { + this.in.ungetChar(c); + continue; + } + + this.state = LEX_CONTENT; + this.waswhite = false; + return this.token; /* the endtag token */ + + case LEX_STARTTAG: /* first letter of tagname */ + this.txtstart = this.lexsize - 1; /* set txtstart to first letter */ + c = parseTagName(); + isempty.value = false; + attributes = null; + this.token = newNode((isempty.value ? Node.StartEndTag : Node.StartTag), + this.lexbuf, + this.txtstart, + this.txtend, + getString(this.lexbuf, + this.txtstart, + this.txtend - this.txtstart)); + + /* parse attributes, consuming closing ">" */ + if (c != '>') + { + if (c == '/') + this.in.ungetChar(c); + + attributes = parseAttrs(isempty); + } + + if (isempty.value) + this.token.type = Node.StartEndTag; + + this.token.attributes = attributes; + this.lexsize = this.txtstart; + this.txtend = this.txtstart; + + /* swallow newline following start tag */ + /* special check needed for CRLF sequence */ + /* this doesn't apply to empty elements */ + + if (expectsContent(this.token) || + this.token.tag == configuration.tt.tagBr) + { + + c = this.in.readChar(); + + if (c == '\r') + { + c = this.in.readChar(); + + if (c != '\n') + this.in.ungetChar(c); + } + else if (c != '\n' && c != '\f') + this.in.ungetChar(c); + + this.waswhite = true; /* to swallow leading whitespace */ + } + else + this.waswhite = false; + + this.state = LEX_CONTENT; + + if (this.token.tag == null) + Report.error(this, null, this.token, Report.UNKNOWN_ELEMENT); + else if (!this.configuration.XmlTags) + { + this.versions &= this.token.tag.versions; + + if ((this.token.tag.versions & Dict.VERS_PROPRIETARY) != 0) + { + if (!this.configuration.MakeClean && (this.token.tag == configuration.tt.tagNobr || + this.token.tag == configuration.tt.tagWbr)) + Report.warning(this, null, this.token, Report.PROPRIETARY_ELEMENT); + } + + if (this.token.tag.chkattrs != null) + { + this.token.checkUniqueAttributes(this); + this.token.tag.chkattrs.check(this, this.token); + } + else + this.token.checkAttributes(this); + } + + return this.token; /* return start tag */ + + case LEX_COMMENT: /* seen */ + + if (c != '-') + continue; + + c = this.in.readChar(); + addCharToLexer(c); + + if (c != '-') + continue; + + end_comment: while (true) { + c = this.in.readChar(); + + if (c == '>') + { + if (badcomment != 0) + Report.warning(this, null, null, Report.MALFORMED_COMMENT); + + this.txtend = this.lexsize - 2; // AQ 8Jul2000 + this.lexbuf[this.lexsize] = (byte)'\0'; + this.state = LEX_CONTENT; + this.waswhite = false; + this.token = newNode(Node.CommentTag, + this.lexbuf, + this.txtstart, + this.txtend); + + /* now look for a line break */ + + c = this.in.readChar(); + + if (c == '\r') + { + c = this.in.readChar(); + + if (c != '\n') + this.token.linebreak = true; + } + + if (c == '\n') + this.token.linebreak = true; + else + this.in.ungetChar(c); + + return this.token; + } + + /* note position of first such error in the comment */ + if (badcomment == 0) + { + this.lines = this.in.curline; + this.columns = this.in.curcol - 3; + } + + badcomment++; + if (this.configuration.FixComments) + this.lexbuf[this.lexsize - 2] = (byte)'='; + + addCharToLexer(c); + + /* if '-' then look for '>' to end the comment */ + if (c != '-') + break end_comment; + + } + /* otherwise continue to look for --> */ + this.lexbuf[this.lexsize - 2] = (byte)'='; + continue; + + case LEX_DOCTYPE: /* seen ' munging whitespace */ + map = MAP((char)c); + + if ((map & WHITE) != 0) + { + if (this.waswhite) + this.lexsize -= 1; + + this.waswhite = true; + } + else + this.waswhite = false; + + if (c != '>') + continue; + + this.lexsize -= 1; + this.txtend = this.lexsize; + this.lexbuf[this.lexsize] = (byte)'\0'; + this.state = LEX_CONTENT; + this.waswhite = false; + this.token = newNode(Node.DocTypeTag, + this.lexbuf, + this.txtstart, + this.txtend); + /* make a note of the version named by the doctype */ + this.doctype = findGivenVersion(this.token); + return this.token; + + case LEX_PROCINSTR: /* seen ' */ + /* check for PHP preprocessor instructions */ + + if (this.lexsize - this.txtstart == 3) + { + if ((getString(this.lexbuf, this.txtstart, 3)).equals("php")) + { + this.state = LEX_PHP; + continue; + } + } + + if (this.configuration.XmlPIs) /* insist on ?> as terminator */ + { + if (c != '?') + continue; + + /* now look for '>' */ + c = this.in.readChar(); + + if (c == StreamIn.EndOfStream) + { + Report.warning(this, null, null, Report.UNEXPECTED_END_OF_FILE); + this.in.ungetChar(c); + continue; + } + + addCharToLexer(c); + } + + if (c != '>') + continue; + + this.lexsize -= 1; + this.txtend = this.lexsize; + this.lexbuf[this.lexsize] = (byte)'\0'; + this.state = LEX_CONTENT; + this.waswhite = false; + this.token = newNode(Node.ProcInsTag, + this.lexbuf, + this.txtstart, + this.txtend); + return this.token; + + case LEX_ASP: /* seen <% so look for "%>" */ + if (c != '%') + continue; + + /* now look for '>' */ + c = this.in.readChar(); + + + if (c != '>') + { + this.in.ungetChar(c); + continue; + } + + this.lexsize -= 1; + this.txtend = this.lexsize; + this.lexbuf[this.lexsize] = (byte)'\0'; + this.state = LEX_CONTENT; + this.waswhite = false; + this.token = newNode(Node.AspTag, + this.lexbuf, + this.txtstart, + this.txtend); + return this.token; + + case LEX_JSTE: /* seen <# so look for "#>" */ + if (c != '#') + continue; + + /* now look for '>' */ + c = this.in.readChar(); + + + if (c != '>') + { + this.in.ungetChar(c); + continue; + } + + this.lexsize -= 1; + this.txtend = this.lexsize; + this.lexbuf[this.lexsize] = (byte)'\0'; + this.state = LEX_CONTENT; + this.waswhite = false; + this.token = newNode(Node.JsteTag, + this.lexbuf, + this.txtstart, + this.txtend); + return this.token; + + case LEX_PHP: /* seen "" */ + if (c != '?') + continue; + + /* now look for '>' */ + c = this.in.readChar(); + + if (c != '>') + { + this.in.ungetChar(c); + continue; + } + + this.lexsize -= 1; + this.txtend = this.lexsize; + this.lexbuf[this.lexsize] = (byte)'\0'; + this.state = LEX_CONTENT; + this.waswhite = false; + this.token = newNode(Node.PhpTag, + this.lexbuf, + this.txtstart, + this.txtend); + return this.token; + + case LEX_SECTION: /* seen "" */ + if (c == '[') + { + if (this.lexsize == (this.txtstart + 6) && + (getString(this.lexbuf, this.txtstart, 6)).equals("CDATA[")) + { + this.state = LEX_CDATA; + this.lexsize -= 6; + continue; + } + } + + if (c != ']') + continue; + + /* now look for '>' */ + c = this.in.readChar(); + + if (c != '>') + { + this.in.ungetChar(c); + continue; + } + + this.lexsize -= 1; + this.txtend = this.lexsize; + this.lexbuf[this.lexsize] = (byte)'\0'; + this.state = LEX_CONTENT; + this.waswhite = false; + this.token = newNode(Node.SectionTag, + this.lexbuf, + this.txtstart, + this.txtend); + return this.token; + + case LEX_CDATA: /* seen "" */ + if (c != ']') + continue; + + /* now look for ']' */ + c = this.in.readChar(); + + if (c != ']') + { + this.in.ungetChar(c); + continue; + } + + /* now look for '>' */ + c = this.in.readChar(); + + if (c != '>') + { + this.in.ungetChar(c); + continue; + } + + this.lexsize -= 1; + this.txtend = this.lexsize; + this.lexbuf[this.lexsize] = (byte)'\0'; + this.state = LEX_CONTENT; + this.waswhite = false; + this.token = newNode(Node.CDATATag, + this.lexbuf, + this.txtstart, + this.txtend); + return this.token; + } + } + + if (this.state == LEX_CONTENT) /* text string */ + { + this.txtend = this.lexsize; + + if (this.txtend > this.txtstart) + { + this.in.ungetChar(c); + + if (this.lexbuf[this.lexsize - 1] == (byte)' ') + { + this.lexsize -= 1; + this.txtend = this.lexsize; + } + + this.token = newNode(Node.TextNode, + this.lexbuf, + this.txtstart, + this.txtend); + return this.token; + } + } + else if (this.state == LEX_COMMENT) /* comment */ + { + if (c == StreamIn.EndOfStream) + Report.warning(this, null, null, Report.MALFORMED_COMMENT); + + this.txtend = this.lexsize; + this.lexbuf[this.lexsize] = (byte)'\0'; + this.state = LEX_CONTENT; + this.waswhite = false; + this.token = newNode(Node.CommentTag, + this.lexbuf, + this.txtstart, + this.txtend); + return this.token; + } + + return null; + } + + /* + parser for ASP within start tags + + Some people use ASP for to customize attributes + Tidy isn't really well suited to dealing with ASP + This is a workaround for attributes, but won't + deal with the case where the ASP is used to tailor + the attribute value. Here is an example of a work + around for using ASP in attribute values: + + href="<%=rsSchool.Fields("ID").Value%>" + + where the ASP that generates the attribute value + is masked from Tidy by the quotemarks. + + */ + + public Node parseAsp() + { + int c; + Node asp = null; + + this.txtstart = this.lexsize; + + for (;;) + { + c = this.in.readChar(); + addCharToLexer(c); + + + if (c != '%') + continue; + + c = this.in.readChar(); + addCharToLexer(c); + + if (c == '>') + break; + } + + this.lexsize -= 2; + this.txtend = this.lexsize; + + if (this.txtend > this.txtstart) + asp = newNode(Node.AspTag, + this.lexbuf, + this.txtstart, + this.txtend); + + this.txtstart = this.txtend; + return asp; + } + + /* + PHP is like ASP but is based upon XML + processing instructions, e.g. + */ + public Node parsePhp() + { + int c; + Node php = null; + + this.txtstart = this.lexsize; + + for (;;) + { + c = this.in.readChar(); + addCharToLexer(c); + + + if (c != '?') + continue; + + c = this.in.readChar(); + addCharToLexer(c); + + if (c == '>') + break; + } + + this.lexsize -= 2; + this.txtend = this.lexsize; + + if (this.txtend > this.txtstart) + php = newNode(Node.PhpTag, + this.lexbuf, + this.txtstart, + this.txtend); + + this.txtstart = this.txtend; + return php; + } + + /* consumes the '>' terminating start tags */ + public String parseAttribute(MutableBoolean isempty, MutableObject asp, + MutableObject php) + { + int start = 0; + // int len = 0; Removed by BUGFIX for 126265 + short map; + String attr; + int c = 0; + + asp.setObject(null); /* clear asp pointer */ + php.setObject(null); /* clear php pointer */ + /* skip white space before the attribute */ + + for (;;) + { + c = this.in.readChar(); + + if (c == '/') + { + c = this.in.readChar(); + + if (c == '>') + { + isempty.value = true; + return null; + } + + this.in.ungetChar(c); + c = '/'; + break; + } + + if (c == '>') + return null; + + if (c =='<') + { + c = this.in.readChar(); + + if (c == '%') + { + asp.setObject(parseAsp()); + return null; + } + else if (c == '?') + { + php.setObject(parsePhp()); + return null; + } + + this.in.ungetChar(c); + Report.attrError(this, this.token, null, Report.UNEXPECTED_GT); + return null; + } + + if (c == '"' || c == '\'') + { + Report.attrError(this, this.token, null, Report.UNEXPECTED_QUOTEMARK); + continue; + } + + if (c == StreamIn.EndOfStream) + { + Report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE); + this.in.ungetChar(c); + return null; + } + + map = MAP((char)c); + + if ((map & WHITE) == 0) + break; + } + + start = this.lexsize; + + for (;;) + { + /* but push back '=' for parseValue() */ + if (c == '=' || c == '>') + { + this.in.ungetChar(c); + break; + } + + if (c == '<' || c == StreamIn.EndOfStream) + { + this.in.ungetChar(c); + break; + } + + map = MAP((char)c); + + if ((map & WHITE) != 0) + break; + + /* what should be done about non-namechar characters? */ + /* currently these are incorporated into the attr name */ + + if (!this.configuration.XmlTags && (map & UPPERCASE) != 0) + c += (int)('a' - 'A'); + + // ++len; Removed by BUGFIX for 126265 + addCharToLexer(c); + + c = this.in.readChar(); + } + + // Following line added by GLP to fix BUG 126265. This is a temporary comment + // and should be removed when Tidy is fixed. + int len = this.lexsize - start; + attr = (len > 0 ? getString(this.lexbuf, start, len) : null); + this.lexsize = start; + + return attr; + } + + /* + invoked when < is seen in place of attribute value + but terminates on whitespace if not ASP, PHP or Tango + this routine recognizes ' and " quoted strings + */ + public int parseServerInstruction() + { + int c, map, delim = '"'; + boolean isrule = false; + + c = this.in.readChar(); + addCharToLexer(c); + + /* check for ASP, PHP or Tango */ + if (c == '%' || c == '?' || c == '@') + isrule = true; + + for (;;) + { + c = this.in.readChar(); + + if (c == StreamIn.EndOfStream) + break; + + if (c == '>') + { + if (isrule) + addCharToLexer(c); + else + this.in.ungetChar(c); + + break; + } + + /* if not recognized as ASP, PHP or Tango */ + /* then also finish value on whitespace */ + if (!isrule) + { + map = MAP((char)c); + + if ((map & WHITE) != 0) + break; + } + + addCharToLexer(c); + + if (c == '"') + { + do + { + c = this.in.readChar(); + addCharToLexer(c); + } + while (c != '"'); + delim = '\''; + continue; + } + + if (c == '\'') + { + do + { + c = this.in.readChar(); + addCharToLexer(c); + } + while (c != '\''); + } + } + + return delim; + } + + /* values start with "=" or " = " etc. */ + /* doesn't consume the ">" at end of start tag */ + + public String parseValue(String name, boolean foldCase, + MutableBoolean isempty, MutableInteger pdelim) + { + int len = 0; + int start; + short map; + boolean seen_gt = false; + boolean munge = true; + int c = 0; + int lastc, delim, quotewarning; + String value; + + delim = 0; + pdelim.value = (int)'"'; + + /* + Henry Zrepa reports that some folk are using the + embed element with script attributes where newlines + are significant and must be preserved + */ + if (configuration.LiteralAttribs) + munge = false; + + /* skip white space before the '=' */ + + for (;;) + { + c = this.in.readChar(); + + if (c == StreamIn.EndOfStream) + { + this.in.ungetChar(c); + break; + } + + map = MAP((char)c); + + if ((map & WHITE) == 0) + break; + } + + /* + c should be '=' if there is a value + other legal possibilities are white + space, '/' and '>' + */ + + if (c != '=') + { + this.in.ungetChar(c); + return null; + } + + /* skip white space after '=' */ + + for (;;) + { + c = this.in.readChar(); + + if (c == StreamIn.EndOfStream) + { + this.in.ungetChar(c); + break; + } + + map = MAP((char)c); + + if ((map & WHITE) == 0) + break; + } + + /* check for quote marks */ + + if (c == '"' || c == '\'') + delim = c; + else if (c == '<') + { + start = this.lexsize; + addCharToLexer(c); + pdelim.value = parseServerInstruction(); + len = this.lexsize - start; + this.lexsize = start; + return (len > 0 ? getString(this.lexbuf, start, len) : null); + } + else + this.in.ungetChar(c); + + /* + and read the value string + check for quote mark if needed + */ + + quotewarning = 0; + start = this.lexsize; + c = '\0'; + + for (;;) + { + lastc = c; /* track last character */ + c = this.in.readChar(); + + if (c == StreamIn.EndOfStream) + { + Report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE); + this.in.ungetChar(c); + break; + } + + if (delim == (char)0) + { + if (c == '>') + { + this.in.ungetChar(c); + break; + } + + if (c == '"' || c == '\'') + { + Report.attrError(this, this.token, null, Report.UNEXPECTED_QUOTEMARK); + break; + } + + if (c == '<') + { + /* this.in.ungetChar(c); */ + Report.attrError(this, this.token, null, Report.UNEXPECTED_GT); + /* break; */ + } + + /* + For cases like
need to avoid treating /> as + part of the attribute value, however care is needed to avoid + so treating
in this way, which + would map the tag to + */ + if (c == '/') + { + /* peek ahead in case of /> */ + c = this.in.readChar(); + + if (c == '>' && + !AttributeTable.getDefaultAttributeTable().isUrl(name)) + { + isempty.value = true; + this.in.ungetChar(c); + break; + } + + /* unget peeked char */ + this.in.ungetChar(c); + c = '/'; + } + } + else /* delim is '\'' or '"' */ + { + if (c == delim) + break; + + /* treat CRLF, CR and LF as single line break */ + + if (c == '\r') + { + c = this.in.readChar(); + if (c != '\n') + this.in.ungetChar(c); + + c = '\n'; + } + + if (c == '\n' || c == '<' || c == '>') + ++quotewarning; + + if (c == '>') + seen_gt = true; + } + + if (c == '&') + { + addCharToLexer(c); + parseEntity((short)0); + continue; + } + + /* + kludge for JavaScript attribute values + with line continuations in string literals + */ + if (c == '\\') + { + c = this.in.readChar(); + + if (c != '\n') + { + this.in.ungetChar(c); + c = '\\'; + } + } + + map = MAP((char)c); + + if ((map & WHITE) != 0) + { + if (delim == (char)0) + break; + + if (munge) + { + c = ' '; + + if (lastc == ' ') + continue; + } + } + else if (foldCase && (map & UPPERCASE) != 0) + c += (int)('a' - 'A'); + + addCharToLexer(c); + } + + if (quotewarning > 10 && seen_gt && munge) + { + /* + there is almost certainly a missing trailling quote mark + as we have see too many newlines, < or > characters. + + an exception is made for Javascript attributes and the + javascript URL scheme which may legitimately include < and > + */ + if (!AttributeTable.getDefaultAttributeTable().isScript(name) && + !(AttributeTable.getDefaultAttributeTable().isUrl(name) && + (getString(this.lexbuf, start, 11)).equals("javascript:"))) + Report.error(this, null, null, Report.SUSPECTED_MISSING_QUOTE); + } + + len = this.lexsize - start; + this.lexsize = start; + + if (len > 0 || delim != 0) + value = getString(this.lexbuf, start, len); + else + value = null; + + /* note delimiter if given */ + if (delim != 0) + pdelim.value = delim; + else + pdelim.value = (int)'"'; + + return value; + } + + /* attr must be non-null */ + public static boolean isValidAttrName(String attr) + { + short map; + char c; + int i; + + /* first character should be a letter */ + c = attr.charAt(0); + map = MAP(c); + + if (!((map & LETTER) != 0)) + return false; + + /* remaining characters should be namechars */ + for( i = 1; i < attr.length(); i++) + { + c = attr.charAt(i); + map = MAP(c); + + if((map & NAMECHAR) != 0) + continue; + + return false; + } + + return true; + } + + /* swallows closing '>' */ + + public AttVal parseAttrs(MutableBoolean isempty) + { + AttVal av, list; + String attribute, value; + MutableInteger delim = new MutableInteger(); + MutableObject asp = new MutableObject(); + MutableObject php = new MutableObject(); + + list = null; + + for (; !endOfInput();) + { + attribute = parseAttribute(isempty, asp, php); + + if (attribute == null) + { + /* check if attributes are created by ASP markup */ + if (asp.getObject() != null) + { + av = new AttVal(list, null, (Node)asp.getObject(), null, + '\0', null, null ); + list = av; + continue; + } + + /* check if attributes are created by PHP markup */ + if (php.getObject() != null) + { + av = new AttVal(list, null, null, (Node)php.getObject(), + '\0', null, null ); + list = av; + continue; + } + + break; + } + + value = parseValue(attribute, false, isempty, delim); + + if (attribute != null && isValidAttrName(attribute)) + { + av = new AttVal( list, null, null, null, + delim.value, attribute, value ); + av.dict = + AttributeTable.getDefaultAttributeTable().findAttribute(av); + list = av; + } + else + { + av = new AttVal( null, null, null, null, + 0, attribute, value ); + Report.attrError(this, this.token, value, Report.BAD_ATTRIBUTE_VALUE); + } + } + + return list; + } + + /* + push a copy of an inline node onto stack + but don't push if implicit or OBJECT or APPLET + (implicit tags are ones generated from the istack) + + One issue arises with pushing inlines when + the tag is already pushed. For instance: + +

text +

more text + + Shouldn't be mapped to + +

text

+

more text + */ + public void pushInline( Node node ) + { + IStack is; + + if (node.implicit) + return; + + if (node.tag == null) + return; + + if ((node.tag.model & Dict.CM_INLINE) == 0 ) + return; + + if ((node.tag.model & Dict.CM_OBJECT) != 0) + return; + + if (node.tag != configuration.tt.tagFont && isPushed(node)) + return; + + // make sure there is enough space for the stack + is = new IStack(); + is.tag = node.tag; + is.element = node.element; + if (node.attributes != null) + is.attributes = cloneAttributes(node.attributes); + this.istack.push( is ); + } + + /* pop inline stack */ + public void popInline( Node node ) + { + AttVal av; + IStack is; + + if (node != null) { + + if (node.tag == null) + return; + + if ((node.tag.model & Dict.CM_INLINE) == 0) + return; + + if ((node.tag.model & Dict.CM_OBJECT) != 0) + return; + + // if node is then pop until we find an + if (node.tag == configuration.tt.tagA) { + + while (this.istack.size() > 0) { + is = (IStack)this.istack.pop(); + if (is.tag == configuration.tt.tagA) { + break; + } + } + + if (this.insert >= this.istack.size()) + this.insert = -1; + return; + } + } + + if (this.istack.size() > 0) { + is = (IStack)this.istack.pop(); + if (this.insert >= this.istack.size()) + this.insert = -1; + } + } + + public boolean isPushed( Node node ) + { + int i; + IStack is; + + for (i = this.istack.size() - 1; i >= 0; --i) { + is = (IStack)this.istack.elementAt(i); + if (is.tag == node.tag) + return true; + } + + return false; + } + + /* + This has the effect of inserting "missing" inline + elements around the contents of blocklevel elements + such as P, TD, TH, DIV, PRE etc. This procedure is + called at the start of ParseBlock. when the inline + stack is not empty, as will be the case in: + +

italic heading

+ + which is then treated as equivalent to + +

italic heading

+ + This is implemented by setting the lexer into a mode + where it gets tokens from the inline stack rather than + from the input stream. + */ + public int inlineDup( Node node ) + { + int n; + + n = this.istack.size() - this.istackbase; + if ( n > 0 ) { + this.insert = this.istackbase; + this.inode = node; + } + + return n; + } + + public Node insertedToken() + { + Node node; + IStack is; + int n; + + // this will only be null if inode != null + if (this.insert == -1) { + node = this.inode; + this.inode = null; + return node; + } + + // is this is the "latest" node then update + // the position, otherwise use current values + + if (this.inode == null) { + this.lines = this.in.curline; + this.columns = this.in.curcol; + } + + node = newNode(Node.StartTag, + this.lexbuf, + this.txtstart, + this.txtend); // GLP: Bugfix 126261. Remove when this change + // is fixed in istack.c in the original Tidy + node.implicit = true; + is = (IStack)this.istack.elementAt( this.insert ); + node.element = is.element; + node.tag = is.tag; + if (is.attributes != null) + node.attributes = cloneAttributes(is.attributes); + + // advance lexer to next item on the stack + n = this.insert; + + // and recover state if we have reached the end + if (++n < this.istack.size() ) { + this.insert = n; + } else { + this.insert = -1; + } + + return node; + } + + /* AQ: Try this for speed optimization */ + public static int wstrcasecmp(String s1, String s2) + { + return (s1.equalsIgnoreCase(s2) ? 0 : 1); + } + + public static int wstrcaselexcmp(String s1, String s2) + { + char c; + int i = 0; + + while ( i < s1.length() && i < s2.length() ) { + c = s1.charAt(i); + if ( toLower(c) != toLower( s2.charAt(i) ) ) { + break; + } + i += 1; + } + if ( i == s1.length() && i == s2.length() ) { + return 0; + } else if ( i == s1.length() ) { + return -1; + } else if ( i == s2.length() ) { + return 1; + } else { + return ( s1.charAt(i) > s2.charAt(i) ? 1 : -1 ); + } + } + + public static boolean wsubstr(String s1, String s2) + { + int i; + int len1 = s1.length(); + int len2 = s2.length(); + + for (i = 0; i <= len1 - len2; ++i) + { + if (s2.equalsIgnoreCase(s1.substring(i))) + return true; + } + + return false; + } + + public boolean canPrune(Node element) + { + if (element.type == Node.TextNode) + return true; + + if (element.content != null) + return false; + + if (element.tag == configuration.tt.tagA && element.attributes != null) + return false; + + if (element.tag == configuration.tt.tagP && !this.configuration.DropEmptyParas) + return false; + + if (element.tag == null) + return false; + + if ((element.tag.model & Dict.CM_ROW) != 0) + return false; + + if (element.tag == configuration.tt.tagApplet) + return false; + + if (element.tag == configuration.tt.tagObject) + return false; + + if (element.attributes != null && + (element.getAttrByName("id") != null || + element.getAttrByName("name") != null) ) + return false; + + return true; + } + + /* duplicate name attribute as an id */ + public void fixId(Node node) + { + AttVal name = node.getAttrByName("name"); + AttVal id = node.getAttrByName("id"); + + if (name != null) + { + if (id != null) + { + if (!id.value.equals(name.value)) + Report.attrError(this, node, "name", Report.ID_NAME_MISMATCH); + } + else if (this.configuration.XmlOut) + node.addAttribute("id", name.value); + } + } + + /* + defer duplicates when entering a table or other + element where the inlines shouldn't be duplicated + */ + public void deferDup() + { + this.insert = -1; + this.inode = null; + } + + /* Private methods and fields */ + + /* lexer char types */ + private static final short DIGIT = 1; + private static final short LETTER = 2; + private static final short NAMECHAR = 4; + private static final short WHITE = 8; + private static final short NEWLINE = 16; + private static final short LOWERCASE = 32; + private static final short UPPERCASE = 64; + + /* lexer GetToken states */ + + private static final short LEX_CONTENT = 0; + private static final short LEX_GT = 1; + private static final short LEX_ENDTAG = 2; + private static final short LEX_STARTTAG = 3; + private static final short LEX_COMMENT = 4; + private static final short LEX_DOCTYPE = 5; + private static final short LEX_PROCINSTR = 6; + private static final short LEX_ENDCOMMENT = 7; + private static final short LEX_CDATA = 8; + private static final short LEX_SECTION = 9; + private static final short LEX_ASP = 10; + private static final short LEX_JSTE = 11; + private static final short LEX_PHP = 12; + + /* used to classify chars for lexical purposes */ + private static short[] lexmap = new short[128]; + + private static void mapStr(String str, short code) + { + int j; + + for ( int i = 0; i < str.length(); i++ ) { + j = (int)str.charAt(i); + lexmap[j] |= code; + } + } + + static { + mapStr("\r\n\f", (short)(NEWLINE|WHITE)); + mapStr(" \t", WHITE); + mapStr("-.:_", NAMECHAR); + mapStr("0123456789", (short)(DIGIT|NAMECHAR)); + mapStr("abcdefghijklmnopqrstuvwxyz", (short)(LOWERCASE|LETTER|NAMECHAR)); + mapStr("ABCDEFGHIJKLMNOPQRSTUVWXYZ", (short)(UPPERCASE|LETTER|NAMECHAR)); + } + + private static short MAP( char c ) + { + return ((int)c < 128 ? lexmap[(int)c] : 0); + } + + private static boolean isWhite(char c) + { + short m = MAP(c); + + return (m & WHITE) != 0; + } + + private static boolean isDigit(char c) + { + short m; + + m = MAP(c); + + return (m & DIGIT) != 0; + } + + private static boolean isLetter(char c) + { + short m; + + m = MAP(c); + + return (m & LETTER) != 0; + } + + private static char toLower(char c) + { + short m = MAP(c); + + if ((m & UPPERCASE) != 0) + c = (char)( (int)c + (int)'a' - (int)'A' ); + + return c; + } + + private static char toUpper(char c) + { + short m = MAP(c); + + if ((m & LOWERCASE) != 0) + c = (char)( (int)c + (int)'A' - (int)'a' ); + + return c; + } + + public static char foldCase(char c, boolean tocaps, boolean xmlTags) + { + short m; + + if (!xmlTags) + { + m = MAP(c); + + if (tocaps) + { + if ((m & LOWERCASE) != 0) + c = (char)( (int)c + (int)'A' - (int)'a' ); + } + else /* force to lower case */ + { + if ((m & UPPERCASE) != 0) + c = (char)( (int)c + (int)'a' - (int)'A' ); + } + } + + return c; + } + + + private static class W3CVersionInfo + { + String name; + String voyagerName; + String profile; + short code; + + public W3CVersionInfo( String name, + String voyagerName, + String profile, + short code ) + { + this.name = name; + this.voyagerName = voyagerName; + this.profile = profile; + this.code = code; + } + } + + /* the 3 URIs for the XHTML 1.0 DTDs */ + private static final String voyager_loose = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"; + private static final String voyager_strict = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"; + private static final String voyager_frameset = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd"; + + private static final String XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"; + + private static Lexer.W3CVersionInfo[] W3CVersion = + { + new W3CVersionInfo("HTML 4.01", + "XHTML 1.0 Strict", + voyager_strict, + Dict.VERS_HTML40_STRICT), + new W3CVersionInfo("HTML 4.01 Transitional", + "XHTML 1.0 Transitional", + voyager_loose, + Dict.VERS_HTML40_LOOSE), + new W3CVersionInfo("HTML 4.01 Frameset", + "XHTML 1.0 Frameset", + voyager_frameset, + Dict.VERS_FRAMES), + new W3CVersionInfo("HTML 4.0", + "XHTML 1.0 Strict", + voyager_strict, + Dict.VERS_HTML40_STRICT), + new W3CVersionInfo("HTML 4.0 Transitional", + "XHTML 1.0 Transitional", + voyager_loose, + Dict.VERS_HTML40_LOOSE), + new W3CVersionInfo("HTML 4.0 Frameset", + "XHTML 1.0 Frameset", + voyager_frameset, + Dict.VERS_FRAMES), + new W3CVersionInfo("HTML 3.2", + "XHTML 1.0 Transitional", + voyager_loose, + Dict.VERS_HTML32), + new W3CVersionInfo("HTML 2.0", + "XHTML 1.0 Strict", + voyager_strict, + Dict.VERS_HTML20) + }; + +}