X-Git-Url: http://secure.phpeclipse.com diff --git a/net.sourceforge.phpeclipse/src/net/sourceforge/phpdt/tidy/Lexer.java b/net.sourceforge.phpeclipse/src/net/sourceforge/phpdt/tidy/Lexer.java deleted file mode 100644 index f5f5548..0000000 --- a/net.sourceforge.phpeclipse/src/net/sourceforge/phpdt/tidy/Lexer.java +++ /dev/null @@ -1,3134 +0,0 @@ -/* - * @(#)Lexer.java 1.11 2000/08/16 - * - */ - -package net.sourceforge.phpdt.tidy; - -/** - * - * Lexer for html parser - * - * (c) 1998-2000 (W3C) MIT, INRIA, Keio University - * See Tidy.java for the copyright notice. - * Derived from - * HTML Tidy Release 4 Aug 2000 - * - * @author Dave Raggett - * @author Andy Quick (translation to Java) - * @version 1.0, 1999/05/22 - * @version 1.0.1, 1999/05/29 - * @version 1.1, 1999/06/18 Java Bean - * @version 1.2, 1999/07/10 Tidy Release 7 Jul 1999 - * @version 1.3, 1999/07/30 Tidy Release 26 Jul 1999 - * @version 1.4, 1999/09/04 DOM support - * @version 1.5, 1999/10/23 Tidy Release 27 Sep 1999 - * @version 1.6, 1999/11/01 Tidy Release 22 Oct 1999 - * @version 1.7, 1999/12/06 Tidy Release 30 Nov 1999 - * @version 1.8, 2000/01/22 Tidy Release 13 Jan 2000 - * @version 1.9, 2000/06/03 Tidy Release 30 Apr 2000 - * @version 1.10, 2000/07/22 Tidy Release 8 Jul 2000 - * @version 1.11, 2000/08/16 Tidy Release 4 Aug 2000 - */ - -/* - Given a file stream fp it returns a sequence of tokens. - - GetToken(fp) gets the next token - UngetToken(fp) provides one level undo - - The tags include an attribute list: - - - linked list of attribute/value nodes - - each node has 2 null-terminated strings. - - entities are replaced in attribute values - - white space is compacted if not in preformatted mode - If not in preformatted mode then leading white space - is discarded and subsequent white space sequences - compacted to single space chars. - - If XmlTags is no then Tag names are folded to upper - case and attribute names to lower case. - - Not yet done: - - Doctype subset and marked sections -*/ - -import java.io.PrintWriter; -import java.util.Stack; -import java.util.Vector; - -import org.eclipse.core.resources.IFile; -import sun.security.krb5.internal.av; - -public class Lexer { - - private IFile iFile; - public StreamIn in; /* file stream */ - public PrintWriter errout; /* error output stream */ - public short badAccess; /* for accessibility errors */ - public short badLayout; /* for bad style errors */ - public short badChars; /* for bad char encodings */ - public short badForm; /* for mismatched/mispositioned form tags */ - public short warnings; /* count of warnings in this document */ - public short errors; /* count of errors */ - public int lines; /* lines seen */ - public int columns; /* at start of current token */ - public boolean waswhite; /* used to collapse contiguous white space */ - public boolean pushed; /* true after token has been pushed back */ - public boolean insertspace; /* when space is moved after end tag */ - public boolean excludeBlocks; /* Netscape compatibility */ - public boolean exiled; /* true if moved out of table */ - public boolean isvoyager; /* true if xmlns attribute on html element */ - public short versions; /* bit vector of HTML versions */ - public int doctype; /* version as given by doctype (if any) */ - public boolean badDoctype; /* e.g. if html or PUBLIC is missing */ - public int txtstart; /* start of current node */ - public int txtend; /* end of current node */ - public short state; /* state of lexer's finite state machine */ - public Node token; - - /* - lexer character buffer - - parse tree nodes span onto this buffer - which contains the concatenated text - contents of all of the elements. - - lexsize must be reset for each file. - */ - public byte[] lexbuf; /* byte buffer of UTF-8 chars */ - public int lexlength; /* allocated */ - public int lexsize; /* used */ - - /* Inline stack for compatibility with Mosaic */ - public Node inode; /* for deferring text node */ - public int insert; /* for inferring inline tags */ - public Stack istack; - public int istackbase; /* start of frame */ - - public Style styles; /* used for cleaning up presentation markup */ - - public Configuration configuration; - protected int seenBodyEndTag; /* used by parser */ - private Vector nodeList; - - public Lexer(IFile iFile, StreamIn in, Configuration configuration) - { - this.iFile = iFile; - this.in = in; - this.lines = 1; - this.columns = 1; - this.state = LEX_CONTENT; - this.badAccess = 0; - this.badLayout = 0; - this.badChars = 0; - this.badForm = 0; - this.warnings = 0; - this.errors = 0; - this.waswhite = false; - this.pushed = false; - this.insertspace = false; - this.exiled = false; - this.isvoyager = false; - this.versions = Dict.VERS_EVERYTHING; - this.doctype = Dict.VERS_UNKNOWN; - this.badDoctype = false; - this.txtstart = 0; - this.txtend = 0; - this.token = null; - this.lexbuf = null; - this.lexlength = 0; - this.lexsize = 0; - this.inode = null; - this.insert = -1; - this.istack = new Stack(); - this.istackbase = 0; - this.styles = null; - this.configuration = configuration; - this.seenBodyEndTag = 0; - this.nodeList = new Vector(); - } - - public IFile getIFile() { - return iFile; - } - - public Node newNode() - { - Node node = new Node(); - nodeList.addElement(node); - return node; - } - - public Node newNode(short type, byte[] textarray, int start, int end) - { - Node node = new Node(type, textarray, start, end); - nodeList.addElement(node); - return node; - } - - public Node newNode(short type, byte[] textarray, int start, int end, String element) - { - Node node = new Node(type, textarray, start, end, element, configuration.tt); - nodeList.addElement(node); - return node; - } - - public Node cloneNode(Node node) - { - Node cnode = (Node)node.clone(); - nodeList.addElement(cnode); - for (AttVal att = cnode.attributes; att != null; att = att.next) { - if (att.asp != null) - nodeList.addElement(att.asp); - if (att.php != null) - nodeList.addElement(att.php); - } - return cnode; - } - - public AttVal cloneAttributes(AttVal attrs) - { - AttVal cattrs = (AttVal)attrs.clone(); - for (AttVal att = cattrs; att != null; att = att.next) { - if (att.asp != null) - nodeList.addElement(att.asp); - if (att.php != null) - nodeList.addElement(att.php); - } - return cattrs; - } - - protected void updateNodeTextArrays(byte[] oldtextarray, byte[] newtextarray) - { - Node node; - for (int i = 0; i < nodeList.size(); i++) { - node = (Node)(nodeList.elementAt(i)); - if (node.textarray == oldtextarray) - node.textarray = newtextarray; - } - } - - /* used for creating preformatted text from Word2000 */ - public Node newLineNode() - { - Node node = newNode(); - - node.textarray = this.lexbuf; - node.start = this.lexsize; - addCharToLexer((int)'\n'); - node.end = this.lexsize; - return node; - } - - // Should always be able convert to/from UTF-8, so encoding exceptions are - // converted to an Error to avoid adding throws declarations in - // lots of methods. - - public static byte[] getBytes(String str) { - try { - return str.getBytes("UTF8"); - } catch (java.io.UnsupportedEncodingException e) { - throw new Error("string to UTF-8 conversion failed: " + e.getMessage()); - } - } - - public static String getString(byte[] bytes, int offset, int length) { - try { - return new String(bytes, offset, length, "UTF8"); - } catch (java.io.UnsupportedEncodingException e) { - throw new Error("UTF-8 to string conversion failed: " + e.getMessage()); - } - } - - public boolean endOfInput() - { - return this.in.isEndOfStream(); - } - - public void addByte(int c) - { - if (this.lexsize + 1 >= this.lexlength) - { - while (this.lexsize + 1 >= this.lexlength) - { - if (this.lexlength == 0) - this.lexlength = 8192; - else - this.lexlength = this.lexlength * 2; - } - - byte[] temp = this.lexbuf; - this.lexbuf = new byte[ this.lexlength ]; - if (temp != null) - { - System.arraycopy( temp, 0, this.lexbuf, 0, temp.length ); - updateNodeTextArrays(temp, this.lexbuf); - } - } - - this.lexbuf[this.lexsize++] = (byte)c; - this.lexbuf[this.lexsize] = (byte)'\0'; /* debug */ - } - - public void changeChar(byte c) - { - if (this.lexsize > 0) - { - this.lexbuf[this.lexsize-1] = c; - } - } - - /* store char c as UTF-8 encoded byte stream */ - public void addCharToLexer(int c) - { - if (c < 128) - addByte(c); - else if (c <= 0x7FF) - { - addByte(0xC0 | (c >> 6)); - addByte(0x80 | (c & 0x3F)); - } - else if (c <= 0xFFFF) - { - addByte(0xE0 | (c >> 12)); - addByte(0x80 | ((c >> 6) & 0x3F)); - addByte(0x80 | (c & 0x3F)); - } - else if (c <= 0x1FFFFF) - { - addByte(0xF0 | (c >> 18)); - addByte(0x80 | ((c >> 12) & 0x3F)); - addByte(0x80 | ((c >> 6) & 0x3F)); - addByte(0x80 | (c & 0x3F)); - } - else - { - addByte(0xF8 | (c >> 24)); - addByte(0x80 | ((c >> 18) & 0x3F)); - addByte(0x80 | ((c >> 12) & 0x3F)); - addByte(0x80 | ((c >> 6) & 0x3F)); - addByte(0x80 | (c & 0x3F)); - } - } - - public void addStringToLexer(String str) - { - for ( int i = 0; i < str.length(); i++ ) { - addCharToLexer( (int)str.charAt(i) ); - } - } - - /* - No longer attempts to insert missing ';' for unknown - enitities unless one was present already, since this - gives unexpected results. - - For example: - was tidied to: - rather than: - - My thanks for Maurice Buxton for spotting this. - */ - public void parseEntity(short mode) - { - short map; - int start; - boolean first = true; - boolean semicolon = false; - boolean numeric = false; - int c, ch, startcol; - String str; - - start = this.lexsize - 1; /* to start at "&" */ - startcol = this.in.curcol - 1; - - while (true) - { - c = this.in.readChar(); - if (c == StreamIn.EndOfStream) break; - if (c == ';') - { - semicolon = true; - break; - } - - if (first && c == '#') - { - addCharToLexer(c); - first = false; - numeric = true; - continue; - } - - first = false; - map = MAP((char)c); - - /* AQ: Added flag for numeric entities so that numeric entities - with missing semi-colons are recognized. - Eg. "rep..." is recognized as "rep" - */ - if (numeric && ((c == 'x') || ((map & DIGIT) != 0))) - { - addCharToLexer(c); - continue; - } - if (!numeric && ((map & NAMECHAR) != 0)) - { - addCharToLexer(c); - continue; - } - - /* otherwise put it back */ - - this.in.ungetChar(c); - break; - } - - str = getString( this.lexbuf, start, this.lexsize - start ); - ch = EntityTable.getDefaultEntityTable().entityCode( str ); - - /* deal with unrecognized entities */ - if (ch <= 0) - { - /* set error position just before offending chararcter */ - this.lines = this.in.curline; - this.columns = startcol; - - if (this.lexsize > start +1 ) - { - Report.entityError(this, Report.UNKNOWN_ENTITY, str, ch); - - if (semicolon) - addCharToLexer(';'); - } - else /* naked & */ - { - Report.entityError(this, Report.UNESCAPED_AMPERSAND, str, ch); - } - } - else - { - if (c != ';') /* issue warning if not terminated by ';' */ - { - /* set error position just before offending chararcter */ - this.lines = this.in.curline; - this.columns = startcol; - Report.entityError(this, Report.MISSING_SEMICOLON, str, c); - } - - this.lexsize = start; - - if (ch == 160 && (mode & Preformatted) != 0) - ch = ' '; - - addCharToLexer(ch); - - if (ch == '&' && !this.configuration.QuoteAmpersand) - { - addCharToLexer('a'); - addCharToLexer('m'); - addCharToLexer('p'); - addCharToLexer(';'); - } - } - } - - public char parseTagName() - { - short map; - int c; - - /* fold case of first char in buffer */ - - c = this.lexbuf[this.txtstart]; - map = MAP((char)c); - - if (!this.configuration.XmlTags && (map & UPPERCASE) != 0) - { - c += (int)((int)'a' - (int)'A'); - this.lexbuf[this.txtstart] = (byte)c; - } - - while (true) - { - c = this.in.readChar(); - if (c == StreamIn.EndOfStream) break; - map = MAP((char)c); - - if ((map & NAMECHAR) == 0) - break; - - /* fold case of subsequent chars */ - - if (!this.configuration.XmlTags && (map & UPPERCASE) != 0) - c += (int)((int)'a' - (int)'A'); - - addCharToLexer(c); - } - - this.txtend = this.lexsize; - return (char)c; - } - - public void addStringLiteral(String str) - { - for ( int i = 0; i < str.length(); i++ ) { - addCharToLexer( (int)str.charAt(i) ); - } - } - - /* choose what version to use for new doctype */ - public short HTMLVersion() - { - short versions; - - versions = this.versions; - - if ((versions & Dict.VERS_HTML20) != 0) - return Dict.VERS_HTML20; - - if ((versions & Dict.VERS_HTML32) != 0) - return Dict.VERS_HTML32; - - if ((versions & Dict.VERS_HTML40_STRICT) != 0) - return Dict.VERS_HTML40_STRICT; - - if ((versions & Dict.VERS_HTML40_LOOSE) != 0) - return Dict.VERS_HTML40_LOOSE; - - if ((versions & Dict.VERS_FRAMES) != 0) - return Dict.VERS_FRAMES; - - return Dict.VERS_UNKNOWN; - } - - public String HTMLVersionName() - { - short guessed; - int j; - - guessed = apparentVersion(); - - for (j = 0; j < W3CVersion.length; ++j) - { - if (guessed == W3CVersion[j].code) - { - if (this.isvoyager) - return W3CVersion[j].voyagerName; - - return W3CVersion[j].name; - } - } - - return null; - } - - /* add meta element for Tidy */ - public boolean addGenerator(Node root) - { - AttVal attval; - Node node; - Node head = root.findHEAD(configuration.tt); - - if (head != null) - { - for (node = head.content; node != null; node = node.next) - { - if (node.tag == configuration.tt.tagMeta) - { - attval = node.getAttrByName("name"); - - if (attval != null && attval.value != null && - Lexer.wstrcasecmp(attval.value, "generator") == 0) - { - attval = node.getAttrByName("content"); - - if (attval != null && attval.value != null && - attval.value.length() >= 9 && - Lexer.wstrcasecmp(attval.value.substring(0, 9), "HTML Tidy") == 0) - { - return false; - } - } - } - } - - node = this.inferredTag("meta"); - node.addAttribute("content", "HTML Tidy, see www.w3.org"); - node.addAttribute("name", "generator"); - Node.insertNodeAtStart(head, node); - return true; - } - - return false; - } - - /* return true if substring s is in p and isn't all in upper case */ - /* this is used to check the case of SYSTEM, PUBLIC, DTD and EN */ - /* len is how many chars to check in p */ - private static boolean findBadSubString(String s, String p, int len) - { - int n = s.length(); - int i = 0; - String ps; - - while (n < len) - { - ps = p.substring(i, i + n); - if (wstrcasecmp(s, ps) == 0) - return (!ps.equals(s.substring(0, n))); - - ++i; - --len; - } - - return false; - } - - public boolean checkDocTypeKeyWords(Node doctype) - { - int len = doctype.end - doctype.start; - String s = getString(this.lexbuf, doctype.start, len); - - return !( - findBadSubString("SYSTEM", s, len) || - findBadSubString("PUBLIC", s, len) || - findBadSubString("//DTD", s, len) || - findBadSubString("//W3C", s, len) || - findBadSubString("//EN", s, len) - ); - } - - /* examine to identify version */ - public short findGivenVersion(Node doctype) - { - String p, s; - int i, j; - int len; - String str1; - String str2; - - /* if root tag for doctype isn't html give up now */ - str1 = getString(this.lexbuf, doctype.start, 5); - if (wstrcasecmp(str1, "html ") != 0) - return 0; - - if (!checkDocTypeKeyWords(doctype)) - Report.warning(this, doctype, null, Report.DTYPE_NOT_UPPER_CASE); - - /* give up if all we are given is the system id for the doctype */ - str1 = getString(this.lexbuf, doctype.start + 5, 7); - if (wstrcasecmp(str1, "SYSTEM ") == 0) - { - /* but at least ensure the case is correct */ - if (!str1.substring(0, 6).equals("SYSTEM")) - System.arraycopy( getBytes("SYSTEM"), 0, - this.lexbuf, doctype.start + 5, 6 ); - return 0; /* unrecognized */ - } - - if (wstrcasecmp(str1, "PUBLIC ") == 0) - { - if (!str1.substring(0, 6).equals("PUBLIC")) - System.arraycopy( getBytes("PUBLIC "), 0, - this.lexbuf, doctype.start + 5, 6 ); - } - else - this.badDoctype = true; - - for (i = doctype.start; i < doctype.end; ++i) - { - if (this.lexbuf[i] == (byte)'"') - { - str1 = getString( this.lexbuf, i + 1, 12 ); - str2 = getString( this.lexbuf, i + 1, 13 ); - if (str1.equals("-//W3C//DTD ")) - { - /* compute length of identifier e.g. "HTML 4.0 Transitional" */ - for (j = i + 13; j < doctype.end && this.lexbuf[j] != (byte)'/'; ++j); - len = j - i - 13; - p = getString( this.lexbuf, i + 13, len ); - - for (j = 1; j < W3CVersion.length; ++j) - { - s = W3CVersion[j].name; - if (len == s.length() && s.equals(p)) - return W3CVersion[j].code; - } - - /* else unrecognized version */ - } - else if (str2.equals("-//IETF//DTD ")) - { - /* compute length of identifier e.g. "HTML 2.0" */ - for (j = i + 14; j < doctype.end && this.lexbuf[j] != (byte)'/'; ++j); - len = j - i - 14; - - p = getString( this.lexbuf, i + 14, len ); - s = W3CVersion[0].name; - if (len == s.length() && s.equals(p)) - return W3CVersion[0].code; - - /* else unrecognized version */ - } - break; - } - } - - return 0; - } - - public void fixHTMLNameSpace(Node root, String profile) - { - Node node; - AttVal prev, attr; - - for (node = root.content; - node != null && node.tag != configuration.tt.tagHtml; node = node.next); - - if (node != null) - { - prev = null; - - for (attr = node.attributes; attr != null; attr = attr.next) - { - if (attr.attribute.equals("xmlns")) - break; - - prev = attr; - } - - if (attr != null) - { - if (!attr.value.equals(profile)) - { - Report.warning(this, node, null, Report.INCONSISTENT_NAMESPACE); - attr.value = profile; - } - } - else - { - attr = new AttVal( node.attributes, null, (int)'"', - "xmlns", profile ); - attr.dict = - AttributeTable.getDefaultAttributeTable().findAttribute( attr ); - node.attributes = attr; - } - } - } - - public boolean setXHTMLDocType(Node root) - { - String fpi = " "; - String sysid = ""; - String namespace = XHTML_NAMESPACE; - Node doctype; - - doctype = root.findDocType(); - - if (configuration.docTypeMode == Configuration.DOCTYPE_OMIT) - { - if (doctype != null) - Node.discardElement(doctype); - return true; - } - - if (configuration.docTypeMode == Configuration.DOCTYPE_AUTO) - { - /* see what flavor of XHTML this document matches */ - if ((this.versions & Dict.VERS_HTML40_STRICT) != 0) - { /* use XHTML strict */ - fpi = "-//W3C//DTD XHTML 1.0 Strict//EN"; - sysid = voyager_strict; - } - else if ((this.versions & Dict.VERS_LOOSE) != 0) - { - fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN"; - sysid = voyager_loose; - } - else if ((this.versions & Dict.VERS_FRAMES) != 0) - { /* use XHTML frames */ - fpi = "-//W3C//DTD XHTML 1.0 Frameset//EN"; - sysid = voyager_frameset; - } - else /* lets assume XHTML transitional */ - { - fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN"; - sysid = voyager_loose; - } - } - else if (configuration.docTypeMode == Configuration.DOCTYPE_STRICT) - { - fpi = "-//W3C//DTD XHTML 1.0 Strict//EN"; - sysid = voyager_strict; - } - else if (configuration.docTypeMode == Configuration.DOCTYPE_LOOSE) - { - fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN"; - sysid = voyager_loose; - } - - fixHTMLNameSpace(root, namespace); - - if (doctype == null) - { - doctype = newNode(Node.DocTypeTag, this.lexbuf, 0, 0); - doctype.next = root.content; - doctype.parent = root; - doctype.prev = null; - root.content = doctype; - } - - if (configuration.docTypeMode == Configuration.DOCTYPE_USER && - configuration.docTypeStr != null) - { - fpi = configuration.docTypeStr; - sysid = ""; - } - - this.txtstart = this.lexsize; - this.txtend = this.lexsize; - - /* add public identifier */ - addStringLiteral("html PUBLIC "); - - /* check if the fpi is quoted or not */ - if (fpi.charAt(0) == '"') - addStringLiteral(fpi); - else - { - addStringLiteral("\""); - addStringLiteral(fpi); - addStringLiteral("\""); - } - - if (sysid.length() + 6 >= this.configuration.wraplen) - addStringLiteral("\n\""); - else - addStringLiteral("\n \""); - - /* add system identifier */ - addStringLiteral(sysid); - addStringLiteral("\""); - - this.txtend = this.lexsize; - - doctype.start = this.txtstart; - doctype.end = this.txtend; - - return false; - } - - public short apparentVersion() - { - switch (this.doctype) - { - case Dict.VERS_UNKNOWN: - return HTMLVersion(); - - case Dict.VERS_HTML20: - if ((this.versions & Dict.VERS_HTML20) != 0) - return Dict.VERS_HTML20; - - break; - - case Dict.VERS_HTML32: - if ((this.versions & Dict.VERS_HTML32) != 0) - return Dict.VERS_HTML32; - - break; /* to replace old version by new */ - - case Dict.VERS_HTML40_STRICT: - if ((this.versions & Dict.VERS_HTML40_STRICT) != 0) - return Dict.VERS_HTML40_STRICT; - - break; - - case Dict.VERS_HTML40_LOOSE: - if ((this.versions & Dict.VERS_HTML40_LOOSE) != 0) - return Dict.VERS_HTML40_LOOSE; - - break; /* to replace old version by new */ - - case Dict.VERS_FRAMES: - if ((this.versions & Dict.VERS_FRAMES) != 0) - return Dict.VERS_FRAMES; - - break; - } - - Report.warning(this, null, null, Report.INCONSISTENT_VERSION); - return this.HTMLVersion(); - } - - /* fixup doctype if missing */ - public boolean fixDocType(Node root) - { - Node doctype; - int guessed = Dict.VERS_HTML40_STRICT, i; - - if (this.badDoctype) - Report.warning(this, null, null, Report.MALFORMED_DOCTYPE); - - if (configuration.XmlOut) - return true; - - doctype = root.findDocType(); - - if (configuration.docTypeMode == Configuration.DOCTYPE_OMIT) - { - if (doctype != null) - Node.discardElement(doctype); - return true; - } - - if (configuration.docTypeMode == Configuration.DOCTYPE_STRICT) - { - Node.discardElement(doctype); - doctype = null; - guessed = Dict.VERS_HTML40_STRICT; - } - else if (configuration.docTypeMode == Configuration.DOCTYPE_LOOSE) - { - Node.discardElement(doctype); - doctype = null; - guessed = Dict.VERS_HTML40_LOOSE; - } - else if (configuration.docTypeMode == Configuration.DOCTYPE_AUTO) - { - if (doctype != null) - { - if (this.doctype == Dict.VERS_UNKNOWN) - return false; - - switch (this.doctype) - { - case Dict.VERS_UNKNOWN: - return false; - - case Dict.VERS_HTML20: - if ((this.versions & Dict.VERS_HTML20) != 0) - return true; - - break; /* to replace old version by new */ - - case Dict.VERS_HTML32: - if ((this.versions & Dict.VERS_HTML32) != 0) - return true; - - break; /* to replace old version by new */ - - case Dict.VERS_HTML40_STRICT: - if ((this.versions & Dict.VERS_HTML40_STRICT) != 0) - return true; - - break; /* to replace old version by new */ - - case Dict.VERS_HTML40_LOOSE: - if ((this.versions & Dict.VERS_HTML40_LOOSE) != 0) - return true; - - break; /* to replace old version by new */ - - case Dict.VERS_FRAMES: - if ((this.versions & Dict.VERS_FRAMES) != 0) - return true; - - break; /* to replace old version by new */ - } - - /* INCONSISTENT_VERSION warning is now issued by ApparentVersion() */ - } - - /* choose new doctype */ - guessed = HTMLVersion(); - } - - if (guessed == Dict.VERS_UNKNOWN) - return false; - - /* for XML use the Voyager system identifier */ - if (this.configuration.XmlOut || this.configuration.XmlTags || this.isvoyager) - { - if (doctype != null) - Node.discardElement(doctype); - - for (i = 0; i < W3CVersion.length; ++i) - { - if (guessed == W3CVersion[i].code) - { - fixHTMLNameSpace(root, W3CVersion[i].profile); - break; - } - } - - return true; - } - - if (doctype == null) - { - doctype = newNode(Node.DocTypeTag, this.lexbuf, 0, 0); - doctype.next = root.content; - doctype.parent = root; - doctype.prev = null; - root.content = doctype; - } - - this.txtstart = this.lexsize; - this.txtend = this.lexsize; - - /* use the appropriate public identifier */ - addStringLiteral("html PUBLIC "); - - if (configuration.docTypeMode == Configuration.DOCTYPE_USER && - configuration.docTypeStr != null) - addStringLiteral(configuration.docTypeStr); - else if (guessed == Dict.VERS_HTML20) - addStringLiteral("\"-//IETF//DTD HTML 2.0//EN\""); - else - { - addStringLiteral("\"-//W3C//DTD "); - - for (i = 0; i < W3CVersion.length; ++i) - { - if (guessed == W3CVersion[i].code) - { - addStringLiteral(W3CVersion[i].name); - break; - } - } - - addStringLiteral("//EN\""); - } - - this.txtend = this.lexsize; - - doctype.start = this.txtstart; - doctype.end = this.txtend; - - return true; - } - - /* ensure XML document starts with */ - public boolean fixXMLPI(Node root) - { - Node xml; - int s; - - if( root.content != null && root.content.type == Node.ProcInsTag) - { - s = root.content.start; - - if (this.lexbuf[s] == (byte)'x' && - this.lexbuf[s+1] == (byte)'m' && - this.lexbuf[s+2] == (byte)'l') - return true; - } - - xml = newNode(Node.ProcInsTag, this.lexbuf, 0, 0); - xml.next = root.content; - - if (root.content != null) - { - root.content.prev = xml; - xml.next = root.content; - } - - root.content = xml; - - this.txtstart = this.lexsize; - this.txtend = this.lexsize; - addStringLiteral("xml version=\"1.0\""); - if (this.configuration.CharEncoding == Configuration.LATIN1) - addStringLiteral(" encoding=\"ISO-8859-1\""); - this.txtend = this.lexsize; - - xml.start = this.txtstart; - xml.end = this.txtend; - return false; - } - - public Node inferredTag(String name) - { - Node node; - - node = newNode(Node.StartTag, - this.lexbuf, - this.txtstart, - this.txtend, - name); - node.implicit = true; - return node; - } - - public static boolean expectsContent(Node node) - { - if (node.type != Node.StartTag) - return false; - - /* unknown element? */ - if (node.tag == null) - return true; - - if ((node.tag.model & Dict.CM_EMPTY) != 0) - return false; - - return true; - } - - /* - create a text node for the contents of - a CDATA element like style or script - which ends with for some foo. - */ - public Node getCDATA(Node container) - { - int c, lastc, start, len, i; - String str; - boolean endtag = false; - - this.lines = this.in.curline; - this.columns = this.in.curcol; - this.waswhite = false; - this.txtstart = this.lexsize; - this.txtend = this.lexsize; - - lastc = (int)'\0'; - start = -1; - - while (true) - { - c = this.in.readChar(); - if (c == StreamIn.EndOfStream) break; - /* treat \r\n as \n and \r as \n */ - - if (c == (int)'/' && lastc == (int)'<') - { - if (endtag) - { - this.lines = this.in.curline; - this.columns = this.in.curcol - 3; - - Report.warning(this, null, null, Report.BAD_CDATA_CONTENT); - } - - start = this.lexsize + 1; /* to first letter */ - endtag = true; - } - else if (c == (int)'>' && start >= 0) - { - len = this.lexsize - start; - if (len == container.element.length()) - { - str = getString( this.lexbuf, start, len ); - if (Lexer.wstrcasecmp(str, container.element) == 0) - { - this.txtend = start - 2; - break; - } - } - - this.lines = this.in.curline; - this.columns = this.in.curcol - 3; - - Report.warning(this, null, null, Report.BAD_CDATA_CONTENT); - - /* if javascript insert backslash before / */ - - if (ParserImpl.isJavaScript(container)) - { - for (i = this.lexsize; i > start-1; --i) - this.lexbuf[i] = this.lexbuf[i-1]; - - this.lexbuf[start-1] = (byte)'\\'; - this.lexsize++; - } - - start = -1; - } - else if (c == (int)'\r') - { - c = this.in.readChar(); - - if (c != (int)'\n') - this.in.ungetChar(c); - - c = (int)'\n'; - } - - addCharToLexer((int)c); - this.txtend = this.lexsize; - lastc = c; - } - - if (c == StreamIn.EndOfStream) - Report.warning(this, container, null, Report.MISSING_ENDTAG_FOR); - - if (this.txtend > this.txtstart) - { - this.token = newNode(Node.TextNode, - this.lexbuf, - this.txtstart, - this.txtend); - return this.token; - } - - return null; - } - - public void ungetToken() - { - this.pushed = true; - } - - public static final short IgnoreWhitespace = 0; - public static final short MixedContent = 1; - public static final short Preformatted = 2; - public static final short IgnoreMarkup = 3; - - /* - modes for GetToken() - - MixedContent -- for elements which don't accept PCDATA - Preformatted -- white space preserved as is - IgnoreMarkup -- for CDATA elements such as script, style - */ - - public Node getToken(short mode) - { - short map; - int c = 0; - int lastc; - int badcomment = 0; - MutableBoolean isempty = new MutableBoolean(); - AttVal attributes; - - if (this.pushed) - { - /* duplicate inlines in preference to pushed text nodes when appropriate */ - if (this.token.type != Node.TextNode || - (this.insert == -1 && this.inode == null)) - { - this.pushed = false; - return this.token; - } - } - - /* at start of block elements, unclosed inline - elements are inserted into the token stream */ - - if (this.insert != -1 || this.inode != null) - return insertedToken(); - - this.lines = this.in.curline; - this.columns = this.in.curcol; - this.waswhite = false; - - this.txtstart = this.lexsize; - this.txtend = this.lexsize; - - while (true) - { - c = this.in.readChar(); - if (c == StreamIn.EndOfStream) break; - if (this.insertspace && mode != IgnoreWhitespace) - { - addCharToLexer(' '); - this.waswhite = true; - this.insertspace = false; - } - - /* treat \r\n as \n and \r as \n */ - - if (c == '\r') - { - c = this.in.readChar(); - - if (c != '\n') - this.in.ungetChar(c); - - c = '\n'; - } - - addCharToLexer(c); - - switch (this.state) - { - case LEX_CONTENT: /* element content */ - map = MAP((char)c); - - /* - Discard white space if appropriate. Its cheaper - to do this here rather than in parser methods - for elements that don't have mixed content. - */ - if (((map & WHITE) != 0) && (mode == IgnoreWhitespace) - && this.lexsize == this.txtstart + 1) - { - --this.lexsize; - this.waswhite = false; - this.lines = this.in.curline; - this.columns = this.in.curcol; - continue; - } - - if (c == '<') - { - this.state = LEX_GT; - continue; - } - - if ((map & WHITE) != 0) - { - /* was previous char white? */ - if (this.waswhite) - { - if (mode != Preformatted && mode != IgnoreMarkup) - { - --this.lexsize; - this.lines = this.in.curline; - this.columns = this.in.curcol; - } - } - else /* prev char wasn't white */ - { - this.waswhite = true; - lastc = c; - - if (mode != Preformatted && mode != IgnoreMarkup && c != ' ') - changeChar((byte)' '); - } - - continue; - } - else if (c == '&' && mode != IgnoreMarkup) - parseEntity(mode); - - /* this is needed to avoid trimming trailing whitespace */ - if (mode == IgnoreWhitespace) - mode = MixedContent; - - this.waswhite = false; - continue; - - case LEX_GT: /* < */ - - /* check for endtag */ - if (c == '/') - { - c = this.in.readChar(); - if (c == StreamIn.EndOfStream) - { - this.in.ungetChar(c); - continue; - } - - addCharToLexer(c); - map = MAP((char)c); - - if ((map & LETTER) != 0) - { - this.lexsize -= 3; - this.txtend = this.lexsize; - this.in.ungetChar(c); - this.state = LEX_ENDTAG; - this.lexbuf[this.lexsize] = (byte)'\0'; /* debug */ - this.in.curcol -= 2; - - /* if some text before the this.txtstart) - { - /* trim space char before end tag */ - if (mode == IgnoreWhitespace && this.lexbuf[this.lexsize - 1] == (byte)' ') - { - this.lexsize -= 1; - this.txtend = this.lexsize; - } - - this.token = newNode(Node.TextNode, - this.lexbuf, - this.txtstart, - this.txtend); - return this.token; - } - - continue; /* no text so keep going */ - } - - /* otherwise treat as CDATA */ - this.waswhite = false; - this.state = LEX_CONTENT; - continue; - } - - if (mode == IgnoreMarkup) - { - /* otherwise treat as CDATA */ - this.waswhite = false; - this.state = LEX_CONTENT; - continue; - } - - /* - look out for comments, doctype or marked sections - this isn't quite right, but its getting there ... - */ - if (c == '!') - { - c = this.in.readChar(); - - if (c == '-') - { - c = this.in.readChar(); - - if (c == '-') - { - this.state = LEX_COMMENT; /* comment */ - this.lexsize -= 2; - this.txtend = this.lexsize; - - /* if some text before < return it now */ - if (this.txtend > this.txtstart) - { - this.token = newNode(Node.TextNode, - this.lexbuf, - this.txtstart, - this.txtend); - return this.token; - } - - this.txtstart = this.lexsize; - continue; - } - - Report.warning(this, null, null, Report.MALFORMED_COMMENT); - } - else if (c == 'd' || c == 'D') - { - this.state = LEX_DOCTYPE; /* doctype */ - this.lexsize -= 2; - this.txtend = this.lexsize; - mode = IgnoreWhitespace; - - /* skip until white space or '>' */ - - for (;;) - { - c = this.in.readChar(); - - if (c == StreamIn.EndOfStream || c == '>') - { - this.in.ungetChar(c); - break; - } - - map = MAP((char)c); - - if ((map & WHITE) == 0) - continue; - - /* and skip to end of whitespace */ - - for (;;) - { - c = this.in.readChar(); - - if (c == StreamIn.EndOfStream || c == '>') - { - this.in.ungetChar(c); - break; - } - - map = MAP((char)c); - - if ((map & WHITE) != 0) - continue; - - this.in.ungetChar(c); - break; - } - - break; - } - - /* if some text before < return it now */ - if (this.txtend > this.txtstart) - { - this.token = newNode(Node.TextNode, - this.lexbuf, - this.txtstart, - this.txtend); - return this.token; - } - - this.txtstart = this.lexsize; - continue; - } - else if (c == '[') - { - /* Word 2000 embeds ... sequences */ - this.lexsize -= 2; - this.state = LEX_SECTION; - this.txtend = this.lexsize; - - /* if some text before < return it now */ - if (this.txtend > this.txtstart) - { - this.token = newNode(Node.TextNode, - this.lexbuf, - this.txtstart, - this.txtend); - return this.token; - } - - this.txtstart = this.lexsize; - continue; - } - - /* otherwise swallow chars up to and including next '>' */ - while (true) - { - c = this.in.readChar(); - if (c == '>') break; - if (c == -1) - { - this.in.ungetChar(c); - break; - } - } - - this.lexsize -= 2; - this.lexbuf[this.lexsize] = (byte)'\0'; - this.state = LEX_CONTENT; - continue; - } - - /* - processing instructions - */ - - if (c == '?') - { - this.lexsize -= 2; - this.state = LEX_PROCINSTR; - this.txtend = this.lexsize; - - /* if some text before < return it now */ - if (this.txtend > this.txtstart) - { - this.token = newNode(Node.TextNode, - this.lexbuf, - this.txtstart, - this.txtend); - return this.token; - } - - this.txtstart = this.lexsize; - continue; - } - - /* Microsoft ASP's e.g. <% ... server-code ... %> */ - if (c == '%') - { - this.lexsize -= 2; - this.state = LEX_ASP; - this.txtend = this.lexsize; - - /* if some text before < return it now */ - if (this.txtend > this.txtstart) - { - this.token = newNode(Node.TextNode, - this.lexbuf, - this.txtstart, - this.txtend); - return this.token; - } - - this.txtstart = this.lexsize; - continue; - } - - /* Netscapes JSTE e.g. <# ... server-code ... #> */ - if (c == '#') - { - this.lexsize -= 2; - this.state = LEX_JSTE; - this.txtend = this.lexsize; - - /* if some text before < return it now */ - if (this.txtend > this.txtstart) - { - this.token = newNode(Node.TextNode, - this.lexbuf, - this.txtstart, - this.txtend); - return this.token; - } - - this.txtstart = this.lexsize; - continue; - } - - map = MAP((char)c); - - /* check for start tag */ - if ((map & LETTER) != 0) - { - this.in.ungetChar(c); /* push back letter */ - this.lexsize -= 2; /* discard "<" + letter */ - this.txtend = this.lexsize; - this.state = LEX_STARTTAG; /* ready to read tag name */ - - /* if some text before < return it now */ - if (this.txtend > this.txtstart) - { - this.token = newNode(Node.TextNode, - this.lexbuf, - this.txtstart, - this.txtend); - return this.token; - } - - continue; /* no text so keep going */ - } - - /* otherwise treat as CDATA */ - this.state = LEX_CONTENT; - this.waswhite = false; - continue; - - case LEX_ENDTAG: /* ' */ - while (c != '>') - { - c = this.in.readChar(); - - if (c == StreamIn.EndOfStream) - break; - } - - if (c == StreamIn.EndOfStream) - { - this.in.ungetChar(c); - continue; - } - - this.state = LEX_CONTENT; - this.waswhite = false; - return this.token; /* the endtag token */ - - case LEX_STARTTAG: /* first letter of tagname */ - this.txtstart = this.lexsize - 1; /* set txtstart to first letter */ - c = parseTagName(); - isempty.value = false; - attributes = null; - this.token = newNode((isempty.value ? Node.StartEndTag : Node.StartTag), - this.lexbuf, - this.txtstart, - this.txtend, - getString(this.lexbuf, - this.txtstart, - this.txtend - this.txtstart)); - - /* parse attributes, consuming closing ">" */ - if (c != '>') - { - if (c == '/') - this.in.ungetChar(c); - - attributes = parseAttrs(isempty); - } - - if (isempty.value) - this.token.type = Node.StartEndTag; - - this.token.attributes = attributes; - this.lexsize = this.txtstart; - this.txtend = this.txtstart; - - /* swallow newline following start tag */ - /* special check needed for CRLF sequence */ - /* this doesn't apply to empty elements */ - - if (expectsContent(this.token) || - this.token.tag == configuration.tt.tagBr) - { - - c = this.in.readChar(); - - if (c == '\r') - { - c = this.in.readChar(); - - if (c != '\n') - this.in.ungetChar(c); - } - else if (c != '\n' && c != '\f') - this.in.ungetChar(c); - - this.waswhite = true; /* to swallow leading whitespace */ - } - else - this.waswhite = false; - - this.state = LEX_CONTENT; - - if (this.token.tag == null) - Report.error(this, null, this.token, Report.UNKNOWN_ELEMENT); - else if (!this.configuration.XmlTags) - { - this.versions &= this.token.tag.versions; - - if ((this.token.tag.versions & Dict.VERS_PROPRIETARY) != 0) - { - if (!this.configuration.MakeClean && (this.token.tag == configuration.tt.tagNobr || - this.token.tag == configuration.tt.tagWbr)) - Report.warning(this, null, this.token, Report.PROPRIETARY_ELEMENT); - } - - if (this.token.tag.chkattrs != null) - { - this.token.checkUniqueAttributes(this); - this.token.tag.chkattrs.check(this, this.token); - } - else - this.token.checkAttributes(this); - } - - return this.token; /* return start tag */ - - case LEX_COMMENT: /* seen */ - - if (c != '-') - continue; - - c = this.in.readChar(); - addCharToLexer(c); - - if (c != '-') - continue; - - end_comment: while (true) { - c = this.in.readChar(); - - if (c == '>') - { - if (badcomment != 0) - Report.warning(this, null, null, Report.MALFORMED_COMMENT); - - this.txtend = this.lexsize - 2; // AQ 8Jul2000 - this.lexbuf[this.lexsize] = (byte)'\0'; - this.state = LEX_CONTENT; - this.waswhite = false; - this.token = newNode(Node.CommentTag, - this.lexbuf, - this.txtstart, - this.txtend); - - /* now look for a line break */ - - c = this.in.readChar(); - - if (c == '\r') - { - c = this.in.readChar(); - - if (c != '\n') - this.token.linebreak = true; - } - - if (c == '\n') - this.token.linebreak = true; - else - this.in.ungetChar(c); - - return this.token; - } - - /* note position of first such error in the comment */ - if (badcomment == 0) - { - this.lines = this.in.curline; - this.columns = this.in.curcol - 3; - } - - badcomment++; - if (this.configuration.FixComments) - this.lexbuf[this.lexsize - 2] = (byte)'='; - - addCharToLexer(c); - - /* if '-' then look for '>' to end the comment */ - if (c != '-') - break end_comment; - - } - /* otherwise continue to look for --> */ - this.lexbuf[this.lexsize - 2] = (byte)'='; - continue; - - case LEX_DOCTYPE: /* seen ' munging whitespace */ - map = MAP((char)c); - - if ((map & WHITE) != 0) - { - if (this.waswhite) - this.lexsize -= 1; - - this.waswhite = true; - } - else - this.waswhite = false; - - if (c != '>') - continue; - - this.lexsize -= 1; - this.txtend = this.lexsize; - this.lexbuf[this.lexsize] = (byte)'\0'; - this.state = LEX_CONTENT; - this.waswhite = false; - this.token = newNode(Node.DocTypeTag, - this.lexbuf, - this.txtstart, - this.txtend); - /* make a note of the version named by the doctype */ - this.doctype = findGivenVersion(this.token); - return this.token; - - case LEX_PROCINSTR: /* seen ' */ - /* check for PHP preprocessor instructions */ - - if (this.lexsize - this.txtstart == 3) - { - if ((getString(this.lexbuf, this.txtstart, 3)).equals("php")) - { - this.state = LEX_PHP; - continue; - } - } - - if (this.configuration.XmlPIs) /* insist on ?> as terminator */ - { - if (c != '?') - continue; - - /* now look for '>' */ - c = this.in.readChar(); - - if (c == StreamIn.EndOfStream) - { - Report.warning(this, null, null, Report.UNEXPECTED_END_OF_FILE); - this.in.ungetChar(c); - continue; - } - - addCharToLexer(c); - } - - if (c != '>') - continue; - - this.lexsize -= 1; - this.txtend = this.lexsize; - this.lexbuf[this.lexsize] = (byte)'\0'; - this.state = LEX_CONTENT; - this.waswhite = false; - this.token = newNode(Node.ProcInsTag, - this.lexbuf, - this.txtstart, - this.txtend); - return this.token; - - case LEX_ASP: /* seen <% so look for "%>" */ - if (c != '%') - continue; - - /* now look for '>' */ - c = this.in.readChar(); - - - if (c != '>') - { - this.in.ungetChar(c); - continue; - } - - this.lexsize -= 1; - this.txtend = this.lexsize; - this.lexbuf[this.lexsize] = (byte)'\0'; - this.state = LEX_CONTENT; - this.waswhite = false; - this.token = newNode(Node.AspTag, - this.lexbuf, - this.txtstart, - this.txtend); - return this.token; - - case LEX_JSTE: /* seen <# so look for "#>" */ - if (c != '#') - continue; - - /* now look for '>' */ - c = this.in.readChar(); - - - if (c != '>') - { - this.in.ungetChar(c); - continue; - } - - this.lexsize -= 1; - this.txtend = this.lexsize; - this.lexbuf[this.lexsize] = (byte)'\0'; - this.state = LEX_CONTENT; - this.waswhite = false; - this.token = newNode(Node.JsteTag, - this.lexbuf, - this.txtstart, - this.txtend); - return this.token; - - case LEX_PHP: /* seen "" */ - if (c != '?') - continue; - - /* now look for '>' */ - c = this.in.readChar(); - - if (c != '>') - { - this.in.ungetChar(c); - continue; - } - - this.lexsize -= 1; - this.txtend = this.lexsize; - this.lexbuf[this.lexsize] = (byte)'\0'; - this.state = LEX_CONTENT; - this.waswhite = false; - this.token = newNode(Node.PhpTag, - this.lexbuf, - this.txtstart, - this.txtend); - return this.token; - - case LEX_SECTION: /* seen "" */ - if (c == '[') - { - if (this.lexsize == (this.txtstart + 6) && - (getString(this.lexbuf, this.txtstart, 6)).equals("CDATA[")) - { - this.state = LEX_CDATA; - this.lexsize -= 6; - continue; - } - } - - if (c != ']') - continue; - - /* now look for '>' */ - c = this.in.readChar(); - - if (c != '>') - { - this.in.ungetChar(c); - continue; - } - - this.lexsize -= 1; - this.txtend = this.lexsize; - this.lexbuf[this.lexsize] = (byte)'\0'; - this.state = LEX_CONTENT; - this.waswhite = false; - this.token = newNode(Node.SectionTag, - this.lexbuf, - this.txtstart, - this.txtend); - return this.token; - - case LEX_CDATA: /* seen "" */ - if (c != ']') - continue; - - /* now look for ']' */ - c = this.in.readChar(); - - if (c != ']') - { - this.in.ungetChar(c); - continue; - } - - /* now look for '>' */ - c = this.in.readChar(); - - if (c != '>') - { - this.in.ungetChar(c); - continue; - } - - this.lexsize -= 1; - this.txtend = this.lexsize; - this.lexbuf[this.lexsize] = (byte)'\0'; - this.state = LEX_CONTENT; - this.waswhite = false; - this.token = newNode(Node.CDATATag, - this.lexbuf, - this.txtstart, - this.txtend); - return this.token; - } - } - - if (this.state == LEX_CONTENT) /* text string */ - { - this.txtend = this.lexsize; - - if (this.txtend > this.txtstart) - { - this.in.ungetChar(c); - - if (this.lexbuf[this.lexsize - 1] == (byte)' ') - { - this.lexsize -= 1; - this.txtend = this.lexsize; - } - - this.token = newNode(Node.TextNode, - this.lexbuf, - this.txtstart, - this.txtend); - return this.token; - } - } - else if (this.state == LEX_COMMENT) /* comment */ - { - if (c == StreamIn.EndOfStream) - Report.warning(this, null, null, Report.MALFORMED_COMMENT); - - this.txtend = this.lexsize; - this.lexbuf[this.lexsize] = (byte)'\0'; - this.state = LEX_CONTENT; - this.waswhite = false; - this.token = newNode(Node.CommentTag, - this.lexbuf, - this.txtstart, - this.txtend); - return this.token; - } - - return null; - } - - /* - parser for ASP within start tags - - Some people use ASP for to customize attributes - Tidy isn't really well suited to dealing with ASP - This is a workaround for attributes, but won't - deal with the case where the ASP is used to tailor - the attribute value. Here is an example of a work - around for using ASP in attribute values: - - href="<%=rsSchool.Fields("ID").Value%>" - - where the ASP that generates the attribute value - is masked from Tidy by the quotemarks. - - */ - - public Node parseAsp() - { - int c; - Node asp = null; - - this.txtstart = this.lexsize; - - for (;;) - { - c = this.in.readChar(); - addCharToLexer(c); - - - if (c != '%') - continue; - - c = this.in.readChar(); - addCharToLexer(c); - - if (c == '>') - break; - } - - this.lexsize -= 2; - this.txtend = this.lexsize; - - if (this.txtend > this.txtstart) - asp = newNode(Node.AspTag, - this.lexbuf, - this.txtstart, - this.txtend); - - this.txtstart = this.txtend; - return asp; - } - - /* - PHP is like ASP but is based upon XML - processing instructions, e.g. - */ - public Node parsePhp() - { - int c; - Node php = null; - - this.txtstart = this.lexsize; - - for (;;) - { - c = this.in.readChar(); - addCharToLexer(c); - - - if (c != '?') - continue; - - c = this.in.readChar(); - addCharToLexer(c); - - if (c == '>') - break; - } - - this.lexsize -= 2; - this.txtend = this.lexsize; - - if (this.txtend > this.txtstart) - php = newNode(Node.PhpTag, - this.lexbuf, - this.txtstart, - this.txtend); - - this.txtstart = this.txtend; - return php; - } - - /* consumes the '>' terminating start tags */ - public String parseAttribute(MutableBoolean isempty, MutableObject asp, - MutableObject php) - { - int start = 0; - // int len = 0; Removed by BUGFIX for 126265 - short map; - String attr; - int c = 0; - - asp.setObject(null); /* clear asp pointer */ - php.setObject(null); /* clear php pointer */ - /* skip white space before the attribute */ - - for (;;) - { - c = this.in.readChar(); - - if (c == '/') - { - c = this.in.readChar(); - - if (c == '>') - { - isempty.value = true; - return null; - } - - this.in.ungetChar(c); - c = '/'; - break; - } - - if (c == '>') - return null; - - if (c =='<') - { - c = this.in.readChar(); - - if (c == '%') - { - asp.setObject(parseAsp()); - return null; - } - else if (c == '?') - { - php.setObject(parsePhp()); - return null; - } - - this.in.ungetChar(c); - Report.attrError(this, this.token, null, Report.UNEXPECTED_GT); - return null; - } - - if (c == '"' || c == '\'') - { - Report.attrError(this, this.token, null, Report.UNEXPECTED_QUOTEMARK); - continue; - } - - if (c == StreamIn.EndOfStream) - { - Report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE); - this.in.ungetChar(c); - return null; - } - - map = MAP((char)c); - - if ((map & WHITE) == 0) - break; - } - - start = this.lexsize; - - for (;;) - { - /* but push back '=' for parseValue() */ - if (c == '=' || c == '>') - { - this.in.ungetChar(c); - break; - } - - if (c == '<' || c == StreamIn.EndOfStream) - { - this.in.ungetChar(c); - break; - } - - map = MAP((char)c); - - if ((map & WHITE) != 0) - break; - - /* what should be done about non-namechar characters? */ - /* currently these are incorporated into the attr name */ - - if (!this.configuration.XmlTags && (map & UPPERCASE) != 0) - c += (int)('a' - 'A'); - - // ++len; Removed by BUGFIX for 126265 - addCharToLexer(c); - - c = this.in.readChar(); - } - - // Following line added by GLP to fix BUG 126265. This is a temporary comment - // and should be removed when Tidy is fixed. - int len = this.lexsize - start; - attr = (len > 0 ? getString(this.lexbuf, start, len) : null); - this.lexsize = start; - - return attr; - } - - /* - invoked when < is seen in place of attribute value - but terminates on whitespace if not ASP, PHP or Tango - this routine recognizes ' and " quoted strings - */ - public int parseServerInstruction() - { - int c, map, delim = '"'; - boolean isrule = false; - - c = this.in.readChar(); - addCharToLexer(c); - - /* check for ASP, PHP or Tango */ - if (c == '%' || c == '?' || c == '@') - isrule = true; - - for (;;) - { - c = this.in.readChar(); - - if (c == StreamIn.EndOfStream) - break; - - if (c == '>') - { - if (isrule) - addCharToLexer(c); - else - this.in.ungetChar(c); - - break; - } - - /* if not recognized as ASP, PHP or Tango */ - /* then also finish value on whitespace */ - if (!isrule) - { - map = MAP((char)c); - - if ((map & WHITE) != 0) - break; - } - - addCharToLexer(c); - - if (c == '"') - { - do - { - c = this.in.readChar(); - addCharToLexer(c); - } - while (c != '"'); - delim = '\''; - continue; - } - - if (c == '\'') - { - do - { - c = this.in.readChar(); - addCharToLexer(c); - } - while (c != '\''); - } - } - - return delim; - } - - /* values start with "=" or " = " etc. */ - /* doesn't consume the ">" at end of start tag */ - - public String parseValue(String name, boolean foldCase, - MutableBoolean isempty, MutableInteger pdelim) - { - int len = 0; - int start; - short map; - boolean seen_gt = false; - boolean munge = true; - int c = 0; - int lastc, delim, quotewarning; - String value; - - delim = 0; - pdelim.value = (int)'"'; - - /* - Henry Zrepa reports that some folk are using the - embed element with script attributes where newlines - are significant and must be preserved - */ - if (configuration.LiteralAttribs) - munge = false; - - /* skip white space before the '=' */ - - for (;;) - { - c = this.in.readChar(); - - if (c == StreamIn.EndOfStream) - { - this.in.ungetChar(c); - break; - } - - map = MAP((char)c); - - if ((map & WHITE) == 0) - break; - } - - /* - c should be '=' if there is a value - other legal possibilities are white - space, '/' and '>' - */ - - if (c != '=') - { - this.in.ungetChar(c); - return null; - } - - /* skip white space after '=' */ - - for (;;) - { - c = this.in.readChar(); - - if (c == StreamIn.EndOfStream) - { - this.in.ungetChar(c); - break; - } - - map = MAP((char)c); - - if ((map & WHITE) == 0) - break; - } - - /* check for quote marks */ - - if (c == '"' || c == '\'') - delim = c; - else if (c == '<') - { - start = this.lexsize; - addCharToLexer(c); - pdelim.value = parseServerInstruction(); - len = this.lexsize - start; - this.lexsize = start; - return (len > 0 ? getString(this.lexbuf, start, len) : null); - } - else - this.in.ungetChar(c); - - /* - and read the value string - check for quote mark if needed - */ - - quotewarning = 0; - start = this.lexsize; - c = '\0'; - - for (;;) - { - lastc = c; /* track last character */ - c = this.in.readChar(); - - if (c == StreamIn.EndOfStream) - { - Report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE); - this.in.ungetChar(c); - break; - } - - if (delim == (char)0) - { - if (c == '>') - { - this.in.ungetChar(c); - break; - } - - if (c == '"' || c == '\'') - { - Report.attrError(this, this.token, null, Report.UNEXPECTED_QUOTEMARK); - break; - } - - if (c == '<') - { - /* this.in.ungetChar(c); */ - Report.attrError(this, this.token, null, Report.UNEXPECTED_GT); - /* break; */ - } - - /* - For cases like
need to avoid treating /> as - part of the attribute value, however care is needed to avoid - so treating
in this way, which - would map the tag to - */ - if (c == '/') - { - /* peek ahead in case of /> */ - c = this.in.readChar(); - - if (c == '>' && - !AttributeTable.getDefaultAttributeTable().isUrl(name)) - { - isempty.value = true; - this.in.ungetChar(c); - break; - } - - /* unget peeked char */ - this.in.ungetChar(c); - c = '/'; - } - } - else /* delim is '\'' or '"' */ - { - if (c == delim) - break; - - /* treat CRLF, CR and LF as single line break */ - - if (c == '\r') - { - c = this.in.readChar(); - if (c != '\n') - this.in.ungetChar(c); - - c = '\n'; - } - - if (c == '\n' || c == '<' || c == '>') - ++quotewarning; - - if (c == '>') - seen_gt = true; - } - - if (c == '&') - { - addCharToLexer(c); - parseEntity((short)0); - continue; - } - - /* - kludge for JavaScript attribute values - with line continuations in string literals - */ - if (c == '\\') - { - c = this.in.readChar(); - - if (c != '\n') - { - this.in.ungetChar(c); - c = '\\'; - } - } - - map = MAP((char)c); - - if ((map & WHITE) != 0) - { - if (delim == (char)0) - break; - - if (munge) - { - c = ' '; - - if (lastc == ' ') - continue; - } - } - else if (foldCase && (map & UPPERCASE) != 0) - c += (int)('a' - 'A'); - - addCharToLexer(c); - } - - if (quotewarning > 10 && seen_gt && munge) - { - /* - there is almost certainly a missing trailling quote mark - as we have see too many newlines, < or > characters. - - an exception is made for Javascript attributes and the - javascript URL scheme which may legitimately include < and > - */ - if (!AttributeTable.getDefaultAttributeTable().isScript(name) && - !(AttributeTable.getDefaultAttributeTable().isUrl(name) && - (getString(this.lexbuf, start, 11)).equals("javascript:"))) - Report.error(this, null, null, Report.SUSPECTED_MISSING_QUOTE); - } - - len = this.lexsize - start; - this.lexsize = start; - - if (len > 0 || delim != 0) - value = getString(this.lexbuf, start, len); - else - value = null; - - /* note delimiter if given */ - if (delim != 0) - pdelim.value = delim; - else - pdelim.value = (int)'"'; - - return value; - } - - /* attr must be non-null */ - public static boolean isValidAttrName(String attr) - { - short map; - char c; - int i; - - /* first character should be a letter */ - c = attr.charAt(0); - map = MAP(c); - - if (!((map & LETTER) != 0)) - return false; - - /* remaining characters should be namechars */ - for( i = 1; i < attr.length(); i++) - { - c = attr.charAt(i); - map = MAP(c); - - if((map & NAMECHAR) != 0) - continue; - - return false; - } - - return true; - } - - /* swallows closing '>' */ - - public AttVal parseAttrs(MutableBoolean isempty) - { - AttVal av, list; - String attribute, value; - MutableInteger delim = new MutableInteger(); - MutableObject asp = new MutableObject(); - MutableObject php = new MutableObject(); - - list = null; - - for (; !endOfInput();) - { - attribute = parseAttribute(isempty, asp, php); - - if (attribute == null) - { - /* check if attributes are created by ASP markup */ - if (asp.getObject() != null) - { - av = new AttVal(list, null, (Node)asp.getObject(), null, - '\0', null, null ); - list = av; - continue; - } - - /* check if attributes are created by PHP markup */ - if (php.getObject() != null) - { - av = new AttVal(list, null, null, (Node)php.getObject(), - '\0', null, null ); - list = av; - continue; - } - - break; - } - - value = parseValue(attribute, false, isempty, delim); - - if (attribute != null && isValidAttrName(attribute)) - { - av = new AttVal( list, null, null, null, - delim.value, attribute, value ); - av.dict = - AttributeTable.getDefaultAttributeTable().findAttribute(av); - list = av; - } - else - { - av = new AttVal( null, null, null, null, - 0, attribute, value ); - Report.attrError(this, this.token, value, Report.BAD_ATTRIBUTE_VALUE); - } - } - - return list; - } - - /* - push a copy of an inline node onto stack - but don't push if implicit or OBJECT or APPLET - (implicit tags are ones generated from the istack) - - One issue arises with pushing inlines when - the tag is already pushed. For instance: - -

text -

more text - - Shouldn't be mapped to - -

text

-

more text - */ - public void pushInline( Node node ) - { - IStack is; - - if (node.implicit) - return; - - if (node.tag == null) - return; - - if ((node.tag.model & Dict.CM_INLINE) == 0 ) - return; - - if ((node.tag.model & Dict.CM_OBJECT) != 0) - return; - - if (node.tag != configuration.tt.tagFont && isPushed(node)) - return; - - // make sure there is enough space for the stack - is = new IStack(); - is.tag = node.tag; - is.element = node.element; - if (node.attributes != null) - is.attributes = cloneAttributes(node.attributes); - this.istack.push( is ); - } - - /* pop inline stack */ - public void popInline( Node node ) - { - AttVal av; - IStack is; - - if (node != null) { - - if (node.tag == null) - return; - - if ((node.tag.model & Dict.CM_INLINE) == 0) - return; - - if ((node.tag.model & Dict.CM_OBJECT) != 0) - return; - - // if node is then pop until we find an - if (node.tag == configuration.tt.tagA) { - - while (this.istack.size() > 0) { - is = (IStack)this.istack.pop(); - if (is.tag == configuration.tt.tagA) { - break; - } - } - - if (this.insert >= this.istack.size()) - this.insert = -1; - return; - } - } - - if (this.istack.size() > 0) { - is = (IStack)this.istack.pop(); - if (this.insert >= this.istack.size()) - this.insert = -1; - } - } - - public boolean isPushed( Node node ) - { - int i; - IStack is; - - for (i = this.istack.size() - 1; i >= 0; --i) { - is = (IStack)this.istack.elementAt(i); - if (is.tag == node.tag) - return true; - } - - return false; - } - - /* - This has the effect of inserting "missing" inline - elements around the contents of blocklevel elements - such as P, TD, TH, DIV, PRE etc. This procedure is - called at the start of ParseBlock. when the inline - stack is not empty, as will be the case in: - -

italic heading

- - which is then treated as equivalent to - -

italic heading

- - This is implemented by setting the lexer into a mode - where it gets tokens from the inline stack rather than - from the input stream. - */ - public int inlineDup( Node node ) - { - int n; - - n = this.istack.size() - this.istackbase; - if ( n > 0 ) { - this.insert = this.istackbase; - this.inode = node; - } - - return n; - } - - public Node insertedToken() - { - Node node; - IStack is; - int n; - - // this will only be null if inode != null - if (this.insert == -1) { - node = this.inode; - this.inode = null; - return node; - } - - // is this is the "latest" node then update - // the position, otherwise use current values - - if (this.inode == null) { - this.lines = this.in.curline; - this.columns = this.in.curcol; - } - - node = newNode(Node.StartTag, - this.lexbuf, - this.txtstart, - this.txtend); // GLP: Bugfix 126261. Remove when this change - // is fixed in istack.c in the original Tidy - node.implicit = true; - is = (IStack)this.istack.elementAt( this.insert ); - node.element = is.element; - node.tag = is.tag; - if (is.attributes != null) - node.attributes = cloneAttributes(is.attributes); - - // advance lexer to next item on the stack - n = this.insert; - - // and recover state if we have reached the end - if (++n < this.istack.size() ) { - this.insert = n; - } else { - this.insert = -1; - } - - return node; - } - - /* AQ: Try this for speed optimization */ - public static int wstrcasecmp(String s1, String s2) - { - return (s1.equalsIgnoreCase(s2) ? 0 : 1); - } - - public static int wstrcaselexcmp(String s1, String s2) - { - char c; - int i = 0; - - while ( i < s1.length() && i < s2.length() ) { - c = s1.charAt(i); - if ( toLower(c) != toLower( s2.charAt(i) ) ) { - break; - } - i += 1; - } - if ( i == s1.length() && i == s2.length() ) { - return 0; - } else if ( i == s1.length() ) { - return -1; - } else if ( i == s2.length() ) { - return 1; - } else { - return ( s1.charAt(i) > s2.charAt(i) ? 1 : -1 ); - } - } - - public static boolean wsubstr(String s1, String s2) - { - int i; - int len1 = s1.length(); - int len2 = s2.length(); - - for (i = 0; i <= len1 - len2; ++i) - { - if (s2.equalsIgnoreCase(s1.substring(i))) - return true; - } - - return false; - } - - public boolean canPrune(Node element) - { - if (element.type == Node.TextNode) - return true; - - if (element.content != null) - return false; - - if (element.tag == configuration.tt.tagA && element.attributes != null) - return false; - - if (element.tag == configuration.tt.tagP && !this.configuration.DropEmptyParas) - return false; - - if (element.tag == null) - return false; - - if ((element.tag.model & Dict.CM_ROW) != 0) - return false; - - if (element.tag == configuration.tt.tagApplet) - return false; - - if (element.tag == configuration.tt.tagObject) - return false; - - if (element.attributes != null && - (element.getAttrByName("id") != null || - element.getAttrByName("name") != null) ) - return false; - - return true; - } - - /* duplicate name attribute as an id */ - public void fixId(Node node) - { - AttVal name = node.getAttrByName("name"); - AttVal id = node.getAttrByName("id"); - - if (name != null) - { - if (id != null) - { - if (!id.value.equals(name.value)) - Report.attrError(this, node, "name", Report.ID_NAME_MISMATCH); - } - else if (this.configuration.XmlOut) - node.addAttribute("id", name.value); - } - } - - /* - defer duplicates when entering a table or other - element where the inlines shouldn't be duplicated - */ - public void deferDup() - { - this.insert = -1; - this.inode = null; - } - - /* Private methods and fields */ - - /* lexer char types */ - private static final short DIGIT = 1; - private static final short LETTER = 2; - private static final short NAMECHAR = 4; - private static final short WHITE = 8; - private static final short NEWLINE = 16; - private static final short LOWERCASE = 32; - private static final short UPPERCASE = 64; - - /* lexer GetToken states */ - - private static final short LEX_CONTENT = 0; - private static final short LEX_GT = 1; - private static final short LEX_ENDTAG = 2; - private static final short LEX_STARTTAG = 3; - private static final short LEX_COMMENT = 4; - private static final short LEX_DOCTYPE = 5; - private static final short LEX_PROCINSTR = 6; - private static final short LEX_ENDCOMMENT = 7; - private static final short LEX_CDATA = 8; - private static final short LEX_SECTION = 9; - private static final short LEX_ASP = 10; - private static final short LEX_JSTE = 11; - private static final short LEX_PHP = 12; - - /* used to classify chars for lexical purposes */ - private static short[] lexmap = new short[128]; - - private static void mapStr(String str, short code) - { - int j; - - for ( int i = 0; i < str.length(); i++ ) { - j = (int)str.charAt(i); - lexmap[j] |= code; - } - } - - static { - mapStr("\r\n\f", (short)(NEWLINE|WHITE)); - mapStr(" \t", WHITE); - mapStr("-.:_", NAMECHAR); - mapStr("0123456789", (short)(DIGIT|NAMECHAR)); - mapStr("abcdefghijklmnopqrstuvwxyz", (short)(LOWERCASE|LETTER|NAMECHAR)); - mapStr("ABCDEFGHIJKLMNOPQRSTUVWXYZ", (short)(UPPERCASE|LETTER|NAMECHAR)); - } - - private static short MAP( char c ) - { - return ((int)c < 128 ? lexmap[(int)c] : 0); - } - - private static boolean isWhite(char c) - { - short m = MAP(c); - - return (m & WHITE) != 0; - } - - private static boolean isDigit(char c) - { - short m; - - m = MAP(c); - - return (m & DIGIT) != 0; - } - - private static boolean isLetter(char c) - { - short m; - - m = MAP(c); - - return (m & LETTER) != 0; - } - - private static char toLower(char c) - { - short m = MAP(c); - - if ((m & UPPERCASE) != 0) - c = (char)( (int)c + (int)'a' - (int)'A' ); - - return c; - } - - private static char toUpper(char c) - { - short m = MAP(c); - - if ((m & LOWERCASE) != 0) - c = (char)( (int)c + (int)'A' - (int)'a' ); - - return c; - } - - public static char foldCase(char c, boolean tocaps, boolean xmlTags) - { - short m; - - if (!xmlTags) - { - m = MAP(c); - - if (tocaps) - { - if ((m & LOWERCASE) != 0) - c = (char)( (int)c + (int)'A' - (int)'a' ); - } - else /* force to lower case */ - { - if ((m & UPPERCASE) != 0) - c = (char)( (int)c + (int)'a' - (int)'A' ); - } - } - - return c; - } - - - private static class W3CVersionInfo - { - String name; - String voyagerName; - String profile; - short code; - - public W3CVersionInfo( String name, - String voyagerName, - String profile, - short code ) - { - this.name = name; - this.voyagerName = voyagerName; - this.profile = profile; - this.code = code; - } - } - - /* the 3 URIs for the XHTML 1.0 DTDs */ - private static final String voyager_loose = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"; - private static final String voyager_strict = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"; - private static final String voyager_frameset = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd"; - - private static final String XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"; - - private static Lexer.W3CVersionInfo[] W3CVersion = - { - new W3CVersionInfo("HTML 4.01", - "XHTML 1.0 Strict", - voyager_strict, - Dict.VERS_HTML40_STRICT), - new W3CVersionInfo("HTML 4.01 Transitional", - "XHTML 1.0 Transitional", - voyager_loose, - Dict.VERS_HTML40_LOOSE), - new W3CVersionInfo("HTML 4.01 Frameset", - "XHTML 1.0 Frameset", - voyager_frameset, - Dict.VERS_FRAMES), - new W3CVersionInfo("HTML 4.0", - "XHTML 1.0 Strict", - voyager_strict, - Dict.VERS_HTML40_STRICT), - new W3CVersionInfo("HTML 4.0 Transitional", - "XHTML 1.0 Transitional", - voyager_loose, - Dict.VERS_HTML40_LOOSE), - new W3CVersionInfo("HTML 4.0 Frameset", - "XHTML 1.0 Frameset", - voyager_frameset, - Dict.VERS_FRAMES), - new W3CVersionInfo("HTML 3.2", - "XHTML 1.0 Transitional", - voyager_loose, - Dict.VERS_HTML32), - new W3CVersionInfo("HTML 2.0", - "XHTML 1.0 Strict", - voyager_strict, - Dict.VERS_HTML20) - }; - -}