(translation to Java)
+ * @version 1.0, 1999/05/22
+ * @version 1.0.1, 1999/05/29
+ * @version 1.1, 1999/06/18 Java Bean
+ * @version 1.2, 1999/07/10 Tidy Release 7 Jul 1999
+ * @version 1.3, 1999/07/30 Tidy Release 26 Jul 1999
+ * @version 1.4, 1999/09/04 DOM support
+ * @version 1.5, 1999/10/23 Tidy Release 27 Sep 1999
+ * @version 1.6, 1999/11/01 Tidy Release 22 Oct 1999
+ * @version 1.7, 1999/12/06 Tidy Release 30 Nov 1999
+ * @version 1.8, 2000/01/22 Tidy Release 13 Jan 2000
+ * @version 1.9, 2000/06/03 Tidy Release 30 Apr 2000
+ * @version 1.10, 2000/07/22 Tidy Release 8 Jul 2000
+ * @version 1.11, 2000/08/16 Tidy Release 4 Aug 2000
+ */
+
+/*
+ Given a file stream fp it returns a sequence of tokens.
+
+ GetToken(fp) gets the next token
+ UngetToken(fp) provides one level undo
+
+ The tags include an attribute list:
+
+ - linked list of attribute/value nodes
+ - each node has 2 null-terminated strings.
+ - entities are replaced in attribute values
+
+ white space is compacted if not in preformatted mode
+ If not in preformatted mode then leading white space
+ is discarded and subsequent white space sequences
+ compacted to single space chars.
+
+ If XmlTags is no then Tag names are folded to upper
+ case and attribute names to lower case.
+
+ Not yet done:
+ - Doctype subset and marked sections
+*/
+
+import java.io.PrintWriter;
+import java.util.Stack;
+import java.util.Vector;
+
+import org.eclipse.core.resources.IFile;
+import sun.security.krb5.internal.av;
+
+public class Lexer {
+
+ private IFile iFile;
+ public StreamIn in; /* file stream */
+ public PrintWriter errout; /* error output stream */
+ public short badAccess; /* for accessibility errors */
+ public short badLayout; /* for bad style errors */
+ public short badChars; /* for bad char encodings */
+ public short badForm; /* for mismatched/mispositioned form tags */
+ public short warnings; /* count of warnings in this document */
+ public short errors; /* count of errors */
+ public int lines; /* lines seen */
+ public int columns; /* at start of current token */
+ public boolean waswhite; /* used to collapse contiguous white space */
+ public boolean pushed; /* true after token has been pushed back */
+ public boolean insertspace; /* when space is moved after end tag */
+ public boolean excludeBlocks; /* Netscape compatibility */
+ public boolean exiled; /* true if moved out of table */
+ public boolean isvoyager; /* true if xmlns attribute on html element */
+ public short versions; /* bit vector of HTML versions */
+ public int doctype; /* version as given by doctype (if any) */
+ public boolean badDoctype; /* e.g. if html or PUBLIC is missing */
+ public int txtstart; /* start of current node */
+ public int txtend; /* end of current node */
+ public short state; /* state of lexer's finite state machine */
+ public Node token;
+
+ /*
+ lexer character buffer
+
+ parse tree nodes span onto this buffer
+ which contains the concatenated text
+ contents of all of the elements.
+
+ lexsize must be reset for each file.
+ */
+ public byte[] lexbuf; /* byte buffer of UTF-8 chars */
+ public int lexlength; /* allocated */
+ public int lexsize; /* used */
+
+ /* Inline stack for compatibility with Mosaic */
+ public Node inode; /* for deferring text node */
+ public int insert; /* for inferring inline tags */
+ public Stack istack;
+ public int istackbase; /* start of frame */
+
+ public Style styles; /* used for cleaning up presentation markup */
+
+ public Configuration configuration;
+ protected int seenBodyEndTag; /* used by parser */
+ private Vector nodeList;
+
+ public Lexer(IFile iFile, StreamIn in, Configuration configuration)
+ {
+ this.iFile = iFile;
+ this.in = in;
+ this.lines = 1;
+ this.columns = 1;
+ this.state = LEX_CONTENT;
+ this.badAccess = 0;
+ this.badLayout = 0;
+ this.badChars = 0;
+ this.badForm = 0;
+ this.warnings = 0;
+ this.errors = 0;
+ this.waswhite = false;
+ this.pushed = false;
+ this.insertspace = false;
+ this.exiled = false;
+ this.isvoyager = false;
+ this.versions = Dict.VERS_EVERYTHING;
+ this.doctype = Dict.VERS_UNKNOWN;
+ this.badDoctype = false;
+ this.txtstart = 0;
+ this.txtend = 0;
+ this.token = null;
+ this.lexbuf = null;
+ this.lexlength = 0;
+ this.lexsize = 0;
+ this.inode = null;
+ this.insert = -1;
+ this.istack = new Stack();
+ this.istackbase = 0;
+ this.styles = null;
+ this.configuration = configuration;
+ this.seenBodyEndTag = 0;
+ this.nodeList = new Vector();
+ }
+
+ public IFile getIFile() {
+ return iFile;
+ }
+
+ public Node newNode()
+ {
+ Node node = new Node();
+ nodeList.addElement(node);
+ return node;
+ }
+
+ public Node newNode(short type, byte[] textarray, int start, int end)
+ {
+ Node node = new Node(type, textarray, start, end);
+ nodeList.addElement(node);
+ return node;
+ }
+
+ public Node newNode(short type, byte[] textarray, int start, int end, String element)
+ {
+ Node node = new Node(type, textarray, start, end, element, configuration.tt);
+ nodeList.addElement(node);
+ return node;
+ }
+
+ public Node cloneNode(Node node)
+ {
+ Node cnode = (Node)node.clone();
+ nodeList.addElement(cnode);
+ for (AttVal att = cnode.attributes; att != null; att = att.next) {
+ if (att.asp != null)
+ nodeList.addElement(att.asp);
+ if (att.php != null)
+ nodeList.addElement(att.php);
+ }
+ return cnode;
+ }
+
+ public AttVal cloneAttributes(AttVal attrs)
+ {
+ AttVal cattrs = (AttVal)attrs.clone();
+ for (AttVal att = cattrs; att != null; att = att.next) {
+ if (att.asp != null)
+ nodeList.addElement(att.asp);
+ if (att.php != null)
+ nodeList.addElement(att.php);
+ }
+ return cattrs;
+ }
+
+ protected void updateNodeTextArrays(byte[] oldtextarray, byte[] newtextarray)
+ {
+ Node node;
+ for (int i = 0; i < nodeList.size(); i++) {
+ node = (Node)(nodeList.elementAt(i));
+ if (node.textarray == oldtextarray)
+ node.textarray = newtextarray;
+ }
+ }
+
+ /* used for creating preformatted text from Word2000 */
+ public Node newLineNode()
+ {
+ Node node = newNode();
+
+ node.textarray = this.lexbuf;
+ node.start = this.lexsize;
+ addCharToLexer((int)'\n');
+ node.end = this.lexsize;
+ return node;
+ }
+
+ // Should always be able convert to/from UTF-8, so encoding exceptions are
+ // converted to an Error to avoid adding throws declarations in
+ // lots of methods.
+
+ public static byte[] getBytes(String str) {
+ try {
+ return str.getBytes("UTF8");
+ } catch (java.io.UnsupportedEncodingException e) {
+ throw new Error("string to UTF-8 conversion failed: " + e.getMessage());
+ }
+ }
+
+ public static String getString(byte[] bytes, int offset, int length) {
+ try {
+ return new String(bytes, offset, length, "UTF8");
+ } catch (java.io.UnsupportedEncodingException e) {
+ throw new Error("UTF-8 to string conversion failed: " + e.getMessage());
+ }
+ }
+
+ public boolean endOfInput()
+ {
+ return this.in.isEndOfStream();
+ }
+
+ public void addByte(int c)
+ {
+ if (this.lexsize + 1 >= this.lexlength)
+ {
+ while (this.lexsize + 1 >= this.lexlength)
+ {
+ if (this.lexlength == 0)
+ this.lexlength = 8192;
+ else
+ this.lexlength = this.lexlength * 2;
+ }
+
+ byte[] temp = this.lexbuf;
+ this.lexbuf = new byte[ this.lexlength ];
+ if (temp != null)
+ {
+ System.arraycopy( temp, 0, this.lexbuf, 0, temp.length );
+ updateNodeTextArrays(temp, this.lexbuf);
+ }
+ }
+
+ this.lexbuf[this.lexsize++] = (byte)c;
+ this.lexbuf[this.lexsize] = (byte)'\0'; /* debug */
+ }
+
+ public void changeChar(byte c)
+ {
+ if (this.lexsize > 0)
+ {
+ this.lexbuf[this.lexsize-1] = c;
+ }
+ }
+
+ /* store char c as UTF-8 encoded byte stream */
+ public void addCharToLexer(int c)
+ {
+ if (c < 128)
+ addByte(c);
+ else if (c <= 0x7FF)
+ {
+ addByte(0xC0 | (c >> 6));
+ addByte(0x80 | (c & 0x3F));
+ }
+ else if (c <= 0xFFFF)
+ {
+ addByte(0xE0 | (c >> 12));
+ addByte(0x80 | ((c >> 6) & 0x3F));
+ addByte(0x80 | (c & 0x3F));
+ }
+ else if (c <= 0x1FFFFF)
+ {
+ addByte(0xF0 | (c >> 18));
+ addByte(0x80 | ((c >> 12) & 0x3F));
+ addByte(0x80 | ((c >> 6) & 0x3F));
+ addByte(0x80 | (c & 0x3F));
+ }
+ else
+ {
+ addByte(0xF8 | (c >> 24));
+ addByte(0x80 | ((c >> 18) & 0x3F));
+ addByte(0x80 | ((c >> 12) & 0x3F));
+ addByte(0x80 | ((c >> 6) & 0x3F));
+ addByte(0x80 | (c & 0x3F));
+ }
+ }
+
+ public void addStringToLexer(String str)
+ {
+ for ( int i = 0; i < str.length(); i++ ) {
+ addCharToLexer( (int)str.charAt(i) );
+ }
+ }
+
+ /*
+ No longer attempts to insert missing ';' for unknown
+ enitities unless one was present already, since this
+ gives unexpected results.
+
+ For example:
+ was tidied to:
+ rather than:
+
+ My thanks for Maurice Buxton for spotting this.
+ */
+ public void parseEntity(short mode)
+ {
+ short map;
+ int start;
+ boolean first = true;
+ boolean semicolon = false;
+ boolean numeric = false;
+ int c, ch, startcol;
+ String str;
+
+ start = this.lexsize - 1; /* to start at "&" */
+ startcol = this.in.curcol - 1;
+
+ while (true)
+ {
+ c = this.in.readChar();
+ if (c == StreamIn.EndOfStream) break;
+ if (c == ';')
+ {
+ semicolon = true;
+ break;
+ }
+
+ if (first && c == '#')
+ {
+ addCharToLexer(c);
+ first = false;
+ numeric = true;
+ continue;
+ }
+
+ first = false;
+ map = MAP((char)c);
+
+ /* AQ: Added flag for numeric entities so that numeric entities
+ with missing semi-colons are recognized.
+ Eg. "rep..." is recognized as "rep"
+ */
+ if (numeric && ((c == 'x') || ((map & DIGIT) != 0)))
+ {
+ addCharToLexer(c);
+ continue;
+ }
+ if (!numeric && ((map & NAMECHAR) != 0))
+ {
+ addCharToLexer(c);
+ continue;
+ }
+
+ /* otherwise put it back */
+
+ this.in.ungetChar(c);
+ break;
+ }
+
+ str = getString( this.lexbuf, start, this.lexsize - start );
+ ch = EntityTable.getDefaultEntityTable().entityCode( str );
+
+ /* deal with unrecognized entities */
+ if (ch <= 0)
+ {
+ /* set error position just before offending chararcter */
+ this.lines = this.in.curline;
+ this.columns = startcol;
+
+ if (this.lexsize > start +1 )
+ {
+ Report.entityError(this, Report.UNKNOWN_ENTITY, str, ch);
+
+ if (semicolon)
+ addCharToLexer(';');
+ }
+ else /* naked & */
+ {
+ Report.entityError(this, Report.UNESCAPED_AMPERSAND, str, ch);
+ }
+ }
+ else
+ {
+ if (c != ';') /* issue warning if not terminated by ';' */
+ {
+ /* set error position just before offending chararcter */
+ this.lines = this.in.curline;
+ this.columns = startcol;
+ Report.entityError(this, Report.MISSING_SEMICOLON, str, c);
+ }
+
+ this.lexsize = start;
+
+ if (ch == 160 && (mode & Preformatted) != 0)
+ ch = ' ';
+
+ addCharToLexer(ch);
+
+ if (ch == '&' && !this.configuration.QuoteAmpersand)
+ {
+ addCharToLexer('a');
+ addCharToLexer('m');
+ addCharToLexer('p');
+ addCharToLexer(';');
+ }
+ }
+ }
+
+ public char parseTagName()
+ {
+ short map;
+ int c;
+
+ /* fold case of first char in buffer */
+
+ c = this.lexbuf[this.txtstart];
+ map = MAP((char)c);
+
+ if (!this.configuration.XmlTags && (map & UPPERCASE) != 0)
+ {
+ c += (int)((int)'a' - (int)'A');
+ this.lexbuf[this.txtstart] = (byte)c;
+ }
+
+ while (true)
+ {
+ c = this.in.readChar();
+ if (c == StreamIn.EndOfStream) break;
+ map = MAP((char)c);
+
+ if ((map & NAMECHAR) == 0)
+ break;
+
+ /* fold case of subsequent chars */
+
+ if (!this.configuration.XmlTags && (map & UPPERCASE) != 0)
+ c += (int)((int)'a' - (int)'A');
+
+ addCharToLexer(c);
+ }
+
+ this.txtend = this.lexsize;
+ return (char)c;
+ }
+
+ public void addStringLiteral(String str)
+ {
+ for ( int i = 0; i < str.length(); i++ ) {
+ addCharToLexer( (int)str.charAt(i) );
+ }
+ }
+
+ /* choose what version to use for new doctype */
+ public short HTMLVersion()
+ {
+ short versions;
+
+ versions = this.versions;
+
+ if ((versions & Dict.VERS_HTML20) != 0)
+ return Dict.VERS_HTML20;
+
+ if ((versions & Dict.VERS_HTML32) != 0)
+ return Dict.VERS_HTML32;
+
+ if ((versions & Dict.VERS_HTML40_STRICT) != 0)
+ return Dict.VERS_HTML40_STRICT;
+
+ if ((versions & Dict.VERS_HTML40_LOOSE) != 0)
+ return Dict.VERS_HTML40_LOOSE;
+
+ if ((versions & Dict.VERS_FRAMES) != 0)
+ return Dict.VERS_FRAMES;
+
+ return Dict.VERS_UNKNOWN;
+ }
+
+ public String HTMLVersionName()
+ {
+ short guessed;
+ int j;
+
+ guessed = apparentVersion();
+
+ for (j = 0; j < W3CVersion.length; ++j)
+ {
+ if (guessed == W3CVersion[j].code)
+ {
+ if (this.isvoyager)
+ return W3CVersion[j].voyagerName;
+
+ return W3CVersion[j].name;
+ }
+ }
+
+ return null;
+ }
+
+ /* add meta element for Tidy */
+ public boolean addGenerator(Node root)
+ {
+ AttVal attval;
+ Node node;
+ Node head = root.findHEAD(configuration.tt);
+
+ if (head != null)
+ {
+ for (node = head.content; node != null; node = node.next)
+ {
+ if (node.tag == configuration.tt.tagMeta)
+ {
+ attval = node.getAttrByName("name");
+
+ if (attval != null && attval.value != null &&
+ Lexer.wstrcasecmp(attval.value, "generator") == 0)
+ {
+ attval = node.getAttrByName("content");
+
+ if (attval != null && attval.value != null &&
+ attval.value.length() >= 9 &&
+ Lexer.wstrcasecmp(attval.value.substring(0, 9), "HTML Tidy") == 0)
+ {
+ return false;
+ }
+ }
+ }
+ }
+
+ node = this.inferredTag("meta");
+ node.addAttribute("content", "HTML Tidy, see www.w3.org");
+ node.addAttribute("name", "generator");
+ Node.insertNodeAtStart(head, node);
+ return true;
+ }
+
+ return false;
+ }
+
+ /* return true if substring s is in p and isn't all in upper case */
+ /* this is used to check the case of SYSTEM, PUBLIC, DTD and EN */
+ /* len is how many chars to check in p */
+ private static boolean findBadSubString(String s, String p, int len)
+ {
+ int n = s.length();
+ int i = 0;
+ String ps;
+
+ while (n < len)
+ {
+ ps = p.substring(i, i + n);
+ if (wstrcasecmp(s, ps) == 0)
+ return (!ps.equals(s.substring(0, n)));
+
+ ++i;
+ --len;
+ }
+
+ return false;
+ }
+
+ public boolean checkDocTypeKeyWords(Node doctype)
+ {
+ int len = doctype.end - doctype.start;
+ String s = getString(this.lexbuf, doctype.start, len);
+
+ return !(
+ findBadSubString("SYSTEM", s, len) ||
+ findBadSubString("PUBLIC", s, len) ||
+ findBadSubString("//DTD", s, len) ||
+ findBadSubString("//W3C", s, len) ||
+ findBadSubString("//EN", s, len)
+ );
+ }
+
+ /* examine to identify version */
+ public short findGivenVersion(Node doctype)
+ {
+ String p, s;
+ int i, j;
+ int len;
+ String str1;
+ String str2;
+
+ /* if root tag for doctype isn't html give up now */
+ str1 = getString(this.lexbuf, doctype.start, 5);
+ if (wstrcasecmp(str1, "html ") != 0)
+ return 0;
+
+ if (!checkDocTypeKeyWords(doctype))
+ Report.warning(this, doctype, null, Report.DTYPE_NOT_UPPER_CASE);
+
+ /* give up if all we are given is the system id for the doctype */
+ str1 = getString(this.lexbuf, doctype.start + 5, 7);
+ if (wstrcasecmp(str1, "SYSTEM ") == 0)
+ {
+ /* but at least ensure the case is correct */
+ if (!str1.substring(0, 6).equals("SYSTEM"))
+ System.arraycopy( getBytes("SYSTEM"), 0,
+ this.lexbuf, doctype.start + 5, 6 );
+ return 0; /* unrecognized */
+ }
+
+ if (wstrcasecmp(str1, "PUBLIC ") == 0)
+ {
+ if (!str1.substring(0, 6).equals("PUBLIC"))
+ System.arraycopy( getBytes("PUBLIC "), 0,
+ this.lexbuf, doctype.start + 5, 6 );
+ }
+ else
+ this.badDoctype = true;
+
+ for (i = doctype.start; i < doctype.end; ++i)
+ {
+ if (this.lexbuf[i] == (byte)'"')
+ {
+ str1 = getString( this.lexbuf, i + 1, 12 );
+ str2 = getString( this.lexbuf, i + 1, 13 );
+ if (str1.equals("-//W3C//DTD "))
+ {
+ /* compute length of identifier e.g. "HTML 4.0 Transitional" */
+ for (j = i + 13; j < doctype.end && this.lexbuf[j] != (byte)'/'; ++j);
+ len = j - i - 13;
+ p = getString( this.lexbuf, i + 13, len );
+
+ for (j = 1; j < W3CVersion.length; ++j)
+ {
+ s = W3CVersion[j].name;
+ if (len == s.length() && s.equals(p))
+ return W3CVersion[j].code;
+ }
+
+ /* else unrecognized version */
+ }
+ else if (str2.equals("-//IETF//DTD "))
+ {
+ /* compute length of identifier e.g. "HTML 2.0" */
+ for (j = i + 14; j < doctype.end && this.lexbuf[j] != (byte)'/'; ++j);
+ len = j - i - 14;
+
+ p = getString( this.lexbuf, i + 14, len );
+ s = W3CVersion[0].name;
+ if (len == s.length() && s.equals(p))
+ return W3CVersion[0].code;
+
+ /* else unrecognized version */
+ }
+ break;
+ }
+ }
+
+ return 0;
+ }
+
+ public void fixHTMLNameSpace(Node root, String profile)
+ {
+ Node node;
+ AttVal prev, attr;
+
+ for (node = root.content;
+ node != null && node.tag != configuration.tt.tagHtml; node = node.next);
+
+ if (node != null)
+ {
+ prev = null;
+
+ for (attr = node.attributes; attr != null; attr = attr.next)
+ {
+ if (attr.attribute.equals("xmlns"))
+ break;
+
+ prev = attr;
+ }
+
+ if (attr != null)
+ {
+ if (!attr.value.equals(profile))
+ {
+ Report.warning(this, node, null, Report.INCONSISTENT_NAMESPACE);
+ attr.value = profile;
+ }
+ }
+ else
+ {
+ attr = new AttVal( node.attributes, null, (int)'"',
+ "xmlns", profile );
+ attr.dict =
+ AttributeTable.getDefaultAttributeTable().findAttribute( attr );
+ node.attributes = attr;
+ }
+ }
+ }
+
+ public boolean setXHTMLDocType(Node root)
+ {
+ String fpi = " ";
+ String sysid = "";
+ String namespace = XHTML_NAMESPACE;
+ Node doctype;
+
+ doctype = root.findDocType();
+
+ if (configuration.docTypeMode == Configuration.DOCTYPE_OMIT)
+ {
+ if (doctype != null)
+ Node.discardElement(doctype);
+ return true;
+ }
+
+ if (configuration.docTypeMode == Configuration.DOCTYPE_AUTO)
+ {
+ /* see what flavor of XHTML this document matches */
+ if ((this.versions & Dict.VERS_HTML40_STRICT) != 0)
+ { /* use XHTML strict */
+ fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
+ sysid = voyager_strict;
+ }
+ else if ((this.versions & Dict.VERS_LOOSE) != 0)
+ {
+ fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
+ sysid = voyager_loose;
+ }
+ else if ((this.versions & Dict.VERS_FRAMES) != 0)
+ { /* use XHTML frames */
+ fpi = "-//W3C//DTD XHTML 1.0 Frameset//EN";
+ sysid = voyager_frameset;
+ }
+ else /* lets assume XHTML transitional */
+ {
+ fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
+ sysid = voyager_loose;
+ }
+ }
+ else if (configuration.docTypeMode == Configuration.DOCTYPE_STRICT)
+ {
+ fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
+ sysid = voyager_strict;
+ }
+ else if (configuration.docTypeMode == Configuration.DOCTYPE_LOOSE)
+ {
+ fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
+ sysid = voyager_loose;
+ }
+
+ fixHTMLNameSpace(root, namespace);
+
+ if (doctype == null)
+ {
+ doctype = newNode(Node.DocTypeTag, this.lexbuf, 0, 0);
+ doctype.next = root.content;
+ doctype.parent = root;
+ doctype.prev = null;
+ root.content = doctype;
+ }
+
+ if (configuration.docTypeMode == Configuration.DOCTYPE_USER &&
+ configuration.docTypeStr != null)
+ {
+ fpi = configuration.docTypeStr;
+ sysid = "";
+ }
+
+ this.txtstart = this.lexsize;
+ this.txtend = this.lexsize;
+
+ /* add public identifier */
+ addStringLiteral("html PUBLIC ");
+
+ /* check if the fpi is quoted or not */
+ if (fpi.charAt(0) == '"')
+ addStringLiteral(fpi);
+ else
+ {
+ addStringLiteral("\"");
+ addStringLiteral(fpi);
+ addStringLiteral("\"");
+ }
+
+ if (sysid.length() + 6 >= this.configuration.wraplen)
+ addStringLiteral("\n\"");
+ else
+ addStringLiteral("\n \"");
+
+ /* add system identifier */
+ addStringLiteral(sysid);
+ addStringLiteral("\"");
+
+ this.txtend = this.lexsize;
+
+ doctype.start = this.txtstart;
+ doctype.end = this.txtend;
+
+ return false;
+ }
+
+ public short apparentVersion()
+ {
+ switch (this.doctype)
+ {
+ case Dict.VERS_UNKNOWN:
+ return HTMLVersion();
+
+ case Dict.VERS_HTML20:
+ if ((this.versions & Dict.VERS_HTML20) != 0)
+ return Dict.VERS_HTML20;
+
+ break;
+
+ case Dict.VERS_HTML32:
+ if ((this.versions & Dict.VERS_HTML32) != 0)
+ return Dict.VERS_HTML32;
+
+ break; /* to replace old version by new */
+
+ case Dict.VERS_HTML40_STRICT:
+ if ((this.versions & Dict.VERS_HTML40_STRICT) != 0)
+ return Dict.VERS_HTML40_STRICT;
+
+ break;
+
+ case Dict.VERS_HTML40_LOOSE:
+ if ((this.versions & Dict.VERS_HTML40_LOOSE) != 0)
+ return Dict.VERS_HTML40_LOOSE;
+
+ break; /* to replace old version by new */
+
+ case Dict.VERS_FRAMES:
+ if ((this.versions & Dict.VERS_FRAMES) != 0)
+ return Dict.VERS_FRAMES;
+
+ break;
+ }
+
+ Report.warning(this, null, null, Report.INCONSISTENT_VERSION);
+ return this.HTMLVersion();
+ }
+
+ /* fixup doctype if missing */
+ public boolean fixDocType(Node root)
+ {
+ Node doctype;
+ int guessed = Dict.VERS_HTML40_STRICT, i;
+
+ if (this.badDoctype)
+ Report.warning(this, null, null, Report.MALFORMED_DOCTYPE);
+
+ if (configuration.XmlOut)
+ return true;
+
+ doctype = root.findDocType();
+
+ if (configuration.docTypeMode == Configuration.DOCTYPE_OMIT)
+ {
+ if (doctype != null)
+ Node.discardElement(doctype);
+ return true;
+ }
+
+ if (configuration.docTypeMode == Configuration.DOCTYPE_STRICT)
+ {
+ Node.discardElement(doctype);
+ doctype = null;
+ guessed = Dict.VERS_HTML40_STRICT;
+ }
+ else if (configuration.docTypeMode == Configuration.DOCTYPE_LOOSE)
+ {
+ Node.discardElement(doctype);
+ doctype = null;
+ guessed = Dict.VERS_HTML40_LOOSE;
+ }
+ else if (configuration.docTypeMode == Configuration.DOCTYPE_AUTO)
+ {
+ if (doctype != null)
+ {
+ if (this.doctype == Dict.VERS_UNKNOWN)
+ return false;
+
+ switch (this.doctype)
+ {
+ case Dict.VERS_UNKNOWN:
+ return false;
+
+ case Dict.VERS_HTML20:
+ if ((this.versions & Dict.VERS_HTML20) != 0)
+ return true;
+
+ break; /* to replace old version by new */
+
+ case Dict.VERS_HTML32:
+ if ((this.versions & Dict.VERS_HTML32) != 0)
+ return true;
+
+ break; /* to replace old version by new */
+
+ case Dict.VERS_HTML40_STRICT:
+ if ((this.versions & Dict.VERS_HTML40_STRICT) != 0)
+ return true;
+
+ break; /* to replace old version by new */
+
+ case Dict.VERS_HTML40_LOOSE:
+ if ((this.versions & Dict.VERS_HTML40_LOOSE) != 0)
+ return true;
+
+ break; /* to replace old version by new */
+
+ case Dict.VERS_FRAMES:
+ if ((this.versions & Dict.VERS_FRAMES) != 0)
+ return true;
+
+ break; /* to replace old version by new */
+ }
+
+ /* INCONSISTENT_VERSION warning is now issued by ApparentVersion() */
+ }
+
+ /* choose new doctype */
+ guessed = HTMLVersion();
+ }
+
+ if (guessed == Dict.VERS_UNKNOWN)
+ return false;
+
+ /* for XML use the Voyager system identifier */
+ if (this.configuration.XmlOut || this.configuration.XmlTags || this.isvoyager)
+ {
+ if (doctype != null)
+ Node.discardElement(doctype);
+
+ for (i = 0; i < W3CVersion.length; ++i)
+ {
+ if (guessed == W3CVersion[i].code)
+ {
+ fixHTMLNameSpace(root, W3CVersion[i].profile);
+ break;
+ }
+ }
+
+ return true;
+ }
+
+ if (doctype == null)
+ {
+ doctype = newNode(Node.DocTypeTag, this.lexbuf, 0, 0);
+ doctype.next = root.content;
+ doctype.parent = root;
+ doctype.prev = null;
+ root.content = doctype;
+ }
+
+ this.txtstart = this.lexsize;
+ this.txtend = this.lexsize;
+
+ /* use the appropriate public identifier */
+ addStringLiteral("html PUBLIC ");
+
+ if (configuration.docTypeMode == Configuration.DOCTYPE_USER &&
+ configuration.docTypeStr != null)
+ addStringLiteral(configuration.docTypeStr);
+ else if (guessed == Dict.VERS_HTML20)
+ addStringLiteral("\"-//IETF//DTD HTML 2.0//EN\"");
+ else
+ {
+ addStringLiteral("\"-//W3C//DTD ");
+
+ for (i = 0; i < W3CVersion.length; ++i)
+ {
+ if (guessed == W3CVersion[i].code)
+ {
+ addStringLiteral(W3CVersion[i].name);
+ break;
+ }
+ }
+
+ addStringLiteral("//EN\"");
+ }
+
+ this.txtend = this.lexsize;
+
+ doctype.start = this.txtstart;
+ doctype.end = this.txtend;
+
+ return true;
+ }
+
+ /* ensure XML document starts with */
+ public boolean fixXMLPI(Node root)
+ {
+ Node xml;
+ int s;
+
+ if( root.content != null && root.content.type == Node.ProcInsTag)
+ {
+ s = root.content.start;
+
+ if (this.lexbuf[s] == (byte)'x' &&
+ this.lexbuf[s+1] == (byte)'m' &&
+ this.lexbuf[s+2] == (byte)'l')
+ return true;
+ }
+
+ xml = newNode(Node.ProcInsTag, this.lexbuf, 0, 0);
+ xml.next = root.content;
+
+ if (root.content != null)
+ {
+ root.content.prev = xml;
+ xml.next = root.content;
+ }
+
+ root.content = xml;
+
+ this.txtstart = this.lexsize;
+ this.txtend = this.lexsize;
+ addStringLiteral("xml version=\"1.0\"");
+ if (this.configuration.CharEncoding == Configuration.LATIN1)
+ addStringLiteral(" encoding=\"ISO-8859-1\"");
+ this.txtend = this.lexsize;
+
+ xml.start = this.txtstart;
+ xml.end = this.txtend;
+ return false;
+ }
+
+ public Node inferredTag(String name)
+ {
+ Node node;
+
+ node = newNode(Node.StartTag,
+ this.lexbuf,
+ this.txtstart,
+ this.txtend,
+ name);
+ node.implicit = true;
+ return node;
+ }
+
+ public static boolean expectsContent(Node node)
+ {
+ if (node.type != Node.StartTag)
+ return false;
+
+ /* unknown element? */
+ if (node.tag == null)
+ return true;
+
+ if ((node.tag.model & Dict.CM_EMPTY) != 0)
+ return false;
+
+ return true;
+ }
+
+ /*
+ create a text node for the contents of
+ a CDATA element like style or script
+ which ends with for some foo.
+ */
+ public Node getCDATA(Node container)
+ {
+ int c, lastc, start, len, i;
+ String str;
+ boolean endtag = false;
+
+ this.lines = this.in.curline;
+ this.columns = this.in.curcol;
+ this.waswhite = false;
+ this.txtstart = this.lexsize;
+ this.txtend = this.lexsize;
+
+ lastc = (int)'\0';
+ start = -1;
+
+ while (true)
+ {
+ c = this.in.readChar();
+ if (c == StreamIn.EndOfStream) break;
+ /* treat \r\n as \n and \r as \n */
+
+ if (c == (int)'/' && lastc == (int)'<')
+ {
+ if (endtag)
+ {
+ this.lines = this.in.curline;
+ this.columns = this.in.curcol - 3;
+
+ Report.warning(this, null, null, Report.BAD_CDATA_CONTENT);
+ }
+
+ start = this.lexsize + 1; /* to first letter */
+ endtag = true;
+ }
+ else if (c == (int)'>' && start >= 0)
+ {
+ len = this.lexsize - start;
+ if (len == container.element.length())
+ {
+ str = getString( this.lexbuf, start, len );
+ if (Lexer.wstrcasecmp(str, container.element) == 0)
+ {
+ this.txtend = start - 2;
+ break;
+ }
+ }
+
+ this.lines = this.in.curline;
+ this.columns = this.in.curcol - 3;
+
+ Report.warning(this, null, null, Report.BAD_CDATA_CONTENT);
+
+ /* if javascript insert backslash before / */
+
+ if (ParserImpl.isJavaScript(container))
+ {
+ for (i = this.lexsize; i > start-1; --i)
+ this.lexbuf[i] = this.lexbuf[i-1];
+
+ this.lexbuf[start-1] = (byte)'\\';
+ this.lexsize++;
+ }
+
+ start = -1;
+ }
+ else if (c == (int)'\r')
+ {
+ c = this.in.readChar();
+
+ if (c != (int)'\n')
+ this.in.ungetChar(c);
+
+ c = (int)'\n';
+ }
+
+ addCharToLexer((int)c);
+ this.txtend = this.lexsize;
+ lastc = c;
+ }
+
+ if (c == StreamIn.EndOfStream)
+ Report.warning(this, container, null, Report.MISSING_ENDTAG_FOR);
+
+ if (this.txtend > this.txtstart)
+ {
+ this.token = newNode(Node.TextNode,
+ this.lexbuf,
+ this.txtstart,
+ this.txtend);
+ return this.token;
+ }
+
+ return null;
+ }
+
+ public void ungetToken()
+ {
+ this.pushed = true;
+ }
+
+ public static final short IgnoreWhitespace = 0;
+ public static final short MixedContent = 1;
+ public static final short Preformatted = 2;
+ public static final short IgnoreMarkup = 3;
+
+ /*
+ modes for GetToken()
+
+ MixedContent -- for elements which don't accept PCDATA
+ Preformatted -- white space preserved as is
+ IgnoreMarkup -- for CDATA elements such as script, style
+ */
+
+ public Node getToken(short mode)
+ {
+ short map;
+ int c = 0;
+ int lastc;
+ int badcomment = 0;
+ MutableBoolean isempty = new MutableBoolean();
+ AttVal attributes;
+
+ if (this.pushed)
+ {
+ /* duplicate inlines in preference to pushed text nodes when appropriate */
+ if (this.token.type != Node.TextNode ||
+ (this.insert == -1 && this.inode == null))
+ {
+ this.pushed = false;
+ return this.token;
+ }
+ }
+
+ /* at start of block elements, unclosed inline
+ elements are inserted into the token stream */
+
+ if (this.insert != -1 || this.inode != null)
+ return insertedToken();
+
+ this.lines = this.in.curline;
+ this.columns = this.in.curcol;
+ this.waswhite = false;
+
+ this.txtstart = this.lexsize;
+ this.txtend = this.lexsize;
+
+ while (true)
+ {
+ c = this.in.readChar();
+ if (c == StreamIn.EndOfStream) break;
+ if (this.insertspace && mode != IgnoreWhitespace)
+ {
+ addCharToLexer(' ');
+ this.waswhite = true;
+ this.insertspace = false;
+ }
+
+ /* treat \r\n as \n and \r as \n */
+
+ if (c == '\r')
+ {
+ c = this.in.readChar();
+
+ if (c != '\n')
+ this.in.ungetChar(c);
+
+ c = '\n';
+ }
+
+ addCharToLexer(c);
+
+ switch (this.state)
+ {
+ case LEX_CONTENT: /* element content */
+ map = MAP((char)c);
+
+ /*
+ Discard white space if appropriate. Its cheaper
+ to do this here rather than in parser methods
+ for elements that don't have mixed content.
+ */
+ if (((map & WHITE) != 0) && (mode == IgnoreWhitespace)
+ && this.lexsize == this.txtstart + 1)
+ {
+ --this.lexsize;
+ this.waswhite = false;
+ this.lines = this.in.curline;
+ this.columns = this.in.curcol;
+ continue;
+ }
+
+ if (c == '<')
+ {
+ this.state = LEX_GT;
+ continue;
+ }
+
+ if ((map & WHITE) != 0)
+ {
+ /* was previous char white? */
+ if (this.waswhite)
+ {
+ if (mode != Preformatted && mode != IgnoreMarkup)
+ {
+ --this.lexsize;
+ this.lines = this.in.curline;
+ this.columns = this.in.curcol;
+ }
+ }
+ else /* prev char wasn't white */
+ {
+ this.waswhite = true;
+ lastc = c;
+
+ if (mode != Preformatted && mode != IgnoreMarkup && c != ' ')
+ changeChar((byte)' ');
+ }
+
+ continue;
+ }
+ else if (c == '&' && mode != IgnoreMarkup)
+ parseEntity(mode);
+
+ /* this is needed to avoid trimming trailing whitespace */
+ if (mode == IgnoreWhitespace)
+ mode = MixedContent;
+
+ this.waswhite = false;
+ continue;
+
+ case LEX_GT: /* < */
+
+ /* check for endtag */
+ if (c == '/')
+ {
+ c = this.in.readChar();
+ if (c == StreamIn.EndOfStream)
+ {
+ this.in.ungetChar(c);
+ continue;
+ }
+
+ addCharToLexer(c);
+ map = MAP((char)c);
+
+ if ((map & LETTER) != 0)
+ {
+ this.lexsize -= 3;
+ this.txtend = this.lexsize;
+ this.in.ungetChar(c);
+ this.state = LEX_ENDTAG;
+ this.lexbuf[this.lexsize] = (byte)'\0'; /* debug */
+ this.in.curcol -= 2;
+
+ /* if some text before the return it now */
+ if (this.txtend > this.txtstart)
+ {
+ /* trim space char before end tag */
+ if (mode == IgnoreWhitespace && this.lexbuf[this.lexsize - 1] == (byte)' ')
+ {
+ this.lexsize -= 1;
+ this.txtend = this.lexsize;
+ }
+
+ this.token = newNode(Node.TextNode,
+ this.lexbuf,
+ this.txtstart,
+ this.txtend);
+ return this.token;
+ }
+
+ continue; /* no text so keep going */
+ }
+
+ /* otherwise treat as CDATA */
+ this.waswhite = false;
+ this.state = LEX_CONTENT;
+ continue;
+ }
+
+ if (mode == IgnoreMarkup)
+ {
+ /* otherwise treat as CDATA */
+ this.waswhite = false;
+ this.state = LEX_CONTENT;
+ continue;
+ }
+
+ /*
+ look out for comments, doctype or marked sections
+ this isn't quite right, but its getting there ...
+ */
+ if (c == '!')
+ {
+ c = this.in.readChar();
+
+ if (c == '-')
+ {
+ c = this.in.readChar();
+
+ if (c == '-')
+ {
+ this.state = LEX_COMMENT; /* comment */
+ this.lexsize -= 2;
+ this.txtend = this.lexsize;
+
+ /* if some text before < return it now */
+ if (this.txtend > this.txtstart)
+ {
+ this.token = newNode(Node.TextNode,
+ this.lexbuf,
+ this.txtstart,
+ this.txtend);
+ return this.token;
+ }
+
+ this.txtstart = this.lexsize;
+ continue;
+ }
+
+ Report.warning(this, null, null, Report.MALFORMED_COMMENT);
+ }
+ else if (c == 'd' || c == 'D')
+ {
+ this.state = LEX_DOCTYPE; /* doctype */
+ this.lexsize -= 2;
+ this.txtend = this.lexsize;
+ mode = IgnoreWhitespace;
+
+ /* skip until white space or '>' */
+
+ for (;;)
+ {
+ c = this.in.readChar();
+
+ if (c == StreamIn.EndOfStream || c == '>')
+ {
+ this.in.ungetChar(c);
+ break;
+ }
+
+ map = MAP((char)c);
+
+ if ((map & WHITE) == 0)
+ continue;
+
+ /* and skip to end of whitespace */
+
+ for (;;)
+ {
+ c = this.in.readChar();
+
+ if (c == StreamIn.EndOfStream || c == '>')
+ {
+ this.in.ungetChar(c);
+ break;
+ }
+
+ map = MAP((char)c);
+
+ if ((map & WHITE) != 0)
+ continue;
+
+ this.in.ungetChar(c);
+ break;
+ }
+
+ break;
+ }
+
+ /* if some text before < return it now */
+ if (this.txtend > this.txtstart)
+ {
+ this.token = newNode(Node.TextNode,
+ this.lexbuf,
+ this.txtstart,
+ this.txtend);
+ return this.token;
+ }
+
+ this.txtstart = this.lexsize;
+ continue;
+ }
+ else if (c == '[')
+ {
+ /* Word 2000 embeds ... sequences */
+ this.lexsize -= 2;
+ this.state = LEX_SECTION;
+ this.txtend = this.lexsize;
+
+ /* if some text before < return it now */
+ if (this.txtend > this.txtstart)
+ {
+ this.token = newNode(Node.TextNode,
+ this.lexbuf,
+ this.txtstart,
+ this.txtend);
+ return this.token;
+ }
+
+ this.txtstart = this.lexsize;
+ continue;
+ }
+
+ /* otherwise swallow chars up to and including next '>' */
+ while (true)
+ {
+ c = this.in.readChar();
+ if (c == '>') break;
+ if (c == -1)
+ {
+ this.in.ungetChar(c);
+ break;
+ }
+ }
+
+ this.lexsize -= 2;
+ this.lexbuf[this.lexsize] = (byte)'\0';
+ this.state = LEX_CONTENT;
+ continue;
+ }
+
+ /*
+ processing instructions
+ */
+
+ if (c == '?')
+ {
+ this.lexsize -= 2;
+ this.state = LEX_PROCINSTR;
+ this.txtend = this.lexsize;
+
+ /* if some text before < return it now */
+ if (this.txtend > this.txtstart)
+ {
+ this.token = newNode(Node.TextNode,
+ this.lexbuf,
+ this.txtstart,
+ this.txtend);
+ return this.token;
+ }
+
+ this.txtstart = this.lexsize;
+ continue;
+ }
+
+ /* Microsoft ASP's e.g. <% ... server-code ... %> */
+ if (c == '%')
+ {
+ this.lexsize -= 2;
+ this.state = LEX_ASP;
+ this.txtend = this.lexsize;
+
+ /* if some text before < return it now */
+ if (this.txtend > this.txtstart)
+ {
+ this.token = newNode(Node.TextNode,
+ this.lexbuf,
+ this.txtstart,
+ this.txtend);
+ return this.token;
+ }
+
+ this.txtstart = this.lexsize;
+ continue;
+ }
+
+ /* Netscapes JSTE e.g. <# ... server-code ... #> */
+ if (c == '#')
+ {
+ this.lexsize -= 2;
+ this.state = LEX_JSTE;
+ this.txtend = this.lexsize;
+
+ /* if some text before < return it now */
+ if (this.txtend > this.txtstart)
+ {
+ this.token = newNode(Node.TextNode,
+ this.lexbuf,
+ this.txtstart,
+ this.txtend);
+ return this.token;
+ }
+
+ this.txtstart = this.lexsize;
+ continue;
+ }
+
+ map = MAP((char)c);
+
+ /* check for start tag */
+ if ((map & LETTER) != 0)
+ {
+ this.in.ungetChar(c); /* push back letter */
+ this.lexsize -= 2; /* discard "<" + letter */
+ this.txtend = this.lexsize;
+ this.state = LEX_STARTTAG; /* ready to read tag name */
+
+ /* if some text before < return it now */
+ if (this.txtend > this.txtstart)
+ {
+ this.token = newNode(Node.TextNode,
+ this.lexbuf,
+ this.txtstart,
+ this.txtend);
+ return this.token;
+ }
+
+ continue; /* no text so keep going */
+ }
+
+ /* otherwise treat as CDATA */
+ this.state = LEX_CONTENT;
+ this.waswhite = false;
+ continue;
+
+ case LEX_ENDTAG: /* ' */
+ while (c != '>')
+ {
+ c = this.in.readChar();
+
+ if (c == StreamIn.EndOfStream)
+ break;
+ }
+
+ if (c == StreamIn.EndOfStream)
+ {
+ this.in.ungetChar(c);
+ continue;
+ }
+
+ this.state = LEX_CONTENT;
+ this.waswhite = false;
+ return this.token; /* the endtag token */
+
+ case LEX_STARTTAG: /* first letter of tagname */
+ this.txtstart = this.lexsize - 1; /* set txtstart to first letter */
+ c = parseTagName();
+ isempty.value = false;
+ attributes = null;
+ this.token = newNode((isempty.value ? Node.StartEndTag : Node.StartTag),
+ this.lexbuf,
+ this.txtstart,
+ this.txtend,
+ getString(this.lexbuf,
+ this.txtstart,
+ this.txtend - this.txtstart));
+
+ /* parse attributes, consuming closing ">" */
+ if (c != '>')
+ {
+ if (c == '/')
+ this.in.ungetChar(c);
+
+ attributes = parseAttrs(isempty);
+ }
+
+ if (isempty.value)
+ this.token.type = Node.StartEndTag;
+
+ this.token.attributes = attributes;
+ this.lexsize = this.txtstart;
+ this.txtend = this.txtstart;
+
+ /* swallow newline following start tag */
+ /* special check needed for CRLF sequence */
+ /* this doesn't apply to empty elements */
+
+ if (expectsContent(this.token) ||
+ this.token.tag == configuration.tt.tagBr)
+ {
+
+ c = this.in.readChar();
+
+ if (c == '\r')
+ {
+ c = this.in.readChar();
+
+ if (c != '\n')
+ this.in.ungetChar(c);
+ }
+ else if (c != '\n' && c != '\f')
+ this.in.ungetChar(c);
+
+ this.waswhite = true; /* to swallow leading whitespace */
+ }
+ else
+ this.waswhite = false;
+
+ this.state = LEX_CONTENT;
+
+ if (this.token.tag == null)
+ Report.error(this, null, this.token, Report.UNKNOWN_ELEMENT);
+ else if (!this.configuration.XmlTags)
+ {
+ this.versions &= this.token.tag.versions;
+
+ if ((this.token.tag.versions & Dict.VERS_PROPRIETARY) != 0)
+ {
+ if (!this.configuration.MakeClean && (this.token.tag == configuration.tt.tagNobr ||
+ this.token.tag == configuration.tt.tagWbr))
+ Report.warning(this, null, this.token, Report.PROPRIETARY_ELEMENT);
+ }
+
+ if (this.token.tag.chkattrs != null)
+ {
+ this.token.checkUniqueAttributes(this);
+ this.token.tag.chkattrs.check(this, this.token);
+ }
+ else
+ this.token.checkAttributes(this);
+ }
+
+ return this.token; /* return start tag */
+
+ case LEX_COMMENT: /* seen */
+
+ if (c != '-')
+ continue;
+
+ c = this.in.readChar();
+ addCharToLexer(c);
+
+ if (c != '-')
+ continue;
+
+ end_comment: while (true) {
+ c = this.in.readChar();
+
+ if (c == '>')
+ {
+ if (badcomment != 0)
+ Report.warning(this, null, null, Report.MALFORMED_COMMENT);
+
+ this.txtend = this.lexsize - 2; // AQ 8Jul2000
+ this.lexbuf[this.lexsize] = (byte)'\0';
+ this.state = LEX_CONTENT;
+ this.waswhite = false;
+ this.token = newNode(Node.CommentTag,
+ this.lexbuf,
+ this.txtstart,
+ this.txtend);
+
+ /* now look for a line break */
+
+ c = this.in.readChar();
+
+ if (c == '\r')
+ {
+ c = this.in.readChar();
+
+ if (c != '\n')
+ this.token.linebreak = true;
+ }
+
+ if (c == '\n')
+ this.token.linebreak = true;
+ else
+ this.in.ungetChar(c);
+
+ return this.token;
+ }
+
+ /* note position of first such error in the comment */
+ if (badcomment == 0)
+ {
+ this.lines = this.in.curline;
+ this.columns = this.in.curcol - 3;
+ }
+
+ badcomment++;
+ if (this.configuration.FixComments)
+ this.lexbuf[this.lexsize - 2] = (byte)'=';
+
+ addCharToLexer(c);
+
+ /* if '-' then look for '>' to end the comment */
+ if (c != '-')
+ break end_comment;
+
+ }
+ /* otherwise continue to look for --> */
+ this.lexbuf[this.lexsize - 2] = (byte)'=';
+ continue;
+
+ case LEX_DOCTYPE: /* seen ' munging whitespace */
+ map = MAP((char)c);
+
+ if ((map & WHITE) != 0)
+ {
+ if (this.waswhite)
+ this.lexsize -= 1;
+
+ this.waswhite = true;
+ }
+ else
+ this.waswhite = false;
+
+ if (c != '>')
+ continue;
+
+ this.lexsize -= 1;
+ this.txtend = this.lexsize;
+ this.lexbuf[this.lexsize] = (byte)'\0';
+ this.state = LEX_CONTENT;
+ this.waswhite = false;
+ this.token = newNode(Node.DocTypeTag,
+ this.lexbuf,
+ this.txtstart,
+ this.txtend);
+ /* make a note of the version named by the doctype */
+ this.doctype = findGivenVersion(this.token);
+ return this.token;
+
+ case LEX_PROCINSTR: /* seen so look for '>' */
+ /* check for PHP preprocessor instructions */
+
+ if (this.lexsize - this.txtstart == 3)
+ {
+ if ((getString(this.lexbuf, this.txtstart, 3)).equals("php"))
+ {
+ this.state = LEX_PHP;
+ continue;
+ }
+ }
+
+ if (this.configuration.XmlPIs) /* insist on ?> as terminator */
+ {
+ if (c != '?')
+ continue;
+
+ /* now look for '>' */
+ c = this.in.readChar();
+
+ if (c == StreamIn.EndOfStream)
+ {
+ Report.warning(this, null, null, Report.UNEXPECTED_END_OF_FILE);
+ this.in.ungetChar(c);
+ continue;
+ }
+
+ addCharToLexer(c);
+ }
+
+ if (c != '>')
+ continue;
+
+ this.lexsize -= 1;
+ this.txtend = this.lexsize;
+ this.lexbuf[this.lexsize] = (byte)'\0';
+ this.state = LEX_CONTENT;
+ this.waswhite = false;
+ this.token = newNode(Node.ProcInsTag,
+ this.lexbuf,
+ this.txtstart,
+ this.txtend);
+ return this.token;
+
+ case LEX_ASP: /* seen <% so look for "%>" */
+ if (c != '%')
+ continue;
+
+ /* now look for '>' */
+ c = this.in.readChar();
+
+
+ if (c != '>')
+ {
+ this.in.ungetChar(c);
+ continue;
+ }
+
+ this.lexsize -= 1;
+ this.txtend = this.lexsize;
+ this.lexbuf[this.lexsize] = (byte)'\0';
+ this.state = LEX_CONTENT;
+ this.waswhite = false;
+ this.token = newNode(Node.AspTag,
+ this.lexbuf,
+ this.txtstart,
+ this.txtend);
+ return this.token;
+
+ case LEX_JSTE: /* seen <# so look for "#>" */
+ if (c != '#')
+ continue;
+
+ /* now look for '>' */
+ c = this.in.readChar();
+
+
+ if (c != '>')
+ {
+ this.in.ungetChar(c);
+ continue;
+ }
+
+ this.lexsize -= 1;
+ this.txtend = this.lexsize;
+ this.lexbuf[this.lexsize] = (byte)'\0';
+ this.state = LEX_CONTENT;
+ this.waswhite = false;
+ this.token = newNode(Node.JsteTag,
+ this.lexbuf,
+ this.txtstart,
+ this.txtend);
+ return this.token;
+
+ case LEX_PHP: /* seen "" */
+ if (c != '?')
+ continue;
+
+ /* now look for '>' */
+ c = this.in.readChar();
+
+ if (c != '>')
+ {
+ this.in.ungetChar(c);
+ continue;
+ }
+
+ this.lexsize -= 1;
+ this.txtend = this.lexsize;
+ this.lexbuf[this.lexsize] = (byte)'\0';
+ this.state = LEX_CONTENT;
+ this.waswhite = false;
+ this.token = newNode(Node.PhpTag,
+ this.lexbuf,
+ this.txtstart,
+ this.txtend);
+ return this.token;
+
+ case LEX_SECTION: /* seen "" */
+ if (c == '[')
+ {
+ if (this.lexsize == (this.txtstart + 6) &&
+ (getString(this.lexbuf, this.txtstart, 6)).equals("CDATA["))
+ {
+ this.state = LEX_CDATA;
+ this.lexsize -= 6;
+ continue;
+ }
+ }
+
+ if (c != ']')
+ continue;
+
+ /* now look for '>' */
+ c = this.in.readChar();
+
+ if (c != '>')
+ {
+ this.in.ungetChar(c);
+ continue;
+ }
+
+ this.lexsize -= 1;
+ this.txtend = this.lexsize;
+ this.lexbuf[this.lexsize] = (byte)'\0';
+ this.state = LEX_CONTENT;
+ this.waswhite = false;
+ this.token = newNode(Node.SectionTag,
+ this.lexbuf,
+ this.txtstart,
+ this.txtend);
+ return this.token;
+
+ case LEX_CDATA: /* seen "" */
+ if (c != ']')
+ continue;
+
+ /* now look for ']' */
+ c = this.in.readChar();
+
+ if (c != ']')
+ {
+ this.in.ungetChar(c);
+ continue;
+ }
+
+ /* now look for '>' */
+ c = this.in.readChar();
+
+ if (c != '>')
+ {
+ this.in.ungetChar(c);
+ continue;
+ }
+
+ this.lexsize -= 1;
+ this.txtend = this.lexsize;
+ this.lexbuf[this.lexsize] = (byte)'\0';
+ this.state = LEX_CONTENT;
+ this.waswhite = false;
+ this.token = newNode(Node.CDATATag,
+ this.lexbuf,
+ this.txtstart,
+ this.txtend);
+ return this.token;
+ }
+ }
+
+ if (this.state == LEX_CONTENT) /* text string */
+ {
+ this.txtend = this.lexsize;
+
+ if (this.txtend > this.txtstart)
+ {
+ this.in.ungetChar(c);
+
+ if (this.lexbuf[this.lexsize - 1] == (byte)' ')
+ {
+ this.lexsize -= 1;
+ this.txtend = this.lexsize;
+ }
+
+ this.token = newNode(Node.TextNode,
+ this.lexbuf,
+ this.txtstart,
+ this.txtend);
+ return this.token;
+ }
+ }
+ else if (this.state == LEX_COMMENT) /* comment */
+ {
+ if (c == StreamIn.EndOfStream)
+ Report.warning(this, null, null, Report.MALFORMED_COMMENT);
+
+ this.txtend = this.lexsize;
+ this.lexbuf[this.lexsize] = (byte)'\0';
+ this.state = LEX_CONTENT;
+ this.waswhite = false;
+ this.token = newNode(Node.CommentTag,
+ this.lexbuf,
+ this.txtstart,
+ this.txtend);
+ return this.token;
+ }
+
+ return null;
+ }
+
+ /*
+ parser for ASP within start tags
+
+ Some people use ASP for to customize attributes
+ Tidy isn't really well suited to dealing with ASP
+ This is a workaround for attributes, but won't
+ deal with the case where the ASP is used to tailor
+ the attribute value. Here is an example of a work
+ around for using ASP in attribute values:
+
+ href="<%=rsSchool.Fields("ID").Value%>"
+
+ where the ASP that generates the attribute value
+ is masked from Tidy by the quotemarks.
+
+ */
+
+ public Node parseAsp()
+ {
+ int c;
+ Node asp = null;
+
+ this.txtstart = this.lexsize;
+
+ for (;;)
+ {
+ c = this.in.readChar();
+ addCharToLexer(c);
+
+
+ if (c != '%')
+ continue;
+
+ c = this.in.readChar();
+ addCharToLexer(c);
+
+ if (c == '>')
+ break;
+ }
+
+ this.lexsize -= 2;
+ this.txtend = this.lexsize;
+
+ if (this.txtend > this.txtstart)
+ asp = newNode(Node.AspTag,
+ this.lexbuf,
+ this.txtstart,
+ this.txtend);
+
+ this.txtstart = this.txtend;
+ return asp;
+ }
+
+ /*
+ PHP is like ASP but is based upon XML
+ processing instructions, e.g.
+ */
+ public Node parsePhp()
+ {
+ int c;
+ Node php = null;
+
+ this.txtstart = this.lexsize;
+
+ for (;;)
+ {
+ c = this.in.readChar();
+ addCharToLexer(c);
+
+
+ if (c != '?')
+ continue;
+
+ c = this.in.readChar();
+ addCharToLexer(c);
+
+ if (c == '>')
+ break;
+ }
+
+ this.lexsize -= 2;
+ this.txtend = this.lexsize;
+
+ if (this.txtend > this.txtstart)
+ php = newNode(Node.PhpTag,
+ this.lexbuf,
+ this.txtstart,
+ this.txtend);
+
+ this.txtstart = this.txtend;
+ return php;
+ }
+
+ /* consumes the '>' terminating start tags */
+ public String parseAttribute(MutableBoolean isempty, MutableObject asp,
+ MutableObject php)
+ {
+ int start = 0;
+ // int len = 0; Removed by BUGFIX for 126265
+ short map;
+ String attr;
+ int c = 0;
+
+ asp.setObject(null); /* clear asp pointer */
+ php.setObject(null); /* clear php pointer */
+ /* skip white space before the attribute */
+
+ for (;;)
+ {
+ c = this.in.readChar();
+
+ if (c == '/')
+ {
+ c = this.in.readChar();
+
+ if (c == '>')
+ {
+ isempty.value = true;
+ return null;
+ }
+
+ this.in.ungetChar(c);
+ c = '/';
+ break;
+ }
+
+ if (c == '>')
+ return null;
+
+ if (c =='<')
+ {
+ c = this.in.readChar();
+
+ if (c == '%')
+ {
+ asp.setObject(parseAsp());
+ return null;
+ }
+ else if (c == '?')
+ {
+ php.setObject(parsePhp());
+ return null;
+ }
+
+ this.in.ungetChar(c);
+ Report.attrError(this, this.token, null, Report.UNEXPECTED_GT);
+ return null;
+ }
+
+ if (c == '"' || c == '\'')
+ {
+ Report.attrError(this, this.token, null, Report.UNEXPECTED_QUOTEMARK);
+ continue;
+ }
+
+ if (c == StreamIn.EndOfStream)
+ {
+ Report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE);
+ this.in.ungetChar(c);
+ return null;
+ }
+
+ map = MAP((char)c);
+
+ if ((map & WHITE) == 0)
+ break;
+ }
+
+ start = this.lexsize;
+
+ for (;;)
+ {
+ /* but push back '=' for parseValue() */
+ if (c == '=' || c == '>')
+ {
+ this.in.ungetChar(c);
+ break;
+ }
+
+ if (c == '<' || c == StreamIn.EndOfStream)
+ {
+ this.in.ungetChar(c);
+ break;
+ }
+
+ map = MAP((char)c);
+
+ if ((map & WHITE) != 0)
+ break;
+
+ /* what should be done about non-namechar characters? */
+ /* currently these are incorporated into the attr name */
+
+ if (!this.configuration.XmlTags && (map & UPPERCASE) != 0)
+ c += (int)('a' - 'A');
+
+ // ++len; Removed by BUGFIX for 126265
+ addCharToLexer(c);
+
+ c = this.in.readChar();
+ }
+
+ // Following line added by GLP to fix BUG 126265. This is a temporary comment
+ // and should be removed when Tidy is fixed.
+ int len = this.lexsize - start;
+ attr = (len > 0 ? getString(this.lexbuf, start, len) : null);
+ this.lexsize = start;
+
+ return attr;
+ }
+
+ /*
+ invoked when < is seen in place of attribute value
+ but terminates on whitespace if not ASP, PHP or Tango
+ this routine recognizes ' and " quoted strings
+ */
+ public int parseServerInstruction()
+ {
+ int c, map, delim = '"';
+ boolean isrule = false;
+
+ c = this.in.readChar();
+ addCharToLexer(c);
+
+ /* check for ASP, PHP or Tango */
+ if (c == '%' || c == '?' || c == '@')
+ isrule = true;
+
+ for (;;)
+ {
+ c = this.in.readChar();
+
+ if (c == StreamIn.EndOfStream)
+ break;
+
+ if (c == '>')
+ {
+ if (isrule)
+ addCharToLexer(c);
+ else
+ this.in.ungetChar(c);
+
+ break;
+ }
+
+ /* if not recognized as ASP, PHP or Tango */
+ /* then also finish value on whitespace */
+ if (!isrule)
+ {
+ map = MAP((char)c);
+
+ if ((map & WHITE) != 0)
+ break;
+ }
+
+ addCharToLexer(c);
+
+ if (c == '"')
+ {
+ do
+ {
+ c = this.in.readChar();
+ addCharToLexer(c);
+ }
+ while (c != '"');
+ delim = '\'';
+ continue;
+ }
+
+ if (c == '\'')
+ {
+ do
+ {
+ c = this.in.readChar();
+ addCharToLexer(c);
+ }
+ while (c != '\'');
+ }
+ }
+
+ return delim;
+ }
+
+ /* values start with "=" or " = " etc. */
+ /* doesn't consume the ">" at end of start tag */
+
+ public String parseValue(String name, boolean foldCase,
+ MutableBoolean isempty, MutableInteger pdelim)
+ {
+ int len = 0;
+ int start;
+ short map;
+ boolean seen_gt = false;
+ boolean munge = true;
+ int c = 0;
+ int lastc, delim, quotewarning;
+ String value;
+
+ delim = 0;
+ pdelim.value = (int)'"';
+
+ /*
+ Henry Zrepa reports that some folk are using the
+ embed element with script attributes where newlines
+ are significant and must be preserved
+ */
+ if (configuration.LiteralAttribs)
+ munge = false;
+
+ /* skip white space before the '=' */
+
+ for (;;)
+ {
+ c = this.in.readChar();
+
+ if (c == StreamIn.EndOfStream)
+ {
+ this.in.ungetChar(c);
+ break;
+ }
+
+ map = MAP((char)c);
+
+ if ((map & WHITE) == 0)
+ break;
+ }
+
+ /*
+ c should be '=' if there is a value
+ other legal possibilities are white
+ space, '/' and '>'
+ */
+
+ if (c != '=')
+ {
+ this.in.ungetChar(c);
+ return null;
+ }
+
+ /* skip white space after '=' */
+
+ for (;;)
+ {
+ c = this.in.readChar();
+
+ if (c == StreamIn.EndOfStream)
+ {
+ this.in.ungetChar(c);
+ break;
+ }
+
+ map = MAP((char)c);
+
+ if ((map & WHITE) == 0)
+ break;
+ }
+
+ /* check for quote marks */
+
+ if (c == '"' || c == '\'')
+ delim = c;
+ else if (c == '<')
+ {
+ start = this.lexsize;
+ addCharToLexer(c);
+ pdelim.value = parseServerInstruction();
+ len = this.lexsize - start;
+ this.lexsize = start;
+ return (len > 0 ? getString(this.lexbuf, start, len) : null);
+ }
+ else
+ this.in.ungetChar(c);
+
+ /*
+ and read the value string
+ check for quote mark if needed
+ */
+
+ quotewarning = 0;
+ start = this.lexsize;
+ c = '\0';
+
+ for (;;)
+ {
+ lastc = c; /* track last character */
+ c = this.in.readChar();
+
+ if (c == StreamIn.EndOfStream)
+ {
+ Report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE);
+ this.in.ungetChar(c);
+ break;
+ }
+
+ if (delim == (char)0)
+ {
+ if (c == '>')
+ {
+ this.in.ungetChar(c);
+ break;
+ }
+
+ if (c == '"' || c == '\'')
+ {
+ Report.attrError(this, this.token, null, Report.UNEXPECTED_QUOTEMARK);
+ break;
+ }
+
+ if (c == '<')
+ {
+ /* this.in.ungetChar(c); */
+ Report.attrError(this, this.token, null, Report.UNEXPECTED_GT);
+ /* break; */
+ }
+
+ /*
+ For cases like
need to avoid treating /> as
+ part of the attribute value, however care is needed to avoid
+ so treating in this way, which
+ would map the tag to
+ */
+ if (c == '/')
+ {
+ /* peek ahead in case of /> */
+ c = this.in.readChar();
+
+ if (c == '>' &&
+ !AttributeTable.getDefaultAttributeTable().isUrl(name))
+ {
+ isempty.value = true;
+ this.in.ungetChar(c);
+ break;
+ }
+
+ /* unget peeked char */
+ this.in.ungetChar(c);
+ c = '/';
+ }
+ }
+ else /* delim is '\'' or '"' */
+ {
+ if (c == delim)
+ break;
+
+ /* treat CRLF, CR and LF as single line break */
+
+ if (c == '\r')
+ {
+ c = this.in.readChar();
+ if (c != '\n')
+ this.in.ungetChar(c);
+
+ c = '\n';
+ }
+
+ if (c == '\n' || c == '<' || c == '>')
+ ++quotewarning;
+
+ if (c == '>')
+ seen_gt = true;
+ }
+
+ if (c == '&')
+ {
+ addCharToLexer(c);
+ parseEntity((short)0);
+ continue;
+ }
+
+ /*
+ kludge for JavaScript attribute values
+ with line continuations in string literals
+ */
+ if (c == '\\')
+ {
+ c = this.in.readChar();
+
+ if (c != '\n')
+ {
+ this.in.ungetChar(c);
+ c = '\\';
+ }
+ }
+
+ map = MAP((char)c);
+
+ if ((map & WHITE) != 0)
+ {
+ if (delim == (char)0)
+ break;
+
+ if (munge)
+ {
+ c = ' ';
+
+ if (lastc == ' ')
+ continue;
+ }
+ }
+ else if (foldCase && (map & UPPERCASE) != 0)
+ c += (int)('a' - 'A');
+
+ addCharToLexer(c);
+ }
+
+ if (quotewarning > 10 && seen_gt && munge)
+ {
+ /*
+ there is almost certainly a missing trailling quote mark
+ as we have see too many newlines, < or > characters.
+
+ an exception is made for Javascript attributes and the
+ javascript URL scheme which may legitimately include < and >
+ */
+ if (!AttributeTable.getDefaultAttributeTable().isScript(name) &&
+ !(AttributeTable.getDefaultAttributeTable().isUrl(name) &&
+ (getString(this.lexbuf, start, 11)).equals("javascript:")))
+ Report.error(this, null, null, Report.SUSPECTED_MISSING_QUOTE);
+ }
+
+ len = this.lexsize - start;
+ this.lexsize = start;
+
+ if (len > 0 || delim != 0)
+ value = getString(this.lexbuf, start, len);
+ else
+ value = null;
+
+ /* note delimiter if given */
+ if (delim != 0)
+ pdelim.value = delim;
+ else
+ pdelim.value = (int)'"';
+
+ return value;
+ }
+
+ /* attr must be non-null */
+ public static boolean isValidAttrName(String attr)
+ {
+ short map;
+ char c;
+ int i;
+
+ /* first character should be a letter */
+ c = attr.charAt(0);
+ map = MAP(c);
+
+ if (!((map & LETTER) != 0))
+ return false;
+
+ /* remaining characters should be namechars */
+ for( i = 1; i < attr.length(); i++)
+ {
+ c = attr.charAt(i);
+ map = MAP(c);
+
+ if((map & NAMECHAR) != 0)
+ continue;
+
+ return false;
+ }
+
+ return true;
+ }
+
+ /* swallows closing '>' */
+
+ public AttVal parseAttrs(MutableBoolean isempty)
+ {
+ AttVal av, list;
+ String attribute, value;
+ MutableInteger delim = new MutableInteger();
+ MutableObject asp = new MutableObject();
+ MutableObject php = new MutableObject();
+
+ list = null;
+
+ for (; !endOfInput();)
+ {
+ attribute = parseAttribute(isempty, asp, php);
+
+ if (attribute == null)
+ {
+ /* check if attributes are created by ASP markup */
+ if (asp.getObject() != null)
+ {
+ av = new AttVal(list, null, (Node)asp.getObject(), null,
+ '\0', null, null );
+ list = av;
+ continue;
+ }
+
+ /* check if attributes are created by PHP markup */
+ if (php.getObject() != null)
+ {
+ av = new AttVal(list, null, null, (Node)php.getObject(),
+ '\0', null, null );
+ list = av;
+ continue;
+ }
+
+ break;
+ }
+
+ value = parseValue(attribute, false, isempty, delim);
+
+ if (attribute != null && isValidAttrName(attribute))
+ {
+ av = new AttVal( list, null, null, null,
+ delim.value, attribute, value );
+ av.dict =
+ AttributeTable.getDefaultAttributeTable().findAttribute(av);
+ list = av;
+ }
+ else
+ {
+ av = new AttVal( null, null, null, null,
+ 0, attribute, value );
+ Report.attrError(this, this.token, value, Report.BAD_ATTRIBUTE_VALUE);
+ }
+ }
+
+ return list;
+ }
+
+ /*
+ push a copy of an inline node onto stack
+ but don't push if implicit or OBJECT or APPLET
+ (implicit tags are ones generated from the istack)
+
+ One issue arises with pushing inlines when
+ the tag is already pushed. For instance:
+
+ text
+ more text
+
+ Shouldn't be mapped to
+
+ text
+ more text
+ */
+ public void pushInline( Node node )
+ {
+ IStack is;
+
+ if (node.implicit)
+ return;
+
+ if (node.tag == null)
+ return;
+
+ if ((node.tag.model & Dict.CM_INLINE) == 0 )
+ return;
+
+ if ((node.tag.model & Dict.CM_OBJECT) != 0)
+ return;
+
+ if (node.tag != configuration.tt.tagFont && isPushed(node))
+ return;
+
+ // make sure there is enough space for the stack
+ is = new IStack();
+ is.tag = node.tag;
+ is.element = node.element;
+ if (node.attributes != null)
+ is.attributes = cloneAttributes(node.attributes);
+ this.istack.push( is );
+ }
+
+ /* pop inline stack */
+ public void popInline( Node node )
+ {
+ AttVal av;
+ IStack is;
+
+ if (node != null) {
+
+ if (node.tag == null)
+ return;
+
+ if ((node.tag.model & Dict.CM_INLINE) == 0)
+ return;
+
+ if ((node.tag.model & Dict.CM_OBJECT) != 0)
+ return;
+
+ // if node is then pop until we find an
+ if (node.tag == configuration.tt.tagA) {
+
+ while (this.istack.size() > 0) {
+ is = (IStack)this.istack.pop();
+ if (is.tag == configuration.tt.tagA) {
+ break;
+ }
+ }
+
+ if (this.insert >= this.istack.size())
+ this.insert = -1;
+ return;
+ }
+ }
+
+ if (this.istack.size() > 0) {
+ is = (IStack)this.istack.pop();
+ if (this.insert >= this.istack.size())
+ this.insert = -1;
+ }
+ }
+
+ public boolean isPushed( Node node )
+ {
+ int i;
+ IStack is;
+
+ for (i = this.istack.size() - 1; i >= 0; --i) {
+ is = (IStack)this.istack.elementAt(i);
+ if (is.tag == node.tag)
+ return true;
+ }
+
+ return false;
+ }
+
+ /*
+ This has the effect of inserting "missing" inline
+ elements around the contents of blocklevel elements
+ such as P, TD, TH, DIV, PRE etc. This procedure is
+ called at the start of ParseBlock. when the inline
+ stack is not empty, as will be the case in:
+
+ italic heading
+
+ which is then treated as equivalent to
+
+ italic heading
+
+ This is implemented by setting the lexer into a mode
+ where it gets tokens from the inline stack rather than
+ from the input stream.
+ */
+ public int inlineDup( Node node )
+ {
+ int n;
+
+ n = this.istack.size() - this.istackbase;
+ if ( n > 0 ) {
+ this.insert = this.istackbase;
+ this.inode = node;
+ }
+
+ return n;
+ }
+
+ public Node insertedToken()
+ {
+ Node node;
+ IStack is;
+ int n;
+
+ // this will only be null if inode != null
+ if (this.insert == -1) {
+ node = this.inode;
+ this.inode = null;
+ return node;
+ }
+
+ // is this is the "latest" node then update
+ // the position, otherwise use current values
+
+ if (this.inode == null) {
+ this.lines = this.in.curline;
+ this.columns = this.in.curcol;
+ }
+
+ node = newNode(Node.StartTag,
+ this.lexbuf,
+ this.txtstart,
+ this.txtend); // GLP: Bugfix 126261. Remove when this change
+ // is fixed in istack.c in the original Tidy
+ node.implicit = true;
+ is = (IStack)this.istack.elementAt( this.insert );
+ node.element = is.element;
+ node.tag = is.tag;
+ if (is.attributes != null)
+ node.attributes = cloneAttributes(is.attributes);
+
+ // advance lexer to next item on the stack
+ n = this.insert;
+
+ // and recover state if we have reached the end
+ if (++n < this.istack.size() ) {
+ this.insert = n;
+ } else {
+ this.insert = -1;
+ }
+
+ return node;
+ }
+
+ /* AQ: Try this for speed optimization */
+ public static int wstrcasecmp(String s1, String s2)
+ {
+ return (s1.equalsIgnoreCase(s2) ? 0 : 1);
+ }
+
+ public static int wstrcaselexcmp(String s1, String s2)
+ {
+ char c;
+ int i = 0;
+
+ while ( i < s1.length() && i < s2.length() ) {
+ c = s1.charAt(i);
+ if ( toLower(c) != toLower( s2.charAt(i) ) ) {
+ break;
+ }
+ i += 1;
+ }
+ if ( i == s1.length() && i == s2.length() ) {
+ return 0;
+ } else if ( i == s1.length() ) {
+ return -1;
+ } else if ( i == s2.length() ) {
+ return 1;
+ } else {
+ return ( s1.charAt(i) > s2.charAt(i) ? 1 : -1 );
+ }
+ }
+
+ public static boolean wsubstr(String s1, String s2)
+ {
+ int i;
+ int len1 = s1.length();
+ int len2 = s2.length();
+
+ for (i = 0; i <= len1 - len2; ++i)
+ {
+ if (s2.equalsIgnoreCase(s1.substring(i)))
+ return true;
+ }
+
+ return false;
+ }
+
+ public boolean canPrune(Node element)
+ {
+ if (element.type == Node.TextNode)
+ return true;
+
+ if (element.content != null)
+ return false;
+
+ if (element.tag == configuration.tt.tagA && element.attributes != null)
+ return false;
+
+ if (element.tag == configuration.tt.tagP && !this.configuration.DropEmptyParas)
+ return false;
+
+ if (element.tag == null)
+ return false;
+
+ if ((element.tag.model & Dict.CM_ROW) != 0)
+ return false;
+
+ if (element.tag == configuration.tt.tagApplet)
+ return false;
+
+ if (element.tag == configuration.tt.tagObject)
+ return false;
+
+ if (element.attributes != null &&
+ (element.getAttrByName("id") != null ||
+ element.getAttrByName("name") != null) )
+ return false;
+
+ return true;
+ }
+
+ /* duplicate name attribute as an id */
+ public void fixId(Node node)
+ {
+ AttVal name = node.getAttrByName("name");
+ AttVal id = node.getAttrByName("id");
+
+ if (name != null)
+ {
+ if (id != null)
+ {
+ if (!id.value.equals(name.value))
+ Report.attrError(this, node, "name", Report.ID_NAME_MISMATCH);
+ }
+ else if (this.configuration.XmlOut)
+ node.addAttribute("id", name.value);
+ }
+ }
+
+ /*
+ defer duplicates when entering a table or other
+ element where the inlines shouldn't be duplicated
+ */
+ public void deferDup()
+ {
+ this.insert = -1;
+ this.inode = null;
+ }
+
+ /* Private methods and fields */
+
+ /* lexer char types */
+ private static final short DIGIT = 1;
+ private static final short LETTER = 2;
+ private static final short NAMECHAR = 4;
+ private static final short WHITE = 8;
+ private static final short NEWLINE = 16;
+ private static final short LOWERCASE = 32;
+ private static final short UPPERCASE = 64;
+
+ /* lexer GetToken states */
+
+ private static final short LEX_CONTENT = 0;
+ private static final short LEX_GT = 1;
+ private static final short LEX_ENDTAG = 2;
+ private static final short LEX_STARTTAG = 3;
+ private static final short LEX_COMMENT = 4;
+ private static final short LEX_DOCTYPE = 5;
+ private static final short LEX_PROCINSTR = 6;
+ private static final short LEX_ENDCOMMENT = 7;
+ private static final short LEX_CDATA = 8;
+ private static final short LEX_SECTION = 9;
+ private static final short LEX_ASP = 10;
+ private static final short LEX_JSTE = 11;
+ private static final short LEX_PHP = 12;
+
+ /* used to classify chars for lexical purposes */
+ private static short[] lexmap = new short[128];
+
+ private static void mapStr(String str, short code)
+ {
+ int j;
+
+ for ( int i = 0; i < str.length(); i++ ) {
+ j = (int)str.charAt(i);
+ lexmap[j] |= code;
+ }
+ }
+
+ static {
+ mapStr("\r\n\f", (short)(NEWLINE|WHITE));
+ mapStr(" \t", WHITE);
+ mapStr("-.:_", NAMECHAR);
+ mapStr("0123456789", (short)(DIGIT|NAMECHAR));
+ mapStr("abcdefghijklmnopqrstuvwxyz", (short)(LOWERCASE|LETTER|NAMECHAR));
+ mapStr("ABCDEFGHIJKLMNOPQRSTUVWXYZ", (short)(UPPERCASE|LETTER|NAMECHAR));
+ }
+
+ private static short MAP( char c )
+ {
+ return ((int)c < 128 ? lexmap[(int)c] : 0);
+ }
+
+ private static boolean isWhite(char c)
+ {
+ short m = MAP(c);
+
+ return (m & WHITE) != 0;
+ }
+
+ private static boolean isDigit(char c)
+ {
+ short m;
+
+ m = MAP(c);
+
+ return (m & DIGIT) != 0;
+ }
+
+ private static boolean isLetter(char c)
+ {
+ short m;
+
+ m = MAP(c);
+
+ return (m & LETTER) != 0;
+ }
+
+ private static char toLower(char c)
+ {
+ short m = MAP(c);
+
+ if ((m & UPPERCASE) != 0)
+ c = (char)( (int)c + (int)'a' - (int)'A' );
+
+ return c;
+ }
+
+ private static char toUpper(char c)
+ {
+ short m = MAP(c);
+
+ if ((m & LOWERCASE) != 0)
+ c = (char)( (int)c + (int)'A' - (int)'a' );
+
+ return c;
+ }
+
+ public static char foldCase(char c, boolean tocaps, boolean xmlTags)
+ {
+ short m;
+
+ if (!xmlTags)
+ {
+ m = MAP(c);
+
+ if (tocaps)
+ {
+ if ((m & LOWERCASE) != 0)
+ c = (char)( (int)c + (int)'A' - (int)'a' );
+ }
+ else /* force to lower case */
+ {
+ if ((m & UPPERCASE) != 0)
+ c = (char)( (int)c + (int)'a' - (int)'A' );
+ }
+ }
+
+ return c;
+ }
+
+
+ private static class W3CVersionInfo
+ {
+ String name;
+ String voyagerName;
+ String profile;
+ short code;
+
+ public W3CVersionInfo( String name,
+ String voyagerName,
+ String profile,
+ short code )
+ {
+ this.name = name;
+ this.voyagerName = voyagerName;
+ this.profile = profile;
+ this.code = code;
+ }
+ }
+
+ /* the 3 URIs for the XHTML 1.0 DTDs */
+ private static final String voyager_loose = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";
+ private static final String voyager_strict = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";
+ private static final String voyager_frameset = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd";
+
+ private static final String XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml";
+
+ private static Lexer.W3CVersionInfo[] W3CVersion =
+ {
+ new W3CVersionInfo("HTML 4.01",
+ "XHTML 1.0 Strict",
+ voyager_strict,
+ Dict.VERS_HTML40_STRICT),
+ new W3CVersionInfo("HTML 4.01 Transitional",
+ "XHTML 1.0 Transitional",
+ voyager_loose,
+ Dict.VERS_HTML40_LOOSE),
+ new W3CVersionInfo("HTML 4.01 Frameset",
+ "XHTML 1.0 Frameset",
+ voyager_frameset,
+ Dict.VERS_FRAMES),
+ new W3CVersionInfo("HTML 4.0",
+ "XHTML 1.0 Strict",
+ voyager_strict,
+ Dict.VERS_HTML40_STRICT),
+ new W3CVersionInfo("HTML 4.0 Transitional",
+ "XHTML 1.0 Transitional",
+ voyager_loose,
+ Dict.VERS_HTML40_LOOSE),
+ new W3CVersionInfo("HTML 4.0 Frameset",
+ "XHTML 1.0 Frameset",
+ voyager_frameset,
+ Dict.VERS_FRAMES),
+ new W3CVersionInfo("HTML 3.2",
+ "XHTML 1.0 Transitional",
+ voyager_loose,
+ Dict.VERS_HTML32),
+ new W3CVersionInfo("HTML 2.0",
+ "XHTML 1.0 Strict",
+ voyager_strict,
+ Dict.VERS_HTML20)
+ };
+
+}