(translation to Java)
- * @version 1.0, 1999/05/22
- * @version 1.0.1, 1999/05/29
- * @version 1.1, 1999/06/18 Java Bean
- * @version 1.2, 1999/07/10 Tidy Release 7 Jul 1999
- * @version 1.3, 1999/07/30 Tidy Release 26 Jul 1999
- * @version 1.4, 1999/09/04 DOM support
- * @version 1.5, 1999/10/23 Tidy Release 27 Sep 1999
- * @version 1.6, 1999/11/01 Tidy Release 22 Oct 1999
- * @version 1.7, 1999/12/06 Tidy Release 30 Nov 1999
- * @version 1.8, 2000/01/22 Tidy Release 13 Jan 2000
- * @version 1.9, 2000/06/03 Tidy Release 30 Apr 2000
- * @version 1.10, 2000/07/22 Tidy Release 8 Jul 2000
- * @version 1.11, 2000/08/16 Tidy Release 4 Aug 2000
- */
-
-/*
- Given a file stream fp it returns a sequence of tokens.
-
- GetToken(fp) gets the next token
- UngetToken(fp) provides one level undo
-
- The tags include an attribute list:
-
- - linked list of attribute/value nodes
- - each node has 2 null-terminated strings.
- - entities are replaced in attribute values
-
- white space is compacted if not in preformatted mode
- If not in preformatted mode then leading white space
- is discarded and subsequent white space sequences
- compacted to single space chars.
-
- If XmlTags is no then Tag names are folded to upper
- case and attribute names to lower case.
-
- Not yet done:
- - Doctype subset and marked sections
-*/
-
-import java.io.PrintWriter;
-import java.util.Stack;
-import java.util.Vector;
-
-import org.eclipse.core.resources.IFile;
-import sun.security.krb5.internal.av;
-
-public class Lexer {
-
- private IFile iFile;
- public StreamIn in; /* file stream */
- public PrintWriter errout; /* error output stream */
- public short badAccess; /* for accessibility errors */
- public short badLayout; /* for bad style errors */
- public short badChars; /* for bad char encodings */
- public short badForm; /* for mismatched/mispositioned form tags */
- public short warnings; /* count of warnings in this document */
- public short errors; /* count of errors */
- public int lines; /* lines seen */
- public int columns; /* at start of current token */
- public boolean waswhite; /* used to collapse contiguous white space */
- public boolean pushed; /* true after token has been pushed back */
- public boolean insertspace; /* when space is moved after end tag */
- public boolean excludeBlocks; /* Netscape compatibility */
- public boolean exiled; /* true if moved out of table */
- public boolean isvoyager; /* true if xmlns attribute on html element */
- public short versions; /* bit vector of HTML versions */
- public int doctype; /* version as given by doctype (if any) */
- public boolean badDoctype; /* e.g. if html or PUBLIC is missing */
- public int txtstart; /* start of current node */
- public int txtend; /* end of current node */
- public short state; /* state of lexer's finite state machine */
- public Node token;
-
- /*
- lexer character buffer
-
- parse tree nodes span onto this buffer
- which contains the concatenated text
- contents of all of the elements.
-
- lexsize must be reset for each file.
- */
- public byte[] lexbuf; /* byte buffer of UTF-8 chars */
- public int lexlength; /* allocated */
- public int lexsize; /* used */
-
- /* Inline stack for compatibility with Mosaic */
- public Node inode; /* for deferring text node */
- public int insert; /* for inferring inline tags */
- public Stack istack;
- public int istackbase; /* start of frame */
-
- public Style styles; /* used for cleaning up presentation markup */
-
- public Configuration configuration;
- protected int seenBodyEndTag; /* used by parser */
- private Vector nodeList;
-
- public Lexer(IFile iFile, StreamIn in, Configuration configuration)
- {
- this.iFile = iFile;
- this.in = in;
- this.lines = 1;
- this.columns = 1;
- this.state = LEX_CONTENT;
- this.badAccess = 0;
- this.badLayout = 0;
- this.badChars = 0;
- this.badForm = 0;
- this.warnings = 0;
- this.errors = 0;
- this.waswhite = false;
- this.pushed = false;
- this.insertspace = false;
- this.exiled = false;
- this.isvoyager = false;
- this.versions = Dict.VERS_EVERYTHING;
- this.doctype = Dict.VERS_UNKNOWN;
- this.badDoctype = false;
- this.txtstart = 0;
- this.txtend = 0;
- this.token = null;
- this.lexbuf = null;
- this.lexlength = 0;
- this.lexsize = 0;
- this.inode = null;
- this.insert = -1;
- this.istack = new Stack();
- this.istackbase = 0;
- this.styles = null;
- this.configuration = configuration;
- this.seenBodyEndTag = 0;
- this.nodeList = new Vector();
- }
-
- public IFile getIFile() {
- return iFile;
- }
-
- public Node newNode()
- {
- Node node = new Node();
- nodeList.addElement(node);
- return node;
- }
-
- public Node newNode(short type, byte[] textarray, int start, int end)
- {
- Node node = new Node(type, textarray, start, end);
- nodeList.addElement(node);
- return node;
- }
-
- public Node newNode(short type, byte[] textarray, int start, int end, String element)
- {
- Node node = new Node(type, textarray, start, end, element, configuration.tt);
- nodeList.addElement(node);
- return node;
- }
-
- public Node cloneNode(Node node)
- {
- Node cnode = (Node)node.clone();
- nodeList.addElement(cnode);
- for (AttVal att = cnode.attributes; att != null; att = att.next) {
- if (att.asp != null)
- nodeList.addElement(att.asp);
- if (att.php != null)
- nodeList.addElement(att.php);
- }
- return cnode;
- }
-
- public AttVal cloneAttributes(AttVal attrs)
- {
- AttVal cattrs = (AttVal)attrs.clone();
- for (AttVal att = cattrs; att != null; att = att.next) {
- if (att.asp != null)
- nodeList.addElement(att.asp);
- if (att.php != null)
- nodeList.addElement(att.php);
- }
- return cattrs;
- }
-
- protected void updateNodeTextArrays(byte[] oldtextarray, byte[] newtextarray)
- {
- Node node;
- for (int i = 0; i < nodeList.size(); i++) {
- node = (Node)(nodeList.elementAt(i));
- if (node.textarray == oldtextarray)
- node.textarray = newtextarray;
- }
- }
-
- /* used for creating preformatted text from Word2000 */
- public Node newLineNode()
- {
- Node node = newNode();
-
- node.textarray = this.lexbuf;
- node.start = this.lexsize;
- addCharToLexer((int)'\n');
- node.end = this.lexsize;
- return node;
- }
-
- // Should always be able convert to/from UTF-8, so encoding exceptions are
- // converted to an Error to avoid adding throws declarations in
- // lots of methods.
-
- public static byte[] getBytes(String str) {
- try {
- return str.getBytes("UTF8");
- } catch (java.io.UnsupportedEncodingException e) {
- throw new Error("string to UTF-8 conversion failed: " + e.getMessage());
- }
- }
-
- public static String getString(byte[] bytes, int offset, int length) {
- try {
- return new String(bytes, offset, length, "UTF8");
- } catch (java.io.UnsupportedEncodingException e) {
- throw new Error("UTF-8 to string conversion failed: " + e.getMessage());
- }
- }
-
- public boolean endOfInput()
- {
- return this.in.isEndOfStream();
- }
-
- public void addByte(int c)
- {
- if (this.lexsize + 1 >= this.lexlength)
- {
- while (this.lexsize + 1 >= this.lexlength)
- {
- if (this.lexlength == 0)
- this.lexlength = 8192;
- else
- this.lexlength = this.lexlength * 2;
- }
-
- byte[] temp = this.lexbuf;
- this.lexbuf = new byte[ this.lexlength ];
- if (temp != null)
- {
- System.arraycopy( temp, 0, this.lexbuf, 0, temp.length );
- updateNodeTextArrays(temp, this.lexbuf);
- }
- }
-
- this.lexbuf[this.lexsize++] = (byte)c;
- this.lexbuf[this.lexsize] = (byte)'\0'; /* debug */
- }
-
- public void changeChar(byte c)
- {
- if (this.lexsize > 0)
- {
- this.lexbuf[this.lexsize-1] = c;
- }
- }
-
- /* store char c as UTF-8 encoded byte stream */
- public void addCharToLexer(int c)
- {
- if (c < 128)
- addByte(c);
- else if (c <= 0x7FF)
- {
- addByte(0xC0 | (c >> 6));
- addByte(0x80 | (c & 0x3F));
- }
- else if (c <= 0xFFFF)
- {
- addByte(0xE0 | (c >> 12));
- addByte(0x80 | ((c >> 6) & 0x3F));
- addByte(0x80 | (c & 0x3F));
- }
- else if (c <= 0x1FFFFF)
- {
- addByte(0xF0 | (c >> 18));
- addByte(0x80 | ((c >> 12) & 0x3F));
- addByte(0x80 | ((c >> 6) & 0x3F));
- addByte(0x80 | (c & 0x3F));
- }
- else
- {
- addByte(0xF8 | (c >> 24));
- addByte(0x80 | ((c >> 18) & 0x3F));
- addByte(0x80 | ((c >> 12) & 0x3F));
- addByte(0x80 | ((c >> 6) & 0x3F));
- addByte(0x80 | (c & 0x3F));
- }
- }
-
- public void addStringToLexer(String str)
- {
- for ( int i = 0; i < str.length(); i++ ) {
- addCharToLexer( (int)str.charAt(i) );
- }
- }
-
- /*
- No longer attempts to insert missing ';' for unknown
- enitities unless one was present already, since this
- gives unexpected results.
-
- For example:
- was tidied to:
- rather than:
-
- My thanks for Maurice Buxton for spotting this.
- */
- public void parseEntity(short mode)
- {
- short map;
- int start;
- boolean first = true;
- boolean semicolon = false;
- boolean numeric = false;
- int c, ch, startcol;
- String str;
-
- start = this.lexsize - 1; /* to start at "&" */
- startcol = this.in.curcol - 1;
-
- while (true)
- {
- c = this.in.readChar();
- if (c == StreamIn.EndOfStream) break;
- if (c == ';')
- {
- semicolon = true;
- break;
- }
-
- if (first && c == '#')
- {
- addCharToLexer(c);
- first = false;
- numeric = true;
- continue;
- }
-
- first = false;
- map = MAP((char)c);
-
- /* AQ: Added flag for numeric entities so that numeric entities
- with missing semi-colons are recognized.
- Eg. "rep..." is recognized as "rep"
- */
- if (numeric && ((c == 'x') || ((map & DIGIT) != 0)))
- {
- addCharToLexer(c);
- continue;
- }
- if (!numeric && ((map & NAMECHAR) != 0))
- {
- addCharToLexer(c);
- continue;
- }
-
- /* otherwise put it back */
-
- this.in.ungetChar(c);
- break;
- }
-
- str = getString( this.lexbuf, start, this.lexsize - start );
- ch = EntityTable.getDefaultEntityTable().entityCode( str );
-
- /* deal with unrecognized entities */
- if (ch <= 0)
- {
- /* set error position just before offending chararcter */
- this.lines = this.in.curline;
- this.columns = startcol;
-
- if (this.lexsize > start +1 )
- {
- Report.entityError(this, Report.UNKNOWN_ENTITY, str, ch);
-
- if (semicolon)
- addCharToLexer(';');
- }
- else /* naked & */
- {
- Report.entityError(this, Report.UNESCAPED_AMPERSAND, str, ch);
- }
- }
- else
- {
- if (c != ';') /* issue warning if not terminated by ';' */
- {
- /* set error position just before offending chararcter */
- this.lines = this.in.curline;
- this.columns = startcol;
- Report.entityError(this, Report.MISSING_SEMICOLON, str, c);
- }
-
- this.lexsize = start;
-
- if (ch == 160 && (mode & Preformatted) != 0)
- ch = ' ';
-
- addCharToLexer(ch);
-
- if (ch == '&' && !this.configuration.QuoteAmpersand)
- {
- addCharToLexer('a');
- addCharToLexer('m');
- addCharToLexer('p');
- addCharToLexer(';');
- }
- }
- }
-
- public char parseTagName()
- {
- short map;
- int c;
-
- /* fold case of first char in buffer */
-
- c = this.lexbuf[this.txtstart];
- map = MAP((char)c);
-
- if (!this.configuration.XmlTags && (map & UPPERCASE) != 0)
- {
- c += (int)((int)'a' - (int)'A');
- this.lexbuf[this.txtstart] = (byte)c;
- }
-
- while (true)
- {
- c = this.in.readChar();
- if (c == StreamIn.EndOfStream) break;
- map = MAP((char)c);
-
- if ((map & NAMECHAR) == 0)
- break;
-
- /* fold case of subsequent chars */
-
- if (!this.configuration.XmlTags && (map & UPPERCASE) != 0)
- c += (int)((int)'a' - (int)'A');
-
- addCharToLexer(c);
- }
-
- this.txtend = this.lexsize;
- return (char)c;
- }
-
- public void addStringLiteral(String str)
- {
- for ( int i = 0; i < str.length(); i++ ) {
- addCharToLexer( (int)str.charAt(i) );
- }
- }
-
- /* choose what version to use for new doctype */
- public short HTMLVersion()
- {
- short versions;
-
- versions = this.versions;
-
- if ((versions & Dict.VERS_HTML20) != 0)
- return Dict.VERS_HTML20;
-
- if ((versions & Dict.VERS_HTML32) != 0)
- return Dict.VERS_HTML32;
-
- if ((versions & Dict.VERS_HTML40_STRICT) != 0)
- return Dict.VERS_HTML40_STRICT;
-
- if ((versions & Dict.VERS_HTML40_LOOSE) != 0)
- return Dict.VERS_HTML40_LOOSE;
-
- if ((versions & Dict.VERS_FRAMES) != 0)
- return Dict.VERS_FRAMES;
-
- return Dict.VERS_UNKNOWN;
- }
-
- public String HTMLVersionName()
- {
- short guessed;
- int j;
-
- guessed = apparentVersion();
-
- for (j = 0; j < W3CVersion.length; ++j)
- {
- if (guessed == W3CVersion[j].code)
- {
- if (this.isvoyager)
- return W3CVersion[j].voyagerName;
-
- return W3CVersion[j].name;
- }
- }
-
- return null;
- }
-
- /* add meta element for Tidy */
- public boolean addGenerator(Node root)
- {
- AttVal attval;
- Node node;
- Node head = root.findHEAD(configuration.tt);
-
- if (head != null)
- {
- for (node = head.content; node != null; node = node.next)
- {
- if (node.tag == configuration.tt.tagMeta)
- {
- attval = node.getAttrByName("name");
-
- if (attval != null && attval.value != null &&
- Lexer.wstrcasecmp(attval.value, "generator") == 0)
- {
- attval = node.getAttrByName("content");
-
- if (attval != null && attval.value != null &&
- attval.value.length() >= 9 &&
- Lexer.wstrcasecmp(attval.value.substring(0, 9), "HTML Tidy") == 0)
- {
- return false;
- }
- }
- }
- }
-
- node = this.inferredTag("meta");
- node.addAttribute("content", "HTML Tidy, see www.w3.org");
- node.addAttribute("name", "generator");
- Node.insertNodeAtStart(head, node);
- return true;
- }
-
- return false;
- }
-
- /* return true if substring s is in p and isn't all in upper case */
- /* this is used to check the case of SYSTEM, PUBLIC, DTD and EN */
- /* len is how many chars to check in p */
- private static boolean findBadSubString(String s, String p, int len)
- {
- int n = s.length();
- int i = 0;
- String ps;
-
- while (n < len)
- {
- ps = p.substring(i, i + n);
- if (wstrcasecmp(s, ps) == 0)
- return (!ps.equals(s.substring(0, n)));
-
- ++i;
- --len;
- }
-
- return false;
- }
-
- public boolean checkDocTypeKeyWords(Node doctype)
- {
- int len = doctype.end - doctype.start;
- String s = getString(this.lexbuf, doctype.start, len);
-
- return !(
- findBadSubString("SYSTEM", s, len) ||
- findBadSubString("PUBLIC", s, len) ||
- findBadSubString("//DTD", s, len) ||
- findBadSubString("//W3C", s, len) ||
- findBadSubString("//EN", s, len)
- );
- }
-
- /* examine to identify version */
- public short findGivenVersion(Node doctype)
- {
- String p, s;
- int i, j;
- int len;
- String str1;
- String str2;
-
- /* if root tag for doctype isn't html give up now */
- str1 = getString(this.lexbuf, doctype.start, 5);
- if (wstrcasecmp(str1, "html ") != 0)
- return 0;
-
- if (!checkDocTypeKeyWords(doctype))
- Report.warning(this, doctype, null, Report.DTYPE_NOT_UPPER_CASE);
-
- /* give up if all we are given is the system id for the doctype */
- str1 = getString(this.lexbuf, doctype.start + 5, 7);
- if (wstrcasecmp(str1, "SYSTEM ") == 0)
- {
- /* but at least ensure the case is correct */
- if (!str1.substring(0, 6).equals("SYSTEM"))
- System.arraycopy( getBytes("SYSTEM"), 0,
- this.lexbuf, doctype.start + 5, 6 );
- return 0; /* unrecognized */
- }
-
- if (wstrcasecmp(str1, "PUBLIC ") == 0)
- {
- if (!str1.substring(0, 6).equals("PUBLIC"))
- System.arraycopy( getBytes("PUBLIC "), 0,
- this.lexbuf, doctype.start + 5, 6 );
- }
- else
- this.badDoctype = true;
-
- for (i = doctype.start; i < doctype.end; ++i)
- {
- if (this.lexbuf[i] == (byte)'"')
- {
- str1 = getString( this.lexbuf, i + 1, 12 );
- str2 = getString( this.lexbuf, i + 1, 13 );
- if (str1.equals("-//W3C//DTD "))
- {
- /* compute length of identifier e.g. "HTML 4.0 Transitional" */
- for (j = i + 13; j < doctype.end && this.lexbuf[j] != (byte)'/'; ++j);
- len = j - i - 13;
- p = getString( this.lexbuf, i + 13, len );
-
- for (j = 1; j < W3CVersion.length; ++j)
- {
- s = W3CVersion[j].name;
- if (len == s.length() && s.equals(p))
- return W3CVersion[j].code;
- }
-
- /* else unrecognized version */
- }
- else if (str2.equals("-//IETF//DTD "))
- {
- /* compute length of identifier e.g. "HTML 2.0" */
- for (j = i + 14; j < doctype.end && this.lexbuf[j] != (byte)'/'; ++j);
- len = j - i - 14;
-
- p = getString( this.lexbuf, i + 14, len );
- s = W3CVersion[0].name;
- if (len == s.length() && s.equals(p))
- return W3CVersion[0].code;
-
- /* else unrecognized version */
- }
- break;
- }
- }
-
- return 0;
- }
-
- public void fixHTMLNameSpace(Node root, String profile)
- {
- Node node;
- AttVal prev, attr;
-
- for (node = root.content;
- node != null && node.tag != configuration.tt.tagHtml; node = node.next);
-
- if (node != null)
- {
- prev = null;
-
- for (attr = node.attributes; attr != null; attr = attr.next)
- {
- if (attr.attribute.equals("xmlns"))
- break;
-
- prev = attr;
- }
-
- if (attr != null)
- {
- if (!attr.value.equals(profile))
- {
- Report.warning(this, node, null, Report.INCONSISTENT_NAMESPACE);
- attr.value = profile;
- }
- }
- else
- {
- attr = new AttVal( node.attributes, null, (int)'"',
- "xmlns", profile );
- attr.dict =
- AttributeTable.getDefaultAttributeTable().findAttribute( attr );
- node.attributes = attr;
- }
- }
- }
-
- public boolean setXHTMLDocType(Node root)
- {
- String fpi = " ";
- String sysid = "";
- String namespace = XHTML_NAMESPACE;
- Node doctype;
-
- doctype = root.findDocType();
-
- if (configuration.docTypeMode == Configuration.DOCTYPE_OMIT)
- {
- if (doctype != null)
- Node.discardElement(doctype);
- return true;
- }
-
- if (configuration.docTypeMode == Configuration.DOCTYPE_AUTO)
- {
- /* see what flavor of XHTML this document matches */
- if ((this.versions & Dict.VERS_HTML40_STRICT) != 0)
- { /* use XHTML strict */
- fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
- sysid = voyager_strict;
- }
- else if ((this.versions & Dict.VERS_LOOSE) != 0)
- {
- fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
- sysid = voyager_loose;
- }
- else if ((this.versions & Dict.VERS_FRAMES) != 0)
- { /* use XHTML frames */
- fpi = "-//W3C//DTD XHTML 1.0 Frameset//EN";
- sysid = voyager_frameset;
- }
- else /* lets assume XHTML transitional */
- {
- fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
- sysid = voyager_loose;
- }
- }
- else if (configuration.docTypeMode == Configuration.DOCTYPE_STRICT)
- {
- fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
- sysid = voyager_strict;
- }
- else if (configuration.docTypeMode == Configuration.DOCTYPE_LOOSE)
- {
- fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
- sysid = voyager_loose;
- }
-
- fixHTMLNameSpace(root, namespace);
-
- if (doctype == null)
- {
- doctype = newNode(Node.DocTypeTag, this.lexbuf, 0, 0);
- doctype.next = root.content;
- doctype.parent = root;
- doctype.prev = null;
- root.content = doctype;
- }
-
- if (configuration.docTypeMode == Configuration.DOCTYPE_USER &&
- configuration.docTypeStr != null)
- {
- fpi = configuration.docTypeStr;
- sysid = "";
- }
-
- this.txtstart = this.lexsize;
- this.txtend = this.lexsize;
-
- /* add public identifier */
- addStringLiteral("html PUBLIC ");
-
- /* check if the fpi is quoted or not */
- if (fpi.charAt(0) == '"')
- addStringLiteral(fpi);
- else
- {
- addStringLiteral("\"");
- addStringLiteral(fpi);
- addStringLiteral("\"");
- }
-
- if (sysid.length() + 6 >= this.configuration.wraplen)
- addStringLiteral("\n\"");
- else
- addStringLiteral("\n \"");
-
- /* add system identifier */
- addStringLiteral(sysid);
- addStringLiteral("\"");
-
- this.txtend = this.lexsize;
-
- doctype.start = this.txtstart;
- doctype.end = this.txtend;
-
- return false;
- }
-
- public short apparentVersion()
- {
- switch (this.doctype)
- {
- case Dict.VERS_UNKNOWN:
- return HTMLVersion();
-
- case Dict.VERS_HTML20:
- if ((this.versions & Dict.VERS_HTML20) != 0)
- return Dict.VERS_HTML20;
-
- break;
-
- case Dict.VERS_HTML32:
- if ((this.versions & Dict.VERS_HTML32) != 0)
- return Dict.VERS_HTML32;
-
- break; /* to replace old version by new */
-
- case Dict.VERS_HTML40_STRICT:
- if ((this.versions & Dict.VERS_HTML40_STRICT) != 0)
- return Dict.VERS_HTML40_STRICT;
-
- break;
-
- case Dict.VERS_HTML40_LOOSE:
- if ((this.versions & Dict.VERS_HTML40_LOOSE) != 0)
- return Dict.VERS_HTML40_LOOSE;
-
- break; /* to replace old version by new */
-
- case Dict.VERS_FRAMES:
- if ((this.versions & Dict.VERS_FRAMES) != 0)
- return Dict.VERS_FRAMES;
-
- break;
- }
-
- Report.warning(this, null, null, Report.INCONSISTENT_VERSION);
- return this.HTMLVersion();
- }
-
- /* fixup doctype if missing */
- public boolean fixDocType(Node root)
- {
- Node doctype;
- int guessed = Dict.VERS_HTML40_STRICT, i;
-
- if (this.badDoctype)
- Report.warning(this, null, null, Report.MALFORMED_DOCTYPE);
-
- if (configuration.XmlOut)
- return true;
-
- doctype = root.findDocType();
-
- if (configuration.docTypeMode == Configuration.DOCTYPE_OMIT)
- {
- if (doctype != null)
- Node.discardElement(doctype);
- return true;
- }
-
- if (configuration.docTypeMode == Configuration.DOCTYPE_STRICT)
- {
- Node.discardElement(doctype);
- doctype = null;
- guessed = Dict.VERS_HTML40_STRICT;
- }
- else if (configuration.docTypeMode == Configuration.DOCTYPE_LOOSE)
- {
- Node.discardElement(doctype);
- doctype = null;
- guessed = Dict.VERS_HTML40_LOOSE;
- }
- else if (configuration.docTypeMode == Configuration.DOCTYPE_AUTO)
- {
- if (doctype != null)
- {
- if (this.doctype == Dict.VERS_UNKNOWN)
- return false;
-
- switch (this.doctype)
- {
- case Dict.VERS_UNKNOWN:
- return false;
-
- case Dict.VERS_HTML20:
- if ((this.versions & Dict.VERS_HTML20) != 0)
- return true;
-
- break; /* to replace old version by new */
-
- case Dict.VERS_HTML32:
- if ((this.versions & Dict.VERS_HTML32) != 0)
- return true;
-
- break; /* to replace old version by new */
-
- case Dict.VERS_HTML40_STRICT:
- if ((this.versions & Dict.VERS_HTML40_STRICT) != 0)
- return true;
-
- break; /* to replace old version by new */
-
- case Dict.VERS_HTML40_LOOSE:
- if ((this.versions & Dict.VERS_HTML40_LOOSE) != 0)
- return true;
-
- break; /* to replace old version by new */
-
- case Dict.VERS_FRAMES:
- if ((this.versions & Dict.VERS_FRAMES) != 0)
- return true;
-
- break; /* to replace old version by new */
- }
-
- /* INCONSISTENT_VERSION warning is now issued by ApparentVersion() */
- }
-
- /* choose new doctype */
- guessed = HTMLVersion();
- }
-
- if (guessed == Dict.VERS_UNKNOWN)
- return false;
-
- /* for XML use the Voyager system identifier */
- if (this.configuration.XmlOut || this.configuration.XmlTags || this.isvoyager)
- {
- if (doctype != null)
- Node.discardElement(doctype);
-
- for (i = 0; i < W3CVersion.length; ++i)
- {
- if (guessed == W3CVersion[i].code)
- {
- fixHTMLNameSpace(root, W3CVersion[i].profile);
- break;
- }
- }
-
- return true;
- }
-
- if (doctype == null)
- {
- doctype = newNode(Node.DocTypeTag, this.lexbuf, 0, 0);
- doctype.next = root.content;
- doctype.parent = root;
- doctype.prev = null;
- root.content = doctype;
- }
-
- this.txtstart = this.lexsize;
- this.txtend = this.lexsize;
-
- /* use the appropriate public identifier */
- addStringLiteral("html PUBLIC ");
-
- if (configuration.docTypeMode == Configuration.DOCTYPE_USER &&
- configuration.docTypeStr != null)
- addStringLiteral(configuration.docTypeStr);
- else if (guessed == Dict.VERS_HTML20)
- addStringLiteral("\"-//IETF//DTD HTML 2.0//EN\"");
- else
- {
- addStringLiteral("\"-//W3C//DTD ");
-
- for (i = 0; i < W3CVersion.length; ++i)
- {
- if (guessed == W3CVersion[i].code)
- {
- addStringLiteral(W3CVersion[i].name);
- break;
- }
- }
-
- addStringLiteral("//EN\"");
- }
-
- this.txtend = this.lexsize;
-
- doctype.start = this.txtstart;
- doctype.end = this.txtend;
-
- return true;
- }
-
- /* ensure XML document starts with */
- public boolean fixXMLPI(Node root)
- {
- Node xml;
- int s;
-
- if( root.content != null && root.content.type == Node.ProcInsTag)
- {
- s = root.content.start;
-
- if (this.lexbuf[s] == (byte)'x' &&
- this.lexbuf[s+1] == (byte)'m' &&
- this.lexbuf[s+2] == (byte)'l')
- return true;
- }
-
- xml = newNode(Node.ProcInsTag, this.lexbuf, 0, 0);
- xml.next = root.content;
-
- if (root.content != null)
- {
- root.content.prev = xml;
- xml.next = root.content;
- }
-
- root.content = xml;
-
- this.txtstart = this.lexsize;
- this.txtend = this.lexsize;
- addStringLiteral("xml version=\"1.0\"");
- if (this.configuration.CharEncoding == Configuration.LATIN1)
- addStringLiteral(" encoding=\"ISO-8859-1\"");
- this.txtend = this.lexsize;
-
- xml.start = this.txtstart;
- xml.end = this.txtend;
- return false;
- }
-
- public Node inferredTag(String name)
- {
- Node node;
-
- node = newNode(Node.StartTag,
- this.lexbuf,
- this.txtstart,
- this.txtend,
- name);
- node.implicit = true;
- return node;
- }
-
- public static boolean expectsContent(Node node)
- {
- if (node.type != Node.StartTag)
- return false;
-
- /* unknown element? */
- if (node.tag == null)
- return true;
-
- if ((node.tag.model & Dict.CM_EMPTY) != 0)
- return false;
-
- return true;
- }
-
- /*
- create a text node for the contents of
- a CDATA element like style or script
- which ends with for some foo.
- */
- public Node getCDATA(Node container)
- {
- int c, lastc, start, len, i;
- String str;
- boolean endtag = false;
-
- this.lines = this.in.curline;
- this.columns = this.in.curcol;
- this.waswhite = false;
- this.txtstart = this.lexsize;
- this.txtend = this.lexsize;
-
- lastc = (int)'\0';
- start = -1;
-
- while (true)
- {
- c = this.in.readChar();
- if (c == StreamIn.EndOfStream) break;
- /* treat \r\n as \n and \r as \n */
-
- if (c == (int)'/' && lastc == (int)'<')
- {
- if (endtag)
- {
- this.lines = this.in.curline;
- this.columns = this.in.curcol - 3;
-
- Report.warning(this, null, null, Report.BAD_CDATA_CONTENT);
- }
-
- start = this.lexsize + 1; /* to first letter */
- endtag = true;
- }
- else if (c == (int)'>' && start >= 0)
- {
- len = this.lexsize - start;
- if (len == container.element.length())
- {
- str = getString( this.lexbuf, start, len );
- if (Lexer.wstrcasecmp(str, container.element) == 0)
- {
- this.txtend = start - 2;
- break;
- }
- }
-
- this.lines = this.in.curline;
- this.columns = this.in.curcol - 3;
-
- Report.warning(this, null, null, Report.BAD_CDATA_CONTENT);
-
- /* if javascript insert backslash before / */
-
- if (ParserImpl.isJavaScript(container))
- {
- for (i = this.lexsize; i > start-1; --i)
- this.lexbuf[i] = this.lexbuf[i-1];
-
- this.lexbuf[start-1] = (byte)'\\';
- this.lexsize++;
- }
-
- start = -1;
- }
- else if (c == (int)'\r')
- {
- c = this.in.readChar();
-
- if (c != (int)'\n')
- this.in.ungetChar(c);
-
- c = (int)'\n';
- }
-
- addCharToLexer((int)c);
- this.txtend = this.lexsize;
- lastc = c;
- }
-
- if (c == StreamIn.EndOfStream)
- Report.warning(this, container, null, Report.MISSING_ENDTAG_FOR);
-
- if (this.txtend > this.txtstart)
- {
- this.token = newNode(Node.TextNode,
- this.lexbuf,
- this.txtstart,
- this.txtend);
- return this.token;
- }
-
- return null;
- }
-
- public void ungetToken()
- {
- this.pushed = true;
- }
-
- public static final short IgnoreWhitespace = 0;
- public static final short MixedContent = 1;
- public static final short Preformatted = 2;
- public static final short IgnoreMarkup = 3;
-
- /*
- modes for GetToken()
-
- MixedContent -- for elements which don't accept PCDATA
- Preformatted -- white space preserved as is
- IgnoreMarkup -- for CDATA elements such as script, style
- */
-
- public Node getToken(short mode)
- {
- short map;
- int c = 0;
- int lastc;
- int badcomment = 0;
- MutableBoolean isempty = new MutableBoolean();
- AttVal attributes;
-
- if (this.pushed)
- {
- /* duplicate inlines in preference to pushed text nodes when appropriate */
- if (this.token.type != Node.TextNode ||
- (this.insert == -1 && this.inode == null))
- {
- this.pushed = false;
- return this.token;
- }
- }
-
- /* at start of block elements, unclosed inline
- elements are inserted into the token stream */
-
- if (this.insert != -1 || this.inode != null)
- return insertedToken();
-
- this.lines = this.in.curline;
- this.columns = this.in.curcol;
- this.waswhite = false;
-
- this.txtstart = this.lexsize;
- this.txtend = this.lexsize;
-
- while (true)
- {
- c = this.in.readChar();
- if (c == StreamIn.EndOfStream) break;
- if (this.insertspace && mode != IgnoreWhitespace)
- {
- addCharToLexer(' ');
- this.waswhite = true;
- this.insertspace = false;
- }
-
- /* treat \r\n as \n and \r as \n */
-
- if (c == '\r')
- {
- c = this.in.readChar();
-
- if (c != '\n')
- this.in.ungetChar(c);
-
- c = '\n';
- }
-
- addCharToLexer(c);
-
- switch (this.state)
- {
- case LEX_CONTENT: /* element content */
- map = MAP((char)c);
-
- /*
- Discard white space if appropriate. Its cheaper
- to do this here rather than in parser methods
- for elements that don't have mixed content.
- */
- if (((map & WHITE) != 0) && (mode == IgnoreWhitespace)
- && this.lexsize == this.txtstart + 1)
- {
- --this.lexsize;
- this.waswhite = false;
- this.lines = this.in.curline;
- this.columns = this.in.curcol;
- continue;
- }
-
- if (c == '<')
- {
- this.state = LEX_GT;
- continue;
- }
-
- if ((map & WHITE) != 0)
- {
- /* was previous char white? */
- if (this.waswhite)
- {
- if (mode != Preformatted && mode != IgnoreMarkup)
- {
- --this.lexsize;
- this.lines = this.in.curline;
- this.columns = this.in.curcol;
- }
- }
- else /* prev char wasn't white */
- {
- this.waswhite = true;
- lastc = c;
-
- if (mode != Preformatted && mode != IgnoreMarkup && c != ' ')
- changeChar((byte)' ');
- }
-
- continue;
- }
- else if (c == '&' && mode != IgnoreMarkup)
- parseEntity(mode);
-
- /* this is needed to avoid trimming trailing whitespace */
- if (mode == IgnoreWhitespace)
- mode = MixedContent;
-
- this.waswhite = false;
- continue;
-
- case LEX_GT: /* < */
-
- /* check for endtag */
- if (c == '/')
- {
- c = this.in.readChar();
- if (c == StreamIn.EndOfStream)
- {
- this.in.ungetChar(c);
- continue;
- }
-
- addCharToLexer(c);
- map = MAP((char)c);
-
- if ((map & LETTER) != 0)
- {
- this.lexsize -= 3;
- this.txtend = this.lexsize;
- this.in.ungetChar(c);
- this.state = LEX_ENDTAG;
- this.lexbuf[this.lexsize] = (byte)'\0'; /* debug */
- this.in.curcol -= 2;
-
- /* if some text before the return it now */
- if (this.txtend > this.txtstart)
- {
- /* trim space char before end tag */
- if (mode == IgnoreWhitespace && this.lexbuf[this.lexsize - 1] == (byte)' ')
- {
- this.lexsize -= 1;
- this.txtend = this.lexsize;
- }
-
- this.token = newNode(Node.TextNode,
- this.lexbuf,
- this.txtstart,
- this.txtend);
- return this.token;
- }
-
- continue; /* no text so keep going */
- }
-
- /* otherwise treat as CDATA */
- this.waswhite = false;
- this.state = LEX_CONTENT;
- continue;
- }
-
- if (mode == IgnoreMarkup)
- {
- /* otherwise treat as CDATA */
- this.waswhite = false;
- this.state = LEX_CONTENT;
- continue;
- }
-
- /*
- look out for comments, doctype or marked sections
- this isn't quite right, but its getting there ...
- */
- if (c == '!')
- {
- c = this.in.readChar();
-
- if (c == '-')
- {
- c = this.in.readChar();
-
- if (c == '-')
- {
- this.state = LEX_COMMENT; /* comment */
- this.lexsize -= 2;
- this.txtend = this.lexsize;
-
- /* if some text before < return it now */
- if (this.txtend > this.txtstart)
- {
- this.token = newNode(Node.TextNode,
- this.lexbuf,
- this.txtstart,
- this.txtend);
- return this.token;
- }
-
- this.txtstart = this.lexsize;
- continue;
- }
-
- Report.warning(this, null, null, Report.MALFORMED_COMMENT);
- }
- else if (c == 'd' || c == 'D')
- {
- this.state = LEX_DOCTYPE; /* doctype */
- this.lexsize -= 2;
- this.txtend = this.lexsize;
- mode = IgnoreWhitespace;
-
- /* skip until white space or '>' */
-
- for (;;)
- {
- c = this.in.readChar();
-
- if (c == StreamIn.EndOfStream || c == '>')
- {
- this.in.ungetChar(c);
- break;
- }
-
- map = MAP((char)c);
-
- if ((map & WHITE) == 0)
- continue;
-
- /* and skip to end of whitespace */
-
- for (;;)
- {
- c = this.in.readChar();
-
- if (c == StreamIn.EndOfStream || c == '>')
- {
- this.in.ungetChar(c);
- break;
- }
-
- map = MAP((char)c);
-
- if ((map & WHITE) != 0)
- continue;
-
- this.in.ungetChar(c);
- break;
- }
-
- break;
- }
-
- /* if some text before < return it now */
- if (this.txtend > this.txtstart)
- {
- this.token = newNode(Node.TextNode,
- this.lexbuf,
- this.txtstart,
- this.txtend);
- return this.token;
- }
-
- this.txtstart = this.lexsize;
- continue;
- }
- else if (c == '[')
- {
- /* Word 2000 embeds ... sequences */
- this.lexsize -= 2;
- this.state = LEX_SECTION;
- this.txtend = this.lexsize;
-
- /* if some text before < return it now */
- if (this.txtend > this.txtstart)
- {
- this.token = newNode(Node.TextNode,
- this.lexbuf,
- this.txtstart,
- this.txtend);
- return this.token;
- }
-
- this.txtstart = this.lexsize;
- continue;
- }
-
- /* otherwise swallow chars up to and including next '>' */
- while (true)
- {
- c = this.in.readChar();
- if (c == '>') break;
- if (c == -1)
- {
- this.in.ungetChar(c);
- break;
- }
- }
-
- this.lexsize -= 2;
- this.lexbuf[this.lexsize] = (byte)'\0';
- this.state = LEX_CONTENT;
- continue;
- }
-
- /*
- processing instructions
- */
-
- if (c == '?')
- {
- this.lexsize -= 2;
- this.state = LEX_PROCINSTR;
- this.txtend = this.lexsize;
-
- /* if some text before < return it now */
- if (this.txtend > this.txtstart)
- {
- this.token = newNode(Node.TextNode,
- this.lexbuf,
- this.txtstart,
- this.txtend);
- return this.token;
- }
-
- this.txtstart = this.lexsize;
- continue;
- }
-
- /* Microsoft ASP's e.g. <% ... server-code ... %> */
- if (c == '%')
- {
- this.lexsize -= 2;
- this.state = LEX_ASP;
- this.txtend = this.lexsize;
-
- /* if some text before < return it now */
- if (this.txtend > this.txtstart)
- {
- this.token = newNode(Node.TextNode,
- this.lexbuf,
- this.txtstart,
- this.txtend);
- return this.token;
- }
-
- this.txtstart = this.lexsize;
- continue;
- }
-
- /* Netscapes JSTE e.g. <# ... server-code ... #> */
- if (c == '#')
- {
- this.lexsize -= 2;
- this.state = LEX_JSTE;
- this.txtend = this.lexsize;
-
- /* if some text before < return it now */
- if (this.txtend > this.txtstart)
- {
- this.token = newNode(Node.TextNode,
- this.lexbuf,
- this.txtstart,
- this.txtend);
- return this.token;
- }
-
- this.txtstart = this.lexsize;
- continue;
- }
-
- map = MAP((char)c);
-
- /* check for start tag */
- if ((map & LETTER) != 0)
- {
- this.in.ungetChar(c); /* push back letter */
- this.lexsize -= 2; /* discard "<" + letter */
- this.txtend = this.lexsize;
- this.state = LEX_STARTTAG; /* ready to read tag name */
-
- /* if some text before < return it now */
- if (this.txtend > this.txtstart)
- {
- this.token = newNode(Node.TextNode,
- this.lexbuf,
- this.txtstart,
- this.txtend);
- return this.token;
- }
-
- continue; /* no text so keep going */
- }
-
- /* otherwise treat as CDATA */
- this.state = LEX_CONTENT;
- this.waswhite = false;
- continue;
-
- case LEX_ENDTAG: /* ' */
- while (c != '>')
- {
- c = this.in.readChar();
-
- if (c == StreamIn.EndOfStream)
- break;
- }
-
- if (c == StreamIn.EndOfStream)
- {
- this.in.ungetChar(c);
- continue;
- }
-
- this.state = LEX_CONTENT;
- this.waswhite = false;
- return this.token; /* the endtag token */
-
- case LEX_STARTTAG: /* first letter of tagname */
- this.txtstart = this.lexsize - 1; /* set txtstart to first letter */
- c = parseTagName();
- isempty.value = false;
- attributes = null;
- this.token = newNode((isempty.value ? Node.StartEndTag : Node.StartTag),
- this.lexbuf,
- this.txtstart,
- this.txtend,
- getString(this.lexbuf,
- this.txtstart,
- this.txtend - this.txtstart));
-
- /* parse attributes, consuming closing ">" */
- if (c != '>')
- {
- if (c == '/')
- this.in.ungetChar(c);
-
- attributes = parseAttrs(isempty);
- }
-
- if (isempty.value)
- this.token.type = Node.StartEndTag;
-
- this.token.attributes = attributes;
- this.lexsize = this.txtstart;
- this.txtend = this.txtstart;
-
- /* swallow newline following start tag */
- /* special check needed for CRLF sequence */
- /* this doesn't apply to empty elements */
-
- if (expectsContent(this.token) ||
- this.token.tag == configuration.tt.tagBr)
- {
-
- c = this.in.readChar();
-
- if (c == '\r')
- {
- c = this.in.readChar();
-
- if (c != '\n')
- this.in.ungetChar(c);
- }
- else if (c != '\n' && c != '\f')
- this.in.ungetChar(c);
-
- this.waswhite = true; /* to swallow leading whitespace */
- }
- else
- this.waswhite = false;
-
- this.state = LEX_CONTENT;
-
- if (this.token.tag == null)
- Report.error(this, null, this.token, Report.UNKNOWN_ELEMENT);
- else if (!this.configuration.XmlTags)
- {
- this.versions &= this.token.tag.versions;
-
- if ((this.token.tag.versions & Dict.VERS_PROPRIETARY) != 0)
- {
- if (!this.configuration.MakeClean && (this.token.tag == configuration.tt.tagNobr ||
- this.token.tag == configuration.tt.tagWbr))
- Report.warning(this, null, this.token, Report.PROPRIETARY_ELEMENT);
- }
-
- if (this.token.tag.chkattrs != null)
- {
- this.token.checkUniqueAttributes(this);
- this.token.tag.chkattrs.check(this, this.token);
- }
- else
- this.token.checkAttributes(this);
- }
-
- return this.token; /* return start tag */
-
- case LEX_COMMENT: /* seen */
-
- if (c != '-')
- continue;
-
- c = this.in.readChar();
- addCharToLexer(c);
-
- if (c != '-')
- continue;
-
- end_comment: while (true) {
- c = this.in.readChar();
-
- if (c == '>')
- {
- if (badcomment != 0)
- Report.warning(this, null, null, Report.MALFORMED_COMMENT);
-
- this.txtend = this.lexsize - 2; // AQ 8Jul2000
- this.lexbuf[this.lexsize] = (byte)'\0';
- this.state = LEX_CONTENT;
- this.waswhite = false;
- this.token = newNode(Node.CommentTag,
- this.lexbuf,
- this.txtstart,
- this.txtend);
-
- /* now look for a line break */
-
- c = this.in.readChar();
-
- if (c == '\r')
- {
- c = this.in.readChar();
-
- if (c != '\n')
- this.token.linebreak = true;
- }
-
- if (c == '\n')
- this.token.linebreak = true;
- else
- this.in.ungetChar(c);
-
- return this.token;
- }
-
- /* note position of first such error in the comment */
- if (badcomment == 0)
- {
- this.lines = this.in.curline;
- this.columns = this.in.curcol - 3;
- }
-
- badcomment++;
- if (this.configuration.FixComments)
- this.lexbuf[this.lexsize - 2] = (byte)'=';
-
- addCharToLexer(c);
-
- /* if '-' then look for '>' to end the comment */
- if (c != '-')
- break end_comment;
-
- }
- /* otherwise continue to look for --> */
- this.lexbuf[this.lexsize - 2] = (byte)'=';
- continue;
-
- case LEX_DOCTYPE: /* seen ' munging whitespace */
- map = MAP((char)c);
-
- if ((map & WHITE) != 0)
- {
- if (this.waswhite)
- this.lexsize -= 1;
-
- this.waswhite = true;
- }
- else
- this.waswhite = false;
-
- if (c != '>')
- continue;
-
- this.lexsize -= 1;
- this.txtend = this.lexsize;
- this.lexbuf[this.lexsize] = (byte)'\0';
- this.state = LEX_CONTENT;
- this.waswhite = false;
- this.token = newNode(Node.DocTypeTag,
- this.lexbuf,
- this.txtstart,
- this.txtend);
- /* make a note of the version named by the doctype */
- this.doctype = findGivenVersion(this.token);
- return this.token;
-
- case LEX_PROCINSTR: /* seen so look for '>' */
- /* check for PHP preprocessor instructions */
-
- if (this.lexsize - this.txtstart == 3)
- {
- if ((getString(this.lexbuf, this.txtstart, 3)).equals("php"))
- {
- this.state = LEX_PHP;
- continue;
- }
- }
-
- if (this.configuration.XmlPIs) /* insist on ?> as terminator */
- {
- if (c != '?')
- continue;
-
- /* now look for '>' */
- c = this.in.readChar();
-
- if (c == StreamIn.EndOfStream)
- {
- Report.warning(this, null, null, Report.UNEXPECTED_END_OF_FILE);
- this.in.ungetChar(c);
- continue;
- }
-
- addCharToLexer(c);
- }
-
- if (c != '>')
- continue;
-
- this.lexsize -= 1;
- this.txtend = this.lexsize;
- this.lexbuf[this.lexsize] = (byte)'\0';
- this.state = LEX_CONTENT;
- this.waswhite = false;
- this.token = newNode(Node.ProcInsTag,
- this.lexbuf,
- this.txtstart,
- this.txtend);
- return this.token;
-
- case LEX_ASP: /* seen <% so look for "%>" */
- if (c != '%')
- continue;
-
- /* now look for '>' */
- c = this.in.readChar();
-
-
- if (c != '>')
- {
- this.in.ungetChar(c);
- continue;
- }
-
- this.lexsize -= 1;
- this.txtend = this.lexsize;
- this.lexbuf[this.lexsize] = (byte)'\0';
- this.state = LEX_CONTENT;
- this.waswhite = false;
- this.token = newNode(Node.AspTag,
- this.lexbuf,
- this.txtstart,
- this.txtend);
- return this.token;
-
- case LEX_JSTE: /* seen <# so look for "#>" */
- if (c != '#')
- continue;
-
- /* now look for '>' */
- c = this.in.readChar();
-
-
- if (c != '>')
- {
- this.in.ungetChar(c);
- continue;
- }
-
- this.lexsize -= 1;
- this.txtend = this.lexsize;
- this.lexbuf[this.lexsize] = (byte)'\0';
- this.state = LEX_CONTENT;
- this.waswhite = false;
- this.token = newNode(Node.JsteTag,
- this.lexbuf,
- this.txtstart,
- this.txtend);
- return this.token;
-
- case LEX_PHP: /* seen "" */
- if (c != '?')
- continue;
-
- /* now look for '>' */
- c = this.in.readChar();
-
- if (c != '>')
- {
- this.in.ungetChar(c);
- continue;
- }
-
- this.lexsize -= 1;
- this.txtend = this.lexsize;
- this.lexbuf[this.lexsize] = (byte)'\0';
- this.state = LEX_CONTENT;
- this.waswhite = false;
- this.token = newNode(Node.PhpTag,
- this.lexbuf,
- this.txtstart,
- this.txtend);
- return this.token;
-
- case LEX_SECTION: /* seen "" */
- if (c == '[')
- {
- if (this.lexsize == (this.txtstart + 6) &&
- (getString(this.lexbuf, this.txtstart, 6)).equals("CDATA["))
- {
- this.state = LEX_CDATA;
- this.lexsize -= 6;
- continue;
- }
- }
-
- if (c != ']')
- continue;
-
- /* now look for '>' */
- c = this.in.readChar();
-
- if (c != '>')
- {
- this.in.ungetChar(c);
- continue;
- }
-
- this.lexsize -= 1;
- this.txtend = this.lexsize;
- this.lexbuf[this.lexsize] = (byte)'\0';
- this.state = LEX_CONTENT;
- this.waswhite = false;
- this.token = newNode(Node.SectionTag,
- this.lexbuf,
- this.txtstart,
- this.txtend);
- return this.token;
-
- case LEX_CDATA: /* seen "" */
- if (c != ']')
- continue;
-
- /* now look for ']' */
- c = this.in.readChar();
-
- if (c != ']')
- {
- this.in.ungetChar(c);
- continue;
- }
-
- /* now look for '>' */
- c = this.in.readChar();
-
- if (c != '>')
- {
- this.in.ungetChar(c);
- continue;
- }
-
- this.lexsize -= 1;
- this.txtend = this.lexsize;
- this.lexbuf[this.lexsize] = (byte)'\0';
- this.state = LEX_CONTENT;
- this.waswhite = false;
- this.token = newNode(Node.CDATATag,
- this.lexbuf,
- this.txtstart,
- this.txtend);
- return this.token;
- }
- }
-
- if (this.state == LEX_CONTENT) /* text string */
- {
- this.txtend = this.lexsize;
-
- if (this.txtend > this.txtstart)
- {
- this.in.ungetChar(c);
-
- if (this.lexbuf[this.lexsize - 1] == (byte)' ')
- {
- this.lexsize -= 1;
- this.txtend = this.lexsize;
- }
-
- this.token = newNode(Node.TextNode,
- this.lexbuf,
- this.txtstart,
- this.txtend);
- return this.token;
- }
- }
- else if (this.state == LEX_COMMENT) /* comment */
- {
- if (c == StreamIn.EndOfStream)
- Report.warning(this, null, null, Report.MALFORMED_COMMENT);
-
- this.txtend = this.lexsize;
- this.lexbuf[this.lexsize] = (byte)'\0';
- this.state = LEX_CONTENT;
- this.waswhite = false;
- this.token = newNode(Node.CommentTag,
- this.lexbuf,
- this.txtstart,
- this.txtend);
- return this.token;
- }
-
- return null;
- }
-
- /*
- parser for ASP within start tags
-
- Some people use ASP for to customize attributes
- Tidy isn't really well suited to dealing with ASP
- This is a workaround for attributes, but won't
- deal with the case where the ASP is used to tailor
- the attribute value. Here is an example of a work
- around for using ASP in attribute values:
-
- href="<%=rsSchool.Fields("ID").Value%>"
-
- where the ASP that generates the attribute value
- is masked from Tidy by the quotemarks.
-
- */
-
- public Node parseAsp()
- {
- int c;
- Node asp = null;
-
- this.txtstart = this.lexsize;
-
- for (;;)
- {
- c = this.in.readChar();
- addCharToLexer(c);
-
-
- if (c != '%')
- continue;
-
- c = this.in.readChar();
- addCharToLexer(c);
-
- if (c == '>')
- break;
- }
-
- this.lexsize -= 2;
- this.txtend = this.lexsize;
-
- if (this.txtend > this.txtstart)
- asp = newNode(Node.AspTag,
- this.lexbuf,
- this.txtstart,
- this.txtend);
-
- this.txtstart = this.txtend;
- return asp;
- }
-
- /*
- PHP is like ASP but is based upon XML
- processing instructions, e.g.
- */
- public Node parsePhp()
- {
- int c;
- Node php = null;
-
- this.txtstart = this.lexsize;
-
- for (;;)
- {
- c = this.in.readChar();
- addCharToLexer(c);
-
-
- if (c != '?')
- continue;
-
- c = this.in.readChar();
- addCharToLexer(c);
-
- if (c == '>')
- break;
- }
-
- this.lexsize -= 2;
- this.txtend = this.lexsize;
-
- if (this.txtend > this.txtstart)
- php = newNode(Node.PhpTag,
- this.lexbuf,
- this.txtstart,
- this.txtend);
-
- this.txtstart = this.txtend;
- return php;
- }
-
- /* consumes the '>' terminating start tags */
- public String parseAttribute(MutableBoolean isempty, MutableObject asp,
- MutableObject php)
- {
- int start = 0;
- // int len = 0; Removed by BUGFIX for 126265
- short map;
- String attr;
- int c = 0;
-
- asp.setObject(null); /* clear asp pointer */
- php.setObject(null); /* clear php pointer */
- /* skip white space before the attribute */
-
- for (;;)
- {
- c = this.in.readChar();
-
- if (c == '/')
- {
- c = this.in.readChar();
-
- if (c == '>')
- {
- isempty.value = true;
- return null;
- }
-
- this.in.ungetChar(c);
- c = '/';
- break;
- }
-
- if (c == '>')
- return null;
-
- if (c =='<')
- {
- c = this.in.readChar();
-
- if (c == '%')
- {
- asp.setObject(parseAsp());
- return null;
- }
- else if (c == '?')
- {
- php.setObject(parsePhp());
- return null;
- }
-
- this.in.ungetChar(c);
- Report.attrError(this, this.token, null, Report.UNEXPECTED_GT);
- return null;
- }
-
- if (c == '"' || c == '\'')
- {
- Report.attrError(this, this.token, null, Report.UNEXPECTED_QUOTEMARK);
- continue;
- }
-
- if (c == StreamIn.EndOfStream)
- {
- Report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE);
- this.in.ungetChar(c);
- return null;
- }
-
- map = MAP((char)c);
-
- if ((map & WHITE) == 0)
- break;
- }
-
- start = this.lexsize;
-
- for (;;)
- {
- /* but push back '=' for parseValue() */
- if (c == '=' || c == '>')
- {
- this.in.ungetChar(c);
- break;
- }
-
- if (c == '<' || c == StreamIn.EndOfStream)
- {
- this.in.ungetChar(c);
- break;
- }
-
- map = MAP((char)c);
-
- if ((map & WHITE) != 0)
- break;
-
- /* what should be done about non-namechar characters? */
- /* currently these are incorporated into the attr name */
-
- if (!this.configuration.XmlTags && (map & UPPERCASE) != 0)
- c += (int)('a' - 'A');
-
- // ++len; Removed by BUGFIX for 126265
- addCharToLexer(c);
-
- c = this.in.readChar();
- }
-
- // Following line added by GLP to fix BUG 126265. This is a temporary comment
- // and should be removed when Tidy is fixed.
- int len = this.lexsize - start;
- attr = (len > 0 ? getString(this.lexbuf, start, len) : null);
- this.lexsize = start;
-
- return attr;
- }
-
- /*
- invoked when < is seen in place of attribute value
- but terminates on whitespace if not ASP, PHP or Tango
- this routine recognizes ' and " quoted strings
- */
- public int parseServerInstruction()
- {
- int c, map, delim = '"';
- boolean isrule = false;
-
- c = this.in.readChar();
- addCharToLexer(c);
-
- /* check for ASP, PHP or Tango */
- if (c == '%' || c == '?' || c == '@')
- isrule = true;
-
- for (;;)
- {
- c = this.in.readChar();
-
- if (c == StreamIn.EndOfStream)
- break;
-
- if (c == '>')
- {
- if (isrule)
- addCharToLexer(c);
- else
- this.in.ungetChar(c);
-
- break;
- }
-
- /* if not recognized as ASP, PHP or Tango */
- /* then also finish value on whitespace */
- if (!isrule)
- {
- map = MAP((char)c);
-
- if ((map & WHITE) != 0)
- break;
- }
-
- addCharToLexer(c);
-
- if (c == '"')
- {
- do
- {
- c = this.in.readChar();
- addCharToLexer(c);
- }
- while (c != '"');
- delim = '\'';
- continue;
- }
-
- if (c == '\'')
- {
- do
- {
- c = this.in.readChar();
- addCharToLexer(c);
- }
- while (c != '\'');
- }
- }
-
- return delim;
- }
-
- /* values start with "=" or " = " etc. */
- /* doesn't consume the ">" at end of start tag */
-
- public String parseValue(String name, boolean foldCase,
- MutableBoolean isempty, MutableInteger pdelim)
- {
- int len = 0;
- int start;
- short map;
- boolean seen_gt = false;
- boolean munge = true;
- int c = 0;
- int lastc, delim, quotewarning;
- String value;
-
- delim = 0;
- pdelim.value = (int)'"';
-
- /*
- Henry Zrepa reports that some folk are using the
- embed element with script attributes where newlines
- are significant and must be preserved
- */
- if (configuration.LiteralAttribs)
- munge = false;
-
- /* skip white space before the '=' */
-
- for (;;)
- {
- c = this.in.readChar();
-
- if (c == StreamIn.EndOfStream)
- {
- this.in.ungetChar(c);
- break;
- }
-
- map = MAP((char)c);
-
- if ((map & WHITE) == 0)
- break;
- }
-
- /*
- c should be '=' if there is a value
- other legal possibilities are white
- space, '/' and '>'
- */
-
- if (c != '=')
- {
- this.in.ungetChar(c);
- return null;
- }
-
- /* skip white space after '=' */
-
- for (;;)
- {
- c = this.in.readChar();
-
- if (c == StreamIn.EndOfStream)
- {
- this.in.ungetChar(c);
- break;
- }
-
- map = MAP((char)c);
-
- if ((map & WHITE) == 0)
- break;
- }
-
- /* check for quote marks */
-
- if (c == '"' || c == '\'')
- delim = c;
- else if (c == '<')
- {
- start = this.lexsize;
- addCharToLexer(c);
- pdelim.value = parseServerInstruction();
- len = this.lexsize - start;
- this.lexsize = start;
- return (len > 0 ? getString(this.lexbuf, start, len) : null);
- }
- else
- this.in.ungetChar(c);
-
- /*
- and read the value string
- check for quote mark if needed
- */
-
- quotewarning = 0;
- start = this.lexsize;
- c = '\0';
-
- for (;;)
- {
- lastc = c; /* track last character */
- c = this.in.readChar();
-
- if (c == StreamIn.EndOfStream)
- {
- Report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE);
- this.in.ungetChar(c);
- break;
- }
-
- if (delim == (char)0)
- {
- if (c == '>')
- {
- this.in.ungetChar(c);
- break;
- }
-
- if (c == '"' || c == '\'')
- {
- Report.attrError(this, this.token, null, Report.UNEXPECTED_QUOTEMARK);
- break;
- }
-
- if (c == '<')
- {
- /* this.in.ungetChar(c); */
- Report.attrError(this, this.token, null, Report.UNEXPECTED_GT);
- /* break; */
- }
-
- /*
- For cases like
need to avoid treating /> as
- part of the attribute value, however care is needed to avoid
- so treating in this way, which
- would map the tag to
- */
- if (c == '/')
- {
- /* peek ahead in case of /> */
- c = this.in.readChar();
-
- if (c == '>' &&
- !AttributeTable.getDefaultAttributeTable().isUrl(name))
- {
- isempty.value = true;
- this.in.ungetChar(c);
- break;
- }
-
- /* unget peeked char */
- this.in.ungetChar(c);
- c = '/';
- }
- }
- else /* delim is '\'' or '"' */
- {
- if (c == delim)
- break;
-
- /* treat CRLF, CR and LF as single line break */
-
- if (c == '\r')
- {
- c = this.in.readChar();
- if (c != '\n')
- this.in.ungetChar(c);
-
- c = '\n';
- }
-
- if (c == '\n' || c == '<' || c == '>')
- ++quotewarning;
-
- if (c == '>')
- seen_gt = true;
- }
-
- if (c == '&')
- {
- addCharToLexer(c);
- parseEntity((short)0);
- continue;
- }
-
- /*
- kludge for JavaScript attribute values
- with line continuations in string literals
- */
- if (c == '\\')
- {
- c = this.in.readChar();
-
- if (c != '\n')
- {
- this.in.ungetChar(c);
- c = '\\';
- }
- }
-
- map = MAP((char)c);
-
- if ((map & WHITE) != 0)
- {
- if (delim == (char)0)
- break;
-
- if (munge)
- {
- c = ' ';
-
- if (lastc == ' ')
- continue;
- }
- }
- else if (foldCase && (map & UPPERCASE) != 0)
- c += (int)('a' - 'A');
-
- addCharToLexer(c);
- }
-
- if (quotewarning > 10 && seen_gt && munge)
- {
- /*
- there is almost certainly a missing trailling quote mark
- as we have see too many newlines, < or > characters.
-
- an exception is made for Javascript attributes and the
- javascript URL scheme which may legitimately include < and >
- */
- if (!AttributeTable.getDefaultAttributeTable().isScript(name) &&
- !(AttributeTable.getDefaultAttributeTable().isUrl(name) &&
- (getString(this.lexbuf, start, 11)).equals("javascript:")))
- Report.error(this, null, null, Report.SUSPECTED_MISSING_QUOTE);
- }
-
- len = this.lexsize - start;
- this.lexsize = start;
-
- if (len > 0 || delim != 0)
- value = getString(this.lexbuf, start, len);
- else
- value = null;
-
- /* note delimiter if given */
- if (delim != 0)
- pdelim.value = delim;
- else
- pdelim.value = (int)'"';
-
- return value;
- }
-
- /* attr must be non-null */
- public static boolean isValidAttrName(String attr)
- {
- short map;
- char c;
- int i;
-
- /* first character should be a letter */
- c = attr.charAt(0);
- map = MAP(c);
-
- if (!((map & LETTER) != 0))
- return false;
-
- /* remaining characters should be namechars */
- for( i = 1; i < attr.length(); i++)
- {
- c = attr.charAt(i);
- map = MAP(c);
-
- if((map & NAMECHAR) != 0)
- continue;
-
- return false;
- }
-
- return true;
- }
-
- /* swallows closing '>' */
-
- public AttVal parseAttrs(MutableBoolean isempty)
- {
- AttVal av, list;
- String attribute, value;
- MutableInteger delim = new MutableInteger();
- MutableObject asp = new MutableObject();
- MutableObject php = new MutableObject();
-
- list = null;
-
- for (; !endOfInput();)
- {
- attribute = parseAttribute(isempty, asp, php);
-
- if (attribute == null)
- {
- /* check if attributes are created by ASP markup */
- if (asp.getObject() != null)
- {
- av = new AttVal(list, null, (Node)asp.getObject(), null,
- '\0', null, null );
- list = av;
- continue;
- }
-
- /* check if attributes are created by PHP markup */
- if (php.getObject() != null)
- {
- av = new AttVal(list, null, null, (Node)php.getObject(),
- '\0', null, null );
- list = av;
- continue;
- }
-
- break;
- }
-
- value = parseValue(attribute, false, isempty, delim);
-
- if (attribute != null && isValidAttrName(attribute))
- {
- av = new AttVal( list, null, null, null,
- delim.value, attribute, value );
- av.dict =
- AttributeTable.getDefaultAttributeTable().findAttribute(av);
- list = av;
- }
- else
- {
- av = new AttVal( null, null, null, null,
- 0, attribute, value );
- Report.attrError(this, this.token, value, Report.BAD_ATTRIBUTE_VALUE);
- }
- }
-
- return list;
- }
-
- /*
- push a copy of an inline node onto stack
- but don't push if implicit or OBJECT or APPLET
- (implicit tags are ones generated from the istack)
-
- One issue arises with pushing inlines when
- the tag is already pushed. For instance:
-
- text
- more text
-
- Shouldn't be mapped to
-
- text
- more text
- */
- public void pushInline( Node node )
- {
- IStack is;
-
- if (node.implicit)
- return;
-
- if (node.tag == null)
- return;
-
- if ((node.tag.model & Dict.CM_INLINE) == 0 )
- return;
-
- if ((node.tag.model & Dict.CM_OBJECT) != 0)
- return;
-
- if (node.tag != configuration.tt.tagFont && isPushed(node))
- return;
-
- // make sure there is enough space for the stack
- is = new IStack();
- is.tag = node.tag;
- is.element = node.element;
- if (node.attributes != null)
- is.attributes = cloneAttributes(node.attributes);
- this.istack.push( is );
- }
-
- /* pop inline stack */
- public void popInline( Node node )
- {
- AttVal av;
- IStack is;
-
- if (node != null) {
-
- if (node.tag == null)
- return;
-
- if ((node.tag.model & Dict.CM_INLINE) == 0)
- return;
-
- if ((node.tag.model & Dict.CM_OBJECT) != 0)
- return;
-
- // if node is then pop until we find an
- if (node.tag == configuration.tt.tagA) {
-
- while (this.istack.size() > 0) {
- is = (IStack)this.istack.pop();
- if (is.tag == configuration.tt.tagA) {
- break;
- }
- }
-
- if (this.insert >= this.istack.size())
- this.insert = -1;
- return;
- }
- }
-
- if (this.istack.size() > 0) {
- is = (IStack)this.istack.pop();
- if (this.insert >= this.istack.size())
- this.insert = -1;
- }
- }
-
- public boolean isPushed( Node node )
- {
- int i;
- IStack is;
-
- for (i = this.istack.size() - 1; i >= 0; --i) {
- is = (IStack)this.istack.elementAt(i);
- if (is.tag == node.tag)
- return true;
- }
-
- return false;
- }
-
- /*
- This has the effect of inserting "missing" inline
- elements around the contents of blocklevel elements
- such as P, TD, TH, DIV, PRE etc. This procedure is
- called at the start of ParseBlock. when the inline
- stack is not empty, as will be the case in:
-
- italic heading
-
- which is then treated as equivalent to
-
- italic heading
-
- This is implemented by setting the lexer into a mode
- where it gets tokens from the inline stack rather than
- from the input stream.
- */
- public int inlineDup( Node node )
- {
- int n;
-
- n = this.istack.size() - this.istackbase;
- if ( n > 0 ) {
- this.insert = this.istackbase;
- this.inode = node;
- }
-
- return n;
- }
-
- public Node insertedToken()
- {
- Node node;
- IStack is;
- int n;
-
- // this will only be null if inode != null
- if (this.insert == -1) {
- node = this.inode;
- this.inode = null;
- return node;
- }
-
- // is this is the "latest" node then update
- // the position, otherwise use current values
-
- if (this.inode == null) {
- this.lines = this.in.curline;
- this.columns = this.in.curcol;
- }
-
- node = newNode(Node.StartTag,
- this.lexbuf,
- this.txtstart,
- this.txtend); // GLP: Bugfix 126261. Remove when this change
- // is fixed in istack.c in the original Tidy
- node.implicit = true;
- is = (IStack)this.istack.elementAt( this.insert );
- node.element = is.element;
- node.tag = is.tag;
- if (is.attributes != null)
- node.attributes = cloneAttributes(is.attributes);
-
- // advance lexer to next item on the stack
- n = this.insert;
-
- // and recover state if we have reached the end
- if (++n < this.istack.size() ) {
- this.insert = n;
- } else {
- this.insert = -1;
- }
-
- return node;
- }
-
- /* AQ: Try this for speed optimization */
- public static int wstrcasecmp(String s1, String s2)
- {
- return (s1.equalsIgnoreCase(s2) ? 0 : 1);
- }
-
- public static int wstrcaselexcmp(String s1, String s2)
- {
- char c;
- int i = 0;
-
- while ( i < s1.length() && i < s2.length() ) {
- c = s1.charAt(i);
- if ( toLower(c) != toLower( s2.charAt(i) ) ) {
- break;
- }
- i += 1;
- }
- if ( i == s1.length() && i == s2.length() ) {
- return 0;
- } else if ( i == s1.length() ) {
- return -1;
- } else if ( i == s2.length() ) {
- return 1;
- } else {
- return ( s1.charAt(i) > s2.charAt(i) ? 1 : -1 );
- }
- }
-
- public static boolean wsubstr(String s1, String s2)
- {
- int i;
- int len1 = s1.length();
- int len2 = s2.length();
-
- for (i = 0; i <= len1 - len2; ++i)
- {
- if (s2.equalsIgnoreCase(s1.substring(i)))
- return true;
- }
-
- return false;
- }
-
- public boolean canPrune(Node element)
- {
- if (element.type == Node.TextNode)
- return true;
-
- if (element.content != null)
- return false;
-
- if (element.tag == configuration.tt.tagA && element.attributes != null)
- return false;
-
- if (element.tag == configuration.tt.tagP && !this.configuration.DropEmptyParas)
- return false;
-
- if (element.tag == null)
- return false;
-
- if ((element.tag.model & Dict.CM_ROW) != 0)
- return false;
-
- if (element.tag == configuration.tt.tagApplet)
- return false;
-
- if (element.tag == configuration.tt.tagObject)
- return false;
-
- if (element.attributes != null &&
- (element.getAttrByName("id") != null ||
- element.getAttrByName("name") != null) )
- return false;
-
- return true;
- }
-
- /* duplicate name attribute as an id */
- public void fixId(Node node)
- {
- AttVal name = node.getAttrByName("name");
- AttVal id = node.getAttrByName("id");
-
- if (name != null)
- {
- if (id != null)
- {
- if (!id.value.equals(name.value))
- Report.attrError(this, node, "name", Report.ID_NAME_MISMATCH);
- }
- else if (this.configuration.XmlOut)
- node.addAttribute("id", name.value);
- }
- }
-
- /*
- defer duplicates when entering a table or other
- element where the inlines shouldn't be duplicated
- */
- public void deferDup()
- {
- this.insert = -1;
- this.inode = null;
- }
-
- /* Private methods and fields */
-
- /* lexer char types */
- private static final short DIGIT = 1;
- private static final short LETTER = 2;
- private static final short NAMECHAR = 4;
- private static final short WHITE = 8;
- private static final short NEWLINE = 16;
- private static final short LOWERCASE = 32;
- private static final short UPPERCASE = 64;
-
- /* lexer GetToken states */
-
- private static final short LEX_CONTENT = 0;
- private static final short LEX_GT = 1;
- private static final short LEX_ENDTAG = 2;
- private static final short LEX_STARTTAG = 3;
- private static final short LEX_COMMENT = 4;
- private static final short LEX_DOCTYPE = 5;
- private static final short LEX_PROCINSTR = 6;
- private static final short LEX_ENDCOMMENT = 7;
- private static final short LEX_CDATA = 8;
- private static final short LEX_SECTION = 9;
- private static final short LEX_ASP = 10;
- private static final short LEX_JSTE = 11;
- private static final short LEX_PHP = 12;
-
- /* used to classify chars for lexical purposes */
- private static short[] lexmap = new short[128];
-
- private static void mapStr(String str, short code)
- {
- int j;
-
- for ( int i = 0; i < str.length(); i++ ) {
- j = (int)str.charAt(i);
- lexmap[j] |= code;
- }
- }
-
- static {
- mapStr("\r\n\f", (short)(NEWLINE|WHITE));
- mapStr(" \t", WHITE);
- mapStr("-.:_", NAMECHAR);
- mapStr("0123456789", (short)(DIGIT|NAMECHAR));
- mapStr("abcdefghijklmnopqrstuvwxyz", (short)(LOWERCASE|LETTER|NAMECHAR));
- mapStr("ABCDEFGHIJKLMNOPQRSTUVWXYZ", (short)(UPPERCASE|LETTER|NAMECHAR));
- }
-
- private static short MAP( char c )
- {
- return ((int)c < 128 ? lexmap[(int)c] : 0);
- }
-
- private static boolean isWhite(char c)
- {
- short m = MAP(c);
-
- return (m & WHITE) != 0;
- }
-
- private static boolean isDigit(char c)
- {
- short m;
-
- m = MAP(c);
-
- return (m & DIGIT) != 0;
- }
-
- private static boolean isLetter(char c)
- {
- short m;
-
- m = MAP(c);
-
- return (m & LETTER) != 0;
- }
-
- private static char toLower(char c)
- {
- short m = MAP(c);
-
- if ((m & UPPERCASE) != 0)
- c = (char)( (int)c + (int)'a' - (int)'A' );
-
- return c;
- }
-
- private static char toUpper(char c)
- {
- short m = MAP(c);
-
- if ((m & LOWERCASE) != 0)
- c = (char)( (int)c + (int)'A' - (int)'a' );
-
- return c;
- }
-
- public static char foldCase(char c, boolean tocaps, boolean xmlTags)
- {
- short m;
-
- if (!xmlTags)
- {
- m = MAP(c);
-
- if (tocaps)
- {
- if ((m & LOWERCASE) != 0)
- c = (char)( (int)c + (int)'A' - (int)'a' );
- }
- else /* force to lower case */
- {
- if ((m & UPPERCASE) != 0)
- c = (char)( (int)c + (int)'a' - (int)'A' );
- }
- }
-
- return c;
- }
-
-
- private static class W3CVersionInfo
- {
- String name;
- String voyagerName;
- String profile;
- short code;
-
- public W3CVersionInfo( String name,
- String voyagerName,
- String profile,
- short code )
- {
- this.name = name;
- this.voyagerName = voyagerName;
- this.profile = profile;
- this.code = code;
- }
- }
-
- /* the 3 URIs for the XHTML 1.0 DTDs */
- private static final String voyager_loose = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";
- private static final String voyager_strict = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";
- private static final String voyager_frameset = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd";
-
- private static final String XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml";
-
- private static Lexer.W3CVersionInfo[] W3CVersion =
- {
- new W3CVersionInfo("HTML 4.01",
- "XHTML 1.0 Strict",
- voyager_strict,
- Dict.VERS_HTML40_STRICT),
- new W3CVersionInfo("HTML 4.01 Transitional",
- "XHTML 1.0 Transitional",
- voyager_loose,
- Dict.VERS_HTML40_LOOSE),
- new W3CVersionInfo("HTML 4.01 Frameset",
- "XHTML 1.0 Frameset",
- voyager_frameset,
- Dict.VERS_FRAMES),
- new W3CVersionInfo("HTML 4.0",
- "XHTML 1.0 Strict",
- voyager_strict,
- Dict.VERS_HTML40_STRICT),
- new W3CVersionInfo("HTML 4.0 Transitional",
- "XHTML 1.0 Transitional",
- voyager_loose,
- Dict.VERS_HTML40_LOOSE),
- new W3CVersionInfo("HTML 4.0 Frameset",
- "XHTML 1.0 Frameset",
- voyager_frameset,
- Dict.VERS_FRAMES),
- new W3CVersionInfo("HTML 3.2",
- "XHTML 1.0 Transitional",
- voyager_loose,
- Dict.VERS_HTML32),
- new W3CVersionInfo("HTML 2.0",
- "XHTML 1.0 Strict",
- voyager_strict,
- Dict.VERS_HTML20)
- };
-
-}