2 * @(#)Lexer.java 1.11 2000/08/16
6 package net.sourceforge.phpdt.tidy.w3c;
10 * Lexer for html parser
12 * (c) 1998-2000 (W3C) MIT, INRIA, Keio University
13 * See Tidy.java for the copyright notice.
14 * Derived from <a href="http://www.w3.org/People/Raggett/tidy">
15 * HTML Tidy Release 4 Aug 2000</a>
17 * @author Dave Raggett <dsr@w3.org>
18 * @author Andy Quick <ac.quick@sympatico.ca> (translation to Java)
19 * @version 1.0, 1999/05/22
20 * @version 1.0.1, 1999/05/29
21 * @version 1.1, 1999/06/18 Java Bean
22 * @version 1.2, 1999/07/10 Tidy Release 7 Jul 1999
23 * @version 1.3, 1999/07/30 Tidy Release 26 Jul 1999
24 * @version 1.4, 1999/09/04 DOM support
25 * @version 1.5, 1999/10/23 Tidy Release 27 Sep 1999
26 * @version 1.6, 1999/11/01 Tidy Release 22 Oct 1999
27 * @version 1.7, 1999/12/06 Tidy Release 30 Nov 1999
28 * @version 1.8, 2000/01/22 Tidy Release 13 Jan 2000
29 * @version 1.9, 2000/06/03 Tidy Release 30 Apr 2000
30 * @version 1.10, 2000/07/22 Tidy Release 8 Jul 2000
31 * @version 1.11, 2000/08/16 Tidy Release 4 Aug 2000
35 Given a file stream fp it returns a sequence of tokens.
37 GetToken(fp) gets the next token
38 UngetToken(fp) provides one level undo
40 The tags include an attribute list:
42 - linked list of attribute/value nodes
43 - each node has 2 null-terminated strings.
44 - entities are replaced in attribute values
46 white space is compacted if not in preformatted mode
47 If not in preformatted mode then leading white space
48 is discarded and subsequent white space sequences
49 compacted to single space chars.
51 If XmlTags is no then Tag names are folded to upper
52 case and attribute names to lower case.
55 - Doctype subset and marked sections
58 import java.io.PrintWriter;
59 import java.util.Stack;
60 import java.util.Vector;
62 import org.eclipse.core.resources.IFile;
67 public StreamIn in; /* file stream */
68 public PrintWriter errout; /* error output stream */
69 public short badAccess; /* for accessibility errors */
70 public short badLayout; /* for bad style errors */
71 public short badChars; /* for bad char encodings */
72 public short badForm; /* for mismatched/mispositioned form tags */
73 public short warnings; /* count of warnings in this document */
74 public short errors; /* count of errors */
75 public int lines; /* lines seen */
76 public int columns; /* at start of current token */
77 public boolean waswhite; /* used to collapse contiguous white space */
78 public boolean pushed; /* true after token has been pushed back */
79 public boolean insertspace; /* when space is moved after end tag */
80 public boolean excludeBlocks; /* Netscape compatibility */
81 public boolean exiled; /* true if moved out of table */
82 public boolean isvoyager; /* true if xmlns attribute on html element */
83 public short versions; /* bit vector of HTML versions */
84 public int doctype; /* version as given by doctype (if any) */
85 public boolean badDoctype; /* e.g. if html or PUBLIC is missing */
86 public int txtstart; /* start of current node */
87 public int txtend; /* end of current node */
88 public short state; /* state of lexer's finite state machine */
92 lexer character buffer
94 parse tree nodes span onto this buffer
95 which contains the concatenated text
96 contents of all of the elements.
98 lexsize must be reset for each file.
100 public byte[] lexbuf; /* byte buffer of UTF-8 chars */
101 public int lexlength; /* allocated */
102 public int lexsize; /* used */
104 /* Inline stack for compatibility with Mosaic */
105 public Node inode; /* for deferring text node */
106 public int insert; /* for inferring inline tags */
108 public int istackbase; /* start of frame */
110 public Style styles; /* used for cleaning up presentation markup */
112 public Configuration configuration;
113 protected int seenBodyEndTag; /* used by parser */
114 private Vector nodeList;
116 public Lexer(IFile iFile, StreamIn in, Configuration configuration)
122 this.state = LEX_CONTENT;
129 this.waswhite = false;
131 this.insertspace = false;
133 this.isvoyager = false;
134 this.versions = Dict.VERS_EVERYTHING;
135 this.doctype = Dict.VERS_UNKNOWN;
136 this.badDoctype = false;
145 this.istack = new Stack();
148 this.configuration = configuration;
149 this.seenBodyEndTag = 0;
150 this.nodeList = new Vector();
153 public IFile getIFile() {
157 public Node newNode()
159 Node node = new Node();
160 nodeList.addElement(node);
164 public Node newNode(short type, byte[] textarray, int start, int end)
166 Node node = new Node(type, textarray, start, end);
167 nodeList.addElement(node);
171 public Node newNode(short type, byte[] textarray, int start, int end, String element)
173 Node node = new Node(type, textarray, start, end, element, configuration.tt);
174 nodeList.addElement(node);
178 public Node cloneNode(Node node)
180 Node cnode = (Node)node.clone();
181 nodeList.addElement(cnode);
182 for (AttVal att = cnode.attributes; att != null; att = att.next) {
184 nodeList.addElement(att.asp);
186 nodeList.addElement(att.php);
191 public AttVal cloneAttributes(AttVal attrs)
193 AttVal cattrs = (AttVal)attrs.clone();
194 for (AttVal att = cattrs; att != null; att = att.next) {
196 nodeList.addElement(att.asp);
198 nodeList.addElement(att.php);
203 protected void updateNodeTextArrays(byte[] oldtextarray, byte[] newtextarray)
206 for (int i = 0; i < nodeList.size(); i++) {
207 node = (Node)(nodeList.elementAt(i));
208 if (node.textarray == oldtextarray)
209 node.textarray = newtextarray;
213 /* used for creating preformatted text from Word2000 */
214 public Node newLineNode()
216 Node node = newNode();
218 node.textarray = this.lexbuf;
219 node.start = this.lexsize;
220 addCharToLexer((int)'\n');
221 node.end = this.lexsize;
225 // Should always be able convert to/from UTF-8, so encoding exceptions are
226 // converted to an Error to avoid adding throws declarations in
229 public static byte[] getBytes(String str) {
231 return str.getBytes("UTF8");
232 } catch (java.io.UnsupportedEncodingException e) {
233 throw new Error("string to UTF-8 conversion failed: " + e.getMessage());
237 public static String getString(byte[] bytes, int offset, int length) {
239 return new String(bytes, offset, length, "UTF8");
240 } catch (java.io.UnsupportedEncodingException e) {
241 throw new Error("UTF-8 to string conversion failed: " + e.getMessage());
245 public boolean endOfInput()
247 return this.in.isEndOfStream();
250 public void addByte(int c)
252 if (this.lexsize + 1 >= this.lexlength)
254 while (this.lexsize + 1 >= this.lexlength)
256 if (this.lexlength == 0)
257 this.lexlength = 8192;
259 this.lexlength = this.lexlength * 2;
262 byte[] temp = this.lexbuf;
263 this.lexbuf = new byte[ this.lexlength ];
266 System.arraycopy( temp, 0, this.lexbuf, 0, temp.length );
267 updateNodeTextArrays(temp, this.lexbuf);
271 this.lexbuf[this.lexsize++] = (byte)c;
272 this.lexbuf[this.lexsize] = (byte)'\0'; /* debug */
275 public void changeChar(byte c)
277 if (this.lexsize > 0)
279 this.lexbuf[this.lexsize-1] = c;
283 /* store char c as UTF-8 encoded byte stream */
284 public void addCharToLexer(int c)
290 addByte(0xC0 | (c >> 6));
291 addByte(0x80 | (c & 0x3F));
293 else if (c <= 0xFFFF)
295 addByte(0xE0 | (c >> 12));
296 addByte(0x80 | ((c >> 6) & 0x3F));
297 addByte(0x80 | (c & 0x3F));
299 else if (c <= 0x1FFFFF)
301 addByte(0xF0 | (c >> 18));
302 addByte(0x80 | ((c >> 12) & 0x3F));
303 addByte(0x80 | ((c >> 6) & 0x3F));
304 addByte(0x80 | (c & 0x3F));
308 addByte(0xF8 | (c >> 24));
309 addByte(0x80 | ((c >> 18) & 0x3F));
310 addByte(0x80 | ((c >> 12) & 0x3F));
311 addByte(0x80 | ((c >> 6) & 0x3F));
312 addByte(0x80 | (c & 0x3F));
316 public void addStringToLexer(String str)
318 for ( int i = 0; i < str.length(); i++ ) {
319 addCharToLexer( (int)str.charAt(i) );
324 No longer attempts to insert missing ';' for unknown
325 enitities unless one was present already, since this
326 gives unexpected results.
328 For example: <a href="something.htm?foo&bar&fred">
329 was tidied to: <a href="something.htm?foo&bar;&fred;">
330 rather than: <a href="something.htm?foo&bar&fred">
332 My thanks for Maurice Buxton for spotting this.
334 public void parseEntity(short mode)
338 boolean first = true;
339 boolean semicolon = false;
340 boolean numeric = false;
344 start = this.lexsize - 1; /* to start at "&" */
345 startcol = this.in.curcol - 1;
349 c = this.in.readChar();
350 if (c == StreamIn.EndOfStream) break;
357 if (first && c == '#')
368 /* AQ: Added flag for numeric entities so that numeric entities
369 with missing semi-colons are recognized.
370 Eg. "rep..." is recognized as "rep"
372 if (numeric && ((c == 'x') || ((map & DIGIT) != 0)))
377 if (!numeric && ((map & NAMECHAR) != 0))
383 /* otherwise put it back */
385 this.in.ungetChar(c);
389 str = getString( this.lexbuf, start, this.lexsize - start );
390 ch = EntityTable.getDefaultEntityTable().entityCode( str );
392 /* deal with unrecognized entities */
395 /* set error position just before offending chararcter */
396 this.lines = this.in.curline;
397 this.columns = startcol;
399 if (this.lexsize > start +1 )
401 Report.entityError(this, Report.UNKNOWN_ENTITY, str, ch);
408 Report.entityError(this, Report.UNESCAPED_AMPERSAND, str, ch);
413 if (c != ';') /* issue warning if not terminated by ';' */
415 /* set error position just before offending chararcter */
416 this.lines = this.in.curline;
417 this.columns = startcol;
418 Report.entityError(this, Report.MISSING_SEMICOLON, str, c);
421 this.lexsize = start;
423 if (ch == 160 && (mode & Preformatted) != 0)
428 if (ch == '&' && !this.configuration.QuoteAmpersand)
438 public char parseTagName()
443 /* fold case of first char in buffer */
445 c = this.lexbuf[this.txtstart];
448 if (!this.configuration.XmlTags && (map & UPPERCASE) != 0)
450 c += (int)((int)'a' - (int)'A');
451 this.lexbuf[this.txtstart] = (byte)c;
456 c = this.in.readChar();
457 if (c == StreamIn.EndOfStream) break;
460 if ((map & NAMECHAR) == 0)
463 /* fold case of subsequent chars */
465 if (!this.configuration.XmlTags && (map & UPPERCASE) != 0)
466 c += (int)((int)'a' - (int)'A');
471 this.txtend = this.lexsize;
475 public void addStringLiteral(String str)
477 for ( int i = 0; i < str.length(); i++ ) {
478 addCharToLexer( (int)str.charAt(i) );
482 /* choose what version to use for new doctype */
483 public short HTMLVersion()
487 versions = this.versions;
489 if ((versions & Dict.VERS_HTML20) != 0)
490 return Dict.VERS_HTML20;
492 if ((versions & Dict.VERS_HTML32) != 0)
493 return Dict.VERS_HTML32;
495 if ((versions & Dict.VERS_HTML40_STRICT) != 0)
496 return Dict.VERS_HTML40_STRICT;
498 if ((versions & Dict.VERS_HTML40_LOOSE) != 0)
499 return Dict.VERS_HTML40_LOOSE;
501 if ((versions & Dict.VERS_FRAMES) != 0)
502 return Dict.VERS_FRAMES;
504 return Dict.VERS_UNKNOWN;
507 public String HTMLVersionName()
512 guessed = apparentVersion();
514 for (j = 0; j < W3CVersion.length; ++j)
516 if (guessed == W3CVersion[j].code)
519 return W3CVersion[j].voyagerName;
521 return W3CVersion[j].name;
528 /* add meta element for Tidy */
529 public boolean addGenerator(Node root)
533 Node head = root.findHEAD(configuration.tt);
537 for (node = head.content; node != null; node = node.next)
539 if (node.tag == configuration.tt.tagMeta)
541 attval = node.getAttrByName("name");
543 if (attval != null && attval.value != null &&
544 Lexer.wstrcasecmp(attval.value, "generator") == 0)
546 attval = node.getAttrByName("content");
548 if (attval != null && attval.value != null &&
549 attval.value.length() >= 9 &&
550 Lexer.wstrcasecmp(attval.value.substring(0, 9), "HTML Tidy") == 0)
558 node = this.inferredTag("meta");
559 node.addAttribute("content", "HTML Tidy, see www.w3.org");
560 node.addAttribute("name", "generator");
561 Node.insertNodeAtStart(head, node);
568 /* return true if substring s is in p and isn't all in upper case */
569 /* this is used to check the case of SYSTEM, PUBLIC, DTD and EN */
570 /* len is how many chars to check in p */
571 private static boolean findBadSubString(String s, String p, int len)
579 ps = p.substring(i, i + n);
580 if (wstrcasecmp(s, ps) == 0)
581 return (!ps.equals(s.substring(0, n)));
590 public boolean checkDocTypeKeyWords(Node doctype)
592 int len = doctype.end - doctype.start;
593 String s = getString(this.lexbuf, doctype.start, len);
596 findBadSubString("SYSTEM", s, len) ||
597 findBadSubString("PUBLIC", s, len) ||
598 findBadSubString("//DTD", s, len) ||
599 findBadSubString("//W3C", s, len) ||
600 findBadSubString("//EN", s, len)
604 /* examine <!DOCTYPE> to identify version */
605 public short findGivenVersion(Node doctype)
613 /* if root tag for doctype isn't html give up now */
614 str1 = getString(this.lexbuf, doctype.start, 5);
615 if (wstrcasecmp(str1, "html ") != 0)
618 if (!checkDocTypeKeyWords(doctype))
619 Report.warning(this, doctype, null, Report.DTYPE_NOT_UPPER_CASE);
621 /* give up if all we are given is the system id for the doctype */
622 str1 = getString(this.lexbuf, doctype.start + 5, 7);
623 if (wstrcasecmp(str1, "SYSTEM ") == 0)
625 /* but at least ensure the case is correct */
626 if (!str1.substring(0, 6).equals("SYSTEM"))
627 System.arraycopy( getBytes("SYSTEM"), 0,
628 this.lexbuf, doctype.start + 5, 6 );
629 return 0; /* unrecognized */
632 if (wstrcasecmp(str1, "PUBLIC ") == 0)
634 if (!str1.substring(0, 6).equals("PUBLIC"))
635 System.arraycopy( getBytes("PUBLIC "), 0,
636 this.lexbuf, doctype.start + 5, 6 );
639 this.badDoctype = true;
641 for (i = doctype.start; i < doctype.end; ++i)
643 if (this.lexbuf[i] == (byte)'"')
645 str1 = getString( this.lexbuf, i + 1, 12 );
646 str2 = getString( this.lexbuf, i + 1, 13 );
647 if (str1.equals("-//W3C//DTD "))
649 /* compute length of identifier e.g. "HTML 4.0 Transitional" */
650 for (j = i + 13; j < doctype.end && this.lexbuf[j] != (byte)'/'; ++j);
652 p = getString( this.lexbuf, i + 13, len );
654 for (j = 1; j < W3CVersion.length; ++j)
656 s = W3CVersion[j].name;
657 if (len == s.length() && s.equals(p))
658 return W3CVersion[j].code;
661 /* else unrecognized version */
663 else if (str2.equals("-//IETF//DTD "))
665 /* compute length of identifier e.g. "HTML 2.0" */
666 for (j = i + 14; j < doctype.end && this.lexbuf[j] != (byte)'/'; ++j);
669 p = getString( this.lexbuf, i + 14, len );
670 s = W3CVersion[0].name;
671 if (len == s.length() && s.equals(p))
672 return W3CVersion[0].code;
674 /* else unrecognized version */
683 public void fixHTMLNameSpace(Node root, String profile)
688 for (node = root.content;
689 node != null && node.tag != configuration.tt.tagHtml; node = node.next);
695 for (attr = node.attributes; attr != null; attr = attr.next)
697 if (attr.attribute.equals("xmlns"))
705 if (!attr.value.equals(profile))
707 Report.warning(this, node, null, Report.INCONSISTENT_NAMESPACE);
708 attr.value = profile;
713 attr = new AttVal( node.attributes, null, (int)'"',
716 AttributeTable.getDefaultAttributeTable().findAttribute( attr );
717 node.attributes = attr;
722 public boolean setXHTMLDocType(Node root)
726 String namespace = XHTML_NAMESPACE;
729 doctype = root.findDocType();
731 if (configuration.docTypeMode == Configuration.DOCTYPE_OMIT)
734 Node.discardElement(doctype);
738 if (configuration.docTypeMode == Configuration.DOCTYPE_AUTO)
740 /* see what flavor of XHTML this document matches */
741 if ((this.versions & Dict.VERS_HTML40_STRICT) != 0)
742 { /* use XHTML strict */
743 fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
744 sysid = voyager_strict;
746 else if ((this.versions & Dict.VERS_LOOSE) != 0)
748 fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
749 sysid = voyager_loose;
751 else if ((this.versions & Dict.VERS_FRAMES) != 0)
752 { /* use XHTML frames */
753 fpi = "-//W3C//DTD XHTML 1.0 Frameset//EN";
754 sysid = voyager_frameset;
756 else /* lets assume XHTML transitional */
758 fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
759 sysid = voyager_loose;
762 else if (configuration.docTypeMode == Configuration.DOCTYPE_STRICT)
764 fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
765 sysid = voyager_strict;
767 else if (configuration.docTypeMode == Configuration.DOCTYPE_LOOSE)
769 fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
770 sysid = voyager_loose;
773 fixHTMLNameSpace(root, namespace);
777 doctype = newNode(Node.DocTypeTag, this.lexbuf, 0, 0);
778 doctype.next = root.content;
779 doctype.parent = root;
781 root.content = doctype;
784 if (configuration.docTypeMode == Configuration.DOCTYPE_USER &&
785 configuration.docTypeStr != null)
787 fpi = configuration.docTypeStr;
791 this.txtstart = this.lexsize;
792 this.txtend = this.lexsize;
794 /* add public identifier */
795 addStringLiteral("html PUBLIC ");
797 /* check if the fpi is quoted or not */
798 if (fpi.charAt(0) == '"')
799 addStringLiteral(fpi);
802 addStringLiteral("\"");
803 addStringLiteral(fpi);
804 addStringLiteral("\"");
807 if (sysid.length() + 6 >= this.configuration.wraplen)
808 addStringLiteral("\n\"");
810 addStringLiteral("\n \"");
812 /* add system identifier */
813 addStringLiteral(sysid);
814 addStringLiteral("\"");
816 this.txtend = this.lexsize;
818 doctype.start = this.txtstart;
819 doctype.end = this.txtend;
824 public short apparentVersion()
826 switch (this.doctype)
828 case Dict.VERS_UNKNOWN:
829 return HTMLVersion();
831 case Dict.VERS_HTML20:
832 if ((this.versions & Dict.VERS_HTML20) != 0)
833 return Dict.VERS_HTML20;
837 case Dict.VERS_HTML32:
838 if ((this.versions & Dict.VERS_HTML32) != 0)
839 return Dict.VERS_HTML32;
841 break; /* to replace old version by new */
843 case Dict.VERS_HTML40_STRICT:
844 if ((this.versions & Dict.VERS_HTML40_STRICT) != 0)
845 return Dict.VERS_HTML40_STRICT;
849 case Dict.VERS_HTML40_LOOSE:
850 if ((this.versions & Dict.VERS_HTML40_LOOSE) != 0)
851 return Dict.VERS_HTML40_LOOSE;
853 break; /* to replace old version by new */
855 case Dict.VERS_FRAMES:
856 if ((this.versions & Dict.VERS_FRAMES) != 0)
857 return Dict.VERS_FRAMES;
862 Report.warning(this, null, null, Report.INCONSISTENT_VERSION);
863 return this.HTMLVersion();
866 /* fixup doctype if missing */
867 public boolean fixDocType(Node root)
870 int guessed = Dict.VERS_HTML40_STRICT, i;
873 Report.warning(this, null, null, Report.MALFORMED_DOCTYPE);
875 if (configuration.XmlOut)
878 doctype = root.findDocType();
880 if (configuration.docTypeMode == Configuration.DOCTYPE_OMIT)
883 Node.discardElement(doctype);
887 if (configuration.docTypeMode == Configuration.DOCTYPE_STRICT)
889 Node.discardElement(doctype);
891 guessed = Dict.VERS_HTML40_STRICT;
893 else if (configuration.docTypeMode == Configuration.DOCTYPE_LOOSE)
895 Node.discardElement(doctype);
897 guessed = Dict.VERS_HTML40_LOOSE;
899 else if (configuration.docTypeMode == Configuration.DOCTYPE_AUTO)
903 if (this.doctype == Dict.VERS_UNKNOWN)
906 switch (this.doctype)
908 case Dict.VERS_UNKNOWN:
911 case Dict.VERS_HTML20:
912 if ((this.versions & Dict.VERS_HTML20) != 0)
915 break; /* to replace old version by new */
917 case Dict.VERS_HTML32:
918 if ((this.versions & Dict.VERS_HTML32) != 0)
921 break; /* to replace old version by new */
923 case Dict.VERS_HTML40_STRICT:
924 if ((this.versions & Dict.VERS_HTML40_STRICT) != 0)
927 break; /* to replace old version by new */
929 case Dict.VERS_HTML40_LOOSE:
930 if ((this.versions & Dict.VERS_HTML40_LOOSE) != 0)
933 break; /* to replace old version by new */
935 case Dict.VERS_FRAMES:
936 if ((this.versions & Dict.VERS_FRAMES) != 0)
939 break; /* to replace old version by new */
942 /* INCONSISTENT_VERSION warning is now issued by ApparentVersion() */
945 /* choose new doctype */
946 guessed = HTMLVersion();
949 if (guessed == Dict.VERS_UNKNOWN)
952 /* for XML use the Voyager system identifier */
953 if (this.configuration.XmlOut || this.configuration.XmlTags || this.isvoyager)
956 Node.discardElement(doctype);
958 for (i = 0; i < W3CVersion.length; ++i)
960 if (guessed == W3CVersion[i].code)
962 fixHTMLNameSpace(root, W3CVersion[i].profile);
972 doctype = newNode(Node.DocTypeTag, this.lexbuf, 0, 0);
973 doctype.next = root.content;
974 doctype.parent = root;
976 root.content = doctype;
979 this.txtstart = this.lexsize;
980 this.txtend = this.lexsize;
982 /* use the appropriate public identifier */
983 addStringLiteral("html PUBLIC ");
985 if (configuration.docTypeMode == Configuration.DOCTYPE_USER &&
986 configuration.docTypeStr != null)
987 addStringLiteral(configuration.docTypeStr);
988 else if (guessed == Dict.VERS_HTML20)
989 addStringLiteral("\"-//IETF//DTD HTML 2.0//EN\"");
992 addStringLiteral("\"-//W3C//DTD ");
994 for (i = 0; i < W3CVersion.length; ++i)
996 if (guessed == W3CVersion[i].code)
998 addStringLiteral(W3CVersion[i].name);
1003 addStringLiteral("//EN\"");
1006 this.txtend = this.lexsize;
1008 doctype.start = this.txtstart;
1009 doctype.end = this.txtend;
1014 /* ensure XML document starts with <?XML version="1.0"?> */
1015 public boolean fixXMLPI(Node root)
1020 if( root.content != null && root.content.type == Node.ProcInsTag)
1022 s = root.content.start;
1024 if (this.lexbuf[s] == (byte)'x' &&
1025 this.lexbuf[s+1] == (byte)'m' &&
1026 this.lexbuf[s+2] == (byte)'l')
1030 xml = newNode(Node.ProcInsTag, this.lexbuf, 0, 0);
1031 xml.next = root.content;
1033 if (root.content != null)
1035 root.content.prev = xml;
1036 xml.next = root.content;
1041 this.txtstart = this.lexsize;
1042 this.txtend = this.lexsize;
1043 addStringLiteral("xml version=\"1.0\"");
1044 if (this.configuration.CharEncoding == Configuration.LATIN1)
1045 addStringLiteral(" encoding=\"ISO-8859-1\"");
1046 this.txtend = this.lexsize;
1048 xml.start = this.txtstart;
1049 xml.end = this.txtend;
1053 public Node inferredTag(String name)
1057 node = newNode(Node.StartTag,
1062 node.implicit = true;
1066 public static boolean expectsContent(Node node)
1068 if (node.type != Node.StartTag)
1071 /* unknown element? */
1072 if (node.tag == null)
1075 if ((node.tag.model & Dict.CM_EMPTY) != 0)
1082 create a text node for the contents of
1083 a CDATA element like style or script
1084 which ends with </foo> for some foo.
1086 public Node getCDATA(Node container)
1088 int c, lastc, start, len, i;
1090 boolean endtag = false;
1092 this.lines = this.in.curline;
1093 this.columns = this.in.curcol;
1094 this.waswhite = false;
1095 this.txtstart = this.lexsize;
1096 this.txtend = this.lexsize;
1103 c = this.in.readChar();
1104 if (c == StreamIn.EndOfStream) break;
1105 /* treat \r\n as \n and \r as \n */
1107 if (c == (int)'/' && lastc == (int)'<')
1111 this.lines = this.in.curline;
1112 this.columns = this.in.curcol - 3;
1114 Report.warning(this, null, null, Report.BAD_CDATA_CONTENT);
1117 start = this.lexsize + 1; /* to first letter */
1120 else if (c == (int)'>' && start >= 0)
1122 len = this.lexsize - start;
1123 if (len == container.element.length())
1125 str = getString( this.lexbuf, start, len );
1126 if (Lexer.wstrcasecmp(str, container.element) == 0)
1128 this.txtend = start - 2;
1133 this.lines = this.in.curline;
1134 this.columns = this.in.curcol - 3;
1136 Report.warning(this, null, null, Report.BAD_CDATA_CONTENT);
1138 /* if javascript insert backslash before / */
1140 if (ParserImpl.isJavaScript(container))
1142 for (i = this.lexsize; i > start-1; --i)
1143 this.lexbuf[i] = this.lexbuf[i-1];
1145 this.lexbuf[start-1] = (byte)'\\';
1151 else if (c == (int)'\r')
1153 c = this.in.readChar();
1156 this.in.ungetChar(c);
1161 addCharToLexer((int)c);
1162 this.txtend = this.lexsize;
1166 if (c == StreamIn.EndOfStream)
1167 Report.warning(this, container, null, Report.MISSING_ENDTAG_FOR);
1169 if (this.txtend > this.txtstart)
1171 this.token = newNode(Node.TextNode,
1181 public void ungetToken()
1186 public static final short IgnoreWhitespace = 0;
1187 public static final short MixedContent = 1;
1188 public static final short Preformatted = 2;
1189 public static final short IgnoreMarkup = 3;
1192 modes for GetToken()
1194 MixedContent -- for elements which don't accept PCDATA
1195 Preformatted -- white space preserved as is
1196 IgnoreMarkup -- for CDATA elements such as script, style
1199 public Node getToken(short mode)
1205 MutableBoolean isempty = new MutableBoolean();
1210 /* duplicate inlines in preference to pushed text nodes when appropriate */
1211 if (this.token.type != Node.TextNode ||
1212 (this.insert == -1 && this.inode == null))
1214 this.pushed = false;
1219 /* at start of block elements, unclosed inline
1220 elements are inserted into the token stream */
1222 if (this.insert != -1 || this.inode != null)
1223 return insertedToken();
1225 this.lines = this.in.curline;
1226 this.columns = this.in.curcol;
1227 this.waswhite = false;
1229 this.txtstart = this.lexsize;
1230 this.txtend = this.lexsize;
1234 c = this.in.readChar();
1235 if (c == StreamIn.EndOfStream) break;
1236 if (this.insertspace && mode != IgnoreWhitespace)
1238 addCharToLexer(' ');
1239 this.waswhite = true;
1240 this.insertspace = false;
1243 /* treat \r\n as \n and \r as \n */
1247 c = this.in.readChar();
1250 this.in.ungetChar(c);
1259 case LEX_CONTENT: /* element content */
1263 Discard white space if appropriate. Its cheaper
1264 to do this here rather than in parser methods
1265 for elements that don't have mixed content.
1267 if (((map & WHITE) != 0) && (mode == IgnoreWhitespace)
1268 && this.lexsize == this.txtstart + 1)
1271 this.waswhite = false;
1272 this.lines = this.in.curline;
1273 this.columns = this.in.curcol;
1279 this.state = LEX_GT;
1283 if ((map & WHITE) != 0)
1285 /* was previous char white? */
1288 if (mode != Preformatted && mode != IgnoreMarkup)
1291 this.lines = this.in.curline;
1292 this.columns = this.in.curcol;
1295 else /* prev char wasn't white */
1297 this.waswhite = true;
1300 if (mode != Preformatted && mode != IgnoreMarkup && c != ' ')
1301 changeChar((byte)' ');
1306 else if (c == '&' && mode != IgnoreMarkup)
1309 /* this is needed to avoid trimming trailing whitespace */
1310 if (mode == IgnoreWhitespace)
1311 mode = MixedContent;
1313 this.waswhite = false;
1316 case LEX_GT: /* < */
1318 /* check for endtag */
1321 c = this.in.readChar();
1322 if (c == StreamIn.EndOfStream)
1324 this.in.ungetChar(c);
1331 if ((map & LETTER) != 0)
1334 this.txtend = this.lexsize;
1335 this.in.ungetChar(c);
1336 this.state = LEX_ENDTAG;
1337 this.lexbuf[this.lexsize] = (byte)'\0'; /* debug */
1338 this.in.curcol -= 2;
1340 /* if some text before the </ return it now */
1341 if (this.txtend > this.txtstart)
1343 /* trim space char before end tag */
1344 if (mode == IgnoreWhitespace && this.lexbuf[this.lexsize - 1] == (byte)' ')
1347 this.txtend = this.lexsize;
1350 this.token = newNode(Node.TextNode,
1357 continue; /* no text so keep going */
1360 /* otherwise treat as CDATA */
1361 this.waswhite = false;
1362 this.state = LEX_CONTENT;
1366 if (mode == IgnoreMarkup)
1368 /* otherwise treat as CDATA */
1369 this.waswhite = false;
1370 this.state = LEX_CONTENT;
1375 look out for comments, doctype or marked sections
1376 this isn't quite right, but its getting there ...
1380 c = this.in.readChar();
1384 c = this.in.readChar();
1388 this.state = LEX_COMMENT; /* comment */
1390 this.txtend = this.lexsize;
1392 /* if some text before < return it now */
1393 if (this.txtend > this.txtstart)
1395 this.token = newNode(Node.TextNode,
1402 this.txtstart = this.lexsize;
1406 Report.warning(this, null, null, Report.MALFORMED_COMMENT);
1408 else if (c == 'd' || c == 'D')
1410 this.state = LEX_DOCTYPE; /* doctype */
1412 this.txtend = this.lexsize;
1413 mode = IgnoreWhitespace;
1415 /* skip until white space or '>' */
1419 c = this.in.readChar();
1421 if (c == StreamIn.EndOfStream || c == '>')
1423 this.in.ungetChar(c);
1429 if ((map & WHITE) == 0)
1432 /* and skip to end of whitespace */
1436 c = this.in.readChar();
1438 if (c == StreamIn.EndOfStream || c == '>')
1440 this.in.ungetChar(c);
1446 if ((map & WHITE) != 0)
1449 this.in.ungetChar(c);
1456 /* if some text before < return it now */
1457 if (this.txtend > this.txtstart)
1459 this.token = newNode(Node.TextNode,
1466 this.txtstart = this.lexsize;
1471 /* Word 2000 embeds <![if ...]> ... <![endif]> sequences */
1473 this.state = LEX_SECTION;
1474 this.txtend = this.lexsize;
1476 /* if some text before < return it now */
1477 if (this.txtend > this.txtstart)
1479 this.token = newNode(Node.TextNode,
1486 this.txtstart = this.lexsize;
1490 /* otherwise swallow chars up to and including next '>' */
1493 c = this.in.readChar();
1494 if (c == '>') break;
1497 this.in.ungetChar(c);
1503 this.lexbuf[this.lexsize] = (byte)'\0';
1504 this.state = LEX_CONTENT;
1509 processing instructions
1515 this.state = LEX_PROCINSTR;
1516 this.txtend = this.lexsize;
1518 /* if some text before < return it now */
1519 if (this.txtend > this.txtstart)
1521 this.token = newNode(Node.TextNode,
1528 this.txtstart = this.lexsize;
1532 /* Microsoft ASP's e.g. <% ... server-code ... %> */
1536 this.state = LEX_ASP;
1537 this.txtend = this.lexsize;
1539 /* if some text before < return it now */
1540 if (this.txtend > this.txtstart)
1542 this.token = newNode(Node.TextNode,
1549 this.txtstart = this.lexsize;
1553 /* Netscapes JSTE e.g. <# ... server-code ... #> */
1557 this.state = LEX_JSTE;
1558 this.txtend = this.lexsize;
1560 /* if some text before < return it now */
1561 if (this.txtend > this.txtstart)
1563 this.token = newNode(Node.TextNode,
1570 this.txtstart = this.lexsize;
1576 /* check for start tag */
1577 if ((map & LETTER) != 0)
1579 this.in.ungetChar(c); /* push back letter */
1580 this.lexsize -= 2; /* discard "<" + letter */
1581 this.txtend = this.lexsize;
1582 this.state = LEX_STARTTAG; /* ready to read tag name */
1584 /* if some text before < return it now */
1585 if (this.txtend > this.txtstart)
1587 this.token = newNode(Node.TextNode,
1594 continue; /* no text so keep going */
1597 /* otherwise treat as CDATA */
1598 this.state = LEX_CONTENT;
1599 this.waswhite = false;
1602 case LEX_ENDTAG: /* </letter */
1603 this.txtstart = this.lexsize - 1;
1604 this.in.curcol += 2;
1606 this.token = newNode(Node.EndTag, /* create endtag token */
1610 getString(this.lexbuf,
1612 this.txtend - this.txtstart));
1613 this.lexsize = this.txtstart;
1614 this.txtend = this.txtstart;
1619 c = this.in.readChar();
1621 if (c == StreamIn.EndOfStream)
1625 if (c == StreamIn.EndOfStream)
1627 this.in.ungetChar(c);
1631 this.state = LEX_CONTENT;
1632 this.waswhite = false;
1633 return this.token; /* the endtag token */
1635 case LEX_STARTTAG: /* first letter of tagname */
1636 this.txtstart = this.lexsize - 1; /* set txtstart to first letter */
1638 isempty.value = false;
1640 this.token = newNode((isempty.value ? Node.StartEndTag : Node.StartTag),
1644 getString(this.lexbuf,
1646 this.txtend - this.txtstart));
1648 /* parse attributes, consuming closing ">" */
1652 this.in.ungetChar(c);
1654 attributes = parseAttrs(isempty);
1658 this.token.type = Node.StartEndTag;
1660 this.token.attributes = attributes;
1661 this.lexsize = this.txtstart;
1662 this.txtend = this.txtstart;
1664 /* swallow newline following start tag */
1665 /* special check needed for CRLF sequence */
1666 /* this doesn't apply to empty elements */
1668 if (expectsContent(this.token) ||
1669 this.token.tag == configuration.tt.tagBr)
1672 c = this.in.readChar();
1676 c = this.in.readChar();
1679 this.in.ungetChar(c);
1681 else if (c != '\n' && c != '\f')
1682 this.in.ungetChar(c);
1684 this.waswhite = true; /* to swallow leading whitespace */
1687 this.waswhite = false;
1689 this.state = LEX_CONTENT;
1691 if (this.token.tag == null)
1692 Report.error(this, null, this.token, Report.UNKNOWN_ELEMENT);
1693 else if (!this.configuration.XmlTags)
1695 this.versions &= this.token.tag.versions;
1697 if ((this.token.tag.versions & Dict.VERS_PROPRIETARY) != 0)
1699 if (!this.configuration.MakeClean && (this.token.tag == configuration.tt.tagNobr ||
1700 this.token.tag == configuration.tt.tagWbr))
1701 Report.warning(this, null, this.token, Report.PROPRIETARY_ELEMENT);
1704 if (this.token.tag.chkattrs != null)
1706 this.token.checkUniqueAttributes(this);
1707 this.token.tag.chkattrs.check(this, this.token);
1710 this.token.checkAttributes(this);
1713 return this.token; /* return start tag */
1715 case LEX_COMMENT: /* seen <!-- so look for --> */
1720 c = this.in.readChar();
1726 end_comment: while (true) {
1727 c = this.in.readChar();
1731 if (badcomment != 0)
1732 Report.warning(this, null, null, Report.MALFORMED_COMMENT);
1734 this.txtend = this.lexsize - 2; // AQ 8Jul2000
1735 this.lexbuf[this.lexsize] = (byte)'\0';
1736 this.state = LEX_CONTENT;
1737 this.waswhite = false;
1738 this.token = newNode(Node.CommentTag,
1743 /* now look for a line break */
1745 c = this.in.readChar();
1749 c = this.in.readChar();
1752 this.token.linebreak = true;
1756 this.token.linebreak = true;
1758 this.in.ungetChar(c);
1763 /* note position of first such error in the comment */
1764 if (badcomment == 0)
1766 this.lines = this.in.curline;
1767 this.columns = this.in.curcol - 3;
1771 if (this.configuration.FixComments)
1772 this.lexbuf[this.lexsize - 2] = (byte)'=';
1776 /* if '-' then look for '>' to end the comment */
1781 /* otherwise continue to look for --> */
1782 this.lexbuf[this.lexsize - 2] = (byte)'=';
1785 case LEX_DOCTYPE: /* seen <!d so look for '>' munging whitespace */
1788 if ((map & WHITE) != 0)
1793 this.waswhite = true;
1796 this.waswhite = false;
1802 this.txtend = this.lexsize;
1803 this.lexbuf[this.lexsize] = (byte)'\0';
1804 this.state = LEX_CONTENT;
1805 this.waswhite = false;
1806 this.token = newNode(Node.DocTypeTag,
1810 /* make a note of the version named by the doctype */
1811 this.doctype = findGivenVersion(this.token);
1814 case LEX_PROCINSTR: /* seen <? so look for '>' */
1815 /* check for PHP preprocessor instructions <?php ... ?> */
1817 if (this.lexsize - this.txtstart == 3)
1819 if ((getString(this.lexbuf, this.txtstart, 3)).equals("php"))
1821 this.state = LEX_PHP;
1826 if (this.configuration.XmlPIs) /* insist on ?> as terminator */
1831 /* now look for '>' */
1832 c = this.in.readChar();
1834 if (c == StreamIn.EndOfStream)
1836 Report.warning(this, null, null, Report.UNEXPECTED_END_OF_FILE);
1837 this.in.ungetChar(c);
1848 this.txtend = this.lexsize;
1849 this.lexbuf[this.lexsize] = (byte)'\0';
1850 this.state = LEX_CONTENT;
1851 this.waswhite = false;
1852 this.token = newNode(Node.ProcInsTag,
1858 case LEX_ASP: /* seen <% so look for "%>" */
1862 /* now look for '>' */
1863 c = this.in.readChar();
1868 this.in.ungetChar(c);
1873 this.txtend = this.lexsize;
1874 this.lexbuf[this.lexsize] = (byte)'\0';
1875 this.state = LEX_CONTENT;
1876 this.waswhite = false;
1877 this.token = newNode(Node.AspTag,
1883 case LEX_JSTE: /* seen <# so look for "#>" */
1887 /* now look for '>' */
1888 c = this.in.readChar();
1893 this.in.ungetChar(c);
1898 this.txtend = this.lexsize;
1899 this.lexbuf[this.lexsize] = (byte)'\0';
1900 this.state = LEX_CONTENT;
1901 this.waswhite = false;
1902 this.token = newNode(Node.JsteTag,
1908 case LEX_PHP: /* seen "<?php" so look for "?>" */
1912 /* now look for '>' */
1913 c = this.in.readChar();
1917 this.in.ungetChar(c);
1922 this.txtend = this.lexsize;
1923 this.lexbuf[this.lexsize] = (byte)'\0';
1924 this.state = LEX_CONTENT;
1925 this.waswhite = false;
1926 this.token = newNode(Node.PhpTag,
1932 case LEX_SECTION: /* seen "<![" so look for "]>" */
1935 if (this.lexsize == (this.txtstart + 6) &&
1936 (getString(this.lexbuf, this.txtstart, 6)).equals("CDATA["))
1938 this.state = LEX_CDATA;
1947 /* now look for '>' */
1948 c = this.in.readChar();
1952 this.in.ungetChar(c);
1957 this.txtend = this.lexsize;
1958 this.lexbuf[this.lexsize] = (byte)'\0';
1959 this.state = LEX_CONTENT;
1960 this.waswhite = false;
1961 this.token = newNode(Node.SectionTag,
1967 case LEX_CDATA: /* seen "<![CDATA[" so look for "]]>" */
1971 /* now look for ']' */
1972 c = this.in.readChar();
1976 this.in.ungetChar(c);
1980 /* now look for '>' */
1981 c = this.in.readChar();
1985 this.in.ungetChar(c);
1990 this.txtend = this.lexsize;
1991 this.lexbuf[this.lexsize] = (byte)'\0';
1992 this.state = LEX_CONTENT;
1993 this.waswhite = false;
1994 this.token = newNode(Node.CDATATag,
2002 if (this.state == LEX_CONTENT) /* text string */
2004 this.txtend = this.lexsize;
2006 if (this.txtend > this.txtstart)
2008 this.in.ungetChar(c);
2010 if (this.lexbuf[this.lexsize - 1] == (byte)' ')
2013 this.txtend = this.lexsize;
2016 this.token = newNode(Node.TextNode,
2023 else if (this.state == LEX_COMMENT) /* comment */
2025 if (c == StreamIn.EndOfStream)
2026 Report.warning(this, null, null, Report.MALFORMED_COMMENT);
2028 this.txtend = this.lexsize;
2029 this.lexbuf[this.lexsize] = (byte)'\0';
2030 this.state = LEX_CONTENT;
2031 this.waswhite = false;
2032 this.token = newNode(Node.CommentTag,
2043 parser for ASP within start tags
2045 Some people use ASP for to customize attributes
2046 Tidy isn't really well suited to dealing with ASP
2047 This is a workaround for attributes, but won't
2048 deal with the case where the ASP is used to tailor
2049 the attribute value. Here is an example of a work
2050 around for using ASP in attribute values:
2052 href="<%=rsSchool.Fields("ID").Value%>"
2054 where the ASP that generates the attribute value
2055 is masked from Tidy by the quotemarks.
2059 public Node parseAsp()
2064 this.txtstart = this.lexsize;
2068 c = this.in.readChar();
2075 c = this.in.readChar();
2083 this.txtend = this.lexsize;
2085 if (this.txtend > this.txtstart)
2086 asp = newNode(Node.AspTag,
2091 this.txtstart = this.txtend;
2096 PHP is like ASP but is based upon XML
2097 processing instructions, e.g. <?php ... ?>
2099 public Node parsePhp()
2104 this.txtstart = this.lexsize;
2108 c = this.in.readChar();
2115 c = this.in.readChar();
2123 this.txtend = this.lexsize;
2125 if (this.txtend > this.txtstart)
2126 php = newNode(Node.PhpTag,
2131 this.txtstart = this.txtend;
2135 /* consumes the '>' terminating start tags */
2136 public String parseAttribute(MutableBoolean isempty, MutableObject asp,
2140 // int len = 0; Removed by BUGFIX for 126265
2145 asp.setObject(null); /* clear asp pointer */
2146 php.setObject(null); /* clear php pointer */
2147 /* skip white space before the attribute */
2151 c = this.in.readChar();
2155 c = this.in.readChar();
2159 isempty.value = true;
2163 this.in.ungetChar(c);
2173 c = this.in.readChar();
2177 asp.setObject(parseAsp());
2182 php.setObject(parsePhp());
2186 this.in.ungetChar(c);
2187 Report.attrError(this, this.token, null, Report.UNEXPECTED_GT);
2191 if (c == '"' || c == '\'')
2193 Report.attrError(this, this.token, null, Report.UNEXPECTED_QUOTEMARK);
2197 if (c == StreamIn.EndOfStream)
2199 Report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE);
2200 this.in.ungetChar(c);
2206 if ((map & WHITE) == 0)
2210 start = this.lexsize;
2214 /* but push back '=' for parseValue() */
2215 if (c == '=' || c == '>')
2217 this.in.ungetChar(c);
2221 if (c == '<' || c == StreamIn.EndOfStream)
2223 this.in.ungetChar(c);
2229 if ((map & WHITE) != 0)
2232 /* what should be done about non-namechar characters? */
2233 /* currently these are incorporated into the attr name */
2235 if (!this.configuration.XmlTags && (map & UPPERCASE) != 0)
2236 c += (int)('a' - 'A');
2238 // ++len; Removed by BUGFIX for 126265
2241 c = this.in.readChar();
2244 // Following line added by GLP to fix BUG 126265. This is a temporary comment
2245 // and should be removed when Tidy is fixed.
2246 int len = this.lexsize - start;
2247 attr = (len > 0 ? getString(this.lexbuf, start, len) : null);
2248 this.lexsize = start;
2254 invoked when < is seen in place of attribute value
2255 but terminates on whitespace if not ASP, PHP or Tango
2256 this routine recognizes ' and " quoted strings
2258 public int parseServerInstruction()
2260 int c, map, delim = '"';
2261 boolean isrule = false;
2263 c = this.in.readChar();
2266 /* check for ASP, PHP or Tango */
2267 if (c == '%' || c == '?' || c == '@')
2272 c = this.in.readChar();
2274 if (c == StreamIn.EndOfStream)
2282 this.in.ungetChar(c);
2287 /* if not recognized as ASP, PHP or Tango */
2288 /* then also finish value on whitespace */
2293 if ((map & WHITE) != 0)
2303 c = this.in.readChar();
2315 c = this.in.readChar();
2325 /* values start with "=" or " = " etc. */
2326 /* doesn't consume the ">" at end of start tag */
2328 public String parseValue(String name, boolean foldCase,
2329 MutableBoolean isempty, MutableInteger pdelim)
2334 boolean seen_gt = false;
2335 boolean munge = true;
2337 int lastc, delim, quotewarning;
2341 pdelim.value = (int)'"';
2344 Henry Zrepa reports that some folk are using the
2345 embed element with script attributes where newlines
2346 are significant and must be preserved
2348 if (configuration.LiteralAttribs)
2351 /* skip white space before the '=' */
2355 c = this.in.readChar();
2357 if (c == StreamIn.EndOfStream)
2359 this.in.ungetChar(c);
2365 if ((map & WHITE) == 0)
2370 c should be '=' if there is a value
2371 other legal possibilities are white
2377 this.in.ungetChar(c);
2381 /* skip white space after '=' */
2385 c = this.in.readChar();
2387 if (c == StreamIn.EndOfStream)
2389 this.in.ungetChar(c);
2395 if ((map & WHITE) == 0)
2399 /* check for quote marks */
2401 if (c == '"' || c == '\'')
2405 start = this.lexsize;
2407 pdelim.value = parseServerInstruction();
2408 len = this.lexsize - start;
2409 this.lexsize = start;
2410 return (len > 0 ? getString(this.lexbuf, start, len) : null);
2413 this.in.ungetChar(c);
2416 and read the value string
2417 check for quote mark if needed
2421 start = this.lexsize;
2426 lastc = c; /* track last character */
2427 c = this.in.readChar();
2429 if (c == StreamIn.EndOfStream)
2431 Report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE);
2432 this.in.ungetChar(c);
2436 if (delim == (char)0)
2440 this.in.ungetChar(c);
2444 if (c == '"' || c == '\'')
2446 Report.attrError(this, this.token, null, Report.UNEXPECTED_QUOTEMARK);
2452 /* this.in.ungetChar(c); */
2453 Report.attrError(this, this.token, null, Report.UNEXPECTED_GT);
2458 For cases like <br clear=all/> need to avoid treating /> as
2459 part of the attribute value, however care is needed to avoid
2460 so treating <a href=http://www.acme.com/> in this way, which
2461 would map the <a> tag to <a href="http://www.acme.com"/>
2465 /* peek ahead in case of /> */
2466 c = this.in.readChar();
2469 !AttributeTable.getDefaultAttributeTable().isUrl(name))
2471 isempty.value = true;
2472 this.in.ungetChar(c);
2476 /* unget peeked char */
2477 this.in.ungetChar(c);
2481 else /* delim is '\'' or '"' */
2486 /* treat CRLF, CR and LF as single line break */
2490 c = this.in.readChar();
2492 this.in.ungetChar(c);
2497 if (c == '\n' || c == '<' || c == '>')
2507 parseEntity((short)0);
2512 kludge for JavaScript attribute values
2513 with line continuations in string literals
2517 c = this.in.readChar();
2521 this.in.ungetChar(c);
2528 if ((map & WHITE) != 0)
2530 if (delim == (char)0)
2541 else if (foldCase && (map & UPPERCASE) != 0)
2542 c += (int)('a' - 'A');
2547 if (quotewarning > 10 && seen_gt && munge)
2550 there is almost certainly a missing trailling quote mark
2551 as we have see too many newlines, < or > characters.
2553 an exception is made for Javascript attributes and the
2554 javascript URL scheme which may legitimately include < and >
2556 if (!AttributeTable.getDefaultAttributeTable().isScript(name) &&
2557 !(AttributeTable.getDefaultAttributeTable().isUrl(name) &&
2558 (getString(this.lexbuf, start, 11)).equals("javascript:")))
2559 Report.error(this, null, null, Report.SUSPECTED_MISSING_QUOTE);
2562 len = this.lexsize - start;
2563 this.lexsize = start;
2565 if (len > 0 || delim != 0)
2566 value = getString(this.lexbuf, start, len);
2570 /* note delimiter if given */
2572 pdelim.value = delim;
2574 pdelim.value = (int)'"';
2579 /* attr must be non-null */
2580 public static boolean isValidAttrName(String attr)
2586 /* first character should be a letter */
2590 if (!((map & LETTER) != 0))
2593 /* remaining characters should be namechars */
2594 for( i = 1; i < attr.length(); i++)
2599 if((map & NAMECHAR) != 0)
2608 /* swallows closing '>' */
2610 public AttVal parseAttrs(MutableBoolean isempty)
2613 String attribute, value;
2614 MutableInteger delim = new MutableInteger();
2615 MutableObject asp = new MutableObject();
2616 MutableObject php = new MutableObject();
2620 for (; !endOfInput();)
2622 attribute = parseAttribute(isempty, asp, php);
2624 if (attribute == null)
2626 /* check if attributes are created by ASP markup */
2627 if (asp.getObject() != null)
2629 av = new AttVal(list, null, (Node)asp.getObject(), null,
2635 /* check if attributes are created by PHP markup */
2636 if (php.getObject() != null)
2638 av = new AttVal(list, null, null, (Node)php.getObject(),
2647 value = parseValue(attribute, false, isempty, delim);
2649 if (attribute != null && isValidAttrName(attribute))
2651 av = new AttVal( list, null, null, null,
2652 delim.value, attribute, value );
2654 AttributeTable.getDefaultAttributeTable().findAttribute(av);
2659 av = new AttVal( null, null, null, null,
2660 0, attribute, value );
2661 Report.attrError(this, this.token, value, Report.BAD_ATTRIBUTE_VALUE);
2669 push a copy of an inline node onto stack
2670 but don't push if implicit or OBJECT or APPLET
2671 (implicit tags are ones generated from the istack)
2673 One issue arises with pushing inlines when
2674 the tag is already pushed. For instance:
2679 Shouldn't be mapped to
2681 <p><em>text</em></p>
2682 <p><em><em>more text</em></em>
2684 public void pushInline( Node node )
2691 if (node.tag == null)
2694 if ((node.tag.model & Dict.CM_INLINE) == 0 )
2697 if ((node.tag.model & Dict.CM_OBJECT) != 0)
2700 if (node.tag != configuration.tt.tagFont && isPushed(node))
2703 // make sure there is enough space for the stack
2706 is.element = node.element;
2707 if (node.attributes != null)
2708 is.attributes = cloneAttributes(node.attributes);
2709 this.istack.push( is );
2712 /* pop inline stack */
2713 public void popInline( Node node )
2720 if (node.tag == null)
2723 if ((node.tag.model & Dict.CM_INLINE) == 0)
2726 if ((node.tag.model & Dict.CM_OBJECT) != 0)
2729 // if node is </a> then pop until we find an <a>
2730 if (node.tag == configuration.tt.tagA) {
2732 while (this.istack.size() > 0) {
2733 is = (IStack)this.istack.pop();
2734 if (is.tag == configuration.tt.tagA) {
2739 if (this.insert >= this.istack.size())
2745 if (this.istack.size() > 0) {
2746 is = (IStack)this.istack.pop();
2747 if (this.insert >= this.istack.size())
2752 public boolean isPushed( Node node )
2757 for (i = this.istack.size() - 1; i >= 0; --i) {
2758 is = (IStack)this.istack.elementAt(i);
2759 if (is.tag == node.tag)
2767 This has the effect of inserting "missing" inline
2768 elements around the contents of blocklevel elements
2769 such as P, TD, TH, DIV, PRE etc. This procedure is
2770 called at the start of ParseBlock. when the inline
2771 stack is not empty, as will be the case in:
2773 <i><h1>italic heading</h1></i>
2775 which is then treated as equivalent to
2777 <h1><i>italic heading</i></h1>
2779 This is implemented by setting the lexer into a mode
2780 where it gets tokens from the inline stack rather than
2781 from the input stream.
2783 public int inlineDup( Node node )
2787 n = this.istack.size() - this.istackbase;
2789 this.insert = this.istackbase;
2796 public Node insertedToken()
2802 // this will only be null if inode != null
2803 if (this.insert == -1) {
2809 // is this is the "latest" node then update
2810 // the position, otherwise use current values
2812 if (this.inode == null) {
2813 this.lines = this.in.curline;
2814 this.columns = this.in.curcol;
2817 node = newNode(Node.StartTag,
2820 this.txtend); // GLP: Bugfix 126261. Remove when this change
2821 // is fixed in istack.c in the original Tidy
2822 node.implicit = true;
2823 is = (IStack)this.istack.elementAt( this.insert );
2824 node.element = is.element;
2826 if (is.attributes != null)
2827 node.attributes = cloneAttributes(is.attributes);
2829 // advance lexer to next item on the stack
2832 // and recover state if we have reached the end
2833 if (++n < this.istack.size() ) {
2842 /* AQ: Try this for speed optimization */
2843 public static int wstrcasecmp(String s1, String s2)
2845 return (s1.equalsIgnoreCase(s2) ? 0 : 1);
2848 public static int wstrcaselexcmp(String s1, String s2)
2853 while ( i < s1.length() && i < s2.length() ) {
2855 if ( toLower(c) != toLower( s2.charAt(i) ) ) {
2860 if ( i == s1.length() && i == s2.length() ) {
2862 } else if ( i == s1.length() ) {
2864 } else if ( i == s2.length() ) {
2867 return ( s1.charAt(i) > s2.charAt(i) ? 1 : -1 );
2871 public static boolean wsubstr(String s1, String s2)
2874 int len1 = s1.length();
2875 int len2 = s2.length();
2877 for (i = 0; i <= len1 - len2; ++i)
2879 if (s2.equalsIgnoreCase(s1.substring(i)))
2886 public boolean canPrune(Node element)
2888 if (element.type == Node.TextNode)
2891 if (element.content != null)
2894 if (element.tag == configuration.tt.tagA && element.attributes != null)
2897 if (element.tag == configuration.tt.tagP && !this.configuration.DropEmptyParas)
2900 if (element.tag == null)
2903 if ((element.tag.model & Dict.CM_ROW) != 0)
2906 if (element.tag == configuration.tt.tagApplet)
2909 if (element.tag == configuration.tt.tagObject)
2912 if (element.attributes != null &&
2913 (element.getAttrByName("id") != null ||
2914 element.getAttrByName("name") != null) )
2920 /* duplicate name attribute as an id */
2921 public void fixId(Node node)
2923 AttVal name = node.getAttrByName("name");
2924 AttVal id = node.getAttrByName("id");
2930 if (!id.value.equals(name.value))
2931 Report.attrError(this, node, "name", Report.ID_NAME_MISMATCH);
2933 else if (this.configuration.XmlOut)
2934 node.addAttribute("id", name.value);
2939 defer duplicates when entering a table or other
2940 element where the inlines shouldn't be duplicated
2942 public void deferDup()
2948 /* Private methods and fields */
2950 /* lexer char types */
2951 private static final short DIGIT = 1;
2952 private static final short LETTER = 2;
2953 private static final short NAMECHAR = 4;
2954 private static final short WHITE = 8;
2955 private static final short NEWLINE = 16;
2956 private static final short LOWERCASE = 32;
2957 private static final short UPPERCASE = 64;
2959 /* lexer GetToken states */
2961 private static final short LEX_CONTENT = 0;
2962 private static final short LEX_GT = 1;
2963 private static final short LEX_ENDTAG = 2;
2964 private static final short LEX_STARTTAG = 3;
2965 private static final short LEX_COMMENT = 4;
2966 private static final short LEX_DOCTYPE = 5;
2967 private static final short LEX_PROCINSTR = 6;
2968 private static final short LEX_ENDCOMMENT = 7;
2969 private static final short LEX_CDATA = 8;
2970 private static final short LEX_SECTION = 9;
2971 private static final short LEX_ASP = 10;
2972 private static final short LEX_JSTE = 11;
2973 private static final short LEX_PHP = 12;
2975 /* used to classify chars for lexical purposes */
2976 private static short[] lexmap = new short[128];
2978 private static void mapStr(String str, short code)
2982 for ( int i = 0; i < str.length(); i++ ) {
2983 j = (int)str.charAt(i);
2989 mapStr("\r\n\f", (short)(NEWLINE|WHITE));
2990 mapStr(" \t", WHITE);
2991 mapStr("-.:_", NAMECHAR);
2992 mapStr("0123456789", (short)(DIGIT|NAMECHAR));
2993 mapStr("abcdefghijklmnopqrstuvwxyz", (short)(LOWERCASE|LETTER|NAMECHAR));
2994 mapStr("ABCDEFGHIJKLMNOPQRSTUVWXYZ", (short)(UPPERCASE|LETTER|NAMECHAR));
2997 private static short MAP( char c )
2999 return ((int)c < 128 ? lexmap[(int)c] : 0);
3002 private static boolean isWhite(char c)
3006 return (m & WHITE) != 0;
3009 private static boolean isDigit(char c)
3015 return (m & DIGIT) != 0;
3018 private static boolean isLetter(char c)
3024 return (m & LETTER) != 0;
3027 private static char toLower(char c)
3031 if ((m & UPPERCASE) != 0)
3032 c = (char)( (int)c + (int)'a' - (int)'A' );
3037 private static char toUpper(char c)
3041 if ((m & LOWERCASE) != 0)
3042 c = (char)( (int)c + (int)'A' - (int)'a' );
3047 public static char foldCase(char c, boolean tocaps, boolean xmlTags)
3057 if ((m & LOWERCASE) != 0)
3058 c = (char)( (int)c + (int)'A' - (int)'a' );
3060 else /* force to lower case */
3062 if ((m & UPPERCASE) != 0)
3063 c = (char)( (int)c + (int)'a' - (int)'A' );
3071 private static class W3CVersionInfo
3078 public W3CVersionInfo( String name,
3084 this.voyagerName = voyagerName;
3085 this.profile = profile;
3090 /* the 3 URIs for the XHTML 1.0 DTDs */
3091 private static final String voyager_loose = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";
3092 private static final String voyager_strict = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";
3093 private static final String voyager_frameset = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd";
3095 private static final String XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml";
3097 private static Lexer.W3CVersionInfo[] W3CVersion =
3099 new W3CVersionInfo("HTML 4.01",
3102 Dict.VERS_HTML40_STRICT),
3103 new W3CVersionInfo("HTML 4.01 Transitional",
3104 "XHTML 1.0 Transitional",
3106 Dict.VERS_HTML40_LOOSE),
3107 new W3CVersionInfo("HTML 4.01 Frameset",
3108 "XHTML 1.0 Frameset",
3111 new W3CVersionInfo("HTML 4.0",
3114 Dict.VERS_HTML40_STRICT),
3115 new W3CVersionInfo("HTML 4.0 Transitional",
3116 "XHTML 1.0 Transitional",
3118 Dict.VERS_HTML40_LOOSE),
3119 new W3CVersionInfo("HTML 4.0 Frameset",
3120 "XHTML 1.0 Frameset",
3123 new W3CVersionInfo("HTML 3.2",
3124 "XHTML 1.0 Transitional",
3127 new W3CVersionInfo("HTML 2.0",