2 * @(#)Tidy.java 1.11 2000/08/16
7 HTML parser and pretty printer
9 Copyright (c) 1998-2000 World Wide Web Consortium (Massachusetts
10 Institute of Technology, Institut National de Recherche en
11 Informatique et en Automatique, Keio University). All Rights
14 Contributing Author(s):
16 Dave Raggett <dsr@w3.org>
17 Andy Quick <ac.quick@sympatico.ca> (translation to Java)
19 The contributing author(s) would like to thank all those who
20 helped with testing, bug fixes, and patience. This wouldn't
21 have been possible without all of you.
25 This software and documentation is provided "as is," and
26 the copyright holders and contributing author(s) make no
27 representations or warranties, express or implied, including
28 but not limited to, warranties of merchantability or fitness
29 for any particular purpose or that the use of the software or
30 documentation will not infringe any third party patents,
31 copyrights, trademarks or other rights.
33 The copyright holders and contributing author(s) will not be
34 liable for any direct, indirect, special or consequential damages
35 arising out of any use of the software or documentation, even if
36 advised of the possibility of such damage.
38 Permission is hereby granted to use, copy, modify, and distribute
39 this source code, or portions hereof, documentation and executables,
40 for any purpose, without fee, subject to the following restrictions:
42 1. The origin of this source code must not be misrepresented.
43 2. Altered versions must be plainly marked as such and must
44 not be misrepresented as being the original source.
45 3. This Copyright notice may not be removed or altered from any
46 source or altered source distribution.
48 The copyright holders and contributing author(s) specifically
49 permit, without fee, and encourage the use of this source code
50 as a component for supporting the Hypertext Markup Language in
51 commercial products. If you use this source code in a product,
52 acknowledgment is not required but would be appreciated.
55 package net.sourceforge.phpdt.tidy.w3c;
57 import java.io.FileInputStream;
58 import java.io.FileNotFoundException;
59 import java.io.FileOutputStream;
60 import java.io.FileWriter;
61 import java.io.IOException;
62 import java.io.InputStream;
63 import java.io.OutputStream;
64 import java.io.PrintWriter;
65 import java.util.Properties;
67 import org.eclipse.core.resources.IFile;
71 * <p>HTML parser and pretty printer</p>
74 * (c) 1998-2000 (W3C) MIT, INRIA, Keio University
75 * See Tidy.java for the copyright notice.
76 * Derived from <a href="http://www.w3.org/People/Raggett/tidy">
77 * HTML Tidy Release 4 Aug 2000</a>
81 * Copyright (c) 1998-2000 World Wide Web Consortium (Massachusetts
82 * Institute of Technology, Institut National de Recherche en
83 * Informatique et en Automatique, Keio University). All Rights
88 * Contributing Author(s):<br>
89 * <a href="mailto:dsr@w3.org">Dave Raggett</a><br>
90 * <a href="mailto:ac.quick@sympatico.ca">Andy Quick</a> (translation to Java)
94 * The contributing author(s) would like to thank all those who
95 * helped with testing, bug fixes, and patience. This wouldn't
96 * have been possible without all of you.
100 * COPYRIGHT NOTICE:<br>
102 * This software and documentation is provided "as is," and
103 * the copyright holders and contributing author(s) make no
104 * representations or warranties, express or implied, including
105 * but not limited to, warranties of merchantability or fitness
106 * for any particular purpose or that the use of the software or
107 * documentation will not infringe any third party patents,
108 * copyrights, trademarks or other rights.
112 * The copyright holders and contributing author(s) will not be
113 * liable for any direct, indirect, special or consequential damages
114 * arising out of any use of the software or documentation, even if
115 * advised of the possibility of such damage.
119 * Permission is hereby granted to use, copy, modify, and distribute
120 * this source code, or portions hereof, documentation and executables,
121 * for any purpose, without fee, subject to the following restrictions:
126 * <li>The origin of this source code must not be misrepresented.</li>
127 * <li>Altered versions must be plainly marked as such and must
128 * not be misrepresented as being the original source.</li>
129 * <li>This Copyright notice may not be removed or altered from any
130 * source or altered source distribution.</li>
135 * The copyright holders and contributing author(s) specifically
136 * permit, without fee, and encourage the use of this source code
137 * as a component for supporting the Hypertext Markup Language in
138 * commercial products. If you use this source code in a product,
139 * acknowledgment is not required but would be appreciated.
142 * @author Dave Raggett <dsr@w3.org>
143 * @author Andy Quick <ac.quick@sympatico.ca> (translation to Java)
144 * @version 1.0, 1999/05/22
145 * @version 1.0.1, 1999/05/29
146 * @version 1.1, 1999/06/18 Java Bean
147 * @version 1.2, 1999/07/10 Tidy Release 7 Jul 1999
148 * @version 1.3, 1999/07/30 Tidy Release 26 Jul 1999
149 * @version 1.4, 1999/09/04 DOM support
150 * @version 1.5, 1999/10/23 Tidy Release 27 Sep 1999
151 * @version 1.6, 1999/11/01 Tidy Release 22 Oct 1999
152 * @version 1.7, 1999/12/06 Tidy Release 30 Nov 1999
153 * @version 1.8, 2000/01/22 Tidy Release 13 Jan 2000
154 * @version 1.9, 2000/06/03 Tidy Release 30 Apr 2000
155 * @version 1.10, 2000/07/22 Tidy Release 8 Jul 2000
156 * @version 1.11, 2000/08/16 Tidy Release 4 Aug 2000
160 public class Tidy implements java.io.Serializable {
162 static final long serialVersionUID = -2794371560623987718L;
164 private boolean initialized = false;
165 private PrintWriter errout = null; /* error output stream */
166 private PrintWriter stderr = null;
167 private Configuration configuration = null;
168 private String inputStreamName = "InputStream";
169 private int parseErrors = 0;
170 private int parseWarnings = 0;
176 public Configuration getConfiguration() {
177 return configuration;
180 public PrintWriter getStderr() {
185 * ParseErrors - the number of errors that occurred in the most
186 * recent parse operation
189 public int getParseErrors() {
194 * ParseWarnings - the number of warnings that occurred in the most
195 * recent parse operation
198 public int getParseWarnings() {
199 return parseWarnings;
203 * Errout - the error output stream
206 public PrintWriter getErrout() {
210 public void setErrout(PrintWriter errout) {
211 this.errout = errout;
215 * Spaces - default indentation
216 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#spaces
219 public void setSpaces(int spaces) {
220 configuration.spaces = spaces;
223 public int getSpaces() {
224 return configuration.spaces;
228 * Wraplen - default wrap margin
229 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#wraplen
232 public void setWraplen(int wraplen) {
233 configuration.wraplen = wraplen;
236 public int getWraplen() {
237 return configuration.wraplen;
242 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#CharEncoding
245 public void setCharEncoding(int charencoding) {
246 configuration.CharEncoding = charencoding;
249 public int getCharEncoding() {
250 return configuration.CharEncoding;
255 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#tabsize
258 public void setTabsize(int tabsize) {
259 configuration.tabsize = tabsize;
262 public int getTabsize() {
263 return configuration.tabsize;
267 * Errfile - file name to write errors to
268 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#errfile
271 public void setErrfile(String errfile) {
272 configuration.errfile = errfile;
275 public String getErrfile() {
276 return configuration.errfile;
280 * Writeback - if true then output tidied markup
281 * NOTE: this property is ignored when parsing from an InputStream.
282 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#writeback
285 public void setWriteback(boolean writeback) {
286 configuration.writeback = writeback;
289 public boolean getWriteback() {
290 return configuration.writeback;
294 * OnlyErrors - if true normal output is suppressed
295 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#OnlyErrors
298 public void setOnlyErrors(boolean OnlyErrors) {
299 configuration.OnlyErrors = OnlyErrors;
302 public boolean getOnlyErrors() {
303 return configuration.OnlyErrors;
307 * ShowWarnings - however errors are always shown
308 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#ShowWarnings
311 public void setShowWarnings(boolean ShowWarnings) {
312 configuration.ShowWarnings = ShowWarnings;
315 public boolean getShowWarnings() {
316 return configuration.ShowWarnings;
320 * Quiet - no 'Parsing X', guessed DTD or summary
321 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#Quiet
324 public void setQuiet(boolean Quiet) {
325 configuration.Quiet = Quiet;
328 public boolean getQuiet() {
329 return configuration.Quiet;
333 * IndentContent - indent content of appropriate tags
334 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#IndentContent
337 public void setIndentContent(boolean IndentContent) {
338 configuration.IndentContent = IndentContent;
341 public boolean getIndentContent() {
342 return configuration.IndentContent;
346 * SmartIndent - does text/block level content effect indentation
347 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#SmartIndent
350 public void setSmartIndent(boolean SmartIndent) {
351 configuration.SmartIndent = SmartIndent;
354 public boolean getSmartIndent() {
355 return configuration.SmartIndent;
359 * HideEndTags - suppress optional end tags
360 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#HideEndTags
363 public void setHideEndTags(boolean HideEndTags) {
364 configuration.HideEndTags = HideEndTags;
367 public boolean getHideEndTags() {
368 return configuration.HideEndTags;
372 * XmlTags - treat input as XML
373 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#XmlTags
376 public void setXmlTags(boolean XmlTags) {
377 configuration.XmlTags = XmlTags;
380 public boolean getXmlTags() {
381 return configuration.XmlTags;
385 * XmlOut - create output as XML
386 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#XmlOut
389 public void setXmlOut(boolean XmlOut) {
390 configuration.XmlOut = XmlOut;
393 public boolean getXmlOut() {
394 return configuration.XmlOut;
398 * XHTML - output extensible HTML
399 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#xHTML
402 public void setXHTML(boolean xHTML) {
403 configuration.xHTML = xHTML;
406 public boolean getXHTML() {
407 return configuration.xHTML;
411 * RawOut - avoid mapping values > 127 to entities
412 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#RawOut
415 public void setRawOut(boolean RawOut) {
416 configuration.RawOut = RawOut;
419 public boolean getRawOut() {
420 return configuration.RawOut;
424 * UpperCaseTags - output tags in upper not lower case
425 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#UpperCaseTags
428 public void setUpperCaseTags(boolean UpperCaseTags) {
429 configuration.UpperCaseTags = UpperCaseTags;
432 public boolean getUpperCaseTags() {
433 return configuration.UpperCaseTags;
437 * UpperCaseAttrs - output attributes in upper not lower case
438 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#UpperCaseAttrs
441 public void setUpperCaseAttrs(boolean UpperCaseAttrs) {
442 configuration.UpperCaseAttrs = UpperCaseAttrs;
445 public boolean getUpperCaseAttrs() {
446 return configuration.UpperCaseAttrs;
450 * MakeClean - remove presentational clutter
451 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#MakeClean
454 public void setMakeClean(boolean MakeClean) {
455 configuration.MakeClean = MakeClean;
458 public boolean getMakeClean() {
459 return configuration.MakeClean;
463 * BreakBeforeBR - o/p newline before <br> or not?
464 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#BreakBeforeBR
467 public void setBreakBeforeBR(boolean BreakBeforeBR) {
468 configuration.BreakBeforeBR = BreakBeforeBR;
471 public boolean getBreakBeforeBR() {
472 return configuration.BreakBeforeBR;
476 * BurstSlides - create slides on each h2 element
477 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#BurstSlides
480 public void setBurstSlides(boolean BurstSlides) {
481 configuration.BurstSlides = BurstSlides;
484 public boolean getBurstSlides() {
485 return configuration.BurstSlides;
489 * NumEntities - use numeric entities
490 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#NumEntities
493 public void setNumEntities(boolean NumEntities) {
494 configuration.NumEntities = NumEntities;
497 public boolean getNumEntities() {
498 return configuration.NumEntities;
502 * QuoteMarks - output " marks as &quot;
503 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#QuoteMarks
506 public void setQuoteMarks(boolean QuoteMarks) {
507 configuration.QuoteMarks = QuoteMarks;
510 public boolean getQuoteMarks() {
511 return configuration.QuoteMarks;
515 * QuoteNbsp - output non-breaking space as entity
516 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#QuoteNbsp
519 public void setQuoteNbsp(boolean QuoteNbsp) {
520 configuration.QuoteNbsp = QuoteNbsp;
523 public boolean getQuoteNbsp() {
524 return configuration.QuoteNbsp;
528 * QuoteAmpersand - output naked ampersand as &
529 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#QuoteAmpersand
532 public void setQuoteAmpersand(boolean QuoteAmpersand) {
533 configuration.QuoteAmpersand = QuoteAmpersand;
536 public boolean getQuoteAmpersand() {
537 return configuration.QuoteAmpersand;
541 * WrapAttVals - wrap within attribute values
542 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#WrapAttVals
545 public void setWrapAttVals(boolean WrapAttVals) {
546 configuration.WrapAttVals = WrapAttVals;
549 public boolean getWrapAttVals() {
550 return configuration.WrapAttVals;
554 * WrapScriptlets - wrap within JavaScript string literals
555 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#WrapScriptlets
558 public void setWrapScriptlets(boolean WrapScriptlets) {
559 configuration.WrapScriptlets = WrapScriptlets;
562 public boolean getWrapScriptlets() {
563 return configuration.WrapScriptlets;
567 * WrapSection - wrap within <![ ... ]> section tags
568 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#WrapSection
571 public void setWrapSection(boolean WrapSection) {
572 configuration.WrapSection = WrapSection;
575 public boolean getWrapSection() {
576 return configuration.WrapSection;
580 * AltText - default text for alt attribute
581 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#altText
584 public void setAltText(String altText) {
585 configuration.altText = altText;
588 public String getAltText() {
589 return configuration.altText;
593 * Slidestyle - style sheet for slides
594 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#slidestyle
597 public void setSlidestyle(String slidestyle) {
598 configuration.slidestyle = slidestyle;
601 public String getSlidestyle() {
602 return configuration.slidestyle;
606 * XmlPi - add <?xml?> for XML docs
607 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#XmlPi
610 public void setXmlPi(boolean XmlPi) {
611 configuration.XmlPi = XmlPi;
614 public boolean getXmlPi() {
615 return configuration.XmlPi;
619 * DropFontTags - discard presentation tags
620 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#DropFontTags
623 public void setDropFontTags(boolean DropFontTags) {
624 configuration.DropFontTags = DropFontTags;
627 public boolean getDropFontTags() {
628 return configuration.DropFontTags;
631 //gschadow patch start
633 * Remove all those stupid pseudo-XML tags. If only XSLT had been
634 * around 5 years earlier, we wouldn't need to bother about all
635 * this ASP, JSP, PHP and other sh..!
637 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#DropFontTags
639 public void setDropPseudoXMLCrap(boolean DropPseudoXMLCrap) {
640 configuration.DropPseudoXMLCrap = DropPseudoXMLCrap;
644 public boolean getDropPseudoXMLCrap() {
645 return configuration.DropPseudoXMLCrap;
649 * DropEmptyParas - discard empty p elements
650 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#DropEmptyParas
653 public void setDropEmptyParas(boolean DropEmptyParas) {
654 configuration.DropEmptyParas = DropEmptyParas;
657 public boolean getDropEmptyParas() {
658 return configuration.DropEmptyParas;
662 * FixComments - fix comments with adjacent hyphens
663 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#FixComments
666 public void setFixComments(boolean FixComments) {
667 configuration.FixComments = FixComments;
670 public boolean getFixComments() {
671 return configuration.FixComments;
675 * WrapAsp - wrap within ASP pseudo elements
676 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#WrapAsp
679 public void setWrapAsp(boolean WrapAsp) {
680 configuration.WrapAsp = WrapAsp;
683 public boolean getWrapAsp() {
684 return configuration.WrapAsp;
688 * WrapJste - wrap within JSTE pseudo elements
689 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#WrapJste
692 public void setWrapJste(boolean WrapJste) {
693 configuration.WrapJste = WrapJste;
696 public boolean getWrapJste() {
697 return configuration.WrapJste;
701 * WrapPhp - wrap within PHP pseudo elements
702 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#WrapPhp
705 public void setWrapPhp(boolean WrapPhp) {
706 configuration.WrapPhp = WrapPhp;
709 public boolean getWrapPhp() {
710 return configuration.WrapPhp;
714 * FixBackslash - fix URLs by replacing \ with /
715 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#FixBackslash
718 public void setFixBackslash(boolean FixBackslash) {
719 configuration.FixBackslash = FixBackslash;
722 public boolean getFixBackslash() {
723 return configuration.FixBackslash;
727 * IndentAttributes - newline+indent before each attribute
728 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#IndentAttributes
731 public void setIndentAttributes(boolean IndentAttributes) {
732 configuration.IndentAttributes = IndentAttributes;
735 public boolean getIndentAttributes() {
736 return configuration.IndentAttributes;
740 * DocType - user specified doctype
741 * omit | auto | strict | loose | <i>fpi</i>
742 * where the <i>fpi</i> is a string similar to
743 * "-//ACME//DTD HTML 3.14159//EN"
744 * Note: for <i>fpi</i> include the double-quotes in the string.
745 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#docTypeStr
746 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#docTypeMode
749 public void setDocType(String doctype) {
751 configuration.docTypeStr = configuration.parseDocType(doctype, "doctype");
754 public String getDocType() {
755 String result = null;
756 switch (configuration.docTypeMode) {
757 case Configuration.DOCTYPE_OMIT :
760 case Configuration.DOCTYPE_AUTO :
763 case Configuration.DOCTYPE_STRICT :
766 case Configuration.DOCTYPE_LOOSE :
769 case Configuration.DOCTYPE_USER :
770 result = configuration.docTypeStr;
777 * LogicalEmphasis - replace i by em and b by strong
778 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#LogicalEmphasis
781 public void setLogicalEmphasis(boolean LogicalEmphasis) {
782 configuration.LogicalEmphasis = LogicalEmphasis;
785 public boolean getLogicalEmphasis() {
786 return configuration.LogicalEmphasis;
790 * XmlPIs - if set to true PIs must end with ?>
791 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#XmlPIs
794 public void setXmlPIs(boolean XmlPIs) {
795 configuration.XmlPIs = XmlPIs;
798 public boolean getXmlPIs() {
799 return configuration.XmlPIs;
803 * EncloseText - if true text at body is wrapped in <p>'s
804 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#EncloseBodyText
807 public void setEncloseText(boolean EncloseText) {
808 configuration.EncloseBodyText = EncloseText;
811 public boolean getEncloseText() {
812 return configuration.EncloseBodyText;
816 * EncloseBlockText - if true text in blocks is wrapped in <p>'s
817 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#EncloseBlockText
820 public void setEncloseBlockText(boolean EncloseBlockText) {
821 configuration.EncloseBlockText = EncloseBlockText;
824 public boolean getEncloseBlockText() {
825 return configuration.EncloseBlockText;
829 * KeepFileTimes - if true last modified time is preserved<br>
830 * <b>this is NOT supported at this time.</b>
831 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#KeepFileTimes
834 public void setKeepFileTimes(boolean KeepFileTimes) {
835 configuration.KeepFileTimes = KeepFileTimes;
838 public boolean getKeepFileTimes() {
839 return configuration.KeepFileTimes;
843 * Word2000 - draconian cleaning for Word2000
844 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#Word2000
847 public void setWord2000(boolean Word2000) {
848 configuration.Word2000 = Word2000;
851 public boolean getWord2000() {
852 return configuration.Word2000;
856 * TidyMark - add meta element indicating tidied doc
857 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#TidyMark
860 public void setTidyMark(boolean TidyMark) {
861 configuration.TidyMark = TidyMark;
864 public boolean getTidyMark() {
865 return configuration.TidyMark;
869 * XmlSpace - if set to yes adds xml:space attr as needed
870 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#XmlSpace
873 public void setXmlSpace(boolean XmlSpace) {
874 configuration.XmlSpace = XmlSpace;
877 public boolean getXmlSpace() {
878 return configuration.XmlSpace;
882 * Emacs - if true format error output for GNU Emacs
883 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#Emacs
886 public void setEmacs(boolean Emacs) {
887 configuration.Emacs = Emacs;
890 public boolean getEmacs() {
891 return configuration.Emacs;
895 * LiteralAttribs - if true attributes may use newlines
896 * @see net.sourceforge.phpdt.tidy.w3c.Configuration#LiteralAttribs
899 public void setLiteralAttribs(boolean LiteralAttribs) {
900 configuration.LiteralAttribs = LiteralAttribs;
903 public boolean getLiteralAttribs() {
904 return configuration.LiteralAttribs;
908 * InputStreamName - the name of the input stream (printed in the
909 * header information).
911 public void setInputStreamName(String name) {
913 inputStreamName = name;
916 public String getInputStreamName() {
917 return inputStreamName;
921 * Sets the configuration from a configuration file.
924 public void setConfigurationFromFile(String filename) {
925 configuration.parseFile(filename);
929 * Sets the configuration from a properties object.
932 public void setConfigurationFromProps(Properties props) {
933 configuration.addProps(props);
937 * first time initialization which should
938 * precede reading the command line
941 private void init() {
942 configuration = new Configuration();
943 if (configuration == null)
946 AttributeTable at = AttributeTable.getDefaultAttributeTable();
949 TagTable tt = new TagTable();
952 tt.setConfiguration(configuration);
953 configuration.tt = tt;
954 EntityTable et = EntityTable.getDefaultEntityTable();
958 /* Unnecessary - same initial values in Configuration
959 Configuration.XmlTags = false;
960 Configuration.XmlOut = false;
961 Configuration.HideEndTags = false;
962 Configuration.UpperCaseTags = false;
963 Configuration.MakeClean = false;
964 Configuration.writeback = false;
965 Configuration.OnlyErrors = false;
968 configuration.errfile = null;
969 stderr = new PrintWriter(System.err, true);
975 * Parses InputStream in and returns the root Node.
976 * If out is non-null, pretty prints to OutputStream out.
979 public Node parse(IFile iFile, InputStream in, OutputStream out) {
980 Node document = null;
983 document = parse(iFile, in, null, out);
985 } catch (FileNotFoundException fnfe) {
986 } catch (IOException e) {
993 * Internal routine that actually does the parsing. The caller
994 * can pass either an InputStream or file name. If both are passed,
995 * the file name is preferred.
998 private Node parse(IFile iFile, InputStream in, String file, OutputStream out)
999 throws FileNotFoundException, IOException {
1001 Node document = null;
1003 Out o = new OutImpl(); /* normal output stream */
1015 /* ensure config is self-consistent */
1016 configuration.adjust();
1019 in = new FileInputStream(file);
1020 inputStreamName = file;
1021 } else if (in == null) {
1023 inputStreamName = "stdin";
1030 new StreamInImpl(in, configuration.CharEncoding, configuration.tabsize),
1032 lexer.errout = errout;
1035 store pointer to lexer in input stream
1036 to allow character encoding errors to be
1039 lexer.in.lexer = lexer;
1041 /* Tidy doesn't alter the doctype for generic XML docs */
1042 if (configuration.XmlTags)
1043 document = ParserImpl.parseXMLDocument(lexer);
1046 if (!configuration.Quiet)
1047 Report.helloMessage(errout, Report.RELEASE_DATE, inputStreamName);
1049 document = ParserImpl.parseDocument(lexer);
1051 if (!document.checkNodeIntegrity()) {
1052 Report.badTree(errout);
1056 Clean cleaner = new Clean(configuration.tt);
1058 /* simplifies <b><b> ... </b> ...</b> etc. */
1059 cleaner.nestedEmphasis(document);
1061 /* cleans up <dir>indented text</dir> etc. */
1062 cleaner.list2BQ(document);
1063 cleaner.bQ2Div(document);
1065 /* replaces i by em and b by strong */
1066 if (configuration.LogicalEmphasis)
1067 cleaner.emFromI(document);
1069 if (configuration.Word2000 && cleaner.isWord2000(document, configuration.tt)) {
1070 /* prune Word2000's <![if ...]> ... <![endif]> */
1071 cleaner.dropSections(lexer, document);
1073 /* drop style & class attributes and empty p, span elements */
1074 cleaner.cleanWord2000(lexer, document);
1077 //gschadow patch start
1078 if (configuration.DropPseudoXMLCrap) {
1079 cleaner.dropPseudoXMLCrap(lexer, document);
1081 //gschadow patch end
1083 /* replaces presentational markup by style rules */
1084 if (configuration.MakeClean || configuration.DropFontTags)
1085 cleaner.cleanTree(lexer, document);
1087 if (!document.checkNodeIntegrity()) {
1088 Report.badTree(errout);
1091 doctype = document.findDocType();
1092 if (document.content != null) {
1093 if (configuration.xHTML)
1094 lexer.setXHTMLDocType(document);
1096 lexer.fixDocType(document);
1098 if (configuration.TidyMark)
1099 lexer.addGenerator(document);
1102 /* ensure presence of initial <?XML version="1.0"?> */
1103 if (configuration.XmlOut && configuration.XmlPi)
1104 lexer.fixXMLPI(document);
1106 if (!configuration.Quiet && document.content != null) {
1107 Report.reportVersion(errout, lexer, inputStreamName, doctype);
1108 Report.reportNumWarnings(errout, lexer);
1112 parseWarnings = lexer.warnings;
1113 parseErrors = lexer.errors;
1115 // Try to close the InputStream but only if if we created it.
1117 if ((file != null) && (in != System.in)) {
1120 } catch (IOException e) {
1124 if (lexer.errors > 0)
1125 Report.needsAuthorIntervention(errout);
1127 o.state = StreamIn.FSM_ASCII;
1128 o.encoding = configuration.CharEncoding;
1130 if (!configuration.OnlyErrors && lexer.errors == 0) {
1131 if (configuration.BurstSlides) {
1136 remove doctype to avoid potential clash with
1137 markup introduced when bursting into slides
1139 /* discard the document type */
1140 doctype = document.findDocType();
1142 if (doctype != null)
1143 Node.discardElement(doctype);
1145 /* slides use transitional features */
1146 lexer.versions |= Dict.VERS_HTML40_LOOSE;
1148 /* and patch up doctype to match */
1149 if (configuration.xHTML)
1150 lexer.setXHTMLDocType(document);
1152 lexer.fixDocType(document);
1154 /* find the body element which may be implicit */
1155 body = document.findBody(configuration.tt);
1158 pprint = new PPrint(configuration);
1159 Report.reportNumberOfSlides(errout, pprint.countSlides(body));
1160 pprint.createSlides(lexer, document);
1162 Report.missingBody(errout);
1163 } else if (configuration.writeback && (file != null)) {
1165 pprint = new PPrint(configuration);
1166 o.out = new FileOutputStream(file);
1168 if (configuration.XmlTags)
1169 pprint.printXMLTree(o, (short) 0, 0, lexer, document);
1171 pprint.printTree(o, (short) 0, 0, lexer, document);
1173 pprint.flushLine(o, 0);
1175 } catch (IOException e) {
1176 errout.println(file + e.toString());
1178 } else if (out != null) {
1179 pprint = new PPrint(configuration);
1182 if (configuration.XmlTags)
1183 pprint.printXMLTree(o, (short) 0, 0, lexer, document);
1185 pprint.printTree(o, (short) 0, 0, lexer, document);
1187 pprint.flushLine(o, 0);
1192 Report.errorSummary(lexer);
1198 * Parses InputStream in and returns a DOM Document node.
1199 * If out is non-null, pretty prints to OutputStream out.
1202 public org.w3c.dom.Document parseDOM(IFile file, InputStream in, OutputStream out) {
1203 Node document = parse(file, in, out);
1204 if (document != null)
1205 return (org.w3c.dom.Document) document.getAdapter();
1211 * Creates an empty DOM Document.
1214 public static org.w3c.dom.Document createEmptyDocument() {
1215 Node document = new Node(Node.RootNode, new byte[0], 0, 0);
1216 Node node = new Node(Node.StartTag, new byte[0], 0, 0, "html", new TagTable());
1217 if (document != null && node != null) {
1218 Node.insertNodeAtStart(document, node);
1219 return (org.w3c.dom.Document) document.getAdapter();
1226 * Pretty-prints a DOM Document.
1229 public void pprint(org.w3c.dom.Document doc, OutputStream out) {
1230 Out o = new OutImpl();
1234 if (!(doc instanceof DOMDocumentImpl)) {
1237 document = ((DOMDocumentImpl) doc).adaptee;
1239 o.state = StreamIn.FSM_ASCII;
1240 o.encoding = configuration.CharEncoding;
1243 pprint = new PPrint(configuration);
1246 if (configuration.XmlTags)
1247 pprint.printXMLTree(o, (short) 0, 0, null, document);
1249 pprint.printTree(o, (short) 0, 0, null, document);
1251 pprint.flushLine(o, 0);
1256 * Command line interface to parser and pretty printer.
1259 public static void main(String[] argv) {
1260 int totalerrors = 0;
1261 int totalwarnings = 0;
1264 String prog = "Tidy";
1269 Out out = new OutImpl(); /* normal output stream */
1271 int argc = argv.length + 1;
1274 Configuration configuration;
1276 String current_errorfile = "stderr";
1279 configuration = tidy.getConfiguration();
1281 /* read command line */
1284 if (argc > 1 && argv[argIndex].startsWith("-")) {
1285 /* support -foo and --foo */
1286 arg = argv[argIndex].substring(1);
1288 if (arg.length() > 0 && arg.charAt(0) == '-')
1289 arg = arg.substring(1);
1291 if (arg.equals("xml"))
1292 configuration.XmlTags = true;
1293 else if (arg.equals("asxml") || arg.equals("asxhtml"))
1294 configuration.xHTML = true;
1295 else if (arg.equals("indent")) {
1296 configuration.IndentContent = true;
1297 configuration.SmartIndent = true;
1298 } else if (arg.equals("omit"))
1299 configuration.HideEndTags = true;
1300 else if (arg.equals("upper"))
1301 configuration.UpperCaseTags = true;
1302 else if (arg.equals("clean"))
1303 configuration.MakeClean = true;
1304 else if (arg.equals("raw"))
1305 configuration.CharEncoding = Configuration.RAW;
1306 else if (arg.equals("ascii"))
1307 configuration.CharEncoding = Configuration.ASCII;
1308 else if (arg.equals("latin1"))
1309 configuration.CharEncoding = Configuration.LATIN1;
1310 else if (arg.equals("utf8"))
1311 configuration.CharEncoding = Configuration.UTF8;
1312 else if (arg.equals("iso2022"))
1313 configuration.CharEncoding = Configuration.ISO2022;
1314 else if (arg.equals("mac"))
1315 configuration.CharEncoding = Configuration.MACROMAN;
1316 else if (arg.equals("numeric"))
1317 configuration.NumEntities = true;
1318 else if (arg.equals("modify"))
1319 configuration.writeback = true;
1320 else if (arg.equals("change")) /* obsolete */
1321 configuration.writeback = true;
1322 else if (arg.equals("update")) /* obsolete */
1323 configuration.writeback = true;
1324 else if (arg.equals("errors"))
1325 configuration.OnlyErrors = true;
1326 else if (arg.equals("quiet"))
1327 configuration.Quiet = true;
1328 else if (arg.equals("slides"))
1329 configuration.BurstSlides = true;
1330 else if (arg.equals("help") || argv[argIndex].charAt(1) == '?' || argv[argIndex].charAt(1) == 'h') {
1331 Report.helpText(new PrintWriter(System.out, true), prog);
1333 } else if (arg.equals("config")) {
1335 configuration.parseFile(argv[argIndex + 1]);
1340 argv[argIndex].equals("-file") || argv[argIndex].equals("--file") || argv[argIndex].equals("-f")) {
1342 configuration.errfile = argv[argIndex + 1];
1347 argv[argIndex].equals("-wrap") || argv[argIndex].equals("--wrap") || argv[argIndex].equals("-w")) {
1349 configuration.wraplen = Integer.parseInt(argv[argIndex + 1]);
1354 argv[argIndex].equals("-version")
1355 || argv[argIndex].equals("--version")
1356 || argv[argIndex].equals("-v")) {
1357 Report.showVersion(tidy.getErrout());
1362 for (int i = 1; i < s.length(); i++) {
1363 if (s.charAt(i) == 'i') {
1364 configuration.IndentContent = true;
1365 configuration.SmartIndent = true;
1366 } else if (s.charAt(i) == 'o')
1367 configuration.HideEndTags = true;
1368 else if (s.charAt(i) == 'u')
1369 configuration.UpperCaseTags = true;
1370 else if (s.charAt(i) == 'c')
1371 configuration.MakeClean = true;
1372 else if (s.charAt(i) == 'n')
1373 configuration.NumEntities = true;
1374 else if (s.charAt(i) == 'm')
1375 configuration.writeback = true;
1376 else if (s.charAt(i) == 'e')
1377 configuration.OnlyErrors = true;
1378 else if (s.charAt(i) == 'q')
1379 configuration.Quiet = true;
1381 Report.unknownOption(tidy.getErrout(), s.charAt(i));
1390 /* ensure config is self-consistent */
1391 configuration.adjust();
1393 /* user specified error file */
1394 if (configuration.errfile != null) {
1395 /* is it same as the currently opened file? */
1396 if (!configuration.errfile.equals(current_errorfile)) {
1397 /* no so close previous error file */
1399 if (tidy.getErrout() != tidy.getStderr())
1400 tidy.getErrout().close();
1402 /* and try to open the new error file */
1404 tidy.setErrout(new PrintWriter(new FileWriter(configuration.errfile), true));
1405 current_errorfile = configuration.errfile;
1406 } catch (IOException e) {
1407 /* can't be opened so fall back to stderr */
1408 current_errorfile = "stderr";
1409 tidy.setErrout(tidy.getStderr());
1415 file = argv[argIndex];
1421 document = tidy.parse(null, null, file, System.out);
1422 totalwarnings += tidy.parseWarnings;
1423 totalerrors += tidy.parseErrors;
1424 } catch (FileNotFoundException fnfe) {
1425 Report.unknownFile(tidy.getErrout(), prog, file);
1426 } catch (IOException ioe) {
1427 Report.unknownFile(tidy.getErrout(), prog, file);
1437 if (totalerrors + totalwarnings > 0)
1438 Report.generalInfo(tidy.getErrout());
1440 if (tidy.getErrout() != tidy.getStderr())
1441 tidy.getErrout().close();
1443 /* return status can be used by scripts */
1445 if (totalerrors > 0)
1448 if (totalwarnings > 0)
1451 /* 0 signifies all is ok */