2 * @(#)Clean.java 1.11 2000/08/16
6 package net.sourceforge.phpdt.tidy.w3c;
10 * Clean up misuse of presentation markup
12 * (c) 1998-2000 (W3C) MIT, INRIA, Keio University
13 * See Tidy.java for the copyright notice.
14 * Derived from <a href="http://www.w3.org/People/Raggett/tidy">
15 * HTML Tidy Release 4 Aug 2000</a>
17 * @author Dave Raggett <dsr@w3.org>
18 * @author Andy Quick <ac.quick@sympatico.ca> (translation to Java)
19 * @version 1.0, 1999/05/22
20 * @version 1.0.1, 1999/05/29
21 * @version 1.1, 1999/06/18 Java Bean
22 * @version 1.2, 1999/07/10 Tidy Release 7 Jul 1999
23 * @version 1.3, 1999/07/30 Tidy Release 26 Jul 1999
24 * @version 1.4, 1999/09/04 DOM support
25 * @version 1.5, 1999/10/23 Tidy Release 27 Sep 1999
26 * @version 1.6, 1999/11/01 Tidy Release 22 Oct 1999
27 * @version 1.7, 1999/12/06 Tidy Release 30 Nov 1999
28 * @version 1.8, 2000/01/22 Tidy Release 13 Jan 2000
29 * @version 1.9, 2000/06/03 Tidy Release 30 Apr 2000
30 * @version 1.10, 2000/07/22 Tidy Release 8 Jul 2000
31 * @version 1.11, 2000/08/16 Tidy Release 4 Aug 2000
35 Filters from other formats such as Microsoft Word
36 often make excessive use of presentation markup such
37 as font tags, B, I, and the align attribute. By applying
38 a set of production rules, it is straight forward to
39 transform this to use CSS.
41 Some rules replace some of the children of an element by
42 style properties on the element, e.g.
44 <p><b>...</b></p> -> <p style="font-weight: bold">...</p>
46 Such rules are applied to the element's content and then
47 to the element itself until none of the rules more apply.
48 Having applied all the rules to an element, it will have
49 a style attribute with one or more properties.
51 Other rules strip the element they apply to, replacing
52 it by style properties on the contents, e.g.
54 <dir><li><p>...</li></dir> -> <p style="margin-left 1em">...
56 These rules are applied to an element before processing
57 its content and replace the current element by the first
58 element in the exposed content.
60 After applying both sets of rules, you can replace the
61 style attribute by a class value and style rule in the
62 document head. To support this, an association of styles
63 and class names is built.
65 A naive approach is to rely on string matching to test
66 when two property lists are the same. A better approach
67 would be to first sort the properties before matching.
72 private int classNum = 1;
76 public Clean(TagTable tt) {
80 private StyleProp insertProperty(StyleProp props, String name, String value) {
81 StyleProp first, prev, prop;
87 while (props != null) {
88 cmp = props.name.compareTo(name);
91 /* this property is already defined, ignore new value */
95 if (cmp > 0) // props.name > name
97 /* insert before this */
99 prop = new StyleProp(name, value, props);
113 prop = new StyleProp(name, value);
124 Create sorted linked list of properties from style string
125 It temporarily places nulls in place of ':' and ';' to
126 delimit the strings for the property name and value.
127 Some systems don't allow you to null literal strings,
128 so to avoid this, a copy is made first.
130 private StyleProp createProps(StyleProp prop, String style) {
138 while (name_start < style.length()) {
139 while (name_start < style.length() && style.charAt(name_start) == ' ')
142 name_end = name_start;
144 while (name_end < style.length()) {
145 if (style.charAt(name_end) == ':') {
146 value_start = name_end + 1;
153 if (name_end >= style.length() || style.charAt(name_end) != ':')
156 while (value_start < style.length() && style.charAt(value_start) == ' ')
159 value_end = value_start;
162 while (value_end < style.length()) {
163 if (style.charAt(value_end) == ';') {
171 prop = insertProperty(prop, style.substring(name_start, name_end), style.substring(value_start, value_end));
174 name_start = value_end + 1;
184 private String createPropString(StyleProp props) {
191 for (len = 0, prop = props; prop != null; prop = prop.next) {
192 len += prop.name.length() + 2;
193 len += prop.value.length() + 2;
196 for (prop = props; prop != null; prop = prop.next) {
197 style = style.concat(prop.name);
198 style = style.concat(": ");
200 style = style.concat(prop.value);
202 if (prop.next == null)
205 style = style.concat("; ");
212 create string with merged properties
214 private String addProperty(String style, String property) {
217 prop = createProps(null, style);
218 prop = createProps(prop, property);
219 style = createPropString(prop);
223 private String gensymClass(String tag) {
226 str = "c" + classNum;
231 private String findStyle(Lexer lexer, String tag, String properties) {
234 for (style = lexer.styles; style != null; style = style.next) {
235 if (style.tag.equals(tag) && style.properties.equals(properties))
236 return style.tagClass;
239 style = new Style(tag, gensymClass(tag), properties, lexer.styles);
240 lexer.styles = style;
241 return style.tagClass;
245 Find style attribute in node, and replace it
246 by corresponding class attribute. Search for
247 class in style dictionary otherwise gensym
248 new class and add to dictionary.
250 Assumes that node doesn't have a class attribute
252 private void style2Rule(Lexer lexer, Node node) {
253 AttVal styleattr, classattr;
256 styleattr = node.getAttrByName("style");
258 if (styleattr != null) {
259 classname = findStyle(lexer, node.element, styleattr.value);
260 classattr = node.getAttrByName("class");
263 if there already is a class attribute
264 then append class name after a space
266 if (classattr != null) {
267 classattr.value = classattr.value + " " + classname;
268 node.removeAttribute(styleattr);
269 } else /* reuse style attribute for class attribute */ {
270 styleattr.attribute = "class";
271 styleattr.value = classname;
276 private void addColorRule(Lexer lexer, String selector, String color) {
278 lexer.addStringLiteral(selector);
279 lexer.addStringLiteral(" { color: ");
280 lexer.addStringLiteral(color);
281 lexer.addStringLiteral(" }\n");
286 move presentation attribs from body to style element
288 background="foo" -> body { background-image: url(foo) }
289 bgcolor="foo" -> body { background-color: foo }
290 text="foo" -> body { color: foo }
291 link="foo" -> :link { color: foo }
292 vlink="foo" -> :visited { color: foo }
293 alink="foo" -> :active { color: foo }
295 private void cleanBodyAttrs(Lexer lexer, Node body) {
298 String bgcolor = null;
301 attr = body.getAttrByName("background");
306 body.removeAttribute(attr);
309 attr = body.getAttrByName("bgcolor");
312 bgcolor = attr.value;
314 body.removeAttribute(attr);
317 attr = body.getAttrByName("text");
322 body.removeAttribute(attr);
325 if (bgurl != null || bgcolor != null || color != null) {
326 lexer.addStringLiteral(" body {\n");
329 lexer.addStringLiteral(" background-image: url(");
330 lexer.addStringLiteral(bgurl);
331 lexer.addStringLiteral(");\n");
334 if (bgcolor != null) {
335 lexer.addStringLiteral(" background-color: ");
336 lexer.addStringLiteral(bgcolor);
337 lexer.addStringLiteral(";\n");
341 lexer.addStringLiteral(" color: ");
342 lexer.addStringLiteral(color);
343 lexer.addStringLiteral(";\n");
346 lexer.addStringLiteral(" }\n");
349 attr = body.getAttrByName("link");
352 addColorRule(lexer, " :link", attr.value);
353 body.removeAttribute(attr);
356 attr = body.getAttrByName("vlink");
359 addColorRule(lexer, " :visited", attr.value);
360 body.removeAttribute(attr);
363 attr = body.getAttrByName("alink");
366 addColorRule(lexer, " :active", attr.value);
367 body.removeAttribute(attr);
371 private boolean niceBody(Lexer lexer, Node doc) {
372 Node body = doc.findBody(lexer.configuration.tt);
375 if (body.getAttrByName("background") != null
376 || body.getAttrByName("bgcolor") != null
377 || body.getAttrByName("text") != null
378 || body.getAttrByName("link") != null
379 || body.getAttrByName("vlink") != null
380 || body.getAttrByName("alink") != null) {
381 lexer.badLayout |= Report.USING_BODY;
389 /* create style element using rules from dictionary */
390 private void createStyleElement(Lexer lexer, Node doc) {
391 Node node, head, body;
395 if (lexer.styles == null && niceBody(lexer, doc))
398 node = lexer.newNode(Node.StartTag, null, 0, 0, "style");
399 node.implicit = true;
401 /* insert type attribute */
402 av = new AttVal(null, null, '"', "type", "text/css");
403 av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
404 node.attributes = av;
406 body = doc.findBody(lexer.configuration.tt);
408 lexer.txtstart = lexer.lexsize;
411 cleanBodyAttrs(lexer, body);
413 for (style = lexer.styles; style != null; style = style.next) {
414 lexer.addCharToLexer(' ');
415 lexer.addStringLiteral(style.tag);
416 lexer.addCharToLexer('.');
417 lexer.addStringLiteral(style.tagClass);
418 lexer.addCharToLexer(' ');
419 lexer.addCharToLexer('{');
420 lexer.addStringLiteral(style.properties);
421 lexer.addCharToLexer('}');
422 lexer.addCharToLexer('\n');
425 lexer.txtend = lexer.lexsize;
427 Node.insertNodeAtEnd(node, lexer.newNode(Node.TextNode, lexer.lexbuf, lexer.txtstart, lexer.txtend));
430 now insert style element into document head
432 doc is root node. search its children for html node
433 the head node should be first child of html node
436 head = doc.findHEAD(lexer.configuration.tt);
439 Node.insertNodeAtEnd(head, node);
442 /* ensure bidirectional links are consistent */
443 private void fixNodeLinks(Node node) {
446 if (node.prev != null)
447 node.prev.next = node;
449 node.parent.content = node;
451 if (node.next != null)
452 node.next.prev = node;
454 node.parent.last = node;
456 for (child = node.content; child != null; child = child.next)
461 used to strip child of node when
462 the node has one and only one child
464 private void stripOnlyChild(Node node) {
467 child = node.content;
468 node.content = child.content;
469 node.last = child.last;
470 child.content = null;
472 for (child = node.content; child != null; child = child.next)
476 /* used to strip font start and end tags */
477 private void discardContainer(Node element, MutableObject pnode) {
479 Node parent = element.parent;
481 if (element.content != null) {
482 element.last.next = element.next;
484 if (element.next != null) {
485 element.next.prev = element.last;
486 element.last.next = element.next;
488 parent.last = element.last;
490 if (element.prev != null) {
491 element.content.prev = element.prev;
492 element.prev.next = element.content;
494 parent.content = element.content;
496 for (node = element.content; node != null; node = node.next)
497 node.parent = parent;
499 pnode.setObject(element.content);
501 if (element.next != null)
502 element.next.prev = element.prev;
504 parent.last = element.prev;
506 if (element.prev != null)
507 element.prev.next = element.next;
509 parent.content = element.next;
511 pnode.setObject(element.next);
515 element.content = null;
519 Add style property to element, creating style
520 attribute as needed and adding ; delimiter
522 private void addStyleProperty(Node node, String property) {
525 for (av = node.attributes; av != null; av = av.next) {
526 if (av.attribute.equals("style"))
530 /* if style attribute already exists then insert property */
535 s = addProperty(av.value, property);
537 } else /* else create new style attribute */ {
538 av = new AttVal(node.attributes, null, '"', "style", property);
539 av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
540 node.attributes = av;
545 Create new string that consists of the
546 combined style properties in s1 and s2
548 To merge property lists, we build a linked
549 list of property/values and insert properties
550 into the list in order, merging values for
551 the same property name.
553 private String mergeProperties(String s1, String s2) {
557 prop = createProps(null, s1);
558 prop = createProps(prop, s2);
559 s = createPropString(prop);
563 private void mergeStyles(Node node, Node child) {
565 String s1, s2, style;
567 for (s2 = null, av = child.attributes; av != null; av = av.next) {
568 if (av.attribute.equals("style")) {
574 for (s1 = null, av = node.attributes; av != null; av = av.next) {
575 if (av.attribute.equals("style")) {
582 if (s2 != null) /* merge styles from both */ {
583 style = mergeProperties(s1, s2);
586 } else if (s2 != null) /* copy style of child */ {
587 av = new AttVal(node.attributes, null, '"', "style", s2);
588 av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
589 node.attributes = av;
593 private String fontSize2Name(String size) {
607 String[] sizes = { "60%", "70%", "80%", null, "120%", "150%", "200%" };
610 if (size.length() > 0 && '0' <= size.charAt(0) && size.charAt(0) <= '6') {
611 int n = size.charAt(0) - '0';
615 if (size.length() > 0 && size.charAt(0) == '-') {
616 if (size.length() > 1 && '0' <= size.charAt(1) && size.charAt(1) <= '6') {
617 int n = size.charAt(1) - '0';
620 for (x = 1.0; n > 0; --n)
624 buf = "" + (int) x + "%";
629 return "smaller"; /*"70%"; */
632 if (size.length() > 1 && '0' <= size.charAt(1) && size.charAt(1) <= '6') {
633 int n = size.charAt(1) - '0';
636 for (x = 1.0; n > 0; --n)
640 buf = "" + (int) x + "%";
645 return "larger"; /* "140%" */
648 private void addFontFace(Node node, String face) {
649 addStyleProperty(node, "font-family: " + face);
652 private void addFontSize(Node node, String size) {
655 if (size.equals("6") && node.tag == tt.tagP) {
661 if (size.equals("5") && node.tag == tt.tagP) {
667 if (size.equals("4") && node.tag == tt.tagP) {
673 value = fontSize2Name(size);
676 addStyleProperty(node, "font-size: " + value);
680 private void addFontColor(Node node, String color) {
681 addStyleProperty(node, "color: " + color);
684 private void addAlign(Node node, String align) {
685 /* force alignment value to lower case */
686 addStyleProperty(node, "text-align: " + align.toLowerCase());
690 add style properties to node corresponding to
691 the font face, size and color attributes
693 private void addFontStyles(Node node, AttVal av) {
695 if (av.attribute.equals("face"))
696 addFontFace(node, av.value);
697 else if (av.attribute.equals("size"))
698 addFontSize(node, av.value);
699 else if (av.attribute.equals("color"))
700 addFontColor(node, av.value);
707 Symptom: <p align=center>
708 Action: <p style="text-align: center">
710 private void textAlign(Lexer lexer, Node node) {
715 for (av = node.attributes; av != null; av = av.next) {
716 if (av.attribute.equals("align")) {
720 node.attributes = av.next;
722 if (av.value != null) {
723 addAlign(node, av.value);
734 The clean up rules use the pnode argument to return the
735 next node when the orignal node has been deleted
739 Symptom: <dir> <li> where <li> is only child
740 Action: coerce <dir> <li> to <div> with indent.
743 private boolean dir2Div(Lexer lexer, Node node, MutableObject pnode) {
746 if (node.tag == tt.tagDir || node.tag == tt.tagUl || node.tag == tt.tagOl) {
747 child = node.content;
752 /* check child has no peers */
754 if (child.next != null)
757 if (child.tag != tt.tagLi)
763 /* coerce dir to div */
765 node.tag = tt.tagDiv;
766 node.element = "div";
767 addStyleProperty(node, "margin-left: 2em");
768 stripOnlyChild(node);
774 //content = child.content;
776 //child.content = null;
778 /* adjust parent and set margin on contents of <li> */
780 //for (child = content; child != null; child = child.next)
782 // child.parent = node.parent;
783 // addStyleProperty(child, "margin-left: 1em");
786 /* hook first/last into sequence */
788 //if (content != null)
790 // content.prev = node.prev;
791 // last.next = node.next;
792 // fixNodeLinks(content);
793 // fixNodeLinks(last);
798 /* ensure that new node is cleaned */
799 //pnode.setObject(cleanNode(lexer, content));
809 Action: replace <center> by <div style="text-align: center">
812 private boolean center2Div(Lexer lexer, Node node, MutableObject pnode) {
813 if (node.tag == tt.tagCenter) {
814 if (lexer.configuration.DropFontTags) {
815 if (node.content != null) {
816 Node last = node.last;
817 Node parent = node.parent;
819 discardContainer(node, pnode);
821 node = lexer.inferredTag("br");
823 if (last.next != null)
824 last.next.prev = node;
826 node.next = last.next;
830 if (parent.last == last)
833 node.parent = parent;
835 Node prev = node.prev;
836 Node next = node.next;
837 Node parent = node.parent;
838 discardContainer(node, pnode);
840 node = lexer.inferredTag("br");
843 node.parent = parent;
853 parent.content = node;
858 node.tag = tt.tagDiv;
859 node.element = "div";
860 addStyleProperty(node, "text-align: center");
868 Symptom <div><div>...</div></div>
869 Action: merge the two divs
871 This is useful after nested <dir>s used by Word
872 for indenting have been converted to <div>s
874 private boolean mergeDivs(Lexer lexer, Node node, MutableObject pnode) {
877 if (node.tag != tt.tagDiv)
880 child = node.content;
885 if (child.tag != tt.tagDiv)
888 if (child.next != null)
891 mergeStyles(node, child);
892 stripOnlyChild(node);
897 Symptom: <ul><li><ul>...</ul></li></ul>
898 Action: discard outer list
901 private boolean nestedList(Lexer lexer, Node node, MutableObject pnode) {
904 if (node.tag == tt.tagUl || node.tag == tt.tagOl) {
905 child = node.content;
910 /* check child has no peers */
912 if (child.next != null)
915 list = child.content;
920 if (list.tag != node.tag)
923 pnode.setObject(node.next);
925 /* move inner list node into position of outer node */
926 list.prev = node.prev;
927 list.next = node.next;
928 list.parent = node.parent;
931 /* get rid of outer ul and its li */
932 child.content = null;
937 If prev node was a list the chances are this node
938 should be appended to that list. Word has no way of
939 recognizing nested lists and just uses indents
942 if (list.prev != null) {
946 if (list.tag == tt.tagUl || list.tag == tt.tagOl) {
947 list.next = node.next;
949 if (list.next != null)
950 list.next.prev = list;
952 child = list.last; /* <li> */
956 node.prev = child.last;
961 cleanNode(lexer, node);
969 Symptom: the only child of a block-level element is a
970 presentation element such as B, I or FONT
972 Action: add style "font-weight: bold" to the block and
973 strip the <b> element, leaving its children.
978 <b><font face="Arial" size="6">Draft Recommended Practice</font></b>
983 <p style="font-weight: bold; font-family: Arial; font-size: 6">
984 Draft Recommended Practice
987 This code also replaces the align attribute by a style attribute.
988 However, to avoid CSS problems with Navigator 4, this isn't done
989 for the elements: caption, tr and table
991 private boolean blockStyle(Lexer lexer, Node node, MutableObject pnode) {
994 if ((node.tag.model & (Dict.CM_BLOCK | Dict.CM_LIST | Dict.CM_DEFLIST | Dict.CM_TABLE)) != 0) {
995 if (node.tag != tt.tagTable && node.tag != tt.tagTr && node.tag != tt.tagLi) {
996 /* check for align attribute */
997 if (node.tag != tt.tagCaption)
998 textAlign(lexer, node);
1000 child = node.content;
1005 /* check child has no peers */
1007 if (child.next != null)
1010 if (child.tag == tt.tagB) {
1011 mergeStyles(node, child);
1012 addStyleProperty(node, "font-weight: bold");
1013 stripOnlyChild(node);
1017 if (child.tag == tt.tagI) {
1018 mergeStyles(node, child);
1019 addStyleProperty(node, "font-style: italic");
1020 stripOnlyChild(node);
1024 if (child.tag == tt.tagFont) {
1025 mergeStyles(node, child);
1026 addFontStyles(node, child.attributes);
1027 stripOnlyChild(node);
1036 /* the only child of table cell or an inline element such as em */
1037 private boolean inlineStyle(Lexer lexer, Node node, MutableObject pnode) {
1040 if (node.tag != tt.tagFont && (node.tag.model & (Dict.CM_INLINE | Dict.CM_ROW)) != 0) {
1041 child = node.content;
1046 /* check child has no peers */
1048 if (child.next != null)
1051 if (child.tag == tt.tagB && lexer.configuration.LogicalEmphasis) {
1052 mergeStyles(node, child);
1053 addStyleProperty(node, "font-weight: bold");
1054 stripOnlyChild(node);
1058 if (child.tag == tt.tagI && lexer.configuration.LogicalEmphasis) {
1059 mergeStyles(node, child);
1060 addStyleProperty(node, "font-style: italic");
1061 stripOnlyChild(node);
1065 if (child.tag == tt.tagFont) {
1066 mergeStyles(node, child);
1067 addFontStyles(node, child.attributes);
1068 stripOnlyChild(node);
1077 Replace font elements by span elements, deleting
1078 the font element's attributes and replacing them
1079 by a single style attribute.
1081 private boolean font2Span(Lexer lexer, Node node, MutableObject pnode) {
1082 AttVal av, style, next;
1084 if (node.tag == tt.tagFont) {
1085 if (lexer.configuration.DropFontTags) {
1086 discardContainer(node, pnode);
1090 /* if FONT is only child of parent element then leave alone */
1091 if (node.parent.content == node && node.next == null)
1094 addFontStyles(node, node.attributes);
1096 /* extract style attribute and free the rest */
1097 av = node.attributes;
1100 while (av != null) {
1103 if (av.attribute.equals("style")) {
1111 node.attributes = style;
1113 node.tag = tt.tagSpan;
1114 node.element = "span";
1123 Applies all matching rules to a node.
1125 private Node cleanNode(Lexer lexer, Node node) {
1127 MutableObject o = new MutableObject();
1130 for (next = node; node.isElement(); node = next) {
1133 b = dir2Div(lexer, node, o);
1134 next = (Node) o.getObject();
1138 b = nestedList(lexer, node, o);
1139 next = (Node) o.getObject();
1143 b = center2Div(lexer, node, o);
1144 next = (Node) o.getObject();
1148 b = mergeDivs(lexer, node, o);
1149 next = (Node) o.getObject();
1153 b = blockStyle(lexer, node, o);
1154 next = (Node) o.getObject();
1158 b = inlineStyle(lexer, node, o);
1159 next = (Node) o.getObject();
1163 b = font2Span(lexer, node, o);
1164 next = (Node) o.getObject();
1174 private Node createStyleProperties(Lexer lexer, Node node) {
1177 if (node.content != null) {
1178 for (child = node.content; child != null; child = child.next) {
1179 child = createStyleProperties(lexer, child);
1183 return cleanNode(lexer, node);
1186 private void defineStyleRules(Lexer lexer, Node node) {
1189 if (node.content != null) {
1190 for (child = node.content; child != null; child = child.next) {
1191 defineStyleRules(lexer, child);
1195 style2Rule(lexer, node);
1198 public void cleanTree(Lexer lexer, Node doc) {
1199 doc = createStyleProperties(lexer, doc);
1201 if (!lexer.configuration.MakeClean) {
1202 defineStyleRules(lexer, doc);
1203 createStyleElement(lexer, doc);
1207 /* simplifies <b><b> ... </b> ...</b> etc. */
1208 public void nestedEmphasis(Node node) {
1209 MutableObject o = new MutableObject();
1212 while (node != null) {
1215 if ((node.tag == tt.tagB || node.tag == tt.tagI) && node.parent != null && node.parent.tag == node.tag) {
1216 /* strip redundant inner element */
1218 discardContainer(node, o);
1219 next = (Node) o.getObject();
1224 if (node.content != null)
1225 nestedEmphasis(node.content);
1231 /* replace i by em and b by strong */
1232 public void emFromI(Node node) {
1233 while (node != null) {
1234 if (node.tag == tt.tagI) {
1235 node.element = tt.tagEm.name;
1236 node.tag = tt.tagEm;
1237 } else if (node.tag == tt.tagB) {
1238 node.element = tt.tagStrong.name;
1239 node.tag = tt.tagStrong;
1242 if (node.content != null)
1243 emFromI(node.content);
1250 Some people use dir or ul without an li
1251 to indent the content. The pattern to
1252 look for is a list with a single implicit
1253 li. This is recursively replaced by an
1254 implicit blockquote.
1256 public void list2BQ(Node node) {
1257 while (node != null) {
1258 if (node.content != null)
1259 list2BQ(node.content);
1261 if (node.tag != null && node.tag.parser == ParserImpl.getParseList() && node.hasOneChild() && node.content.implicit) {
1262 stripOnlyChild(node);
1263 node.element = tt.tagBlockquote.name;
1264 node.tag = tt.tagBlockquote;
1265 node.implicit = true;
1273 Replace implicit blockquote by div with an indent
1274 taking care to reduce nested blockquotes to a single
1275 div with the indent set to match the nesting depth
1277 public void bQ2Div(Node node) {
1281 while (node != null) {
1282 if (node.tag == tt.tagBlockquote && node.implicit) {
1285 while (node.hasOneChild() && node.content.tag == tt.tagBlockquote && node.implicit) {
1287 stripOnlyChild(node);
1290 if (node.content != null)
1291 bQ2Div(node.content);
1293 indent_buf = "margin-left: " + (new Integer(2 * indent)).toString() + "em";
1295 node.element = tt.tagDiv.name;
1296 node.tag = tt.tagDiv;
1297 node.addAttribute("style", indent_buf);
1298 } else if (node.content != null)
1299 bQ2Div(node.content);
1305 /* node is <![if ...]> prune up to <![endif]> */
1306 public Node pruneSection(Lexer lexer, Node node) {
1308 /* discard node and returns next */
1309 node = Node.discardElement(node);
1314 if (node.type == Node.SectionTag) {
1315 if ((Lexer.getString(node.textarray, node.start, 2)).equals("if")) {
1316 node = pruneSection(lexer, node);
1320 if ((Lexer.getString(node.textarray, node.start, 5)).equals("endif")) {
1321 node = Node.discardElement(node);
1330 public void dropSections(Lexer lexer, Node node) {
1331 while (node != null) {
1332 if (node.type == Node.SectionTag) {
1333 /* prune up to matching endif */
1334 if ((Lexer.getString(node.textarray, node.start, 2)).equals("if")) {
1335 node = pruneSection(lexer, node);
1339 /* discard others as well */
1340 node = Node.discardElement(node);
1344 if (node.content != null)
1345 dropSections(lexer, node.content);
1351 // gschadow patch start
1352 /** Get rid of all this pseudo-XML crap, sections, Asp tags, JSP tags, etc.
1354 public void dropPseudoXMLCrap(Lexer lexer, Node node) {
1355 while (node != null) {
1356 switch (node.type) {
1360 case Node.SectionTag :
1361 node = Node.discardElement(node);
1365 if (node.content != null)
1366 dropPseudoXMLCrap(lexer, node.content);
1372 // gschadow patch end
1374 public void purgeAttributes(Node node) {
1375 AttVal attr = node.attributes;
1379 while (attr != null) {
1382 /* special check for class="Code" denoting pre text */
1383 if (attr.attribute != null && attr.value != null && attr.attribute.equals("class") && attr.value.equals("Code")) {
1386 attr.attribute != null
1387 && (attr.attribute.equals("class")
1388 || attr.attribute.equals("style")
1389 || attr.attribute.equals("lang")
1390 || attr.attribute.startsWith("x:")
1391 || ((attr.attribute.equals("height") || attr.attribute.equals("width"))
1392 && (node.tag == tt.tagTd || node.tag == tt.tagTr || node.tag == tt.tagTh)))) {
1396 node.attributes = next;
1405 /* Word2000 uses span excessively, so we strip span out */
1406 public Node stripSpan(Lexer lexer, Node span) {
1412 deal with span elements that have content
1413 by splicing the content in place of the span
1414 after having processed it
1417 cleanWord2000(lexer, span.content);
1418 content = span.content;
1420 if (span.prev != null)
1422 else if (content != null) {
1424 content = content.next;
1425 Node.removeNode(node);
1426 Node.insertNodeBeforeElement(span, node);
1430 while (content != null) {
1432 content = content.next;
1433 Node.removeNode(node);
1434 Node.insertNodeAfterElement(prev, node);
1438 if (span.next == null)
1439 span.parent.last = prev;
1442 span.content = null;
1443 Node.discardElement(span);
1447 /* map non-breaking spaces to regular spaces */
1448 private void normalizeSpaces(Lexer lexer, Node node) {
1449 while (node != null) {
1450 if (node.content != null)
1451 normalizeSpaces(lexer, node.content);
1453 if (node.type == Node.TextNode) {
1455 MutableInteger c = new MutableInteger();
1458 for (i = node.start; i < node.end; ++i) {
1459 c.value = (int) node.textarray[i];
1461 /* look for UTF-8 multibyte character */
1463 i += PPrint.getUTF8(node.textarray, i, c);
1468 p = PPrint.putUTF8(node.textarray, p, c.value);
1477 This is a major clean up to strip out all the extra stuff you get
1478 when you save as web page from Word 2000. It doesn't yet know what
1479 to do with VML tags, but these will appear as errors unless you
1480 declare them as new tags, such as o:p which needs to be declared
1483 public void cleanWord2000(Lexer lexer, Node node) {
1484 /* used to a list from a sequence of bulletted p's */
1487 while (node != null) {
1488 /* discard Word's style verbiage */
1489 if (node.tag == tt.tagStyle || node.tag == tt.tagMeta || node.type == Node.CommentTag) {
1490 node = Node.discardElement(node);
1494 /* strip out all span tags Word scatters so liberally! */
1495 if (node.tag == tt.tagSpan) {
1496 node = stripSpan(lexer, node);
1500 /* get rid of Word's xmlns attributes */
1501 if (node.tag == tt.tagHtml) {
1502 /* check that it's a Word 2000 document */
1503 if (node.getAttrByName("xmlns:o") == null)
1507 if (node.tag == tt.tagLink) {
1508 AttVal attr = node.getAttrByName("rel");
1510 if (attr != null && attr.value != null && attr.value.equals("File-List")) {
1511 node = Node.discardElement(node);
1516 /* discard empty paragraphs */
1517 if (node.content == null && node.tag == tt.tagP) {
1518 node = Node.discardElement(node);
1522 if (node.tag == tt.tagP) {
1523 AttVal attr = node.getAttrByName("class");
1525 /* map sequence of <p class="MsoListBullet"> to <ul>...</ul> */
1526 if (attr != null && attr.value != null && attr.value.equals("MsoListBullet")) {
1527 Node.coerceNode(lexer, node, tt.tagLi);
1529 if (list == null || list.tag != tt.tagUl) {
1530 list = lexer.inferredTag("ul");
1531 Node.insertNodeBeforeElement(node, list);
1534 purgeAttributes(node);
1536 if (node.content != null)
1537 cleanWord2000(lexer, node.content);
1539 /* remove node and append to contents of list */
1540 Node.removeNode(node);
1541 Node.insertNodeAtEnd(list, node);
1544 /* map sequence of <p class="Code"> to <pre>...</pre> */
1545 else if (attr != null && attr.value != null && attr.value.equals("Code")) {
1546 Node br = lexer.newLineNode();
1547 normalizeSpaces(lexer, node);
1549 if (list == null || list.tag != tt.tagPre) {
1550 list = lexer.inferredTag("pre");
1551 Node.insertNodeBeforeElement(node, list);
1554 /* remove node and append to contents of list */
1555 Node.removeNode(node);
1556 Node.insertNodeAtEnd(list, node);
1557 stripSpan(lexer, node);
1558 Node.insertNodeAtEnd(list, br);
1565 /* strip out style and class attributes */
1566 if (node.type == Node.StartTag || node.type == Node.StartEndTag)
1567 purgeAttributes(node);
1569 if (node.content != null)
1570 cleanWord2000(lexer, node.content);
1576 public boolean isWord2000(Node root, TagTable tt) {
1577 Node html = root.findHTML(tt);
1579 return (html != null && html.getAttrByName("xmlns:o") != null);