--- /dev/null
+/*
+ * @(#)Clean.java 1.11 2000/08/16
+ *
+ */
+
+package net.sourceforge.phpdt.tidy;
+
+/**
+ *
+ * Clean up misuse of presentation markup
+ *
+ * (c) 1998-2000 (W3C) MIT, INRIA, Keio University
+ * See Tidy.java for the copyright notice.
+ * Derived from <a href="http://www.w3.org/People/Raggett/tidy">
+ * HTML Tidy Release 4 Aug 2000</a>
+ *
+ * @author Dave Raggett <dsr@w3.org>
+ * @author Andy Quick <ac.quick@sympatico.ca> (translation to Java)
+ * @version 1.0, 1999/05/22
+ * @version 1.0.1, 1999/05/29
+ * @version 1.1, 1999/06/18 Java Bean
+ * @version 1.2, 1999/07/10 Tidy Release 7 Jul 1999
+ * @version 1.3, 1999/07/30 Tidy Release 26 Jul 1999
+ * @version 1.4, 1999/09/04 DOM support
+ * @version 1.5, 1999/10/23 Tidy Release 27 Sep 1999
+ * @version 1.6, 1999/11/01 Tidy Release 22 Oct 1999
+ * @version 1.7, 1999/12/06 Tidy Release 30 Nov 1999
+ * @version 1.8, 2000/01/22 Tidy Release 13 Jan 2000
+ * @version 1.9, 2000/06/03 Tidy Release 30 Apr 2000
+ * @version 1.10, 2000/07/22 Tidy Release 8 Jul 2000
+ * @version 1.11, 2000/08/16 Tidy Release 4 Aug 2000
+ */
+
+/*
+ Filters from other formats such as Microsoft Word
+ often make excessive use of presentation markup such
+ as font tags, B, I, and the align attribute. By applying
+ a set of production rules, it is straight forward to
+ transform this to use CSS.
+
+ Some rules replace some of the children of an element by
+ style properties on the element, e.g.
+
+ <p><b>...</b></p> -> <p style="font-weight: bold">...</p>
+
+ Such rules are applied to the element's content and then
+ to the element itself until none of the rules more apply.
+ Having applied all the rules to an element, it will have
+ a style attribute with one or more properties.
+
+ Other rules strip the element they apply to, replacing
+ it by style properties on the contents, e.g.
+
+ <dir><li><p>...</li></dir> -> <p style="margin-left 1em">...
+
+ These rules are applied to an element before processing
+ its content and replace the current element by the first
+ element in the exposed content.
+
+ After applying both sets of rules, you can replace the
+ style attribute by a class value and style rule in the
+ document head. To support this, an association of styles
+ and class names is built.
+
+ A naive approach is to rely on string matching to test
+ when two property lists are the same. A better approach
+ would be to first sort the properties before matching.
+*/
+
+public class Clean {
+
+ private int classNum = 1;
+
+ private TagTable tt;
+
+ public Clean(TagTable tt)
+ {
+ this.tt = tt;
+ }
+
+ private StyleProp insertProperty(StyleProp props, String name,
+ String value)
+ {
+ StyleProp first, prev, prop;
+ int cmp;
+
+ prev = null;
+ first = props;
+
+ while (props != null)
+ {
+ cmp = props.name.compareTo(name);
+
+ if (cmp == 0)
+ {
+ /* this property is already defined, ignore new value */
+ return first;
+ }
+
+ if (cmp > 0) // props.name > name
+ {
+ /* insert before this */
+
+ prop = new StyleProp(name, value, props);
+
+ if (prev != null)
+ prev.next = prop;
+ else
+ first = prop;
+
+ return first;
+ }
+
+ prev = props;
+ props = props.next;
+ }
+
+ prop = new StyleProp(name, value);
+
+ if (prev != null)
+ prev.next = prop;
+ else
+ first = prop;
+
+ return first;
+ }
+
+ /*
+ Create sorted linked list of properties from style string
+ It temporarily places nulls in place of ':' and ';' to
+ delimit the strings for the property name and value.
+ Some systems don't allow you to null literal strings,
+ so to avoid this, a copy is made first.
+ */
+ private StyleProp createProps(StyleProp prop, String style)
+ {
+ int name_end;
+ int value_end;
+ int value_start = 0;
+ int name_start = 0;
+ boolean more;
+
+ name_start = 0;
+ while (name_start < style.length())
+ {
+ while (name_start < style.length() &&
+ style.charAt(name_start) == ' ')
+ ++name_start;
+
+ name_end = name_start;
+
+ while (name_end < style.length())
+ {
+ if (style.charAt(name_end) == ':')
+ {
+ value_start = name_end + 1;
+ break;
+ }
+
+ ++name_end;
+ }
+
+ if (name_end >= style.length() || style.charAt(name_end) != ':')
+ break;
+
+ while (value_start < style.length() &&
+ style.charAt(value_start) == ' ')
+ ++value_start;
+
+ value_end = value_start;
+ more = false;
+
+ while (value_end < style.length())
+ {
+ if (style.charAt(value_end) == ';')
+ {
+ more = true;
+ break;
+ }
+
+ ++value_end;
+ }
+
+ prop = insertProperty(prop,
+ style.substring(name_start, name_end),
+ style.substring(value_start, value_end));
+
+ if (more)
+ {
+ name_start = value_end + 1;
+ continue;
+ }
+
+ break;
+ }
+
+ return prop;
+ }
+
+ private String createPropString(StyleProp props)
+ {
+ String style = "";
+ int len;
+ StyleProp prop;
+
+ /* compute length */
+
+ for (len = 0, prop = props; prop != null; prop = prop.next)
+ {
+ len += prop.name.length() + 2;
+ len += prop.value.length() + 2;
+ }
+
+ for (prop = props; prop != null; prop = prop.next)
+ {
+ style = style.concat(prop.name);
+ style = style.concat(": ");
+
+ style = style.concat(prop.value);
+
+ if (prop.next == null)
+ break;
+
+ style = style.concat("; ");
+ }
+
+ return style;
+ }
+
+ /*
+ create string with merged properties
+ */
+ private String addProperty(String style, String property)
+ {
+ StyleProp prop;
+
+ prop = createProps(null, style);
+ prop = createProps(prop, property);
+ style = createPropString(prop);
+ return style;
+ }
+
+ private String gensymClass(String tag)
+ {
+ String str;
+
+ str = "c" + classNum;
+ classNum++;
+ return str;
+ }
+
+ private String findStyle(Lexer lexer, String tag, String properties)
+ {
+ Style style;
+
+ for (style = lexer.styles; style != null; style=style.next)
+ {
+ if (style.tag.equals(tag) &&
+ style.properties.equals(properties))
+ return style.tagClass;
+ }
+
+ style = new Style(tag, gensymClass(tag), properties, lexer.styles);
+ lexer.styles = style;
+ return style.tagClass;
+ }
+
+ /*
+ Find style attribute in node, and replace it
+ by corresponding class attribute. Search for
+ class in style dictionary otherwise gensym
+ new class and add to dictionary.
+
+ Assumes that node doesn't have a class attribute
+ */
+ private void style2Rule(Lexer lexer, Node node)
+ {
+ AttVal styleattr, classattr;
+ String classname;
+
+ styleattr = node.getAttrByName("style");
+
+ if (styleattr != null)
+ {
+ classname = findStyle(lexer, node.element, styleattr.value);
+ classattr = node.getAttrByName("class");
+
+ /*
+ if there already is a class attribute
+ then append class name after a space
+ */
+ if (classattr != null)
+ {
+ classattr.value = classattr.value + " " + classname;
+ node.removeAttribute(styleattr);
+ }
+ else /* reuse style attribute for class attribute */
+ {
+ styleattr.attribute = "class";
+ styleattr.value = classname;
+ }
+ }
+ }
+
+ private void addColorRule(Lexer lexer, String selector, String color)
+ {
+ if (color != null)
+ {
+ lexer.addStringLiteral(selector);
+ lexer.addStringLiteral(" { color: ");
+ lexer.addStringLiteral(color);
+ lexer.addStringLiteral(" }\n");
+ }
+ }
+
+ /*
+ move presentation attribs from body to style element
+
+ background="foo" -> body { background-image: url(foo) }
+ bgcolor="foo" -> body { background-color: foo }
+ text="foo" -> body { color: foo }
+ link="foo" -> :link { color: foo }
+ vlink="foo" -> :visited { color: foo }
+ alink="foo" -> :active { color: foo }
+ */
+ private void cleanBodyAttrs(Lexer lexer, Node body)
+ {
+ AttVal attr;
+ String bgurl = null;
+ String bgcolor = null;
+ String color = null;
+
+ attr = body.getAttrByName("background");
+
+ if (attr != null)
+ {
+ bgurl = attr.value;
+ attr.value = null;
+ body.removeAttribute(attr);
+ }
+
+ attr = body.getAttrByName("bgcolor");
+
+ if (attr != null)
+ {
+ bgcolor = attr.value;
+ attr.value = null;
+ body.removeAttribute(attr);
+ }
+
+ attr = body.getAttrByName("text");
+
+ if (attr != null)
+ {
+ color = attr.value;
+ attr.value = null;
+ body.removeAttribute(attr);
+ }
+
+ if (bgurl != null || bgcolor != null || color != null)
+ {
+ lexer.addStringLiteral(" body {\n");
+
+ if (bgurl != null)
+ {
+ lexer.addStringLiteral(" background-image: url(");
+ lexer.addStringLiteral(bgurl);
+ lexer.addStringLiteral(");\n");
+ }
+
+ if (bgcolor != null)
+ {
+ lexer.addStringLiteral(" background-color: ");
+ lexer.addStringLiteral(bgcolor);
+ lexer.addStringLiteral(";\n");
+ }
+
+ if (color != null)
+ {
+ lexer.addStringLiteral(" color: ");
+ lexer.addStringLiteral(color);
+ lexer.addStringLiteral(";\n");
+ }
+
+ lexer.addStringLiteral(" }\n");
+ }
+
+ attr = body.getAttrByName("link");
+
+ if (attr != null)
+ {
+ addColorRule(lexer, " :link", attr.value);
+ body.removeAttribute(attr);
+ }
+
+ attr = body.getAttrByName("vlink");
+
+ if (attr != null)
+ {
+ addColorRule(lexer, " :visited", attr.value);
+ body.removeAttribute(attr);
+ }
+
+ attr = body.getAttrByName("alink");
+
+ if (attr != null)
+ {
+ addColorRule(lexer, " :active", attr.value);
+ body.removeAttribute(attr);
+ }
+ }
+
+ private boolean niceBody(Lexer lexer, Node doc)
+ {
+ Node body = doc.findBody(lexer.configuration.tt);
+
+ if (body != null)
+ {
+ if (
+ body.getAttrByName("background") != null ||
+ body.getAttrByName("bgcolor") != null ||
+ body.getAttrByName("text") != null ||
+ body.getAttrByName("link") != null ||
+ body.getAttrByName("vlink") != null ||
+ body.getAttrByName("alink") != null
+ )
+ {
+ lexer.badLayout |= Report.USING_BODY;
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+ /* create style element using rules from dictionary */
+ private void createStyleElement(Lexer lexer, Node doc)
+ {
+ Node node, head, body;
+ Style style;
+ AttVal av;
+
+ if (lexer.styles == null && niceBody(lexer, doc))
+ return;
+
+ node = lexer.newNode(Node.StartTag, null, 0, 0, "style");
+ node.implicit = true;
+
+ /* insert type attribute */
+ av = new AttVal(null, null, '"', "type", "text/css");
+ av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
+ node.attributes = av;
+
+ body = doc.findBody(lexer.configuration.tt);
+
+ lexer.txtstart = lexer.lexsize;
+
+ if (body != null)
+ cleanBodyAttrs(lexer, body);
+
+ for (style = lexer.styles; style != null; style = style.next)
+ {
+ lexer.addCharToLexer(' ');
+ lexer.addStringLiteral(style.tag);
+ lexer.addCharToLexer('.');
+ lexer.addStringLiteral(style.tagClass);
+ lexer.addCharToLexer(' ');
+ lexer.addCharToLexer('{');
+ lexer.addStringLiteral(style.properties);
+ lexer.addCharToLexer('}');
+ lexer.addCharToLexer('\n');
+ }
+
+ lexer.txtend = lexer.lexsize;
+
+ Node.insertNodeAtEnd(node,
+ lexer.newNode(Node.TextNode,
+ lexer.lexbuf,
+ lexer.txtstart,
+ lexer.txtend));
+
+ /*
+ now insert style element into document head
+
+ doc is root node. search its children for html node
+ the head node should be first child of html node
+ */
+
+ head = doc.findHEAD(lexer.configuration.tt);
+
+ if (head != null)
+ Node.insertNodeAtEnd(head, node);
+ }
+
+ /* ensure bidirectional links are consistent */
+ private void fixNodeLinks(Node node)
+ {
+ Node child;
+
+ if (node.prev != null)
+ node.prev.next = node;
+ else
+ node.parent.content = node;
+
+ if (node.next != null)
+ node.next.prev = node;
+ else
+ node.parent.last = node;
+
+ for (child = node.content; child != null; child = child.next)
+ child.parent = node;
+ }
+
+ /*
+ used to strip child of node when
+ the node has one and only one child
+ */
+ private void stripOnlyChild(Node node)
+ {
+ Node child;
+
+ child = node.content;
+ node.content = child.content;
+ node.last = child.last;
+ child.content = null;
+
+ for (child = node.content; child != null; child = child.next)
+ child.parent = node;
+ }
+
+ /* used to strip font start and end tags */
+ private void discardContainer(Node element, MutableObject pnode)
+ {
+ Node node;
+ Node parent = element.parent;
+
+ if (element.content != null)
+ {
+ element.last.next = element.next;
+
+ if (element.next != null)
+ {
+ element.next.prev = element.last;
+ element.last.next = element.next;
+ }
+ else
+ parent.last = element.last;
+
+ if (element.prev != null)
+ {
+ element.content.prev = element.prev;
+ element.prev.next = element.content;
+ }
+ else
+ parent.content = element.content;
+
+ for (node = element.content; node != null; node = node.next)
+ node.parent = parent;
+
+ pnode.setObject(element.content);
+ }
+ else
+ {
+ if (element.next != null)
+ element.next.prev = element.prev;
+ else
+ parent.last = element.prev;
+
+ if (element.prev != null)
+ element.prev.next = element.next;
+ else
+ parent.content = element.next;
+
+ pnode.setObject(element.next);
+ }
+
+ element.next = null;
+ element.content = null;
+ }
+
+ /*
+ Add style property to element, creating style
+ attribute as needed and adding ; delimiter
+ */
+ private void addStyleProperty(Node node, String property)
+ {
+ AttVal av;
+
+ for (av = node.attributes; av != null; av = av.next)
+ {
+ if (av.attribute.equals("style"))
+ break;
+ }
+
+ /* if style attribute already exists then insert property */
+
+ if (av != null)
+ {
+ String s;
+
+ s = addProperty(av.value, property);
+ av.value = s;
+ }
+ else /* else create new style attribute */
+ {
+ av = new AttVal(node.attributes, null, '"', "style", property);
+ av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
+ node.attributes = av;
+ }
+ }
+
+ /*
+ Create new string that consists of the
+ combined style properties in s1 and s2
+
+ To merge property lists, we build a linked
+ list of property/values and insert properties
+ into the list in order, merging values for
+ the same property name.
+ */
+ private String mergeProperties(String s1, String s2)
+ {
+ String s;
+ StyleProp prop;
+
+ prop = createProps(null, s1);
+ prop = createProps(prop, s2);
+ s = createPropString(prop);
+ return s;
+ }
+
+ private void mergeStyles(Node node, Node child)
+ {
+ AttVal av;
+ String s1, s2, style;
+
+ for (s2 = null, av = child.attributes; av != null; av = av.next)
+ {
+ if (av.attribute.equals("style"))
+ {
+ s2 = av.value;
+ break;
+ }
+ }
+
+ for (s1 = null, av = node.attributes; av != null; av = av.next)
+ {
+ if (av.attribute.equals("style"))
+ {
+ s1 = av.value;
+ break;
+ }
+ }
+
+ if (s1 != null)
+ {
+ if (s2 != null) /* merge styles from both */
+ {
+ style = mergeProperties(s1, s2);
+ av.value = style;
+ }
+ }
+ else if (s2 != null) /* copy style of child */
+ {
+ av = new AttVal(node.attributes, null, '"', "style", s2);
+ av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
+ node.attributes = av;
+ }
+ }
+
+ private String fontSize2Name(String size)
+ {
+ /*
+ String[] sizes =
+ {
+ "50%",
+ "60%",
+ "80%",
+ null,
+ "120%",
+ "150%",
+ "200%"
+ };
+ */
+
+ String[] sizes =
+ {
+ "60%",
+ "70%",
+ "80%",
+ null,
+ "120%",
+ "150%",
+ "200%"
+ };
+ String buf;
+
+ if (size.length() > 0 &&
+ '0' <= size.charAt(0) && size.charAt(0) <= '6')
+ {
+ int n = size.charAt(0) - '0';
+ return sizes[n];
+ }
+
+ if (size.length() > 0 && size.charAt(0) == '-')
+ {
+ if (size.length() > 1 &&
+ '0' <= size.charAt(1) && size.charAt(1) <= '6')
+ {
+ int n = size.charAt(1) - '0';
+ double x;
+
+ for (x = 1.0; n > 0; --n)
+ x *= 0.8;
+
+ x *= 100.0;
+ buf = "" + (int)x + "%";
+
+ return buf;
+ }
+
+ return "smaller"; /*"70%"; */
+ }
+
+ if (size.length() > 1 &&
+ '0' <= size.charAt(1) && size.charAt(1) <= '6')
+ {
+ int n = size.charAt(1) - '0';
+ double x;
+
+ for (x = 1.0; n > 0; --n)
+ x *= 1.2;
+
+ x *= 100.0;
+ buf = "" + (int)x + "%";
+
+ return buf;
+ }
+
+ return "larger"; /* "140%" */
+ }
+
+ private void addFontFace(Node node, String face)
+ {
+ addStyleProperty(node, "font-family: " + face);
+ }
+
+ private void addFontSize(Node node, String size)
+ {
+ String value;
+
+ if (size.equals("6") && node.tag == tt.tagP)
+ {
+ node.element = "h1";
+ tt.findTag(node);
+ return;
+ }
+
+ if (size.equals("5") && node.tag == tt.tagP)
+ {
+ node.element = "h2";
+ tt.findTag(node);
+ return;
+ }
+
+ if (size.equals("4") && node.tag == tt.tagP)
+ {
+ node.element = "h3";
+ tt.findTag(node);
+ return;
+ }
+
+ value = fontSize2Name(size);
+
+ if (value != null)
+ {
+ addStyleProperty(node, "font-size: " + value);
+ }
+ }
+
+ private void addFontColor(Node node, String color)
+ {
+ addStyleProperty(node, "color: " + color);
+ }
+
+ private void addAlign(Node node, String align)
+ {
+ /* force alignment value to lower case */
+ addStyleProperty(node, "text-align: " + align.toLowerCase());
+ }
+
+ /*
+ add style properties to node corresponding to
+ the font face, size and color attributes
+ */
+ private void addFontStyles(Node node, AttVal av)
+ {
+ while (av != null)
+ {
+ if (av.attribute.equals("face"))
+ addFontFace(node, av.value);
+ else if (av.attribute.equals("size"))
+ addFontSize(node, av.value);
+ else if (av.attribute.equals("color"))
+ addFontColor(node, av.value);
+
+ av = av.next;
+ }
+ }
+
+ /*
+ Symptom: <p align=center>
+ Action: <p style="text-align: center">
+ */
+ private void textAlign(Lexer lexer, Node node)
+ {
+ AttVal av, prev;
+
+ prev = null;
+
+ for (av = node.attributes; av != null; av = av.next)
+ {
+ if (av.attribute.equals("align"))
+ {
+ if (prev != null)
+ prev.next = av.next;
+ else
+ node.attributes = av.next;
+
+ if (av.value != null)
+ {
+ addAlign(node, av.value);
+ }
+
+ break;
+ }
+
+ prev = av;
+ }
+ }
+
+ /*
+ The clean up rules use the pnode argument to return the
+ next node when the orignal node has been deleted
+ */
+
+ /*
+ Symptom: <dir> <li> where <li> is only child
+ Action: coerce <dir> <li> to <div> with indent.
+ */
+
+ private boolean dir2Div(Lexer lexer, Node node, MutableObject pnode)
+ {
+ Node child;
+
+ if (node.tag == tt.tagDir ||
+ node.tag == tt.tagUl ||
+ node.tag == tt.tagOl)
+ {
+ child = node.content;
+
+ if (child == null)
+ return false;
+
+ /* check child has no peers */
+
+ if (child.next != null)
+ return false;
+
+ if (child.tag != tt.tagLi)
+ return false;
+
+ if (!child.implicit)
+ return false;
+
+ /* coerce dir to div */
+
+ node.tag = tt.tagDiv;
+ node.element = "div";
+ addStyleProperty(node, "margin-left: 2em");
+ stripOnlyChild(node);
+ return true;
+
+//#if 0
+ //Node content;
+ //Node last;
+ //content = child.content;
+ //last = child.last;
+ //child.content = null;
+
+ /* adjust parent and set margin on contents of <li> */
+
+ //for (child = content; child != null; child = child.next)
+ //{
+ // child.parent = node.parent;
+ // addStyleProperty(child, "margin-left: 1em");
+ //}
+
+ /* hook first/last into sequence */
+
+ //if (content != null)
+ //{
+ // content.prev = node.prev;
+ // last.next = node.next;
+ // fixNodeLinks(content);
+ // fixNodeLinks(last);
+ //}
+
+ //node.next = null;
+
+ /* ensure that new node is cleaned */
+ //pnode.setObject(cleanNode(lexer, content));
+ //return true;
+//#endif
+ }
+
+ return false;
+ }
+
+ /*
+ Symptom: <center>
+ Action: replace <center> by <div style="text-align: center">
+ */
+
+ private boolean center2Div(Lexer lexer, Node node, MutableObject pnode)
+ {
+ if (node.tag == tt.tagCenter)
+ {
+ if (lexer.configuration.DropFontTags)
+ {
+ if (node.content != null)
+ {
+ Node last = node.last;
+ Node parent = node.parent;
+
+ discardContainer(node, pnode);
+
+ node = lexer.inferredTag("br");
+
+ if (last.next != null)
+ last.next.prev = node;
+
+ node.next = last.next;
+ last.next = node;
+ node.prev = last;
+
+ if (parent.last == last)
+ parent.last = node;
+
+ node.parent = parent;
+ }
+ else
+ {
+ Node prev = node.prev;
+ Node next = node.next;
+ Node parent = node.parent;
+ discardContainer(node, pnode);
+
+ node = lexer.inferredTag("br");
+ node.next = next;
+ node.prev = prev;
+ node.parent = parent;
+
+ if (next != null)
+ next.prev = node;
+ else
+ parent.last = node;
+
+ if (prev != null)
+ prev.next = node;
+ else
+ parent.content = node;
+ }
+
+ return true;
+ }
+ node.tag = tt.tagDiv;
+ node.element = "div";
+ addStyleProperty(node, "text-align: center");
+ return true;
+ }
+
+ return false;
+ }
+
+ /*
+ Symptom <div><div>...</div></div>
+ Action: merge the two divs
+
+ This is useful after nested <dir>s used by Word
+ for indenting have been converted to <div>s
+ */
+ private boolean mergeDivs(Lexer lexer, Node node, MutableObject pnode)
+ {
+ Node child;
+
+ if (node.tag != tt.tagDiv)
+ return false;
+
+ child = node.content;
+
+ if (child == null)
+ return false;
+
+ if (child.tag != tt.tagDiv)
+ return false;
+
+ if (child.next != null)
+ return false;
+
+ mergeStyles(node, child);
+ stripOnlyChild(node);
+ return true;
+ }
+
+ /*
+ Symptom: <ul><li><ul>...</ul></li></ul>
+ Action: discard outer list
+ */
+
+ private boolean nestedList(Lexer lexer, Node node, MutableObject pnode)
+ {
+ Node child, list;
+
+ if (node.tag == tt.tagUl || node.tag == tt.tagOl)
+ {
+ child = node.content;
+
+ if (child == null)
+ return false;
+
+ /* check child has no peers */
+
+ if (child.next != null)
+ return false;
+
+ list = child.content;
+
+ if (list == null)
+ return false;
+
+ if (list.tag != node.tag)
+ return false;
+
+ pnode.setObject(node.next);
+
+ /* move inner list node into position of outer node */
+ list.prev = node.prev;
+ list.next = node.next;
+ list.parent = node.parent;
+ fixNodeLinks(list);
+
+ /* get rid of outer ul and its li */
+ child.content = null;
+ node.content = null;
+ node.next = null;
+
+ /*
+ If prev node was a list the chances are this node
+ should be appended to that list. Word has no way of
+ recognizing nested lists and just uses indents
+ */
+
+ if (list.prev != null)
+ {
+ node = list;
+ list = node.prev;
+
+ if (list.tag == tt.tagUl || list.tag == tt.tagOl)
+ {
+ list.next = node.next;
+
+ if (list.next != null)
+ list.next.prev = list;
+
+ child = list.last; /* <li> */
+
+ node.parent = child;
+ node.next = null;
+ node.prev = child.last;
+ fixNodeLinks(node);
+ }
+ }
+
+ cleanNode(lexer, node);
+ return true;
+ }
+
+ return false;
+ }
+
+ /*
+ Symptom: the only child of a block-level element is a
+ presentation element such as B, I or FONT
+
+ Action: add style "font-weight: bold" to the block and
+ strip the <b> element, leaving its children.
+
+ example:
+
+ <p>
+ <b><font face="Arial" size="6">Draft Recommended Practice</font></b>
+ </p>
+
+ becomes:
+
+ <p style="font-weight: bold; font-family: Arial; font-size: 6">
+ Draft Recommended Practice
+ </p>
+
+ This code also replaces the align attribute by a style attribute.
+ However, to avoid CSS problems with Navigator 4, this isn't done
+ for the elements: caption, tr and table
+ */
+ private boolean blockStyle(Lexer lexer, Node node, MutableObject pnode)
+ {
+ Node child;
+
+ if ((node.tag.model & (Dict.CM_BLOCK | Dict.CM_LIST | Dict.CM_DEFLIST | Dict.CM_TABLE)) != 0)
+ {
+ if (node.tag != tt.tagTable
+ && node.tag != tt.tagTr
+ && node.tag != tt.tagLi)
+ {
+ /* check for align attribute */
+ if (node.tag != tt.tagCaption)
+ textAlign(lexer, node);
+
+ child = node.content;
+
+ if (child == null)
+ return false;
+
+ /* check child has no peers */
+
+ if (child.next != null)
+ return false;
+
+ if (child.tag == tt.tagB)
+ {
+ mergeStyles(node, child);
+ addStyleProperty(node, "font-weight: bold");
+ stripOnlyChild(node);
+ return true;
+ }
+
+ if (child.tag == tt.tagI)
+ {
+ mergeStyles(node, child);
+ addStyleProperty(node, "font-style: italic");
+ stripOnlyChild(node);
+ return true;
+ }
+
+ if (child.tag == tt.tagFont)
+ {
+ mergeStyles(node, child);
+ addFontStyles(node, child.attributes);
+ stripOnlyChild(node);
+ return true;
+ }
+ }
+ }
+
+ return false;
+ }
+
+ /* the only child of table cell or an inline element such as em */
+ private boolean inlineStyle(Lexer lexer, Node node, MutableObject pnode)
+ {
+ Node child;
+
+ if (node.tag != tt.tagFont && (node.tag.model & (Dict.CM_INLINE|Dict.CM_ROW)) != 0)
+ {
+ child = node.content;
+
+ if (child == null)
+ return false;
+
+ /* check child has no peers */
+
+ if (child.next != null)
+ return false;
+
+ if (child.tag == tt.tagB && lexer.configuration.LogicalEmphasis)
+ {
+ mergeStyles(node, child);
+ addStyleProperty(node, "font-weight: bold");
+ stripOnlyChild(node);
+ return true;
+ }
+
+ if (child.tag == tt.tagI && lexer.configuration.LogicalEmphasis)
+ {
+ mergeStyles(node, child);
+ addStyleProperty(node, "font-style: italic");
+ stripOnlyChild(node);
+ return true;
+ }
+
+ if (child.tag == tt.tagFont)
+ {
+ mergeStyles(node, child);
+ addFontStyles(node, child.attributes);
+ stripOnlyChild(node);
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ /*
+ Replace font elements by span elements, deleting
+ the font element's attributes and replacing them
+ by a single style attribute.
+ */
+ private boolean font2Span(Lexer lexer, Node node, MutableObject pnode)
+ {
+ AttVal av, style, next;
+
+ if (node.tag == tt.tagFont)
+ {
+ if (lexer.configuration.DropFontTags)
+ {
+ discardContainer(node, pnode);
+ return false;
+ }
+
+ /* if FONT is only child of parent element then leave alone */
+ if (node.parent.content == node
+ && node.next == null)
+ return false;
+
+ addFontStyles(node, node.attributes);
+
+ /* extract style attribute and free the rest */
+ av = node.attributes;
+ style = null;
+
+ while (av != null)
+ {
+ next = av.next;
+
+ if (av.attribute.equals("style"))
+ {
+ av.next = null;
+ style = av;
+ }
+
+ av = next;
+ }
+
+ node.attributes = style;
+
+ node.tag = tt.tagSpan;
+ node.element = "span";
+
+ return true;
+ }
+
+ return false;
+ }
+
+ /*
+ Applies all matching rules to a node.
+ */
+ private Node cleanNode(Lexer lexer, Node node)
+ {
+ Node next = null;
+ MutableObject o = new MutableObject();
+ boolean b = false;
+
+ for (next = node; node.isElement(); node = next)
+ {
+ o.setObject(next);
+
+ b = dir2Div(lexer, node, o);
+ next = (Node)o.getObject();
+ if (b)
+ continue;
+
+ b = nestedList(lexer, node, o);
+ next = (Node)o.getObject();
+ if (b)
+ continue;
+
+ b = center2Div(lexer, node, o);
+ next = (Node)o.getObject();
+ if (b)
+ continue;
+
+ b = mergeDivs(lexer, node, o);
+ next = (Node)o.getObject();
+ if (b)
+ continue;
+
+ b = blockStyle(lexer, node, o);
+ next = (Node)o.getObject();
+ if (b)
+ continue;
+
+ b = inlineStyle(lexer, node, o);
+ next = (Node)o.getObject();
+ if (b)
+ continue;
+
+ b = font2Span(lexer, node, o);
+ next = (Node)o.getObject();
+ if (b)
+ continue;
+
+ break;
+ }
+
+ return next;
+ }
+
+ private Node createStyleProperties(Lexer lexer, Node node)
+ {
+ Node child;
+
+ if (node.content != null)
+ {
+ for (child = node.content; child != null; child = child.next)
+ {
+ child = createStyleProperties(lexer, child);
+ }
+ }
+
+ return cleanNode(lexer, node);
+ }
+
+ private void defineStyleRules(Lexer lexer, Node node)
+ {
+ Node child;
+
+ if (node.content != null)
+ {
+ for (child = node.content;
+ child != null; child = child.next)
+ {
+ defineStyleRules(lexer, child);
+ }
+ }
+
+ style2Rule(lexer, node);
+ }
+
+ public void cleanTree(Lexer lexer, Node doc)
+ {
+ doc = createStyleProperties(lexer, doc);
+
+ if (!lexer.configuration.MakeClean)
+ {
+ defineStyleRules(lexer, doc);
+ createStyleElement(lexer, doc);
+ }
+ }
+
+ /* simplifies <b><b> ... </b> ...</b> etc. */
+ public void nestedEmphasis(Node node)
+ {
+ MutableObject o = new MutableObject();
+ Node next;
+
+ while (node != null)
+ {
+ next = node.next;
+
+ if ((node.tag == tt.tagB || node.tag == tt.tagI)
+ && node.parent != null && node.parent.tag == node.tag)
+ {
+ /* strip redundant inner element */
+ o.setObject(next);
+ discardContainer(node, o);
+ next = (Node)o.getObject();
+ node = next;
+ continue;
+ }
+
+ if (node.content != null)
+ nestedEmphasis(node.content);
+
+ node = next;
+ }
+ }
+
+ /* replace i by em and b by strong */
+ public void emFromI(Node node)
+ {
+ while (node != null)
+ {
+ if (node.tag == tt.tagI)
+ {
+ node.element = tt.tagEm.name;
+ node.tag = tt.tagEm;
+ }
+ else if (node.tag == tt.tagB)
+ {
+ node.element = tt.tagStrong.name;
+ node.tag = tt.tagStrong;
+ }
+
+ if (node.content != null)
+ emFromI(node.content);
+
+ node = node.next;
+ }
+ }
+
+ /*
+ Some people use dir or ul without an li
+ to indent the content. The pattern to
+ look for is a list with a single implicit
+ li. This is recursively replaced by an
+ implicit blockquote.
+ */
+ public void list2BQ(Node node)
+ {
+ while (node != null)
+ {
+ if (node.content != null)
+ list2BQ(node.content);
+
+ if (node.tag != null && node.tag.parser == ParserImpl.getParseList() &&
+ node.hasOneChild() && node.content.implicit)
+ {
+ stripOnlyChild(node);
+ node.element = tt.tagBlockquote.name;
+ node.tag = tt.tagBlockquote;
+ node.implicit = true;
+ }
+
+ node = node.next;
+ }
+ }
+
+ /*
+ Replace implicit blockquote by div with an indent
+ taking care to reduce nested blockquotes to a single
+ div with the indent set to match the nesting depth
+ */
+ public void bQ2Div(Node node)
+ {
+ int indent;
+ String indent_buf;
+
+ while (node != null)
+ {
+ if (node.tag == tt.tagBlockquote && node.implicit)
+ {
+ indent = 1;
+
+ while(node.hasOneChild() &&
+ node.content.tag == tt.tagBlockquote &&
+ node.implicit)
+ {
+ ++indent;
+ stripOnlyChild(node);
+ }
+
+ if (node.content != null)
+ bQ2Div(node.content);
+
+ indent_buf = "margin-left: " +
+ (new Integer(2*indent)).toString() + "em";
+
+ node.element = tt.tagDiv.name;
+ node.tag = tt.tagDiv;
+ node.addAttribute("style", indent_buf);
+ }
+ else if (node.content != null)
+ bQ2Div(node.content);
+
+
+ node = node.next;
+ }
+ }
+
+ /* node is <![if ...]> prune up to <![endif]> */
+ public Node pruneSection(Lexer lexer, Node node)
+ {
+ for (;;)
+ {
+ /* discard node and returns next */
+ node = Node.discardElement(node);
+
+ if (node == null)
+ return null;
+
+ if (node.type == Node.SectionTag)
+ {
+ if ((Lexer.getString(node.textarray, node.start, 2)).equals("if"))
+ {
+ node = pruneSection(lexer, node);
+ continue;
+ }
+
+ if ((Lexer.getString(node.textarray, node.start, 5)).equals("endif"))
+ {
+ node = Node.discardElement(node);
+ break;
+ }
+ }
+ }
+
+ return node;
+ }
+
+ public void dropSections(Lexer lexer, Node node)
+ {
+ while (node != null)
+ {
+ if (node.type == Node.SectionTag)
+ {
+ /* prune up to matching endif */
+ if ((Lexer.getString(node.textarray, node.start, 2)).equals("if"))
+ {
+ node = pruneSection(lexer, node);
+ continue;
+ }
+
+ /* discard others as well */
+ node = Node.discardElement(node);
+ continue;
+ }
+
+ if (node.content != null)
+ dropSections(lexer, node.content);
+
+ node = node.next;
+ }
+ }
+
+ public void purgeAttributes(Node node)
+ {
+ AttVal attr = node.attributes;
+ AttVal next = null;
+ AttVal prev = null;
+
+ while (attr != null)
+ {
+ next = attr.next;
+
+ /* special check for class="Code" denoting pre text */
+ if (attr.attribute != null &&
+ attr.value != null &&
+ attr.attribute.equals("class") &&
+ attr.value.equals("Code"))
+ {
+ prev = attr;
+ }
+ else if (attr.attribute != null &&
+ (attr.attribute.equals("class") ||
+ attr.attribute.equals("style") ||
+ attr.attribute.equals("lang") ||
+ attr.attribute.startsWith("x:") ||
+ ((attr.attribute.equals("height") || attr.attribute.equals("width")) &&
+ (node.tag == tt.tagTd || node.tag == tt.tagTr || node.tag == tt.tagTh))))
+ {
+ if (prev != null)
+ prev.next = next;
+ else
+ node.attributes = next;
+
+ }
+ else
+ prev = attr;
+
+ attr = next;
+ }
+ }
+
+ /* Word2000 uses span excessively, so we strip span out */
+ public Node stripSpan(Lexer lexer, Node span)
+ {
+ Node node;
+ Node prev = null;
+ Node content;
+
+ /*
+ deal with span elements that have content
+ by splicing the content in place of the span
+ after having processed it
+ */
+
+ cleanWord2000(lexer, span.content);
+ content = span.content;
+
+ if (span.prev != null)
+ prev = span.prev;
+ else if (content != null)
+ {
+ node = content;
+ content = content.next;
+ Node.removeNode(node);
+ Node.insertNodeBeforeElement(span, node);
+ prev = node;
+ }
+
+ while (content != null)
+ {
+ node = content;
+ content = content.next;
+ Node.removeNode(node);
+ Node.insertNodeAfterElement(prev, node);
+ prev = node;
+ }
+
+ if (span.next == null)
+ span.parent.last = prev;
+
+ node = span.next;
+ span.content = null;
+ Node.discardElement(span);
+ return node;
+ }
+
+ /* map non-breaking spaces to regular spaces */
+ private void normalizeSpaces(Lexer lexer, Node node)
+ {
+ while (node != null)
+ {
+ if (node.content != null)
+ normalizeSpaces(lexer, node.content);
+
+ if (node.type == Node.TextNode)
+ {
+ int i;
+ MutableInteger c = new MutableInteger();
+ int p = node.start;
+
+ for (i = node.start; i < node.end; ++i)
+ {
+ c.value = (int)node.textarray[i];
+
+ /* look for UTF-8 multibyte character */
+ if (c.value > 0x7F)
+ i += PPrint.getUTF8(node.textarray, i, c);
+
+ if (c.value == 160)
+ c.value = ' ';
+
+ p = PPrint.putUTF8(node.textarray, p, c.value);
+ }
+ }
+
+ node = node.next;
+ }
+ }
+
+ /*
+ This is a major clean up to strip out all the extra stuff you get
+ when you save as web page from Word 2000. It doesn't yet know what
+ to do with VML tags, but these will appear as errors unless you
+ declare them as new tags, such as o:p which needs to be declared
+ as inline.
+ */
+ public void cleanWord2000(Lexer lexer, Node node)
+ {
+ /* used to a list from a sequence of bulletted p's */
+ Node list = null;
+
+ while (node != null)
+ {
+ /* discard Word's style verbiage */
+ if (node.tag == tt.tagStyle ||
+ node.tag == tt.tagMeta ||
+ node.type == Node.CommentTag)
+ {
+ node = Node.discardElement(node);
+ continue;
+ }
+
+ /* strip out all span tags Word scatters so liberally! */
+ if (node.tag == tt.tagSpan)
+ {
+ node = stripSpan(lexer, node);
+ continue;
+ }
+
+ /* get rid of Word's xmlns attributes */
+ if (node.tag == tt.tagHtml)
+ {
+ /* check that it's a Word 2000 document */
+ if (node.getAttrByName("xmlns:o") == null)
+ return;
+ }
+
+ if (node.tag == tt.tagLink)
+ {
+ AttVal attr = node.getAttrByName("rel");
+
+ if (attr != null && attr.value != null &&
+ attr.value.equals("File-List"))
+ {
+ node = Node.discardElement(node);
+ continue;
+ }
+ }
+
+ /* discard empty paragraphs */
+ if (node.content == null && node.tag == tt.tagP)
+ {
+ node = Node.discardElement(node);
+ continue;
+ }
+
+ if (node.tag == tt.tagP)
+ {
+ AttVal attr = node.getAttrByName("class");
+
+ /* map sequence of <p class="MsoListBullet"> to <ul>...</ul> */
+ if (attr != null && attr.value != null &&
+ attr.value.equals("MsoListBullet"))
+ {
+ Node.coerceNode(lexer, node, tt.tagLi);
+
+ if (list == null || list.tag != tt.tagUl)
+ {
+ list = lexer.inferredTag("ul");
+ Node.insertNodeBeforeElement(node, list);
+ }
+
+ purgeAttributes(node);
+
+ if (node.content != null)
+ cleanWord2000(lexer, node.content);
+
+ /* remove node and append to contents of list */
+ Node.removeNode(node);
+ Node.insertNodeAtEnd(list, node);
+ node = list.next;
+ }
+ /* map sequence of <p class="Code"> to <pre>...</pre> */
+ else if (attr != null && attr.value != null &&
+ attr.value.equals("Code"))
+ {
+ Node br = lexer.newLineNode();
+ normalizeSpaces(lexer, node);
+
+ if (list == null || list.tag != tt.tagPre)
+ {
+ list = lexer.inferredTag("pre");
+ Node.insertNodeBeforeElement(node, list);
+ }
+
+ /* remove node and append to contents of list */
+ Node.removeNode(node);
+ Node.insertNodeAtEnd(list, node);
+ stripSpan(lexer, node);
+ Node.insertNodeAtEnd(list, br);
+ node = list.next;
+ }
+ else
+ list = null;
+ }
+ else
+ list = null;
+
+ /* strip out style and class attributes */
+ if (node.type == Node.StartTag || node.type == Node.StartEndTag)
+ purgeAttributes(node);
+
+ if (node.content != null)
+ cleanWord2000(lexer, node.content);
+
+ node = node.next;
+ }
+ }
+
+ public boolean isWord2000(Node root, TagTable tt)
+ {
+ Node html = root.findHTML(tt);
+
+ return (html != null && html.getAttrByName("xmlns:o") != null);
+ }
+}