experimental jtidy version adapted for phpeclipse
[phpeclipse.git] / net.sourceforge.phpeclipse / src / net / sourceforge / phpdt / tidy / Clean.java
diff --git a/net.sourceforge.phpeclipse/src/net/sourceforge/phpdt/tidy/Clean.java b/net.sourceforge.phpeclipse/src/net/sourceforge/phpdt/tidy/Clean.java
new file mode 100644 (file)
index 0000000..66d2331
--- /dev/null
@@ -0,0 +1,1779 @@
+/*
+ * @(#)Clean.java   1.11 2000/08/16
+ *
+ */
+
+package net.sourceforge.phpdt.tidy;
+
+/**
+ *
+ * Clean up misuse of presentation markup
+ *
+ * (c) 1998-2000 (W3C) MIT, INRIA, Keio University
+ * See Tidy.java for the copyright notice.
+ * Derived from <a href="http://www.w3.org/People/Raggett/tidy">
+ * HTML Tidy Release 4 Aug 2000</a>
+ *
+ * @author  Dave Raggett <dsr@w3.org>
+ * @author  Andy Quick <ac.quick@sympatico.ca> (translation to Java)
+ * @version 1.0, 1999/05/22
+ * @version 1.0.1, 1999/05/29
+ * @version 1.1, 1999/06/18 Java Bean
+ * @version 1.2, 1999/07/10 Tidy Release 7 Jul 1999
+ * @version 1.3, 1999/07/30 Tidy Release 26 Jul 1999
+ * @version 1.4, 1999/09/04 DOM support
+ * @version 1.5, 1999/10/23 Tidy Release 27 Sep 1999
+ * @version 1.6, 1999/11/01 Tidy Release 22 Oct 1999
+ * @version 1.7, 1999/12/06 Tidy Release 30 Nov 1999
+ * @version 1.8, 2000/01/22 Tidy Release 13 Jan 2000
+ * @version 1.9, 2000/06/03 Tidy Release 30 Apr 2000
+ * @version 1.10, 2000/07/22 Tidy Release 8 Jul 2000
+ * @version 1.11, 2000/08/16 Tidy Release 4 Aug 2000
+ */
+
+/*
+  Filters from other formats such as Microsoft Word
+  often make excessive use of presentation markup such
+  as font tags, B, I, and the align attribute. By applying
+  a set of production rules, it is straight forward to
+  transform this to use CSS.
+
+  Some rules replace some of the children of an element by
+  style properties on the element, e.g.
+
+  <p><b>...</b></p> -> <p style="font-weight: bold">...</p>
+
+  Such rules are applied to the element's content and then
+  to the element itself until none of the rules more apply.
+  Having applied all the rules to an element, it will have
+  a style attribute with one or more properties. 
+
+  Other rules strip the element they apply to, replacing
+  it by style properties on the contents, e.g.
+  
+  <dir><li><p>...</li></dir> -> <p style="margin-left 1em">...
+      
+  These rules are applied to an element before processing
+  its content and replace the current element by the first
+  element in the exposed content.
+
+  After applying both sets of rules, you can replace the
+  style attribute by a class value and style rule in the
+  document head. To support this, an association of styles
+  and class names is built.
+
+  A naive approach is to rely on string matching to test
+  when two property lists are the same. A better approach
+  would be to first sort the properties before matching.
+*/
+
+public class Clean {
+
+    private int classNum = 1;
+
+    private TagTable tt;
+
+    public Clean(TagTable tt)
+    {
+      this.tt = tt;
+    }
+
+    private StyleProp insertProperty(StyleProp props, String name,
+                                            String value)
+    {
+        StyleProp first, prev, prop;
+        int cmp;
+
+        prev = null;
+        first = props;
+
+        while (props != null)
+        {
+            cmp = props.name.compareTo(name);
+
+            if (cmp == 0)
+            {
+                /* this property is already defined, ignore new value */
+                return first;
+            }
+
+            if (cmp > 0) // props.name > name
+            {
+                /* insert before this */
+
+                prop = new StyleProp(name, value, props);
+
+                if (prev != null)
+                    prev.next = prop;
+                else
+                    first = prop;
+
+                return first;
+            }
+
+            prev = props;
+            props = props.next;
+        }
+
+        prop = new StyleProp(name, value);
+
+        if (prev != null)
+            prev.next = prop;
+        else
+            first = prop;
+
+        return first;
+    }
+
+    /*
+     Create sorted linked list of properties from style string
+     It temporarily places nulls in place of ':' and ';' to
+     delimit the strings for the property name and value.
+     Some systems don't allow you to null literal strings,
+     so to avoid this, a copy is made first.
+    */
+    private StyleProp createProps(StyleProp prop, String style)
+    {
+        int name_end;
+        int value_end;
+        int value_start = 0;
+        int name_start = 0;
+        boolean more;
+
+        name_start = 0;
+        while (name_start < style.length())
+        {
+            while (name_start < style.length() &&
+                       style.charAt(name_start) == ' ')
+                ++name_start;
+
+            name_end = name_start;
+
+            while (name_end < style.length())
+            {
+                if (style.charAt(name_end) == ':')
+                {
+                    value_start = name_end + 1;
+                    break;
+                }
+
+                ++name_end;
+            }
+
+            if (name_end >= style.length() || style.charAt(name_end) != ':')
+                break;
+
+            while (value_start < style.length() &&
+                       style.charAt(value_start) == ' ')
+                ++value_start;
+
+            value_end = value_start;
+            more = false;
+
+            while (value_end < style.length())
+            {
+                if (style.charAt(value_end) == ';')
+                {
+                    more = true;
+                    break;
+                }
+
+                ++value_end;
+            }
+
+            prop = insertProperty(prop,
+                                  style.substring(name_start, name_end),
+                                  style.substring(value_start, value_end));
+
+            if (more)
+            {
+                name_start = value_end + 1;
+                continue;
+            }
+
+            break;
+        }
+
+        return prop;
+    }
+
+    private String createPropString(StyleProp props)
+    {
+        String style = "";
+        int len;
+        StyleProp prop;
+
+        /* compute length */
+
+        for (len = 0, prop = props; prop != null; prop = prop.next)
+        {
+            len += prop.name.length() + 2;
+            len += prop.value.length() + 2;
+        }
+
+        for (prop = props; prop != null; prop = prop.next)
+        {
+            style = style.concat(prop.name);
+            style = style.concat(": ");
+
+            style = style.concat(prop.value);
+
+            if (prop.next == null)
+                break;
+
+            style = style.concat("; ");
+        }
+
+        return style;
+    }
+
+    /*
+      create string with merged properties
+    */
+    private String addProperty(String style, String property)
+    {
+        StyleProp prop;
+
+        prop = createProps(null, style);
+        prop = createProps(prop, property);
+        style = createPropString(prop);
+        return style;
+    }
+
+    private String gensymClass(String tag)
+    {
+        String str;
+
+        str = "c" + classNum;
+        classNum++;
+        return str;
+    }
+
+    private String findStyle(Lexer lexer, String tag, String properties)
+    {
+        Style style;
+
+        for (style = lexer.styles; style != null; style=style.next)
+        {
+            if (style.tag.equals(tag) &&
+                style.properties.equals(properties))
+                return style.tagClass;
+        }
+
+        style = new Style(tag, gensymClass(tag), properties, lexer.styles);
+        lexer.styles = style;
+        return style.tagClass;
+    }
+
+    /*
+     Find style attribute in node, and replace it
+     by corresponding class attribute. Search for
+     class in style dictionary otherwise gensym
+     new class and add to dictionary.
+
+     Assumes that node doesn't have a class attribute
+    */
+    private void style2Rule(Lexer lexer, Node node)
+    {
+        AttVal styleattr, classattr;
+        String classname;
+
+        styleattr = node.getAttrByName("style");
+
+        if (styleattr != null)
+        {
+                classname = findStyle(lexer, node.element, styleattr.value);
+                classattr = node.getAttrByName("class");
+
+                /*
+                if there already is a class attribute
+                then append class name after a space
+               */
+                if (classattr != null)
+               {
+                        classattr.value = classattr.value + " " + classname;
+                        node.removeAttribute(styleattr);
+               }
+               else /* reuse style attribute for class attribute */
+               {
+                        styleattr.attribute = "class";
+                        styleattr.value = classname;
+               }
+        }
+    }
+
+    private void addColorRule(Lexer lexer, String selector, String color)
+    {
+        if (color != null)
+        {
+            lexer.addStringLiteral(selector);
+            lexer.addStringLiteral(" { color: ");
+            lexer.addStringLiteral(color);
+            lexer.addStringLiteral(" }\n");
+        }
+    }
+
+    /*
+     move presentation attribs from body to style element
+
+     background="foo" ->  body { background-image: url(foo) }
+     bgcolor="foo"    ->  body { background-color: foo }
+     text="foo"       ->  body { color: foo }
+     link="foo"       ->  :link { color: foo }
+     vlink="foo"      ->  :visited { color: foo }
+     alink="foo"      ->  :active { color: foo }
+    */
+    private void cleanBodyAttrs(Lexer lexer, Node body)
+    {
+        AttVal attr;
+        String bgurl = null;
+        String bgcolor = null;
+        String color = null;
+    
+        attr = body.getAttrByName("background");
+
+        if (attr != null)
+        {
+            bgurl = attr.value;
+            attr.value = null;
+            body.removeAttribute(attr);
+        }
+
+        attr = body.getAttrByName("bgcolor");
+
+        if (attr != null)
+        {
+            bgcolor = attr.value;
+            attr.value = null;
+            body.removeAttribute(attr);
+        }
+
+        attr = body.getAttrByName("text");
+
+        if (attr != null)
+        {
+            color = attr.value;
+            attr.value = null;
+            body.removeAttribute(attr);
+        }
+
+        if (bgurl != null || bgcolor != null || color != null)
+        {
+            lexer.addStringLiteral(" body {\n");
+
+            if (bgurl != null)
+            {
+                lexer.addStringLiteral("  background-image: url(");
+                lexer.addStringLiteral(bgurl);
+                lexer.addStringLiteral(");\n");
+            }
+
+            if (bgcolor != null)
+            {
+                lexer.addStringLiteral("  background-color: ");
+                lexer.addStringLiteral(bgcolor);
+                lexer.addStringLiteral(";\n");
+            }
+
+            if (color != null)
+            {
+                lexer.addStringLiteral("  color: ");
+                lexer.addStringLiteral(color);
+                lexer.addStringLiteral(";\n");
+            }
+
+            lexer.addStringLiteral(" }\n");
+        }
+
+        attr = body.getAttrByName("link");
+
+        if (attr != null)
+        {
+            addColorRule(lexer, " :link", attr.value);
+            body.removeAttribute(attr);
+        }
+
+        attr = body.getAttrByName("vlink");
+
+        if (attr != null)
+        {
+            addColorRule(lexer, " :visited", attr.value);
+            body.removeAttribute(attr);
+        }
+
+        attr = body.getAttrByName("alink");
+
+        if (attr != null)
+        {
+            addColorRule(lexer, " :active", attr.value);
+            body.removeAttribute(attr);
+        }
+    }
+
+    private boolean niceBody(Lexer lexer, Node doc)
+    {
+        Node body = doc.findBody(lexer.configuration.tt);
+
+        if (body != null)
+        {
+            if (
+                body.getAttrByName("background") != null ||
+                body.getAttrByName("bgcolor") != null ||
+                body.getAttrByName("text") != null ||
+                body.getAttrByName("link") != null ||
+                body.getAttrByName("vlink") != null ||
+                body.getAttrByName("alink") != null
+               )
+            {
+                lexer.badLayout |= Report.USING_BODY;
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    /* create style element using rules from dictionary */
+    private void createStyleElement(Lexer lexer, Node doc)
+    {
+        Node node, head, body;
+        Style style;
+        AttVal av;
+
+        if (lexer.styles == null && niceBody(lexer, doc))
+            return;
+
+        node = lexer.newNode(Node.StartTag, null, 0, 0, "style");
+        node.implicit = true;
+
+        /* insert type attribute */
+        av = new AttVal(null, null, '"', "type", "text/css");
+        av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
+        node.attributes = av;
+
+        body = doc.findBody(lexer.configuration.tt);
+
+        lexer.txtstart = lexer.lexsize;
+
+        if (body != null)
+            cleanBodyAttrs(lexer, body);
+
+        for (style = lexer.styles; style != null; style = style.next)
+        {
+            lexer.addCharToLexer(' ');
+            lexer.addStringLiteral(style.tag);
+            lexer.addCharToLexer('.');
+            lexer.addStringLiteral(style.tagClass);
+            lexer.addCharToLexer(' ');
+            lexer.addCharToLexer('{');
+            lexer.addStringLiteral(style.properties);
+            lexer.addCharToLexer('}');
+            lexer.addCharToLexer('\n');
+        }
+
+        lexer.txtend = lexer.lexsize;
+
+        Node.insertNodeAtEnd(node,
+                             lexer.newNode(Node.TextNode,
+                                      lexer.lexbuf,
+                                      lexer.txtstart,
+                                      lexer.txtend));
+
+        /*
+         now insert style element into document head
+
+         doc is root node. search its children for html node
+         the head node should be first child of html node
+        */
+
+        head = doc.findHEAD(lexer.configuration.tt);
+    
+        if (head != null)
+            Node.insertNodeAtEnd(head, node);
+    }
+
+    /* ensure bidirectional links are consistent */
+    private void fixNodeLinks(Node node)
+    {
+        Node child;
+
+        if (node.prev != null)
+            node.prev.next = node;
+        else
+            node.parent.content = node;
+
+        if (node.next != null)
+            node.next.prev = node;
+        else
+            node.parent.last = node;
+
+        for (child = node.content; child != null; child = child.next)
+            child.parent = node;
+    }
+
+    /*
+     used to strip child of node when
+     the node has one and only one child
+    */
+    private void stripOnlyChild(Node node)
+    {
+        Node child;
+
+        child = node.content;
+        node.content = child.content;
+        node.last = child.last;
+        child.content = null;
+
+        for (child = node.content; child != null; child = child.next)
+            child.parent = node;
+    }
+
+    /* used to strip font start and end tags */
+    private void discardContainer(Node element, MutableObject pnode)
+    {
+        Node node;
+        Node parent = element.parent;
+
+        if (element.content != null)
+        {
+            element.last.next = element.next;
+
+            if (element.next != null)
+            {
+                element.next.prev = element.last;
+                element.last.next = element.next;
+            }
+            else
+                parent.last = element.last;
+
+            if (element.prev != null)
+            {
+                element.content.prev = element.prev;
+                element.prev.next = element.content;
+            }
+            else
+                parent.content = element.content;
+
+            for (node = element.content; node != null; node = node.next)
+                node.parent = parent;
+
+            pnode.setObject(element.content);
+        }
+        else
+        {
+            if (element.next != null)
+                element.next.prev = element.prev;
+            else
+                parent.last = element.prev;
+
+            if (element.prev != null)
+                element.prev.next = element.next;
+            else
+                parent.content = element.next;
+
+            pnode.setObject(element.next);
+        }
+
+        element.next = null;
+        element.content = null;
+    }
+
+    /*
+     Add style property to element, creating style
+     attribute as needed and adding ; delimiter
+    */
+    private void addStyleProperty(Node node, String property)
+    {
+        AttVal av;
+
+        for (av = node.attributes; av != null; av = av.next)
+        {
+            if (av.attribute.equals("style"))
+                break;
+        }
+
+        /* if style attribute already exists then insert property */
+
+        if (av != null)
+        {
+            String s;
+
+            s = addProperty(av.value, property);
+            av.value = s;
+        }
+        else /* else create new style attribute */
+        {
+            av = new AttVal(node.attributes, null, '"', "style", property);
+            av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
+            node.attributes = av;
+        }
+    }
+
+    /*
+      Create new string that consists of the
+      combined style properties in s1 and s2
+
+      To merge property lists, we build a linked
+      list of property/values and insert properties
+      into the list in order, merging values for
+      the same property name.
+    */
+    private String mergeProperties(String s1, String s2)
+    {
+        String s;
+        StyleProp prop;
+
+        prop = createProps(null, s1);
+        prop = createProps(prop, s2);
+        s = createPropString(prop);
+        return s;
+    }
+
+    private void mergeStyles(Node node, Node child)
+    {
+        AttVal av;
+        String s1, s2, style;
+
+        for (s2 = null, av = child.attributes; av != null; av = av.next)
+        {
+            if (av.attribute.equals("style"))
+            {
+                s2 = av.value;
+                break;
+            }
+        }
+
+        for (s1 = null, av = node.attributes; av != null; av = av.next)
+        {
+            if (av.attribute.equals("style"))
+            {
+                s1 = av.value;
+                break;
+            }
+        }
+
+        if (s1 != null)
+        {
+            if (s2 != null)  /* merge styles from both */
+            {
+                style = mergeProperties(s1, s2);
+                av.value = style;
+            }
+        }
+        else if (s2 != null)  /* copy style of child */
+        {
+            av = new AttVal(node.attributes, null, '"', "style", s2);
+            av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
+            node.attributes = av;
+        }
+    }
+
+    private String fontSize2Name(String size)
+    {
+        /*
+        String[] sizes =
+        {
+            "50%",
+            "60%",
+            "80%",
+            null,
+            "120%",
+            "150%",
+            "200%"
+        };
+        */
+
+        String[] sizes =
+        {
+            "60%",
+            "70%",
+            "80%",
+            null,
+            "120%",
+            "150%",
+            "200%"
+        };
+        String buf;
+
+        if (size.length() > 0 &&
+            '0' <= size.charAt(0) && size.charAt(0) <= '6')
+        {
+            int n = size.charAt(0) - '0';
+            return sizes[n];
+        }
+
+        if (size.length() > 0 && size.charAt(0) == '-')
+        {
+            if (size.length() > 1 &&
+                '0' <= size.charAt(1) && size.charAt(1) <= '6')
+            {
+                int n = size.charAt(1) - '0';
+                double x;
+
+                for (x = 1.0; n > 0; --n)
+                    x *= 0.8;
+
+                x *= 100.0;
+                buf = "" + (int)x + "%";
+
+                return buf;
+            }
+
+            return "smaller"; /*"70%"; */
+        }
+
+        if (size.length() > 1 &&
+            '0' <= size.charAt(1) && size.charAt(1) <= '6')
+        {
+            int n = size.charAt(1) - '0';
+            double x;
+
+            for (x = 1.0; n > 0; --n)
+                x *= 1.2;
+
+            x *= 100.0;
+            buf = "" + (int)x + "%";
+
+            return buf;
+        }
+
+        return "larger"; /* "140%" */
+    }
+
+    private void addFontFace(Node node, String face)
+    {
+        addStyleProperty(node, "font-family: " + face);
+    }
+
+    private void addFontSize(Node node, String size)
+    {
+        String value;
+
+        if (size.equals("6") && node.tag == tt.tagP)
+        {
+            node.element = "h1";
+            tt.findTag(node);
+            return;
+        }
+
+        if (size.equals("5") && node.tag == tt.tagP)
+        {
+            node.element = "h2";
+            tt.findTag(node);
+            return;
+        }
+
+        if (size.equals("4") && node.tag == tt.tagP)
+        {
+            node.element = "h3";
+            tt.findTag(node);
+            return;
+        }
+
+        value = fontSize2Name(size);
+
+        if (value != null)
+        {
+            addStyleProperty(node, "font-size: " + value);
+        }
+    }
+
+    private void addFontColor(Node node, String color)
+    {
+        addStyleProperty(node, "color: " + color);
+    }
+
+    private void addAlign(Node node, String align)
+    {
+        /* force alignment value to lower case */
+        addStyleProperty(node, "text-align: " + align.toLowerCase());
+    }
+
+    /*
+     add style properties to node corresponding to
+     the font face, size and color attributes
+    */
+    private void addFontStyles(Node node, AttVal av)
+    {
+        while (av != null)
+        {
+            if (av.attribute.equals("face"))
+                addFontFace(node, av.value);
+            else if (av.attribute.equals("size"))
+                addFontSize(node, av.value);
+            else if (av.attribute.equals("color"))
+                addFontColor(node, av.value);
+
+            av = av.next;
+        }
+    }
+
+    /*
+        Symptom: <p align=center>
+        Action: <p style="text-align: center">
+    */
+    private void textAlign(Lexer lexer, Node node)
+    {
+        AttVal av, prev;
+
+        prev = null;
+
+        for (av = node.attributes; av != null; av = av.next)
+        {
+            if (av.attribute.equals("align"))
+            {
+                if (prev != null)
+                    prev.next = av.next;
+                else
+                    node.attributes = av.next;
+
+                if (av.value != null)
+                {
+                    addAlign(node, av.value);
+                }
+
+                break;
+            }
+
+            prev = av;
+        }
+    }
+
+    /*
+       The clean up rules use the pnode argument to return the
+       next node when the orignal node has been deleted
+    */
+
+    /*
+        Symptom: <dir> <li> where <li> is only child
+        Action: coerce <dir> <li> to <div> with indent.
+    */
+
+    private boolean dir2Div(Lexer lexer, Node node, MutableObject pnode)
+    {
+        Node child;
+
+        if (node.tag == tt.tagDir ||
+            node.tag == tt.tagUl ||
+            node.tag == tt.tagOl)
+        {
+            child = node.content;
+
+            if (child == null)
+                return false;
+
+            /* check child has no peers */
+
+            if (child.next != null)
+                return false;
+
+            if (child.tag != tt.tagLi)
+                return false;
+
+            if (!child.implicit)
+                return false;
+
+            /* coerce dir to div */
+
+            node.tag = tt.tagDiv;
+            node.element = "div";
+            addStyleProperty(node, "margin-left: 2em");
+            stripOnlyChild(node);
+            return true;
+
+//#if 0
+            //Node content;
+            //Node last;
+            //content = child.content;
+            //last = child.last;
+            //child.content = null;
+
+            /* adjust parent and set margin on contents of <li> */
+
+            //for (child = content; child != null; child = child.next)
+            //{
+            //    child.parent = node.parent;
+            //    addStyleProperty(child, "margin-left: 1em");
+            //}
+
+            /* hook first/last into sequence */
+
+            //if (content != null)
+            //{
+            //    content.prev = node.prev;
+            //    last.next = node.next;
+            //    fixNodeLinks(content);
+            //    fixNodeLinks(last);
+            //}
+
+            //node.next = null;
+
+            /* ensure that new node is cleaned */
+            //pnode.setObject(cleanNode(lexer, content));
+            //return true;
+//#endif
+        }
+
+        return false;
+    }
+
+    /*
+        Symptom: <center>
+        Action: replace <center> by <div style="text-align: center">
+    */
+
+    private boolean center2Div(Lexer lexer, Node node, MutableObject pnode)
+    {
+        if (node.tag == tt.tagCenter)
+        {
+            if (lexer.configuration.DropFontTags)
+            {
+                if (node.content != null)
+                {
+                    Node last = node.last;
+                    Node parent = node.parent;
+
+                    discardContainer(node, pnode);
+
+                    node = lexer.inferredTag("br");
+
+                    if (last.next != null)
+                        last.next.prev = node;
+
+                    node.next = last.next;
+                    last.next = node;
+                    node.prev = last;
+
+                    if (parent.last == last)
+                        parent.last = node;
+
+                    node.parent = parent;
+                }
+                else
+                {
+                    Node prev = node.prev;
+                    Node next = node.next;
+                    Node parent = node.parent;
+                    discardContainer(node, pnode);
+
+                    node = lexer.inferredTag("br");
+                    node.next = next;
+                    node.prev = prev;
+                    node.parent = parent;
+
+                    if (next != null)
+                        next.prev = node;
+                    else
+                        parent.last = node;
+
+                    if (prev != null)
+                        prev.next = node;
+                    else
+                        parent.content = node;
+                }
+
+                return true;
+            }
+            node.tag = tt.tagDiv;
+            node.element = "div";
+            addStyleProperty(node, "text-align: center");
+            return true;
+        }
+
+        return false;
+    }
+
+    /*
+        Symptom <div><div>...</div></div>
+        Action: merge the two divs
+
+      This is useful after nested <dir>s used by Word
+      for indenting have been converted to <div>s
+    */
+    private boolean mergeDivs(Lexer lexer, Node node, MutableObject pnode)
+    {
+        Node child;
+
+        if (node.tag != tt.tagDiv)
+            return false;
+
+        child = node.content;
+
+        if (child == null)
+            return false;
+
+        if (child.tag != tt.tagDiv)
+            return false;
+
+        if (child.next != null)
+            return false;
+
+        mergeStyles(node, child);
+        stripOnlyChild(node);
+        return true;
+    }
+
+    /*
+        Symptom: <ul><li><ul>...</ul></li></ul>
+        Action: discard outer list
+    */
+
+    private boolean nestedList(Lexer lexer, Node node, MutableObject pnode)
+    {
+        Node child, list;
+
+        if (node.tag == tt.tagUl || node.tag == tt.tagOl)
+        {
+            child = node.content;
+
+            if (child == null)
+                return false;
+
+            /* check child has no peers */
+
+            if (child.next != null)
+                return false;
+
+            list = child.content;
+
+            if (list == null)
+                return false;
+
+            if (list.tag != node.tag)
+                return false;
+
+            pnode.setObject(node.next);
+
+            /* move inner list node into position of outer node */
+            list.prev = node.prev;
+            list.next = node.next;
+            list.parent = node.parent;
+            fixNodeLinks(list);
+
+            /* get rid of outer ul and its li */
+            child.content = null;
+            node.content = null;
+            node.next = null;
+
+            /*
+              If prev node was a list the chances are this node
+              should be appended to that list. Word has no way of
+              recognizing nested lists and just uses indents
+            */
+
+            if (list.prev != null)
+            {
+                node = list;
+                list = node.prev;
+
+                if (list.tag == tt.tagUl || list.tag == tt.tagOl)
+                {
+                    list.next = node.next;
+
+                    if (list.next != null)
+                        list.next.prev = list;
+
+                    child = list.last;  /* <li> */
+
+                    node.parent = child;
+                    node.next = null;
+                    node.prev = child.last;
+                    fixNodeLinks(node);
+                }
+            }
+
+            cleanNode(lexer, node);
+            return true;
+        }
+
+        return false;
+    }
+
+    /*
+        Symptom: the only child of a block-level element is a
+        presentation element such as B, I or FONT
+
+        Action: add style "font-weight: bold" to the block and
+        strip the <b> element, leaving its children.
+
+      example:
+
+        <p>
+          <b><font face="Arial" size="6">Draft Recommended Practice</font></b>
+        </p>
+
+      becomes:
+
+          <p style="font-weight: bold; font-family: Arial; font-size: 6">
+            Draft Recommended Practice
+          </p>
+
+      This code also replaces the align attribute by a style attribute.
+      However, to avoid CSS problems with Navigator 4, this isn't done
+      for the elements: caption, tr and table
+    */
+    private boolean blockStyle(Lexer lexer, Node node, MutableObject pnode)
+    {
+        Node child;
+
+        if ((node.tag.model & (Dict.CM_BLOCK | Dict.CM_LIST | Dict.CM_DEFLIST | Dict.CM_TABLE)) != 0)
+        {
+            if (node.tag != tt.tagTable
+                    && node.tag != tt.tagTr
+                    && node.tag != tt.tagLi)
+            {
+                /* check for align attribute */
+                if (node.tag != tt.tagCaption)
+                    textAlign(lexer, node);
+
+                child = node.content;
+
+                if (child == null)
+                    return false;
+
+                /* check child has no peers */
+
+                if (child.next != null)
+                    return false;
+
+                if (child.tag == tt.tagB)
+                {
+                    mergeStyles(node, child);
+                    addStyleProperty(node, "font-weight: bold");
+                    stripOnlyChild(node);
+                    return true;
+                }
+
+                if (child.tag == tt.tagI)
+                {
+                    mergeStyles(node, child);
+                    addStyleProperty(node, "font-style: italic");
+                    stripOnlyChild(node);
+                    return true;
+                }
+
+                if (child.tag == tt.tagFont)
+                {
+                    mergeStyles(node, child);
+                    addFontStyles(node, child.attributes);
+                    stripOnlyChild(node);
+                    return true;
+                }
+            }
+        }
+
+        return false;
+    }
+
+    /* the only child of table cell or an inline element such as em */
+    private boolean inlineStyle(Lexer lexer, Node node, MutableObject pnode)
+    {
+        Node child;
+
+        if (node.tag != tt.tagFont && (node.tag.model & (Dict.CM_INLINE|Dict.CM_ROW)) != 0)
+        {
+            child = node.content;
+
+            if (child == null)
+                return false;
+
+            /* check child has no peers */
+
+            if (child.next != null)
+                return false;
+
+            if (child.tag == tt.tagB && lexer.configuration.LogicalEmphasis)
+            {
+                mergeStyles(node, child);
+                addStyleProperty(node, "font-weight: bold");
+                stripOnlyChild(node);
+                return true;
+            }
+
+            if (child.tag == tt.tagI && lexer.configuration.LogicalEmphasis)
+            {
+                mergeStyles(node, child);
+                addStyleProperty(node, "font-style: italic");
+                stripOnlyChild(node);
+                return true;
+            }
+
+            if (child.tag == tt.tagFont)
+            {
+                mergeStyles(node, child);
+                addFontStyles(node, child.attributes);
+                stripOnlyChild(node);
+                return true;
+            }
+        }
+
+        return false;
+    }
+
+    /*
+      Replace font elements by span elements, deleting
+      the font element's attributes and replacing them
+      by a single style attribute.
+    */
+    private boolean font2Span(Lexer lexer, Node node, MutableObject pnode)
+    {
+        AttVal av, style, next;
+
+        if (node.tag == tt.tagFont)
+        {
+            if (lexer.configuration.DropFontTags)
+            {
+                discardContainer(node, pnode);
+                return false;
+            }
+
+            /* if FONT is only child of parent element then leave alone */
+            if (node.parent.content == node
+                && node.next == null)
+                return false;
+
+            addFontStyles(node, node.attributes);
+
+            /* extract style attribute and free the rest */
+            av = node.attributes;
+            style = null;
+
+            while (av != null)
+            {
+                next = av.next;
+
+                if (av.attribute.equals("style"))
+                {
+                    av.next = null;
+                    style = av;
+                }
+
+                av = next;
+            }
+
+            node.attributes = style;
+
+            node.tag = tt.tagSpan;
+            node.element = "span";
+
+            return true;
+        }
+
+        return false;
+    }
+
+    /*
+      Applies all matching rules to a node.
+    */
+    private Node cleanNode(Lexer lexer, Node node)
+    {
+        Node next = null;
+        MutableObject o = new MutableObject();
+        boolean b = false;
+
+        for (next = node; node.isElement(); node = next)
+        {
+            o.setObject(next);
+
+            b = dir2Div(lexer, node, o);
+            next = (Node)o.getObject();
+            if (b)
+                continue;
+
+            b = nestedList(lexer, node, o);
+            next = (Node)o.getObject();
+            if (b)
+                continue;
+
+            b = center2Div(lexer, node, o);
+            next = (Node)o.getObject();
+            if (b)
+                continue;
+
+            b = mergeDivs(lexer, node, o);
+            next = (Node)o.getObject();
+            if (b)
+                continue;
+
+            b = blockStyle(lexer, node, o);
+            next = (Node)o.getObject();
+            if (b)
+                continue;
+
+            b = inlineStyle(lexer, node, o);
+            next = (Node)o.getObject();
+            if (b)
+                continue;
+
+            b = font2Span(lexer, node, o);
+            next = (Node)o.getObject();
+            if (b)
+                continue;
+
+            break;
+        }
+
+        return next;
+    }
+
+    private Node createStyleProperties(Lexer lexer, Node node)
+    {
+        Node child;
+
+        if (node.content != null)
+        {
+            for (child = node.content; child != null; child = child.next)
+            {
+                child = createStyleProperties(lexer, child);
+            }
+        }
+
+        return cleanNode(lexer, node);
+    }
+
+    private void defineStyleRules(Lexer lexer, Node node)
+    {
+        Node child;
+
+        if (node.content != null)
+        {
+            for (child = node.content;
+                    child != null; child = child.next)
+            {
+                defineStyleRules(lexer, child);
+            }
+        }
+
+        style2Rule(lexer, node);
+    }
+
+    public void cleanTree(Lexer lexer, Node doc)
+    {
+        doc = createStyleProperties(lexer, doc);
+
+        if (!lexer.configuration.MakeClean)
+        {
+            defineStyleRules(lexer, doc);
+            createStyleElement(lexer, doc);
+        }
+    }
+
+    /* simplifies <b><b> ... </b> ...</b> etc. */
+    public void nestedEmphasis(Node node)
+    {
+        MutableObject o = new MutableObject();
+        Node next;
+
+        while (node != null)
+        {
+            next = node.next;
+
+            if ((node.tag == tt.tagB || node.tag == tt.tagI)
+                && node.parent != null && node.parent.tag == node.tag)
+            {
+                /* strip redundant inner element */
+                o.setObject(next);
+                discardContainer(node, o);
+                next = (Node)o.getObject();
+                node = next;
+                continue;
+            }
+
+            if (node.content != null)
+                nestedEmphasis(node.content);
+
+            node = next;
+        }
+    }
+
+    /* replace i by em and b by strong */
+    public void emFromI(Node node)
+    {
+        while (node != null)
+        {
+            if (node.tag == tt.tagI)
+            {
+                node.element = tt.tagEm.name;
+                node.tag = tt.tagEm;
+            }
+            else if (node.tag == tt.tagB)
+            {
+                node.element = tt.tagStrong.name;
+                node.tag = tt.tagStrong;
+            }
+
+            if (node.content != null)
+                emFromI(node.content);
+
+            node = node.next;
+        }
+    }
+
+    /*
+     Some people use dir or ul without an li
+     to indent the content. The pattern to
+     look for is a list with a single implicit
+     li. This is recursively replaced by an
+     implicit blockquote.
+    */
+    public void list2BQ(Node node)
+    {
+        while (node != null)
+        {
+            if (node.content != null)
+                list2BQ(node.content);
+
+            if (node.tag != null && node.tag.parser == ParserImpl.getParseList() &&
+                node.hasOneChild() && node.content.implicit)
+            {
+                stripOnlyChild(node);
+                node.element = tt.tagBlockquote.name;
+                node.tag = tt.tagBlockquote;
+                node.implicit = true;
+            }
+
+            node = node.next;
+        }
+    }
+
+    /*
+     Replace implicit blockquote by div with an indent
+     taking care to reduce nested blockquotes to a single
+     div with the indent set to match the nesting depth
+    */
+    public void bQ2Div(Node node)
+    {
+        int indent;
+        String indent_buf;
+
+        while (node != null)
+        {
+            if (node.tag == tt.tagBlockquote && node.implicit)
+            {
+                indent = 1;
+
+                while(node.hasOneChild() &&
+                      node.content.tag == tt.tagBlockquote &&
+                      node.implicit)
+                {
+                    ++indent;
+                    stripOnlyChild(node);
+                }
+
+                if (node.content != null)
+                    bQ2Div(node.content);
+
+                indent_buf = "margin-left: " +
+                             (new Integer(2*indent)).toString() + "em";
+
+                node.element = tt.tagDiv.name;
+                node.tag = tt.tagDiv;
+                node.addAttribute("style", indent_buf);
+            }
+            else if (node.content != null)
+                bQ2Div(node.content);
+
+
+            node = node.next;
+        }
+    }
+
+    /* node is <![if ...]> prune up to <![endif]> */
+    public Node pruneSection(Lexer lexer, Node node)
+    {
+        for (;;)
+        {
+            /* discard node and returns next */
+            node = Node.discardElement(node);
+
+            if (node == null)
+                return null;
+        
+            if (node.type == Node.SectionTag)
+            {
+                if ((Lexer.getString(node.textarray, node.start, 2)).equals("if"))
+                {
+                    node = pruneSection(lexer, node);
+                    continue;
+                }
+
+                if ((Lexer.getString(node.textarray, node.start, 5)).equals("endif"))
+                {
+                    node = Node.discardElement(node);
+                    break;
+                }
+            }
+        }
+
+        return node;
+    }
+
+    public void dropSections(Lexer lexer, Node node)
+    {
+        while (node != null)
+        {
+            if (node.type == Node.SectionTag)
+            {
+                /* prune up to matching endif */
+                if ((Lexer.getString(node.textarray, node.start, 2)).equals("if"))
+                {
+                    node = pruneSection(lexer, node);
+                    continue;
+                }
+
+                /* discard others as well */
+                node = Node.discardElement(node);
+                continue;
+            }
+
+            if (node.content != null)
+                dropSections(lexer, node.content);
+
+            node = node.next;
+        }
+    }
+
+    public void purgeAttributes(Node node)
+    {
+        AttVal attr = node.attributes;
+        AttVal next = null;
+        AttVal prev = null;
+
+        while (attr != null)
+        {
+            next = attr.next;
+
+            /* special check for class="Code" denoting pre text */
+            if (attr.attribute != null &&
+                attr.value != null &&
+                attr.attribute.equals("class") &&
+                attr.value.equals("Code"))
+            {
+                prev = attr;
+            }
+            else if (attr.attribute != null &&
+                (attr.attribute.equals("class") ||
+                 attr.attribute.equals("style") ||
+                 attr.attribute.equals("lang") ||
+                 attr.attribute.startsWith("x:") ||
+                 ((attr.attribute.equals("height") || attr.attribute.equals("width")) &&
+                    (node.tag == tt.tagTd || node.tag == tt.tagTr || node.tag == tt.tagTh))))
+            {
+                if (prev != null)
+                    prev.next = next;
+                else
+                    node.attributes = next;
+
+            }
+            else
+                prev = attr;
+
+            attr = next;
+        }
+    }
+
+    /* Word2000 uses span excessively, so we strip span out */
+    public Node stripSpan(Lexer lexer, Node span)
+    {
+        Node node;
+        Node prev = null;
+        Node content;
+
+        /*
+         deal with span elements that have content
+         by splicing the content in place of the span
+         after having processed it
+        */
+
+        cleanWord2000(lexer, span.content);
+        content = span.content;
+
+        if (span.prev != null)
+            prev = span.prev;
+        else if (content != null)
+        {
+            node = content;
+            content = content.next;
+            Node.removeNode(node);
+            Node.insertNodeBeforeElement(span, node);
+            prev = node;
+        }
+
+        while (content != null)
+        {
+            node = content;
+            content = content.next;
+            Node.removeNode(node);
+            Node.insertNodeAfterElement(prev, node);
+            prev = node;
+        }
+
+        if (span.next == null)
+            span.parent.last = prev;
+
+        node = span.next;
+        span.content = null;
+        Node.discardElement(span);
+        return node;
+    }
+
+    /* map non-breaking spaces to regular spaces */
+    private void normalizeSpaces(Lexer lexer, Node node)
+    {
+        while (node != null)
+        {
+            if (node.content != null)
+                normalizeSpaces(lexer, node.content);
+
+            if (node.type == Node.TextNode)
+            {
+                int i;
+                MutableInteger c = new MutableInteger();
+                int p = node.start;
+
+                for (i = node.start; i < node.end; ++i)
+                {
+                    c.value = (int)node.textarray[i];
+
+                    /* look for UTF-8 multibyte character */
+                    if (c.value > 0x7F)
+                        i += PPrint.getUTF8(node.textarray, i, c);
+
+                    if (c.value == 160)
+                        c.value = ' ';
+
+                    p = PPrint.putUTF8(node.textarray, p, c.value);
+                }
+            }
+
+            node = node.next;
+        }
+    }
+
+    /*
+     This is a major clean up to strip out all the extra stuff you get
+     when you save as web page from Word 2000. It doesn't yet know what
+     to do with VML tags, but these will appear as errors unless you
+     declare them as new tags, such as o:p which needs to be declared
+     as inline.
+    */
+    public void cleanWord2000(Lexer lexer, Node node)
+    {
+        /* used to a list from a sequence of bulletted p's */
+        Node list = null;
+
+        while (node != null)
+        {
+            /* discard Word's style verbiage */
+            if (node.tag == tt.tagStyle ||
+                node.tag == tt.tagMeta ||
+                node.type == Node.CommentTag)
+            {
+                node = Node.discardElement(node);
+                continue;
+            }
+
+            /* strip out all span tags Word scatters so liberally! */
+            if (node.tag == tt.tagSpan)
+            {
+                node = stripSpan(lexer, node);
+                continue;
+            }
+
+            /* get rid of Word's xmlns attributes */
+            if (node.tag == tt.tagHtml)
+            {
+                /* check that it's a Word 2000 document */
+                if (node.getAttrByName("xmlns:o") == null)
+                    return;
+            }
+
+            if (node.tag == tt.tagLink)
+            {
+                AttVal attr = node.getAttrByName("rel");
+
+                if (attr != null && attr.value != null &&
+                    attr.value.equals("File-List"))
+                {
+                    node = Node.discardElement(node);
+                    continue;
+                }
+            }
+
+            /* discard empty paragraphs */
+            if (node.content == null && node.tag == tt.tagP)
+            {
+                node = Node.discardElement(node);
+                continue;
+            }
+
+            if (node.tag == tt.tagP)
+            {
+                AttVal attr = node.getAttrByName("class");
+
+                /* map sequence of <p class="MsoListBullet"> to <ul>...</ul> */
+                if (attr != null && attr.value != null &&
+                    attr.value.equals("MsoListBullet"))
+                {
+                    Node.coerceNode(lexer, node, tt.tagLi);
+
+                    if (list == null || list.tag != tt.tagUl)
+                    {
+                        list = lexer.inferredTag("ul");
+                        Node.insertNodeBeforeElement(node, list);
+                    }
+
+                    purgeAttributes(node);
+
+                    if (node.content != null)
+                        cleanWord2000(lexer, node.content);
+
+                    /* remove node and append to contents of list */
+                    Node.removeNode(node);
+                    Node.insertNodeAtEnd(list, node);
+                    node = list.next;
+                }
+                /* map sequence of <p class="Code"> to <pre>...</pre> */
+                else if (attr != null && attr.value != null &&
+                         attr.value.equals("Code"))
+                {
+                    Node br = lexer.newLineNode();
+                    normalizeSpaces(lexer, node);
+
+                    if (list == null || list.tag != tt.tagPre)
+                    {
+                        list = lexer.inferredTag("pre");
+                        Node.insertNodeBeforeElement(node, list);
+                    }
+
+                    /* remove node and append to contents of list */
+                    Node.removeNode(node);
+                    Node.insertNodeAtEnd(list, node);
+                    stripSpan(lexer, node);
+                    Node.insertNodeAtEnd(list, br);
+                    node = list.next;
+                }
+                else
+                    list = null;
+            }
+            else
+                list = null;
+
+            /* strip out style and class attributes */
+            if (node.type == Node.StartTag || node.type == Node.StartEndTag)
+                purgeAttributes(node);
+
+            if (node.content != null)
+                cleanWord2000(lexer, node.content);
+
+            node = node.next;
+        }
+    }
+
+    public boolean isWord2000(Node root, TagTable tt)
+    {
+        Node html = root.findHTML(tt);
+
+        return (html != null && html.getAttrByName("xmlns:o") != null);
+    }
+}