X-Git-Url: http://secure.phpeclipse.com diff --git a/net.sourceforge.phpeclipse/src/net/sourceforge/phpdt/tidy/Clean.java b/net.sourceforge.phpeclipse/src/net/sourceforge/phpdt/tidy/Clean.java new file mode 100644 index 0000000..66d2331 --- /dev/null +++ b/net.sourceforge.phpeclipse/src/net/sourceforge/phpdt/tidy/Clean.java @@ -0,0 +1,1779 @@ +/* + * @(#)Clean.java 1.11 2000/08/16 + * + */ + +package net.sourceforge.phpdt.tidy; + +/** + * + * Clean up misuse of presentation markup + * + * (c) 1998-2000 (W3C) MIT, INRIA, Keio University + * See Tidy.java for the copyright notice. + * Derived from + * HTML Tidy Release 4 Aug 2000 + * + * @author Dave Raggett + * @author Andy Quick (translation to Java) + * @version 1.0, 1999/05/22 + * @version 1.0.1, 1999/05/29 + * @version 1.1, 1999/06/18 Java Bean + * @version 1.2, 1999/07/10 Tidy Release 7 Jul 1999 + * @version 1.3, 1999/07/30 Tidy Release 26 Jul 1999 + * @version 1.4, 1999/09/04 DOM support + * @version 1.5, 1999/10/23 Tidy Release 27 Sep 1999 + * @version 1.6, 1999/11/01 Tidy Release 22 Oct 1999 + * @version 1.7, 1999/12/06 Tidy Release 30 Nov 1999 + * @version 1.8, 2000/01/22 Tidy Release 13 Jan 2000 + * @version 1.9, 2000/06/03 Tidy Release 30 Apr 2000 + * @version 1.10, 2000/07/22 Tidy Release 8 Jul 2000 + * @version 1.11, 2000/08/16 Tidy Release 4 Aug 2000 + */ + +/* + Filters from other formats such as Microsoft Word + often make excessive use of presentation markup such + as font tags, B, I, and the align attribute. By applying + a set of production rules, it is straight forward to + transform this to use CSS. + + Some rules replace some of the children of an element by + style properties on the element, e.g. + +

...

->

...

+ + Such rules are applied to the element's content and then + to the element itself until none of the rules more apply. + Having applied all the rules to an element, it will have + a style attribute with one or more properties. + + Other rules strip the element they apply to, replacing + it by style properties on the contents, e.g. + +
  • ...

  • ->

    ... + + These rules are applied to an element before processing + its content and replace the current element by the first + element in the exposed content. + + After applying both sets of rules, you can replace the + style attribute by a class value and style rule in the + document head. To support this, an association of styles + and class names is built. + + A naive approach is to rely on string matching to test + when two property lists are the same. A better approach + would be to first sort the properties before matching. +*/ + +public class Clean { + + private int classNum = 1; + + private TagTable tt; + + public Clean(TagTable tt) + { + this.tt = tt; + } + + private StyleProp insertProperty(StyleProp props, String name, + String value) + { + StyleProp first, prev, prop; + int cmp; + + prev = null; + first = props; + + while (props != null) + { + cmp = props.name.compareTo(name); + + if (cmp == 0) + { + /* this property is already defined, ignore new value */ + return first; + } + + if (cmp > 0) // props.name > name + { + /* insert before this */ + + prop = new StyleProp(name, value, props); + + if (prev != null) + prev.next = prop; + else + first = prop; + + return first; + } + + prev = props; + props = props.next; + } + + prop = new StyleProp(name, value); + + if (prev != null) + prev.next = prop; + else + first = prop; + + return first; + } + + /* + Create sorted linked list of properties from style string + It temporarily places nulls in place of ':' and ';' to + delimit the strings for the property name and value. + Some systems don't allow you to null literal strings, + so to avoid this, a copy is made first. + */ + private StyleProp createProps(StyleProp prop, String style) + { + int name_end; + int value_end; + int value_start = 0; + int name_start = 0; + boolean more; + + name_start = 0; + while (name_start < style.length()) + { + while (name_start < style.length() && + style.charAt(name_start) == ' ') + ++name_start; + + name_end = name_start; + + while (name_end < style.length()) + { + if (style.charAt(name_end) == ':') + { + value_start = name_end + 1; + break; + } + + ++name_end; + } + + if (name_end >= style.length() || style.charAt(name_end) != ':') + break; + + while (value_start < style.length() && + style.charAt(value_start) == ' ') + ++value_start; + + value_end = value_start; + more = false; + + while (value_end < style.length()) + { + if (style.charAt(value_end) == ';') + { + more = true; + break; + } + + ++value_end; + } + + prop = insertProperty(prop, + style.substring(name_start, name_end), + style.substring(value_start, value_end)); + + if (more) + { + name_start = value_end + 1; + continue; + } + + break; + } + + return prop; + } + + private String createPropString(StyleProp props) + { + String style = ""; + int len; + StyleProp prop; + + /* compute length */ + + for (len = 0, prop = props; prop != null; prop = prop.next) + { + len += prop.name.length() + 2; + len += prop.value.length() + 2; + } + + for (prop = props; prop != null; prop = prop.next) + { + style = style.concat(prop.name); + style = style.concat(": "); + + style = style.concat(prop.value); + + if (prop.next == null) + break; + + style = style.concat("; "); + } + + return style; + } + + /* + create string with merged properties + */ + private String addProperty(String style, String property) + { + StyleProp prop; + + prop = createProps(null, style); + prop = createProps(prop, property); + style = createPropString(prop); + return style; + } + + private String gensymClass(String tag) + { + String str; + + str = "c" + classNum; + classNum++; + return str; + } + + private String findStyle(Lexer lexer, String tag, String properties) + { + Style style; + + for (style = lexer.styles; style != null; style=style.next) + { + if (style.tag.equals(tag) && + style.properties.equals(properties)) + return style.tagClass; + } + + style = new Style(tag, gensymClass(tag), properties, lexer.styles); + lexer.styles = style; + return style.tagClass; + } + + /* + Find style attribute in node, and replace it + by corresponding class attribute. Search for + class in style dictionary otherwise gensym + new class and add to dictionary. + + Assumes that node doesn't have a class attribute + */ + private void style2Rule(Lexer lexer, Node node) + { + AttVal styleattr, classattr; + String classname; + + styleattr = node.getAttrByName("style"); + + if (styleattr != null) + { + classname = findStyle(lexer, node.element, styleattr.value); + classattr = node.getAttrByName("class"); + + /* + if there already is a class attribute + then append class name after a space + */ + if (classattr != null) + { + classattr.value = classattr.value + " " + classname; + node.removeAttribute(styleattr); + } + else /* reuse style attribute for class attribute */ + { + styleattr.attribute = "class"; + styleattr.value = classname; + } + } + } + + private void addColorRule(Lexer lexer, String selector, String color) + { + if (color != null) + { + lexer.addStringLiteral(selector); + lexer.addStringLiteral(" { color: "); + lexer.addStringLiteral(color); + lexer.addStringLiteral(" }\n"); + } + } + + /* + move presentation attribs from body to style element + + background="foo" -> body { background-image: url(foo) } + bgcolor="foo" -> body { background-color: foo } + text="foo" -> body { color: foo } + link="foo" -> :link { color: foo } + vlink="foo" -> :visited { color: foo } + alink="foo" -> :active { color: foo } + */ + private void cleanBodyAttrs(Lexer lexer, Node body) + { + AttVal attr; + String bgurl = null; + String bgcolor = null; + String color = null; + + attr = body.getAttrByName("background"); + + if (attr != null) + { + bgurl = attr.value; + attr.value = null; + body.removeAttribute(attr); + } + + attr = body.getAttrByName("bgcolor"); + + if (attr != null) + { + bgcolor = attr.value; + attr.value = null; + body.removeAttribute(attr); + } + + attr = body.getAttrByName("text"); + + if (attr != null) + { + color = attr.value; + attr.value = null; + body.removeAttribute(attr); + } + + if (bgurl != null || bgcolor != null || color != null) + { + lexer.addStringLiteral(" body {\n"); + + if (bgurl != null) + { + lexer.addStringLiteral(" background-image: url("); + lexer.addStringLiteral(bgurl); + lexer.addStringLiteral(");\n"); + } + + if (bgcolor != null) + { + lexer.addStringLiteral(" background-color: "); + lexer.addStringLiteral(bgcolor); + lexer.addStringLiteral(";\n"); + } + + if (color != null) + { + lexer.addStringLiteral(" color: "); + lexer.addStringLiteral(color); + lexer.addStringLiteral(";\n"); + } + + lexer.addStringLiteral(" }\n"); + } + + attr = body.getAttrByName("link"); + + if (attr != null) + { + addColorRule(lexer, " :link", attr.value); + body.removeAttribute(attr); + } + + attr = body.getAttrByName("vlink"); + + if (attr != null) + { + addColorRule(lexer, " :visited", attr.value); + body.removeAttribute(attr); + } + + attr = body.getAttrByName("alink"); + + if (attr != null) + { + addColorRule(lexer, " :active", attr.value); + body.removeAttribute(attr); + } + } + + private boolean niceBody(Lexer lexer, Node doc) + { + Node body = doc.findBody(lexer.configuration.tt); + + if (body != null) + { + if ( + body.getAttrByName("background") != null || + body.getAttrByName("bgcolor") != null || + body.getAttrByName("text") != null || + body.getAttrByName("link") != null || + body.getAttrByName("vlink") != null || + body.getAttrByName("alink") != null + ) + { + lexer.badLayout |= Report.USING_BODY; + return false; + } + } + + return true; + } + + /* create style element using rules from dictionary */ + private void createStyleElement(Lexer lexer, Node doc) + { + Node node, head, body; + Style style; + AttVal av; + + if (lexer.styles == null && niceBody(lexer, doc)) + return; + + node = lexer.newNode(Node.StartTag, null, 0, 0, "style"); + node.implicit = true; + + /* insert type attribute */ + av = new AttVal(null, null, '"', "type", "text/css"); + av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av); + node.attributes = av; + + body = doc.findBody(lexer.configuration.tt); + + lexer.txtstart = lexer.lexsize; + + if (body != null) + cleanBodyAttrs(lexer, body); + + for (style = lexer.styles; style != null; style = style.next) + { + lexer.addCharToLexer(' '); + lexer.addStringLiteral(style.tag); + lexer.addCharToLexer('.'); + lexer.addStringLiteral(style.tagClass); + lexer.addCharToLexer(' '); + lexer.addCharToLexer('{'); + lexer.addStringLiteral(style.properties); + lexer.addCharToLexer('}'); + lexer.addCharToLexer('\n'); + } + + lexer.txtend = lexer.lexsize; + + Node.insertNodeAtEnd(node, + lexer.newNode(Node.TextNode, + lexer.lexbuf, + lexer.txtstart, + lexer.txtend)); + + /* + now insert style element into document head + + doc is root node. search its children for html node + the head node should be first child of html node + */ + + head = doc.findHEAD(lexer.configuration.tt); + + if (head != null) + Node.insertNodeAtEnd(head, node); + } + + /* ensure bidirectional links are consistent */ + private void fixNodeLinks(Node node) + { + Node child; + + if (node.prev != null) + node.prev.next = node; + else + node.parent.content = node; + + if (node.next != null) + node.next.prev = node; + else + node.parent.last = node; + + for (child = node.content; child != null; child = child.next) + child.parent = node; + } + + /* + used to strip child of node when + the node has one and only one child + */ + private void stripOnlyChild(Node node) + { + Node child; + + child = node.content; + node.content = child.content; + node.last = child.last; + child.content = null; + + for (child = node.content; child != null; child = child.next) + child.parent = node; + } + + /* used to strip font start and end tags */ + private void discardContainer(Node element, MutableObject pnode) + { + Node node; + Node parent = element.parent; + + if (element.content != null) + { + element.last.next = element.next; + + if (element.next != null) + { + element.next.prev = element.last; + element.last.next = element.next; + } + else + parent.last = element.last; + + if (element.prev != null) + { + element.content.prev = element.prev; + element.prev.next = element.content; + } + else + parent.content = element.content; + + for (node = element.content; node != null; node = node.next) + node.parent = parent; + + pnode.setObject(element.content); + } + else + { + if (element.next != null) + element.next.prev = element.prev; + else + parent.last = element.prev; + + if (element.prev != null) + element.prev.next = element.next; + else + parent.content = element.next; + + pnode.setObject(element.next); + } + + element.next = null; + element.content = null; + } + + /* + Add style property to element, creating style + attribute as needed and adding ; delimiter + */ + private void addStyleProperty(Node node, String property) + { + AttVal av; + + for (av = node.attributes; av != null; av = av.next) + { + if (av.attribute.equals("style")) + break; + } + + /* if style attribute already exists then insert property */ + + if (av != null) + { + String s; + + s = addProperty(av.value, property); + av.value = s; + } + else /* else create new style attribute */ + { + av = new AttVal(node.attributes, null, '"', "style", property); + av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av); + node.attributes = av; + } + } + + /* + Create new string that consists of the + combined style properties in s1 and s2 + + To merge property lists, we build a linked + list of property/values and insert properties + into the list in order, merging values for + the same property name. + */ + private String mergeProperties(String s1, String s2) + { + String s; + StyleProp prop; + + prop = createProps(null, s1); + prop = createProps(prop, s2); + s = createPropString(prop); + return s; + } + + private void mergeStyles(Node node, Node child) + { + AttVal av; + String s1, s2, style; + + for (s2 = null, av = child.attributes; av != null; av = av.next) + { + if (av.attribute.equals("style")) + { + s2 = av.value; + break; + } + } + + for (s1 = null, av = node.attributes; av != null; av = av.next) + { + if (av.attribute.equals("style")) + { + s1 = av.value; + break; + } + } + + if (s1 != null) + { + if (s2 != null) /* merge styles from both */ + { + style = mergeProperties(s1, s2); + av.value = style; + } + } + else if (s2 != null) /* copy style of child */ + { + av = new AttVal(node.attributes, null, '"', "style", s2); + av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av); + node.attributes = av; + } + } + + private String fontSize2Name(String size) + { + /* + String[] sizes = + { + "50%", + "60%", + "80%", + null, + "120%", + "150%", + "200%" + }; + */ + + String[] sizes = + { + "60%", + "70%", + "80%", + null, + "120%", + "150%", + "200%" + }; + String buf; + + if (size.length() > 0 && + '0' <= size.charAt(0) && size.charAt(0) <= '6') + { + int n = size.charAt(0) - '0'; + return sizes[n]; + } + + if (size.length() > 0 && size.charAt(0) == '-') + { + if (size.length() > 1 && + '0' <= size.charAt(1) && size.charAt(1) <= '6') + { + int n = size.charAt(1) - '0'; + double x; + + for (x = 1.0; n > 0; --n) + x *= 0.8; + + x *= 100.0; + buf = "" + (int)x + "%"; + + return buf; + } + + return "smaller"; /*"70%"; */ + } + + if (size.length() > 1 && + '0' <= size.charAt(1) && size.charAt(1) <= '6') + { + int n = size.charAt(1) - '0'; + double x; + + for (x = 1.0; n > 0; --n) + x *= 1.2; + + x *= 100.0; + buf = "" + (int)x + "%"; + + return buf; + } + + return "larger"; /* "140%" */ + } + + private void addFontFace(Node node, String face) + { + addStyleProperty(node, "font-family: " + face); + } + + private void addFontSize(Node node, String size) + { + String value; + + if (size.equals("6") && node.tag == tt.tagP) + { + node.element = "h1"; + tt.findTag(node); + return; + } + + if (size.equals("5") && node.tag == tt.tagP) + { + node.element = "h2"; + tt.findTag(node); + return; + } + + if (size.equals("4") && node.tag == tt.tagP) + { + node.element = "h3"; + tt.findTag(node); + return; + } + + value = fontSize2Name(size); + + if (value != null) + { + addStyleProperty(node, "font-size: " + value); + } + } + + private void addFontColor(Node node, String color) + { + addStyleProperty(node, "color: " + color); + } + + private void addAlign(Node node, String align) + { + /* force alignment value to lower case */ + addStyleProperty(node, "text-align: " + align.toLowerCase()); + } + + /* + add style properties to node corresponding to + the font face, size and color attributes + */ + private void addFontStyles(Node node, AttVal av) + { + while (av != null) + { + if (av.attribute.equals("face")) + addFontFace(node, av.value); + else if (av.attribute.equals("size")) + addFontSize(node, av.value); + else if (av.attribute.equals("color")) + addFontColor(node, av.value); + + av = av.next; + } + } + + /* + Symptom:

    + Action:

    + */ + private void textAlign(Lexer lexer, Node node) + { + AttVal av, prev; + + prev = null; + + for (av = node.attributes; av != null; av = av.next) + { + if (av.attribute.equals("align")) + { + if (prev != null) + prev.next = av.next; + else + node.attributes = av.next; + + if (av.value != null) + { + addAlign(node, av.value); + } + + break; + } + + prev = av; + } + } + + /* + The clean up rules use the pnode argument to return the + next node when the orignal node has been deleted + */ + + /* + Symptom:

  • where
  • is only child + Action: coerce
  • to
    with indent. + */ + + private boolean dir2Div(Lexer lexer, Node node, MutableObject pnode) + { + Node child; + + if (node.tag == tt.tagDir || + node.tag == tt.tagUl || + node.tag == tt.tagOl) + { + child = node.content; + + if (child == null) + return false; + + /* check child has no peers */ + + if (child.next != null) + return false; + + if (child.tag != tt.tagLi) + return false; + + if (!child.implicit) + return false; + + /* coerce dir to div */ + + node.tag = tt.tagDiv; + node.element = "div"; + addStyleProperty(node, "margin-left: 2em"); + stripOnlyChild(node); + return true; + +//#if 0 + //Node content; + //Node last; + //content = child.content; + //last = child.last; + //child.content = null; + + /* adjust parent and set margin on contents of
  • */ + + //for (child = content; child != null; child = child.next) + //{ + // child.parent = node.parent; + // addStyleProperty(child, "margin-left: 1em"); + //} + + /* hook first/last into sequence */ + + //if (content != null) + //{ + // content.prev = node.prev; + // last.next = node.next; + // fixNodeLinks(content); + // fixNodeLinks(last); + //} + + //node.next = null; + + /* ensure that new node is cleaned */ + //pnode.setObject(cleanNode(lexer, content)); + //return true; +//#endif + } + + return false; + } + + /* + Symptom:
    + Action: replace
    by
    + */ + + private boolean center2Div(Lexer lexer, Node node, MutableObject pnode) + { + if (node.tag == tt.tagCenter) + { + if (lexer.configuration.DropFontTags) + { + if (node.content != null) + { + Node last = node.last; + Node parent = node.parent; + + discardContainer(node, pnode); + + node = lexer.inferredTag("br"); + + if (last.next != null) + last.next.prev = node; + + node.next = last.next; + last.next = node; + node.prev = last; + + if (parent.last == last) + parent.last = node; + + node.parent = parent; + } + else + { + Node prev = node.prev; + Node next = node.next; + Node parent = node.parent; + discardContainer(node, pnode); + + node = lexer.inferredTag("br"); + node.next = next; + node.prev = prev; + node.parent = parent; + + if (next != null) + next.prev = node; + else + parent.last = node; + + if (prev != null) + prev.next = node; + else + parent.content = node; + } + + return true; + } + node.tag = tt.tagDiv; + node.element = "div"; + addStyleProperty(node, "text-align: center"); + return true; + } + + return false; + } + + /* + Symptom
    ...
    + Action: merge the two divs + + This is useful after nested s used by Word + for indenting have been converted to
    s + */ + private boolean mergeDivs(Lexer lexer, Node node, MutableObject pnode) + { + Node child; + + if (node.tag != tt.tagDiv) + return false; + + child = node.content; + + if (child == null) + return false; + + if (child.tag != tt.tagDiv) + return false; + + if (child.next != null) + return false; + + mergeStyles(node, child); + stripOnlyChild(node); + return true; + } + + /* + Symptom:
      • ...
    + Action: discard outer list + */ + + private boolean nestedList(Lexer lexer, Node node, MutableObject pnode) + { + Node child, list; + + if (node.tag == tt.tagUl || node.tag == tt.tagOl) + { + child = node.content; + + if (child == null) + return false; + + /* check child has no peers */ + + if (child.next != null) + return false; + + list = child.content; + + if (list == null) + return false; + + if (list.tag != node.tag) + return false; + + pnode.setObject(node.next); + + /* move inner list node into position of outer node */ + list.prev = node.prev; + list.next = node.next; + list.parent = node.parent; + fixNodeLinks(list); + + /* get rid of outer ul and its li */ + child.content = null; + node.content = null; + node.next = null; + + /* + If prev node was a list the chances are this node + should be appended to that list. Word has no way of + recognizing nested lists and just uses indents + */ + + if (list.prev != null) + { + node = list; + list = node.prev; + + if (list.tag == tt.tagUl || list.tag == tt.tagOl) + { + list.next = node.next; + + if (list.next != null) + list.next.prev = list; + + child = list.last; /*
  • */ + + node.parent = child; + node.next = null; + node.prev = child.last; + fixNodeLinks(node); + } + } + + cleanNode(lexer, node); + return true; + } + + return false; + } + + /* + Symptom: the only child of a block-level element is a + presentation element such as B, I or FONT + + Action: add style "font-weight: bold" to the block and + strip the element, leaving its children. + + example: + +

    + Draft Recommended Practice +

    + + becomes: + +

    + Draft Recommended Practice +

    + + This code also replaces the align attribute by a style attribute. + However, to avoid CSS problems with Navigator 4, this isn't done + for the elements: caption, tr and table + */ + private boolean blockStyle(Lexer lexer, Node node, MutableObject pnode) + { + Node child; + + if ((node.tag.model & (Dict.CM_BLOCK | Dict.CM_LIST | Dict.CM_DEFLIST | Dict.CM_TABLE)) != 0) + { + if (node.tag != tt.tagTable + && node.tag != tt.tagTr + && node.tag != tt.tagLi) + { + /* check for align attribute */ + if (node.tag != tt.tagCaption) + textAlign(lexer, node); + + child = node.content; + + if (child == null) + return false; + + /* check child has no peers */ + + if (child.next != null) + return false; + + if (child.tag == tt.tagB) + { + mergeStyles(node, child); + addStyleProperty(node, "font-weight: bold"); + stripOnlyChild(node); + return true; + } + + if (child.tag == tt.tagI) + { + mergeStyles(node, child); + addStyleProperty(node, "font-style: italic"); + stripOnlyChild(node); + return true; + } + + if (child.tag == tt.tagFont) + { + mergeStyles(node, child); + addFontStyles(node, child.attributes); + stripOnlyChild(node); + return true; + } + } + } + + return false; + } + + /* the only child of table cell or an inline element such as em */ + private boolean inlineStyle(Lexer lexer, Node node, MutableObject pnode) + { + Node child; + + if (node.tag != tt.tagFont && (node.tag.model & (Dict.CM_INLINE|Dict.CM_ROW)) != 0) + { + child = node.content; + + if (child == null) + return false; + + /* check child has no peers */ + + if (child.next != null) + return false; + + if (child.tag == tt.tagB && lexer.configuration.LogicalEmphasis) + { + mergeStyles(node, child); + addStyleProperty(node, "font-weight: bold"); + stripOnlyChild(node); + return true; + } + + if (child.tag == tt.tagI && lexer.configuration.LogicalEmphasis) + { + mergeStyles(node, child); + addStyleProperty(node, "font-style: italic"); + stripOnlyChild(node); + return true; + } + + if (child.tag == tt.tagFont) + { + mergeStyles(node, child); + addFontStyles(node, child.attributes); + stripOnlyChild(node); + return true; + } + } + + return false; + } + + /* + Replace font elements by span elements, deleting + the font element's attributes and replacing them + by a single style attribute. + */ + private boolean font2Span(Lexer lexer, Node node, MutableObject pnode) + { + AttVal av, style, next; + + if (node.tag == tt.tagFont) + { + if (lexer.configuration.DropFontTags) + { + discardContainer(node, pnode); + return false; + } + + /* if FONT is only child of parent element then leave alone */ + if (node.parent.content == node + && node.next == null) + return false; + + addFontStyles(node, node.attributes); + + /* extract style attribute and free the rest */ + av = node.attributes; + style = null; + + while (av != null) + { + next = av.next; + + if (av.attribute.equals("style")) + { + av.next = null; + style = av; + } + + av = next; + } + + node.attributes = style; + + node.tag = tt.tagSpan; + node.element = "span"; + + return true; + } + + return false; + } + + /* + Applies all matching rules to a node. + */ + private Node cleanNode(Lexer lexer, Node node) + { + Node next = null; + MutableObject o = new MutableObject(); + boolean b = false; + + for (next = node; node.isElement(); node = next) + { + o.setObject(next); + + b = dir2Div(lexer, node, o); + next = (Node)o.getObject(); + if (b) + continue; + + b = nestedList(lexer, node, o); + next = (Node)o.getObject(); + if (b) + continue; + + b = center2Div(lexer, node, o); + next = (Node)o.getObject(); + if (b) + continue; + + b = mergeDivs(lexer, node, o); + next = (Node)o.getObject(); + if (b) + continue; + + b = blockStyle(lexer, node, o); + next = (Node)o.getObject(); + if (b) + continue; + + b = inlineStyle(lexer, node, o); + next = (Node)o.getObject(); + if (b) + continue; + + b = font2Span(lexer, node, o); + next = (Node)o.getObject(); + if (b) + continue; + + break; + } + + return next; + } + + private Node createStyleProperties(Lexer lexer, Node node) + { + Node child; + + if (node.content != null) + { + for (child = node.content; child != null; child = child.next) + { + child = createStyleProperties(lexer, child); + } + } + + return cleanNode(lexer, node); + } + + private void defineStyleRules(Lexer lexer, Node node) + { + Node child; + + if (node.content != null) + { + for (child = node.content; + child != null; child = child.next) + { + defineStyleRules(lexer, child); + } + } + + style2Rule(lexer, node); + } + + public void cleanTree(Lexer lexer, Node doc) + { + doc = createStyleProperties(lexer, doc); + + if (!lexer.configuration.MakeClean) + { + defineStyleRules(lexer, doc); + createStyleElement(lexer, doc); + } + } + + /* simplifies ... ... etc. */ + public void nestedEmphasis(Node node) + { + MutableObject o = new MutableObject(); + Node next; + + while (node != null) + { + next = node.next; + + if ((node.tag == tt.tagB || node.tag == tt.tagI) + && node.parent != null && node.parent.tag == node.tag) + { + /* strip redundant inner element */ + o.setObject(next); + discardContainer(node, o); + next = (Node)o.getObject(); + node = next; + continue; + } + + if (node.content != null) + nestedEmphasis(node.content); + + node = next; + } + } + + /* replace i by em and b by strong */ + public void emFromI(Node node) + { + while (node != null) + { + if (node.tag == tt.tagI) + { + node.element = tt.tagEm.name; + node.tag = tt.tagEm; + } + else if (node.tag == tt.tagB) + { + node.element = tt.tagStrong.name; + node.tag = tt.tagStrong; + } + + if (node.content != null) + emFromI(node.content); + + node = node.next; + } + } + + /* + Some people use dir or ul without an li + to indent the content. The pattern to + look for is a list with a single implicit + li. This is recursively replaced by an + implicit blockquote. + */ + public void list2BQ(Node node) + { + while (node != null) + { + if (node.content != null) + list2BQ(node.content); + + if (node.tag != null && node.tag.parser == ParserImpl.getParseList() && + node.hasOneChild() && node.content.implicit) + { + stripOnlyChild(node); + node.element = tt.tagBlockquote.name; + node.tag = tt.tagBlockquote; + node.implicit = true; + } + + node = node.next; + } + } + + /* + Replace implicit blockquote by div with an indent + taking care to reduce nested blockquotes to a single + div with the indent set to match the nesting depth + */ + public void bQ2Div(Node node) + { + int indent; + String indent_buf; + + while (node != null) + { + if (node.tag == tt.tagBlockquote && node.implicit) + { + indent = 1; + + while(node.hasOneChild() && + node.content.tag == tt.tagBlockquote && + node.implicit) + { + ++indent; + stripOnlyChild(node); + } + + if (node.content != null) + bQ2Div(node.content); + + indent_buf = "margin-left: " + + (new Integer(2*indent)).toString() + "em"; + + node.element = tt.tagDiv.name; + node.tag = tt.tagDiv; + node.addAttribute("style", indent_buf); + } + else if (node.content != null) + bQ2Div(node.content); + + + node = node.next; + } + } + + /* node is prune up to */ + public Node pruneSection(Lexer lexer, Node node) + { + for (;;) + { + /* discard node and returns next */ + node = Node.discardElement(node); + + if (node == null) + return null; + + if (node.type == Node.SectionTag) + { + if ((Lexer.getString(node.textarray, node.start, 2)).equals("if")) + { + node = pruneSection(lexer, node); + continue; + } + + if ((Lexer.getString(node.textarray, node.start, 5)).equals("endif")) + { + node = Node.discardElement(node); + break; + } + } + } + + return node; + } + + public void dropSections(Lexer lexer, Node node) + { + while (node != null) + { + if (node.type == Node.SectionTag) + { + /* prune up to matching endif */ + if ((Lexer.getString(node.textarray, node.start, 2)).equals("if")) + { + node = pruneSection(lexer, node); + continue; + } + + /* discard others as well */ + node = Node.discardElement(node); + continue; + } + + if (node.content != null) + dropSections(lexer, node.content); + + node = node.next; + } + } + + public void purgeAttributes(Node node) + { + AttVal attr = node.attributes; + AttVal next = null; + AttVal prev = null; + + while (attr != null) + { + next = attr.next; + + /* special check for class="Code" denoting pre text */ + if (attr.attribute != null && + attr.value != null && + attr.attribute.equals("class") && + attr.value.equals("Code")) + { + prev = attr; + } + else if (attr.attribute != null && + (attr.attribute.equals("class") || + attr.attribute.equals("style") || + attr.attribute.equals("lang") || + attr.attribute.startsWith("x:") || + ((attr.attribute.equals("height") || attr.attribute.equals("width")) && + (node.tag == tt.tagTd || node.tag == tt.tagTr || node.tag == tt.tagTh)))) + { + if (prev != null) + prev.next = next; + else + node.attributes = next; + + } + else + prev = attr; + + attr = next; + } + } + + /* Word2000 uses span excessively, so we strip span out */ + public Node stripSpan(Lexer lexer, Node span) + { + Node node; + Node prev = null; + Node content; + + /* + deal with span elements that have content + by splicing the content in place of the span + after having processed it + */ + + cleanWord2000(lexer, span.content); + content = span.content; + + if (span.prev != null) + prev = span.prev; + else if (content != null) + { + node = content; + content = content.next; + Node.removeNode(node); + Node.insertNodeBeforeElement(span, node); + prev = node; + } + + while (content != null) + { + node = content; + content = content.next; + Node.removeNode(node); + Node.insertNodeAfterElement(prev, node); + prev = node; + } + + if (span.next == null) + span.parent.last = prev; + + node = span.next; + span.content = null; + Node.discardElement(span); + return node; + } + + /* map non-breaking spaces to regular spaces */ + private void normalizeSpaces(Lexer lexer, Node node) + { + while (node != null) + { + if (node.content != null) + normalizeSpaces(lexer, node.content); + + if (node.type == Node.TextNode) + { + int i; + MutableInteger c = new MutableInteger(); + int p = node.start; + + for (i = node.start; i < node.end; ++i) + { + c.value = (int)node.textarray[i]; + + /* look for UTF-8 multibyte character */ + if (c.value > 0x7F) + i += PPrint.getUTF8(node.textarray, i, c); + + if (c.value == 160) + c.value = ' '; + + p = PPrint.putUTF8(node.textarray, p, c.value); + } + } + + node = node.next; + } + } + + /* + This is a major clean up to strip out all the extra stuff you get + when you save as web page from Word 2000. It doesn't yet know what + to do with VML tags, but these will appear as errors unless you + declare them as new tags, such as o:p which needs to be declared + as inline. + */ + public void cleanWord2000(Lexer lexer, Node node) + { + /* used to a list from a sequence of bulletted p's */ + Node list = null; + + while (node != null) + { + /* discard Word's style verbiage */ + if (node.tag == tt.tagStyle || + node.tag == tt.tagMeta || + node.type == Node.CommentTag) + { + node = Node.discardElement(node); + continue; + } + + /* strip out all span tags Word scatters so liberally! */ + if (node.tag == tt.tagSpan) + { + node = stripSpan(lexer, node); + continue; + } + + /* get rid of Word's xmlns attributes */ + if (node.tag == tt.tagHtml) + { + /* check that it's a Word 2000 document */ + if (node.getAttrByName("xmlns:o") == null) + return; + } + + if (node.tag == tt.tagLink) + { + AttVal attr = node.getAttrByName("rel"); + + if (attr != null && attr.value != null && + attr.value.equals("File-List")) + { + node = Node.discardElement(node); + continue; + } + } + + /* discard empty paragraphs */ + if (node.content == null && node.tag == tt.tagP) + { + node = Node.discardElement(node); + continue; + } + + if (node.tag == tt.tagP) + { + AttVal attr = node.getAttrByName("class"); + + /* map sequence of

    to

      ...
    */ + if (attr != null && attr.value != null && + attr.value.equals("MsoListBullet")) + { + Node.coerceNode(lexer, node, tt.tagLi); + + if (list == null || list.tag != tt.tagUl) + { + list = lexer.inferredTag("ul"); + Node.insertNodeBeforeElement(node, list); + } + + purgeAttributes(node); + + if (node.content != null) + cleanWord2000(lexer, node.content); + + /* remove node and append to contents of list */ + Node.removeNode(node); + Node.insertNodeAtEnd(list, node); + node = list.next; + } + /* map sequence of

    to

    ...
    */ + else if (attr != null && attr.value != null && + attr.value.equals("Code")) + { + Node br = lexer.newLineNode(); + normalizeSpaces(lexer, node); + + if (list == null || list.tag != tt.tagPre) + { + list = lexer.inferredTag("pre"); + Node.insertNodeBeforeElement(node, list); + } + + /* remove node and append to contents of list */ + Node.removeNode(node); + Node.insertNodeAtEnd(list, node); + stripSpan(lexer, node); + Node.insertNodeAtEnd(list, br); + node = list.next; + } + else + list = null; + } + else + list = null; + + /* strip out style and class attributes */ + if (node.type == Node.StartTag || node.type == Node.StartEndTag) + purgeAttributes(node); + + if (node.content != null) + cleanWord2000(lexer, node.content); + + node = node.next; + } + } + + public boolean isWord2000(Node root, TagTable tt) + { + Node html = root.findHTML(tt); + + return (html != null && html.getAttrByName("xmlns:o") != null); + } +}