+++ /dev/null
-/*
- * @(#)Clean.java 1.11 2000/08/16
- *
- */
-
-package net.sourceforge.phpdt.tidy;
-
-/**
- *
- * Clean up misuse of presentation markup
- *
- * (c) 1998-2000 (W3C) MIT, INRIA, Keio University
- * See Tidy.java for the copyright notice.
- * Derived from <a href="http://www.w3.org/People/Raggett/tidy">
- * HTML Tidy Release 4 Aug 2000</a>
- *
- * @author Dave Raggett <dsr@w3.org>
- * @author Andy Quick <ac.quick@sympatico.ca> (translation to Java)
- * @version 1.0, 1999/05/22
- * @version 1.0.1, 1999/05/29
- * @version 1.1, 1999/06/18 Java Bean
- * @version 1.2, 1999/07/10 Tidy Release 7 Jul 1999
- * @version 1.3, 1999/07/30 Tidy Release 26 Jul 1999
- * @version 1.4, 1999/09/04 DOM support
- * @version 1.5, 1999/10/23 Tidy Release 27 Sep 1999
- * @version 1.6, 1999/11/01 Tidy Release 22 Oct 1999
- * @version 1.7, 1999/12/06 Tidy Release 30 Nov 1999
- * @version 1.8, 2000/01/22 Tidy Release 13 Jan 2000
- * @version 1.9, 2000/06/03 Tidy Release 30 Apr 2000
- * @version 1.10, 2000/07/22 Tidy Release 8 Jul 2000
- * @version 1.11, 2000/08/16 Tidy Release 4 Aug 2000
- */
-
-/*
- Filters from other formats such as Microsoft Word
- often make excessive use of presentation markup such
- as font tags, B, I, and the align attribute. By applying
- a set of production rules, it is straight forward to
- transform this to use CSS.
-
- Some rules replace some of the children of an element by
- style properties on the element, e.g.
-
- <p><b>...</b></p> -> <p style="font-weight: bold">...</p>
-
- Such rules are applied to the element's content and then
- to the element itself until none of the rules more apply.
- Having applied all the rules to an element, it will have
- a style attribute with one or more properties.
-
- Other rules strip the element they apply to, replacing
- it by style properties on the contents, e.g.
-
- <dir><li><p>...</li></dir> -> <p style="margin-left 1em">...
-
- These rules are applied to an element before processing
- its content and replace the current element by the first
- element in the exposed content.
-
- After applying both sets of rules, you can replace the
- style attribute by a class value and style rule in the
- document head. To support this, an association of styles
- and class names is built.
-
- A naive approach is to rely on string matching to test
- when two property lists are the same. A better approach
- would be to first sort the properties before matching.
-*/
-
-public class Clean {
-
- private int classNum = 1;
-
- private TagTable tt;
-
- public Clean(TagTable tt)
- {
- this.tt = tt;
- }
-
- private StyleProp insertProperty(StyleProp props, String name,
- String value)
- {
- StyleProp first, prev, prop;
- int cmp;
-
- prev = null;
- first = props;
-
- while (props != null)
- {
- cmp = props.name.compareTo(name);
-
- if (cmp == 0)
- {
- /* this property is already defined, ignore new value */
- return first;
- }
-
- if (cmp > 0) // props.name > name
- {
- /* insert before this */
-
- prop = new StyleProp(name, value, props);
-
- if (prev != null)
- prev.next = prop;
- else
- first = prop;
-
- return first;
- }
-
- prev = props;
- props = props.next;
- }
-
- prop = new StyleProp(name, value);
-
- if (prev != null)
- prev.next = prop;
- else
- first = prop;
-
- return first;
- }
-
- /*
- Create sorted linked list of properties from style string
- It temporarily places nulls in place of ':' and ';' to
- delimit the strings for the property name and value.
- Some systems don't allow you to null literal strings,
- so to avoid this, a copy is made first.
- */
- private StyleProp createProps(StyleProp prop, String style)
- {
- int name_end;
- int value_end;
- int value_start = 0;
- int name_start = 0;
- boolean more;
-
- name_start = 0;
- while (name_start < style.length())
- {
- while (name_start < style.length() &&
- style.charAt(name_start) == ' ')
- ++name_start;
-
- name_end = name_start;
-
- while (name_end < style.length())
- {
- if (style.charAt(name_end) == ':')
- {
- value_start = name_end + 1;
- break;
- }
-
- ++name_end;
- }
-
- if (name_end >= style.length() || style.charAt(name_end) != ':')
- break;
-
- while (value_start < style.length() &&
- style.charAt(value_start) == ' ')
- ++value_start;
-
- value_end = value_start;
- more = false;
-
- while (value_end < style.length())
- {
- if (style.charAt(value_end) == ';')
- {
- more = true;
- break;
- }
-
- ++value_end;
- }
-
- prop = insertProperty(prop,
- style.substring(name_start, name_end),
- style.substring(value_start, value_end));
-
- if (more)
- {
- name_start = value_end + 1;
- continue;
- }
-
- break;
- }
-
- return prop;
- }
-
- private String createPropString(StyleProp props)
- {
- String style = "";
- int len;
- StyleProp prop;
-
- /* compute length */
-
- for (len = 0, prop = props; prop != null; prop = prop.next)
- {
- len += prop.name.length() + 2;
- len += prop.value.length() + 2;
- }
-
- for (prop = props; prop != null; prop = prop.next)
- {
- style = style.concat(prop.name);
- style = style.concat(": ");
-
- style = style.concat(prop.value);
-
- if (prop.next == null)
- break;
-
- style = style.concat("; ");
- }
-
- return style;
- }
-
- /*
- create string with merged properties
- */
- private String addProperty(String style, String property)
- {
- StyleProp prop;
-
- prop = createProps(null, style);
- prop = createProps(prop, property);
- style = createPropString(prop);
- return style;
- }
-
- private String gensymClass(String tag)
- {
- String str;
-
- str = "c" + classNum;
- classNum++;
- return str;
- }
-
- private String findStyle(Lexer lexer, String tag, String properties)
- {
- Style style;
-
- for (style = lexer.styles; style != null; style=style.next)
- {
- if (style.tag.equals(tag) &&
- style.properties.equals(properties))
- return style.tagClass;
- }
-
- style = new Style(tag, gensymClass(tag), properties, lexer.styles);
- lexer.styles = style;
- return style.tagClass;
- }
-
- /*
- Find style attribute in node, and replace it
- by corresponding class attribute. Search for
- class in style dictionary otherwise gensym
- new class and add to dictionary.
-
- Assumes that node doesn't have a class attribute
- */
- private void style2Rule(Lexer lexer, Node node)
- {
- AttVal styleattr, classattr;
- String classname;
-
- styleattr = node.getAttrByName("style");
-
- if (styleattr != null)
- {
- classname = findStyle(lexer, node.element, styleattr.value);
- classattr = node.getAttrByName("class");
-
- /*
- if there already is a class attribute
- then append class name after a space
- */
- if (classattr != null)
- {
- classattr.value = classattr.value + " " + classname;
- node.removeAttribute(styleattr);
- }
- else /* reuse style attribute for class attribute */
- {
- styleattr.attribute = "class";
- styleattr.value = classname;
- }
- }
- }
-
- private void addColorRule(Lexer lexer, String selector, String color)
- {
- if (color != null)
- {
- lexer.addStringLiteral(selector);
- lexer.addStringLiteral(" { color: ");
- lexer.addStringLiteral(color);
- lexer.addStringLiteral(" }\n");
- }
- }
-
- /*
- move presentation attribs from body to style element
-
- background="foo" -> body { background-image: url(foo) }
- bgcolor="foo" -> body { background-color: foo }
- text="foo" -> body { color: foo }
- link="foo" -> :link { color: foo }
- vlink="foo" -> :visited { color: foo }
- alink="foo" -> :active { color: foo }
- */
- private void cleanBodyAttrs(Lexer lexer, Node body)
- {
- AttVal attr;
- String bgurl = null;
- String bgcolor = null;
- String color = null;
-
- attr = body.getAttrByName("background");
-
- if (attr != null)
- {
- bgurl = attr.value;
- attr.value = null;
- body.removeAttribute(attr);
- }
-
- attr = body.getAttrByName("bgcolor");
-
- if (attr != null)
- {
- bgcolor = attr.value;
- attr.value = null;
- body.removeAttribute(attr);
- }
-
- attr = body.getAttrByName("text");
-
- if (attr != null)
- {
- color = attr.value;
- attr.value = null;
- body.removeAttribute(attr);
- }
-
- if (bgurl != null || bgcolor != null || color != null)
- {
- lexer.addStringLiteral(" body {\n");
-
- if (bgurl != null)
- {
- lexer.addStringLiteral(" background-image: url(");
- lexer.addStringLiteral(bgurl);
- lexer.addStringLiteral(");\n");
- }
-
- if (bgcolor != null)
- {
- lexer.addStringLiteral(" background-color: ");
- lexer.addStringLiteral(bgcolor);
- lexer.addStringLiteral(";\n");
- }
-
- if (color != null)
- {
- lexer.addStringLiteral(" color: ");
- lexer.addStringLiteral(color);
- lexer.addStringLiteral(";\n");
- }
-
- lexer.addStringLiteral(" }\n");
- }
-
- attr = body.getAttrByName("link");
-
- if (attr != null)
- {
- addColorRule(lexer, " :link", attr.value);
- body.removeAttribute(attr);
- }
-
- attr = body.getAttrByName("vlink");
-
- if (attr != null)
- {
- addColorRule(lexer, " :visited", attr.value);
- body.removeAttribute(attr);
- }
-
- attr = body.getAttrByName("alink");
-
- if (attr != null)
- {
- addColorRule(lexer, " :active", attr.value);
- body.removeAttribute(attr);
- }
- }
-
- private boolean niceBody(Lexer lexer, Node doc)
- {
- Node body = doc.findBody(lexer.configuration.tt);
-
- if (body != null)
- {
- if (
- body.getAttrByName("background") != null ||
- body.getAttrByName("bgcolor") != null ||
- body.getAttrByName("text") != null ||
- body.getAttrByName("link") != null ||
- body.getAttrByName("vlink") != null ||
- body.getAttrByName("alink") != null
- )
- {
- lexer.badLayout |= Report.USING_BODY;
- return false;
- }
- }
-
- return true;
- }
-
- /* create style element using rules from dictionary */
- private void createStyleElement(Lexer lexer, Node doc)
- {
- Node node, head, body;
- Style style;
- AttVal av;
-
- if (lexer.styles == null && niceBody(lexer, doc))
- return;
-
- node = lexer.newNode(Node.StartTag, null, 0, 0, "style");
- node.implicit = true;
-
- /* insert type attribute */
- av = new AttVal(null, null, '"', "type", "text/css");
- av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
- node.attributes = av;
-
- body = doc.findBody(lexer.configuration.tt);
-
- lexer.txtstart = lexer.lexsize;
-
- if (body != null)
- cleanBodyAttrs(lexer, body);
-
- for (style = lexer.styles; style != null; style = style.next)
- {
- lexer.addCharToLexer(' ');
- lexer.addStringLiteral(style.tag);
- lexer.addCharToLexer('.');
- lexer.addStringLiteral(style.tagClass);
- lexer.addCharToLexer(' ');
- lexer.addCharToLexer('{');
- lexer.addStringLiteral(style.properties);
- lexer.addCharToLexer('}');
- lexer.addCharToLexer('\n');
- }
-
- lexer.txtend = lexer.lexsize;
-
- Node.insertNodeAtEnd(node,
- lexer.newNode(Node.TextNode,
- lexer.lexbuf,
- lexer.txtstart,
- lexer.txtend));
-
- /*
- now insert style element into document head
-
- doc is root node. search its children for html node
- the head node should be first child of html node
- */
-
- head = doc.findHEAD(lexer.configuration.tt);
-
- if (head != null)
- Node.insertNodeAtEnd(head, node);
- }
-
- /* ensure bidirectional links are consistent */
- private void fixNodeLinks(Node node)
- {
- Node child;
-
- if (node.prev != null)
- node.prev.next = node;
- else
- node.parent.content = node;
-
- if (node.next != null)
- node.next.prev = node;
- else
- node.parent.last = node;
-
- for (child = node.content; child != null; child = child.next)
- child.parent = node;
- }
-
- /*
- used to strip child of node when
- the node has one and only one child
- */
- private void stripOnlyChild(Node node)
- {
- Node child;
-
- child = node.content;
- node.content = child.content;
- node.last = child.last;
- child.content = null;
-
- for (child = node.content; child != null; child = child.next)
- child.parent = node;
- }
-
- /* used to strip font start and end tags */
- private void discardContainer(Node element, MutableObject pnode)
- {
- Node node;
- Node parent = element.parent;
-
- if (element.content != null)
- {
- element.last.next = element.next;
-
- if (element.next != null)
- {
- element.next.prev = element.last;
- element.last.next = element.next;
- }
- else
- parent.last = element.last;
-
- if (element.prev != null)
- {
- element.content.prev = element.prev;
- element.prev.next = element.content;
- }
- else
- parent.content = element.content;
-
- for (node = element.content; node != null; node = node.next)
- node.parent = parent;
-
- pnode.setObject(element.content);
- }
- else
- {
- if (element.next != null)
- element.next.prev = element.prev;
- else
- parent.last = element.prev;
-
- if (element.prev != null)
- element.prev.next = element.next;
- else
- parent.content = element.next;
-
- pnode.setObject(element.next);
- }
-
- element.next = null;
- element.content = null;
- }
-
- /*
- Add style property to element, creating style
- attribute as needed and adding ; delimiter
- */
- private void addStyleProperty(Node node, String property)
- {
- AttVal av;
-
- for (av = node.attributes; av != null; av = av.next)
- {
- if (av.attribute.equals("style"))
- break;
- }
-
- /* if style attribute already exists then insert property */
-
- if (av != null)
- {
- String s;
-
- s = addProperty(av.value, property);
- av.value = s;
- }
- else /* else create new style attribute */
- {
- av = new AttVal(node.attributes, null, '"', "style", property);
- av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
- node.attributes = av;
- }
- }
-
- /*
- Create new string that consists of the
- combined style properties in s1 and s2
-
- To merge property lists, we build a linked
- list of property/values and insert properties
- into the list in order, merging values for
- the same property name.
- */
- private String mergeProperties(String s1, String s2)
- {
- String s;
- StyleProp prop;
-
- prop = createProps(null, s1);
- prop = createProps(prop, s2);
- s = createPropString(prop);
- return s;
- }
-
- private void mergeStyles(Node node, Node child)
- {
- AttVal av;
- String s1, s2, style;
-
- for (s2 = null, av = child.attributes; av != null; av = av.next)
- {
- if (av.attribute.equals("style"))
- {
- s2 = av.value;
- break;
- }
- }
-
- for (s1 = null, av = node.attributes; av != null; av = av.next)
- {
- if (av.attribute.equals("style"))
- {
- s1 = av.value;
- break;
- }
- }
-
- if (s1 != null)
- {
- if (s2 != null) /* merge styles from both */
- {
- style = mergeProperties(s1, s2);
- av.value = style;
- }
- }
- else if (s2 != null) /* copy style of child */
- {
- av = new AttVal(node.attributes, null, '"', "style", s2);
- av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
- node.attributes = av;
- }
- }
-
- private String fontSize2Name(String size)
- {
- /*
- String[] sizes =
- {
- "50%",
- "60%",
- "80%",
- null,
- "120%",
- "150%",
- "200%"
- };
- */
-
- String[] sizes =
- {
- "60%",
- "70%",
- "80%",
- null,
- "120%",
- "150%",
- "200%"
- };
- String buf;
-
- if (size.length() > 0 &&
- '0' <= size.charAt(0) && size.charAt(0) <= '6')
- {
- int n = size.charAt(0) - '0';
- return sizes[n];
- }
-
- if (size.length() > 0 && size.charAt(0) == '-')
- {
- if (size.length() > 1 &&
- '0' <= size.charAt(1) && size.charAt(1) <= '6')
- {
- int n = size.charAt(1) - '0';
- double x;
-
- for (x = 1.0; n > 0; --n)
- x *= 0.8;
-
- x *= 100.0;
- buf = "" + (int)x + "%";
-
- return buf;
- }
-
- return "smaller"; /*"70%"; */
- }
-
- if (size.length() > 1 &&
- '0' <= size.charAt(1) && size.charAt(1) <= '6')
- {
- int n = size.charAt(1) - '0';
- double x;
-
- for (x = 1.0; n > 0; --n)
- x *= 1.2;
-
- x *= 100.0;
- buf = "" + (int)x + "%";
-
- return buf;
- }
-
- return "larger"; /* "140%" */
- }
-
- private void addFontFace(Node node, String face)
- {
- addStyleProperty(node, "font-family: " + face);
- }
-
- private void addFontSize(Node node, String size)
- {
- String value;
-
- if (size.equals("6") && node.tag == tt.tagP)
- {
- node.element = "h1";
- tt.findTag(node);
- return;
- }
-
- if (size.equals("5") && node.tag == tt.tagP)
- {
- node.element = "h2";
- tt.findTag(node);
- return;
- }
-
- if (size.equals("4") && node.tag == tt.tagP)
- {
- node.element = "h3";
- tt.findTag(node);
- return;
- }
-
- value = fontSize2Name(size);
-
- if (value != null)
- {
- addStyleProperty(node, "font-size: " + value);
- }
- }
-
- private void addFontColor(Node node, String color)
- {
- addStyleProperty(node, "color: " + color);
- }
-
- private void addAlign(Node node, String align)
- {
- /* force alignment value to lower case */
- addStyleProperty(node, "text-align: " + align.toLowerCase());
- }
-
- /*
- add style properties to node corresponding to
- the font face, size and color attributes
- */
- private void addFontStyles(Node node, AttVal av)
- {
- while (av != null)
- {
- if (av.attribute.equals("face"))
- addFontFace(node, av.value);
- else if (av.attribute.equals("size"))
- addFontSize(node, av.value);
- else if (av.attribute.equals("color"))
- addFontColor(node, av.value);
-
- av = av.next;
- }
- }
-
- /*
- Symptom: <p align=center>
- Action: <p style="text-align: center">
- */
- private void textAlign(Lexer lexer, Node node)
- {
- AttVal av, prev;
-
- prev = null;
-
- for (av = node.attributes; av != null; av = av.next)
- {
- if (av.attribute.equals("align"))
- {
- if (prev != null)
- prev.next = av.next;
- else
- node.attributes = av.next;
-
- if (av.value != null)
- {
- addAlign(node, av.value);
- }
-
- break;
- }
-
- prev = av;
- }
- }
-
- /*
- The clean up rules use the pnode argument to return the
- next node when the orignal node has been deleted
- */
-
- /*
- Symptom: <dir> <li> where <li> is only child
- Action: coerce <dir> <li> to <div> with indent.
- */
-
- private boolean dir2Div(Lexer lexer, Node node, MutableObject pnode)
- {
- Node child;
-
- if (node.tag == tt.tagDir ||
- node.tag == tt.tagUl ||
- node.tag == tt.tagOl)
- {
- child = node.content;
-
- if (child == null)
- return false;
-
- /* check child has no peers */
-
- if (child.next != null)
- return false;
-
- if (child.tag != tt.tagLi)
- return false;
-
- if (!child.implicit)
- return false;
-
- /* coerce dir to div */
-
- node.tag = tt.tagDiv;
- node.element = "div";
- addStyleProperty(node, "margin-left: 2em");
- stripOnlyChild(node);
- return true;
-
-//#if 0
- //Node content;
- //Node last;
- //content = child.content;
- //last = child.last;
- //child.content = null;
-
- /* adjust parent and set margin on contents of <li> */
-
- //for (child = content; child != null; child = child.next)
- //{
- // child.parent = node.parent;
- // addStyleProperty(child, "margin-left: 1em");
- //}
-
- /* hook first/last into sequence */
-
- //if (content != null)
- //{
- // content.prev = node.prev;
- // last.next = node.next;
- // fixNodeLinks(content);
- // fixNodeLinks(last);
- //}
-
- //node.next = null;
-
- /* ensure that new node is cleaned */
- //pnode.setObject(cleanNode(lexer, content));
- //return true;
-//#endif
- }
-
- return false;
- }
-
- /*
- Symptom: <center>
- Action: replace <center> by <div style="text-align: center">
- */
-
- private boolean center2Div(Lexer lexer, Node node, MutableObject pnode)
- {
- if (node.tag == tt.tagCenter)
- {
- if (lexer.configuration.DropFontTags)
- {
- if (node.content != null)
- {
- Node last = node.last;
- Node parent = node.parent;
-
- discardContainer(node, pnode);
-
- node = lexer.inferredTag("br");
-
- if (last.next != null)
- last.next.prev = node;
-
- node.next = last.next;
- last.next = node;
- node.prev = last;
-
- if (parent.last == last)
- parent.last = node;
-
- node.parent = parent;
- }
- else
- {
- Node prev = node.prev;
- Node next = node.next;
- Node parent = node.parent;
- discardContainer(node, pnode);
-
- node = lexer.inferredTag("br");
- node.next = next;
- node.prev = prev;
- node.parent = parent;
-
- if (next != null)
- next.prev = node;
- else
- parent.last = node;
-
- if (prev != null)
- prev.next = node;
- else
- parent.content = node;
- }
-
- return true;
- }
- node.tag = tt.tagDiv;
- node.element = "div";
- addStyleProperty(node, "text-align: center");
- return true;
- }
-
- return false;
- }
-
- /*
- Symptom <div><div>...</div></div>
- Action: merge the two divs
-
- This is useful after nested <dir>s used by Word
- for indenting have been converted to <div>s
- */
- private boolean mergeDivs(Lexer lexer, Node node, MutableObject pnode)
- {
- Node child;
-
- if (node.tag != tt.tagDiv)
- return false;
-
- child = node.content;
-
- if (child == null)
- return false;
-
- if (child.tag != tt.tagDiv)
- return false;
-
- if (child.next != null)
- return false;
-
- mergeStyles(node, child);
- stripOnlyChild(node);
- return true;
- }
-
- /*
- Symptom: <ul><li><ul>...</ul></li></ul>
- Action: discard outer list
- */
-
- private boolean nestedList(Lexer lexer, Node node, MutableObject pnode)
- {
- Node child, list;
-
- if (node.tag == tt.tagUl || node.tag == tt.tagOl)
- {
- child = node.content;
-
- if (child == null)
- return false;
-
- /* check child has no peers */
-
- if (child.next != null)
- return false;
-
- list = child.content;
-
- if (list == null)
- return false;
-
- if (list.tag != node.tag)
- return false;
-
- pnode.setObject(node.next);
-
- /* move inner list node into position of outer node */
- list.prev = node.prev;
- list.next = node.next;
- list.parent = node.parent;
- fixNodeLinks(list);
-
- /* get rid of outer ul and its li */
- child.content = null;
- node.content = null;
- node.next = null;
-
- /*
- If prev node was a list the chances are this node
- should be appended to that list. Word has no way of
- recognizing nested lists and just uses indents
- */
-
- if (list.prev != null)
- {
- node = list;
- list = node.prev;
-
- if (list.tag == tt.tagUl || list.tag == tt.tagOl)
- {
- list.next = node.next;
-
- if (list.next != null)
- list.next.prev = list;
-
- child = list.last; /* <li> */
-
- node.parent = child;
- node.next = null;
- node.prev = child.last;
- fixNodeLinks(node);
- }
- }
-
- cleanNode(lexer, node);
- return true;
- }
-
- return false;
- }
-
- /*
- Symptom: the only child of a block-level element is a
- presentation element such as B, I or FONT
-
- Action: add style "font-weight: bold" to the block and
- strip the <b> element, leaving its children.
-
- example:
-
- <p>
- <b><font face="Arial" size="6">Draft Recommended Practice</font></b>
- </p>
-
- becomes:
-
- <p style="font-weight: bold; font-family: Arial; font-size: 6">
- Draft Recommended Practice
- </p>
-
- This code also replaces the align attribute by a style attribute.
- However, to avoid CSS problems with Navigator 4, this isn't done
- for the elements: caption, tr and table
- */
- private boolean blockStyle(Lexer lexer, Node node, MutableObject pnode)
- {
- Node child;
-
- if ((node.tag.model & (Dict.CM_BLOCK | Dict.CM_LIST | Dict.CM_DEFLIST | Dict.CM_TABLE)) != 0)
- {
- if (node.tag != tt.tagTable
- && node.tag != tt.tagTr
- && node.tag != tt.tagLi)
- {
- /* check for align attribute */
- if (node.tag != tt.tagCaption)
- textAlign(lexer, node);
-
- child = node.content;
-
- if (child == null)
- return false;
-
- /* check child has no peers */
-
- if (child.next != null)
- return false;
-
- if (child.tag == tt.tagB)
- {
- mergeStyles(node, child);
- addStyleProperty(node, "font-weight: bold");
- stripOnlyChild(node);
- return true;
- }
-
- if (child.tag == tt.tagI)
- {
- mergeStyles(node, child);
- addStyleProperty(node, "font-style: italic");
- stripOnlyChild(node);
- return true;
- }
-
- if (child.tag == tt.tagFont)
- {
- mergeStyles(node, child);
- addFontStyles(node, child.attributes);
- stripOnlyChild(node);
- return true;
- }
- }
- }
-
- return false;
- }
-
- /* the only child of table cell or an inline element such as em */
- private boolean inlineStyle(Lexer lexer, Node node, MutableObject pnode)
- {
- Node child;
-
- if (node.tag != tt.tagFont && (node.tag.model & (Dict.CM_INLINE|Dict.CM_ROW)) != 0)
- {
- child = node.content;
-
- if (child == null)
- return false;
-
- /* check child has no peers */
-
- if (child.next != null)
- return false;
-
- if (child.tag == tt.tagB && lexer.configuration.LogicalEmphasis)
- {
- mergeStyles(node, child);
- addStyleProperty(node, "font-weight: bold");
- stripOnlyChild(node);
- return true;
- }
-
- if (child.tag == tt.tagI && lexer.configuration.LogicalEmphasis)
- {
- mergeStyles(node, child);
- addStyleProperty(node, "font-style: italic");
- stripOnlyChild(node);
- return true;
- }
-
- if (child.tag == tt.tagFont)
- {
- mergeStyles(node, child);
- addFontStyles(node, child.attributes);
- stripOnlyChild(node);
- return true;
- }
- }
-
- return false;
- }
-
- /*
- Replace font elements by span elements, deleting
- the font element's attributes and replacing them
- by a single style attribute.
- */
- private boolean font2Span(Lexer lexer, Node node, MutableObject pnode)
- {
- AttVal av, style, next;
-
- if (node.tag == tt.tagFont)
- {
- if (lexer.configuration.DropFontTags)
- {
- discardContainer(node, pnode);
- return false;
- }
-
- /* if FONT is only child of parent element then leave alone */
- if (node.parent.content == node
- && node.next == null)
- return false;
-
- addFontStyles(node, node.attributes);
-
- /* extract style attribute and free the rest */
- av = node.attributes;
- style = null;
-
- while (av != null)
- {
- next = av.next;
-
- if (av.attribute.equals("style"))
- {
- av.next = null;
- style = av;
- }
-
- av = next;
- }
-
- node.attributes = style;
-
- node.tag = tt.tagSpan;
- node.element = "span";
-
- return true;
- }
-
- return false;
- }
-
- /*
- Applies all matching rules to a node.
- */
- private Node cleanNode(Lexer lexer, Node node)
- {
- Node next = null;
- MutableObject o = new MutableObject();
- boolean b = false;
-
- for (next = node; node.isElement(); node = next)
- {
- o.setObject(next);
-
- b = dir2Div(lexer, node, o);
- next = (Node)o.getObject();
- if (b)
- continue;
-
- b = nestedList(lexer, node, o);
- next = (Node)o.getObject();
- if (b)
- continue;
-
- b = center2Div(lexer, node, o);
- next = (Node)o.getObject();
- if (b)
- continue;
-
- b = mergeDivs(lexer, node, o);
- next = (Node)o.getObject();
- if (b)
- continue;
-
- b = blockStyle(lexer, node, o);
- next = (Node)o.getObject();
- if (b)
- continue;
-
- b = inlineStyle(lexer, node, o);
- next = (Node)o.getObject();
- if (b)
- continue;
-
- b = font2Span(lexer, node, o);
- next = (Node)o.getObject();
- if (b)
- continue;
-
- break;
- }
-
- return next;
- }
-
- private Node createStyleProperties(Lexer lexer, Node node)
- {
- Node child;
-
- if (node.content != null)
- {
- for (child = node.content; child != null; child = child.next)
- {
- child = createStyleProperties(lexer, child);
- }
- }
-
- return cleanNode(lexer, node);
- }
-
- private void defineStyleRules(Lexer lexer, Node node)
- {
- Node child;
-
- if (node.content != null)
- {
- for (child = node.content;
- child != null; child = child.next)
- {
- defineStyleRules(lexer, child);
- }
- }
-
- style2Rule(lexer, node);
- }
-
- public void cleanTree(Lexer lexer, Node doc)
- {
- doc = createStyleProperties(lexer, doc);
-
- if (!lexer.configuration.MakeClean)
- {
- defineStyleRules(lexer, doc);
- createStyleElement(lexer, doc);
- }
- }
-
- /* simplifies <b><b> ... </b> ...</b> etc. */
- public void nestedEmphasis(Node node)
- {
- MutableObject o = new MutableObject();
- Node next;
-
- while (node != null)
- {
- next = node.next;
-
- if ((node.tag == tt.tagB || node.tag == tt.tagI)
- && node.parent != null && node.parent.tag == node.tag)
- {
- /* strip redundant inner element */
- o.setObject(next);
- discardContainer(node, o);
- next = (Node)o.getObject();
- node = next;
- continue;
- }
-
- if (node.content != null)
- nestedEmphasis(node.content);
-
- node = next;
- }
- }
-
- /* replace i by em and b by strong */
- public void emFromI(Node node)
- {
- while (node != null)
- {
- if (node.tag == tt.tagI)
- {
- node.element = tt.tagEm.name;
- node.tag = tt.tagEm;
- }
- else if (node.tag == tt.tagB)
- {
- node.element = tt.tagStrong.name;
- node.tag = tt.tagStrong;
- }
-
- if (node.content != null)
- emFromI(node.content);
-
- node = node.next;
- }
- }
-
- /*
- Some people use dir or ul without an li
- to indent the content. The pattern to
- look for is a list with a single implicit
- li. This is recursively replaced by an
- implicit blockquote.
- */
- public void list2BQ(Node node)
- {
- while (node != null)
- {
- if (node.content != null)
- list2BQ(node.content);
-
- if (node.tag != null && node.tag.parser == ParserImpl.getParseList() &&
- node.hasOneChild() && node.content.implicit)
- {
- stripOnlyChild(node);
- node.element = tt.tagBlockquote.name;
- node.tag = tt.tagBlockquote;
- node.implicit = true;
- }
-
- node = node.next;
- }
- }
-
- /*
- Replace implicit blockquote by div with an indent
- taking care to reduce nested blockquotes to a single
- div with the indent set to match the nesting depth
- */
- public void bQ2Div(Node node)
- {
- int indent;
- String indent_buf;
-
- while (node != null)
- {
- if (node.tag == tt.tagBlockquote && node.implicit)
- {
- indent = 1;
-
- while(node.hasOneChild() &&
- node.content.tag == tt.tagBlockquote &&
- node.implicit)
- {
- ++indent;
- stripOnlyChild(node);
- }
-
- if (node.content != null)
- bQ2Div(node.content);
-
- indent_buf = "margin-left: " +
- (new Integer(2*indent)).toString() + "em";
-
- node.element = tt.tagDiv.name;
- node.tag = tt.tagDiv;
- node.addAttribute("style", indent_buf);
- }
- else if (node.content != null)
- bQ2Div(node.content);
-
-
- node = node.next;
- }
- }
-
- /* node is <![if ...]> prune up to <![endif]> */
- public Node pruneSection(Lexer lexer, Node node)
- {
- for (;;)
- {
- /* discard node and returns next */
- node = Node.discardElement(node);
-
- if (node == null)
- return null;
-
- if (node.type == Node.SectionTag)
- {
- if ((Lexer.getString(node.textarray, node.start, 2)).equals("if"))
- {
- node = pruneSection(lexer, node);
- continue;
- }
-
- if ((Lexer.getString(node.textarray, node.start, 5)).equals("endif"))
- {
- node = Node.discardElement(node);
- break;
- }
- }
- }
-
- return node;
- }
-
- public void dropSections(Lexer lexer, Node node)
- {
- while (node != null)
- {
- if (node.type == Node.SectionTag)
- {
- /* prune up to matching endif */
- if ((Lexer.getString(node.textarray, node.start, 2)).equals("if"))
- {
- node = pruneSection(lexer, node);
- continue;
- }
-
- /* discard others as well */
- node = Node.discardElement(node);
- continue;
- }
-
- if (node.content != null)
- dropSections(lexer, node.content);
-
- node = node.next;
- }
- }
-
- public void purgeAttributes(Node node)
- {
- AttVal attr = node.attributes;
- AttVal next = null;
- AttVal prev = null;
-
- while (attr != null)
- {
- next = attr.next;
-
- /* special check for class="Code" denoting pre text */
- if (attr.attribute != null &&
- attr.value != null &&
- attr.attribute.equals("class") &&
- attr.value.equals("Code"))
- {
- prev = attr;
- }
- else if (attr.attribute != null &&
- (attr.attribute.equals("class") ||
- attr.attribute.equals("style") ||
- attr.attribute.equals("lang") ||
- attr.attribute.startsWith("x:") ||
- ((attr.attribute.equals("height") || attr.attribute.equals("width")) &&
- (node.tag == tt.tagTd || node.tag == tt.tagTr || node.tag == tt.tagTh))))
- {
- if (prev != null)
- prev.next = next;
- else
- node.attributes = next;
-
- }
- else
- prev = attr;
-
- attr = next;
- }
- }
-
- /* Word2000 uses span excessively, so we strip span out */
- public Node stripSpan(Lexer lexer, Node span)
- {
- Node node;
- Node prev = null;
- Node content;
-
- /*
- deal with span elements that have content
- by splicing the content in place of the span
- after having processed it
- */
-
- cleanWord2000(lexer, span.content);
- content = span.content;
-
- if (span.prev != null)
- prev = span.prev;
- else if (content != null)
- {
- node = content;
- content = content.next;
- Node.removeNode(node);
- Node.insertNodeBeforeElement(span, node);
- prev = node;
- }
-
- while (content != null)
- {
- node = content;
- content = content.next;
- Node.removeNode(node);
- Node.insertNodeAfterElement(prev, node);
- prev = node;
- }
-
- if (span.next == null)
- span.parent.last = prev;
-
- node = span.next;
- span.content = null;
- Node.discardElement(span);
- return node;
- }
-
- /* map non-breaking spaces to regular spaces */
- private void normalizeSpaces(Lexer lexer, Node node)
- {
- while (node != null)
- {
- if (node.content != null)
- normalizeSpaces(lexer, node.content);
-
- if (node.type == Node.TextNode)
- {
- int i;
- MutableInteger c = new MutableInteger();
- int p = node.start;
-
- for (i = node.start; i < node.end; ++i)
- {
- c.value = (int)node.textarray[i];
-
- /* look for UTF-8 multibyte character */
- if (c.value > 0x7F)
- i += PPrint.getUTF8(node.textarray, i, c);
-
- if (c.value == 160)
- c.value = ' ';
-
- p = PPrint.putUTF8(node.textarray, p, c.value);
- }
- }
-
- node = node.next;
- }
- }
-
- /*
- This is a major clean up to strip out all the extra stuff you get
- when you save as web page from Word 2000. It doesn't yet know what
- to do with VML tags, but these will appear as errors unless you
- declare them as new tags, such as o:p which needs to be declared
- as inline.
- */
- public void cleanWord2000(Lexer lexer, Node node)
- {
- /* used to a list from a sequence of bulletted p's */
- Node list = null;
-
- while (node != null)
- {
- /* discard Word's style verbiage */
- if (node.tag == tt.tagStyle ||
- node.tag == tt.tagMeta ||
- node.type == Node.CommentTag)
- {
- node = Node.discardElement(node);
- continue;
- }
-
- /* strip out all span tags Word scatters so liberally! */
- if (node.tag == tt.tagSpan)
- {
- node = stripSpan(lexer, node);
- continue;
- }
-
- /* get rid of Word's xmlns attributes */
- if (node.tag == tt.tagHtml)
- {
- /* check that it's a Word 2000 document */
- if (node.getAttrByName("xmlns:o") == null)
- return;
- }
-
- if (node.tag == tt.tagLink)
- {
- AttVal attr = node.getAttrByName("rel");
-
- if (attr != null && attr.value != null &&
- attr.value.equals("File-List"))
- {
- node = Node.discardElement(node);
- continue;
- }
- }
-
- /* discard empty paragraphs */
- if (node.content == null && node.tag == tt.tagP)
- {
- node = Node.discardElement(node);
- continue;
- }
-
- if (node.tag == tt.tagP)
- {
- AttVal attr = node.getAttrByName("class");
-
- /* map sequence of <p class="MsoListBullet"> to <ul>...</ul> */
- if (attr != null && attr.value != null &&
- attr.value.equals("MsoListBullet"))
- {
- Node.coerceNode(lexer, node, tt.tagLi);
-
- if (list == null || list.tag != tt.tagUl)
- {
- list = lexer.inferredTag("ul");
- Node.insertNodeBeforeElement(node, list);
- }
-
- purgeAttributes(node);
-
- if (node.content != null)
- cleanWord2000(lexer, node.content);
-
- /* remove node and append to contents of list */
- Node.removeNode(node);
- Node.insertNodeAtEnd(list, node);
- node = list.next;
- }
- /* map sequence of <p class="Code"> to <pre>...</pre> */
- else if (attr != null && attr.value != null &&
- attr.value.equals("Code"))
- {
- Node br = lexer.newLineNode();
- normalizeSpaces(lexer, node);
-
- if (list == null || list.tag != tt.tagPre)
- {
- list = lexer.inferredTag("pre");
- Node.insertNodeBeforeElement(node, list);
- }
-
- /* remove node and append to contents of list */
- Node.removeNode(node);
- Node.insertNodeAtEnd(list, node);
- stripSpan(lexer, node);
- Node.insertNodeAtEnd(list, br);
- node = list.next;
- }
- else
- list = null;
- }
- else
- list = null;
-
- /* strip out style and class attributes */
- if (node.type == Node.StartTag || node.type == Node.StartEndTag)
- purgeAttributes(node);
-
- if (node.content != null)
- cleanWord2000(lexer, node.content);
-
- node = node.next;
- }
- }
-
- public boolean isWord2000(Node root, TagTable tt)
- {
- Node html = root.findHTML(tt);
-
- return (html != null && html.getAttrByName("xmlns:o") != null);
- }
-}