archive/net.sourceforge.phpeclipse.jtidy/src/net/sourceforge/phpdt/tidy/w3c/Clean.java

   1 /*
   2  * @(#)Clean.java   1.11 2000/08/16
   3  *
   4  */
   5
   6 package net.sourceforge.phpdt.tidy.w3c;
   7
   8 /**
   9  *
  10  * Clean up misuse of presentation markup
  11  *
  12  * (c) 1998-2000 (W3C) MIT, INRIA, Keio University
  13  * See Tidy.java for the copyright notice.
  14  * Derived from <a href="http://www.w3.org/People/Raggett/tidy">
  15  * HTML Tidy Release 4 Aug 2000</a>
  16  *
  17  * @author  Dave Raggett <dsr@w3.org>
  18  * @author  Andy Quick <ac.quick@sympatico.ca> (translation to Java)
  19  * @version 1.0, 1999/05/22
  20  * @version 1.0.1, 1999/05/29
  21  * @version 1.1, 1999/06/18 Java Bean
  22  * @version 1.2, 1999/07/10 Tidy Release 7 Jul 1999
  23  * @version 1.3, 1999/07/30 Tidy Release 26 Jul 1999
  24  * @version 1.4, 1999/09/04 DOM support
  25  * @version 1.5, 1999/10/23 Tidy Release 27 Sep 1999
  26  * @version 1.6, 1999/11/01 Tidy Release 22 Oct 1999
  27  * @version 1.7, 1999/12/06 Tidy Release 30 Nov 1999
  28  * @version 1.8, 2000/01/22 Tidy Release 13 Jan 2000
  29  * @version 1.9, 2000/06/03 Tidy Release 30 Apr 2000
  30  * @version 1.10, 2000/07/22 Tidy Release 8 Jul 2000
  31  * @version 1.11, 2000/08/16 Tidy Release 4 Aug 2000
  32  */
  33
  34 /*
  35   Filters from other formats such as Microsoft Word
  36   often make excessive use of presentation markup such
  37   as font tags, B, I, and the align attribute. By applying
  38   a set of production rules, it is straight forward to
  39   transform this to use CSS.
  40
  41   Some rules replace some of the children of an element by
  42   style properties on the element, e.g.
  43
  44   <p><b>...</b></p> -> <p style="font-weight: bold">...</p>
  45
  46   Such rules are applied to the element's content and then
  47   to the element itself until none of the rules more apply.
  48   Having applied all the rules to an element, it will have
  49   a style attribute with one or more properties.
  50
  51   Other rules strip the element they apply to, replacing
  52   it by style properties on the contents, e.g.
  53
  54   <dir><li><p>...</li></dir> -> <p style="margin-left 1em">...
  55
  56   These rules are applied to an element before processing
  57   its content and replace the current element by the first
  58   element in the exposed content.
  59
  60   After applying both sets of rules, you can replace the
  61   style attribute by a class value and style rule in the
  62   document head. To support this, an association of styles
  63   and class names is built.
  64
  65   A naive approach is to rely on string matching to test
  66   when two property lists are the same. A better approach
  67   would be to first sort the properties before matching.
  68 */
  69
  70 public class Clean {
  71
  72   private int classNum = 1;
  73
  74   private TagTable tt;
  75
  76   public Clean(TagTable tt) {
  77     this.tt = tt;
  78   }
  79
  80   private StyleProp insertProperty(StyleProp props, String name, String value) {
  81     StyleProp first, prev, prop;
  82     int cmp;
  83
  84     prev = null;
  85     first = props;
  86
  87     while (props != null) {
  88       cmp = props.name.compareTo(name);
  89
  90       if (cmp == 0) {
  91         /* this property is already defined, ignore new value */
  92         return first;
  93       }
  94
  95       if (cmp > 0) // props.name > name
  96         {
  97         /* insert before this */
  98
  99         prop = new StyleProp(name, value, props);
 100
 101         if (prev != null)
 102           prev.next = prop;
 103         else
 104           first = prop;
 105
 106         return first;
 107       }
 108
 109       prev = props;
 110       props = props.next;
 111     }
 112
 113     prop = new StyleProp(name, value);
 114
 115     if (prev != null)
 116       prev.next = prop;
 117     else
 118       first = prop;
 119
 120     return first;
 121   }
 122
 123   /*
 124    Create sorted linked list of properties from style string
 125    It temporarily places nulls in place of ':' and ';' to
 126    delimit the strings for the property name and value.
 127    Some systems don't allow you to null literal strings,
 128    so to avoid this, a copy is made first.
 129   */
 130   private StyleProp createProps(StyleProp prop, String style) {
 131     int name_end;
 132     int value_end;
 133     int value_start = 0;
 134     int name_start = 0;
 135     boolean more;
 136
 137     name_start = 0;
 138     while (name_start < style.length()) {
 139       while (name_start < style.length() && style.charAt(name_start) == ' ')
 140         ++name_start;
 141
 142       name_end = name_start;
 143
 144       while (name_end < style.length()) {
 145         if (style.charAt(name_end) == ':') {
 146           value_start = name_end + 1;
 147           break;
 148         }
 149
 150         ++name_end;
 151       }
 152
 153       if (name_end >= style.length() || style.charAt(name_end) != ':')
 154         break;
 155
 156       while (value_start < style.length() && style.charAt(value_start) == ' ')
 157         ++value_start;
 158
 159       value_end = value_start;
 160       more = false;
 161
 162       while (value_end < style.length()) {
 163         if (style.charAt(value_end) == ';') {
 164           more = true;
 165           break;
 166         }
 167
 168         ++value_end;
 169       }
 170
 171       prop = insertProperty(prop, style.substring(name_start, name_end), style.substring(value_start, value_end));
 172
 173       if (more) {
 174         name_start = value_end + 1;
 175         continue;
 176       }
 177
 178       break;
 179     }
 180
 181     return prop;
 182   }
 183
 184   private String createPropString(StyleProp props) {
 185     String style = "";
 186     int len;
 187     StyleProp prop;
 188
 189     /* compute length */
 190
 191     for (len = 0, prop = props; prop != null; prop = prop.next) {
 192       len += prop.name.length() + 2;
 193       len += prop.value.length() + 2;
 194     }
 195
 196     for (prop = props; prop != null; prop = prop.next) {
 197       style = style.concat(prop.name);
 198       style = style.concat(": ");
 199
 200       style = style.concat(prop.value);
 201
 202       if (prop.next == null)
 203         break;
 204
 205       style = style.concat("; ");
 206     }
 207
 208     return style;
 209   }
 210
 211   /*
 212     create string with merged properties
 213   */
 214   private String addProperty(String style, String property) {
 215     StyleProp prop;
 216
 217     prop = createProps(null, style);
 218     prop = createProps(prop, property);
 219     style = createPropString(prop);
 220     return style;
 221   }
 222
 223   private String gensymClass(String tag) {
 224     String str;
 225
 226     str = "c" + classNum;
 227     classNum++;
 228     return str;
 229   }
 230
 231   private String findStyle(Lexer lexer, String tag, String properties) {
 232     Style style;
 233
 234     for (style = lexer.styles; style != null; style = style.next) {
 235       if (style.tag.equals(tag) && style.properties.equals(properties))
 236         return style.tagClass;
 237     }
 238
 239     style = new Style(tag, gensymClass(tag), properties, lexer.styles);
 240     lexer.styles = style;
 241     return style.tagClass;
 242   }
 243
 244   /*
 245    Find style attribute in node, and replace it
 246    by corresponding class attribute. Search for
 247    class in style dictionary otherwise gensym
 248    new class and add to dictionary.
 249
 250    Assumes that node doesn't have a class attribute
 251   */
 252   private void style2Rule(Lexer lexer, Node node) {
 253     AttVal styleattr, classattr;
 254     String classname;
 255
 256     styleattr = node.getAttrByName("style");
 257
 258     if (styleattr != null) {
 259       classname = findStyle(lexer, node.element, styleattr.value);
 260       classattr = node.getAttrByName("class");
 261
 262       /*
 263       if there already is a class attribute
 264       then append class name after a space
 265       */
 266       if (classattr != null) {
 267         classattr.value = classattr.value + " " + classname;
 268         node.removeAttribute(styleattr);
 269       } else /* reuse style attribute for class attribute */ {
 270         styleattr.attribute = "class";
 271         styleattr.value = classname;
 272       }
 273     }
 274   }
 275
 276   private void addColorRule(Lexer lexer, String selector, String color) {
 277     if (color != null) {
 278       lexer.addStringLiteral(selector);
 279       lexer.addStringLiteral(" { color: ");
 280       lexer.addStringLiteral(color);
 281       lexer.addStringLiteral(" }\n");
 282     }
 283   }
 284
 285   /*
 286    move presentation attribs from body to style element
 287
 288    background="foo" ->  body { background-image: url(foo) }
 289    bgcolor="foo"    ->  body { background-color: foo }
 290    text="foo"       ->  body { color: foo }
 291    link="foo"       ->  :link { color: foo }
 292    vlink="foo"      ->  :visited { color: foo }
 293    alink="foo"      ->  :active { color: foo }
 294   */
 295   private void cleanBodyAttrs(Lexer lexer, Node body) {
 296     AttVal attr;
 297     String bgurl = null;
 298     String bgcolor = null;
 299     String color = null;
 300
 301     attr = body.getAttrByName("background");
 302
 303     if (attr != null) {
 304       bgurl = attr.value;
 305       attr.value = null;
 306       body.removeAttribute(attr);
 307     }
 308
 309     attr = body.getAttrByName("bgcolor");
 310
 311     if (attr != null) {
 312       bgcolor = attr.value;
 313       attr.value = null;
 314       body.removeAttribute(attr);
 315     }
 316
 317     attr = body.getAttrByName("text");
 318
 319     if (attr != null) {
 320       color = attr.value;
 321       attr.value = null;
 322       body.removeAttribute(attr);
 323     }
 324
 325     if (bgurl != null || bgcolor != null || color != null) {
 326       lexer.addStringLiteral(" body {\n");
 327
 328       if (bgurl != null) {
 329         lexer.addStringLiteral("  background-image: url(");
 330         lexer.addStringLiteral(bgurl);
 331         lexer.addStringLiteral(");\n");
 332       }
 333
 334       if (bgcolor != null) {
 335         lexer.addStringLiteral("  background-color: ");
 336         lexer.addStringLiteral(bgcolor);
 337         lexer.addStringLiteral(";\n");
 338       }
 339
 340       if (color != null) {
 341         lexer.addStringLiteral("  color: ");
 342         lexer.addStringLiteral(color);
 343         lexer.addStringLiteral(";\n");
 344       }
 345
 346       lexer.addStringLiteral(" }\n");
 347     }
 348
 349     attr = body.getAttrByName("link");
 350
 351     if (attr != null) {
 352       addColorRule(lexer, " :link", attr.value);
 353       body.removeAttribute(attr);
 354     }
 355
 356     attr = body.getAttrByName("vlink");
 357
 358     if (attr != null) {
 359       addColorRule(lexer, " :visited", attr.value);
 360       body.removeAttribute(attr);
 361     }
 362
 363     attr = body.getAttrByName("alink");
 364
 365     if (attr != null) {
 366       addColorRule(lexer, " :active", attr.value);
 367       body.removeAttribute(attr);
 368     }
 369   }
 370
 371   private boolean niceBody(Lexer lexer, Node doc) {
 372     Node body = doc.findBody(lexer.configuration.tt);
 373
 374     if (body != null) {
 375       if (body.getAttrByName("background") != null
 376         || body.getAttrByName("bgcolor") != null
 377         || body.getAttrByName("text") != null
 378         || body.getAttrByName("link") != null
 379         || body.getAttrByName("vlink") != null
 380         || body.getAttrByName("alink") != null) {
 381         lexer.badLayout |= Report.USING_BODY;
 382         return false;
 383       }
 384     }
 385
 386     return true;
 387   }
 388
 389   /* create style element using rules from dictionary */
 390   private void createStyleElement(Lexer lexer, Node doc) {
 391     Node node, head, body;
 392     Style style;
 393     AttVal av;
 394
 395     if (lexer.styles == null && niceBody(lexer, doc))
 396       return;
 397
 398     node = lexer.newNode(Node.StartTag, null, 0, 0, "style");
 399     node.implicit = true;
 400
 401     /* insert type attribute */
 402     av = new AttVal(null, null, '"', "type", "text/css");
 403     av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
 404     node.attributes = av;
 405
 406     body = doc.findBody(lexer.configuration.tt);
 407
 408     lexer.txtstart = lexer.lexsize;
 409
 410     if (body != null)
 411       cleanBodyAttrs(lexer, body);
 412
 413     for (style = lexer.styles; style != null; style = style.next) {
 414       lexer.addCharToLexer(' ');
 415       lexer.addStringLiteral(style.tag);
 416       lexer.addCharToLexer('.');
 417       lexer.addStringLiteral(style.tagClass);
 418       lexer.addCharToLexer(' ');
 419       lexer.addCharToLexer('{');
 420       lexer.addStringLiteral(style.properties);
 421       lexer.addCharToLexer('}');
 422       lexer.addCharToLexer('\n');
 423     }
 424
 425     lexer.txtend = lexer.lexsize;
 426
 427     Node.insertNodeAtEnd(node, lexer.newNode(Node.TextNode, lexer.lexbuf, lexer.txtstart, lexer.txtend));
 428
 429     /*
 430      now insert style element into document head
 431
 432      doc is root node. search its children for html node
 433      the head node should be first child of html node
 434     */
 435
 436     head = doc.findHEAD(lexer.configuration.tt);
 437
 438     if (head != null)
 439       Node.insertNodeAtEnd(head, node);
 440   }
 441
 442   /* ensure bidirectional links are consistent */
 443   private void fixNodeLinks(Node node) {
 444     Node child;
 445
 446     if (node.prev != null)
 447       node.prev.next = node;
 448     else
 449       node.parent.content = node;
 450
 451     if (node.next != null)
 452       node.next.prev = node;
 453     else
 454       node.parent.last = node;
 455
 456     for (child = node.content; child != null; child = child.next)
 457       child.parent = node;
 458   }
 459
 460   /*
 461    used to strip child of node when
 462    the node has one and only one child
 463   */
 464   private void stripOnlyChild(Node node) {
 465     Node child;
 466
 467     child = node.content;
 468     node.content = child.content;
 469     node.last = child.last;
 470     child.content = null;
 471
 472     for (child = node.content; child != null; child = child.next)
 473       child.parent = node;
 474   }
 475
 476   /* used to strip font start and end tags */
 477   private void discardContainer(Node element, MutableObject pnode) {
 478     Node node;
 479     Node parent = element.parent;
 480
 481     if (element.content != null) {
 482       element.last.next = element.next;
 483
 484       if (element.next != null) {
 485         element.next.prev = element.last;
 486         element.last.next = element.next;
 487       } else
 488         parent.last = element.last;
 489
 490       if (element.prev != null) {
 491         element.content.prev = element.prev;
 492         element.prev.next = element.content;
 493       } else
 494         parent.content = element.content;
 495
 496       for (node = element.content; node != null; node = node.next)
 497         node.parent = parent;
 498
 499       pnode.setObject(element.content);
 500     } else {
 501       if (element.next != null)
 502         element.next.prev = element.prev;
 503       else
 504         parent.last = element.prev;
 505
 506       if (element.prev != null)
 507         element.prev.next = element.next;
 508       else
 509         parent.content = element.next;
 510
 511       pnode.setObject(element.next);
 512     }
 513
 514     element.next = null;
 515     element.content = null;
 516   }
 517
 518   /*
 519    Add style property to element, creating style
 520    attribute as needed and adding ; delimiter
 521   */
 522   private void addStyleProperty(Node node, String property) {
 523     AttVal av;
 524
 525     for (av = node.attributes; av != null; av = av.next) {
 526       if (av.attribute.equals("style"))
 527         break;
 528     }
 529
 530     /* if style attribute already exists then insert property */
 531
 532     if (av != null) {
 533       String s;
 534
 535       s = addProperty(av.value, property);
 536       av.value = s;
 537     } else /* else create new style attribute */ {
 538       av = new AttVal(node.attributes, null, '"', "style", property);
 539       av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
 540       node.attributes = av;
 541     }
 542   }
 543
 544   /*
 545     Create new string that consists of the
 546     combined style properties in s1 and s2
 547
 548     To merge property lists, we build a linked
 549     list of property/values and insert properties
 550     into the list in order, merging values for
 551     the same property name.
 552   */
 553   private String mergeProperties(String s1, String s2) {
 554     String s;
 555     StyleProp prop;
 556
 557     prop = createProps(null, s1);
 558     prop = createProps(prop, s2);
 559     s = createPropString(prop);
 560     return s;
 561   }
 562
 563   private void mergeStyles(Node node, Node child) {
 564     AttVal av;
 565     String s1, s2, style;
 566
 567     for (s2 = null, av = child.attributes; av != null; av = av.next) {
 568       if (av.attribute.equals("style")) {
 569         s2 = av.value;
 570         break;
 571       }
 572     }
 573
 574     for (s1 = null, av = node.attributes; av != null; av = av.next) {
 575       if (av.attribute.equals("style")) {
 576         s1 = av.value;
 577         break;
 578       }
 579     }
 580
 581     if (s1 != null) {
 582       if (s2 != null) /* merge styles from both */ {
 583         style = mergeProperties(s1, s2);
 584         av.value = style;
 585       }
 586     } else if (s2 != null) /* copy style of child */ {
 587       av = new AttVal(node.attributes, null, '"', "style", s2);
 588       av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
 589       node.attributes = av;
 590     }
 591   }
 592
 593   private String fontSize2Name(String size) {
 594     /*
 595     String[] sizes =
 596     {
 597         "50%",
 598         "60%",
 599         "80%",
 600         null,
 601         "120%",
 602         "150%",
 603         "200%"
 604     };
 605     */
 606
 607     String[] sizes = { "60%", "70%", "80%", null, "120%", "150%", "200%" };
 608     String buf;
 609
 610     if (size.length() > 0 && '0' <= size.charAt(0) && size.charAt(0) <= '6') {
 611       int n = size.charAt(0) - '0';
 612       return sizes[n];
 613     }
 614
 615     if (size.length() > 0 && size.charAt(0) == '-') {
 616       if (size.length() > 1 && '0' <= size.charAt(1) && size.charAt(1) <= '6') {
 617         int n = size.charAt(1) - '0';
 618         double x;
 619
 620         for (x = 1.0; n > 0; --n)
 621           x *= 0.8;
 622
 623         x *= 100.0;
 624         buf = "" + (int) x + "%";
 625
 626         return buf;
 627       }
 628
 629       return "smaller"; /*"70%"; */
 630     }
 631
 632     if (size.length() > 1 && '0' <= size.charAt(1) && size.charAt(1) <= '6') {
 633       int n = size.charAt(1) - '0';
 634       double x;
 635
 636       for (x = 1.0; n > 0; --n)
 637         x *= 1.2;
 638
 639       x *= 100.0;
 640       buf = "" + (int) x + "%";
 641
 642       return buf;
 643     }
 644
 645     return "larger"; /* "140%" */
 646   }
 647
 648   private void addFontFace(Node node, String face) {
 649     addStyleProperty(node, "font-family: " + face);
 650   }
 651
 652   private void addFontSize(Node node, String size) {
 653     String value;
 654
 655     if (size.equals("6") && node.tag == tt.tagP) {
 656       node.element = "h1";
 657       tt.findTag(node);
 658       return;
 659     }
 660
 661     if (size.equals("5") && node.tag == tt.tagP) {
 662       node.element = "h2";
 663       tt.findTag(node);
 664       return;
 665     }
 666
 667     if (size.equals("4") && node.tag == tt.tagP) {
 668       node.element = "h3";
 669       tt.findTag(node);
 670       return;
 671     }
 672
 673     value = fontSize2Name(size);
 674
 675     if (value != null) {
 676       addStyleProperty(node, "font-size: " + value);
 677     }
 678   }
 679
 680   private void addFontColor(Node node, String color) {
 681     addStyleProperty(node, "color: " + color);
 682   }
 683
 684   private void addAlign(Node node, String align) {
 685     /* force alignment value to lower case */
 686     addStyleProperty(node, "text-align: " + align.toLowerCase());
 687   }
 688
 689   /*
 690    add style properties to node corresponding to
 691    the font face, size and color attributes
 692   */
 693   private void addFontStyles(Node node, AttVal av) {
 694     while (av != null) {
 695       if (av.attribute.equals("face"))
 696         addFontFace(node, av.value);
 697       else if (av.attribute.equals("size"))
 698         addFontSize(node, av.value);
 699       else if (av.attribute.equals("color"))
 700         addFontColor(node, av.value);
 701
 702       av = av.next;
 703     }
 704   }
 705
 706   /*
 707       Symptom: <p align=center>
 708       Action: <p style="text-align: center">
 709   */
 710   private void textAlign(Lexer lexer, Node node) {
 711     AttVal av, prev;
 712
 713     prev = null;
 714
 715     for (av = node.attributes; av != null; av = av.next) {
 716       if (av.attribute.equals("align")) {
 717         if (prev != null)
 718           prev.next = av.next;
 719         else
 720           node.attributes = av.next;
 721
 722         if (av.value != null) {
 723           addAlign(node, av.value);
 724         }
 725
 726         break;
 727       }
 728
 729       prev = av;
 730     }
 731   }
 732
 733   /*
 734      The clean up rules use the pnode argument to return the
 735      next node when the orignal node has been deleted
 736   */
 737
 738   /*
 739       Symptom: <dir> <li> where <li> is only child
 740       Action: coerce <dir> <li> to <div> with indent.
 741   */
 742
 743   private boolean dir2Div(Lexer lexer, Node node, MutableObject pnode) {
 744     Node child;
 745
 746     if (node.tag == tt.tagDir || node.tag == tt.tagUl || node.tag == tt.tagOl) {
 747       child = node.content;
 748
 749       if (child == null)
 750         return false;
 751
 752       /* check child has no peers */
 753
 754       if (child.next != null)
 755         return false;
 756
 757       if (child.tag != tt.tagLi)
 758         return false;
 759
 760       if (!child.implicit)
 761         return false;
 762
 763       /* coerce dir to div */
 764
 765       node.tag = tt.tagDiv;
 766       node.element = "div";
 767       addStyleProperty(node, "margin-left: 2em");
 768       stripOnlyChild(node);
 769       return true;
 770
 771       //#if 0
 772       //Node content;
 773       //Node last;
 774       //content = child.content;
 775       //last = child.last;
 776       //child.content = null;
 777
 778       /* adjust parent and set margin on contents of <li> */
 779
 780       //for (child = content; child != null; child = child.next)
 781       //{
 782       //    child.parent = node.parent;
 783       //    addStyleProperty(child, "margin-left: 1em");
 784       //}
 785
 786       /* hook first/last into sequence */
 787
 788       //if (content != null)
 789       //{
 790       //    content.prev = node.prev;
 791       //    last.next = node.next;
 792       //    fixNodeLinks(content);
 793       //    fixNodeLinks(last);
 794       //}
 795
 796       //node.next = null;
 797
 798       /* ensure that new node is cleaned */
 799       //pnode.setObject(cleanNode(lexer, content));
 800       //return true;
 801       //#endif
 802     }
 803
 804     return false;
 805   }
 806
 807   /*
 808       Symptom: <center>
 809       Action: replace <center> by <div style="text-align: center">
 810   */
 811
 812   private boolean center2Div(Lexer lexer, Node node, MutableObject pnode) {
 813     if (node.tag == tt.tagCenter) {
 814       if (lexer.configuration.DropFontTags) {
 815         if (node.content != null) {
 816           Node last = node.last;
 817           Node parent = node.parent;
 818
 819           discardContainer(node, pnode);
 820
 821           node = lexer.inferredTag("br");
 822
 823           if (last.next != null)
 824             last.next.prev = node;
 825
 826           node.next = last.next;
 827           last.next = node;
 828           node.prev = last;
 829
 830           if (parent.last == last)
 831             parent.last = node;
 832
 833           node.parent = parent;
 834         } else {
 835           Node prev = node.prev;
 836           Node next = node.next;
 837           Node parent = node.parent;
 838           discardContainer(node, pnode);
 839
 840           node = lexer.inferredTag("br");
 841           node.next = next;
 842           node.prev = prev;
 843           node.parent = parent;
 844
 845           if (next != null)
 846             next.prev = node;
 847           else
 848             parent.last = node;
 849
 850           if (prev != null)
 851             prev.next = node;
 852           else
 853             parent.content = node;
 854         }
 855
 856         return true;
 857       }
 858       node.tag = tt.tagDiv;
 859       node.element = "div";
 860       addStyleProperty(node, "text-align: center");
 861       return true;
 862     }
 863
 864     return false;
 865   }
 866
 867   /*
 868       Symptom <div><div>...</div></div>
 869       Action: merge the two divs
 870
 871     This is useful after nested <dir>s used by Word
 872     for indenting have been converted to <div>s
 873   */
 874   private boolean mergeDivs(Lexer lexer, Node node, MutableObject pnode) {
 875     Node child;
 876
 877     if (node.tag != tt.tagDiv)
 878       return false;
 879
 880     child = node.content;
 881
 882     if (child == null)
 883       return false;
 884
 885     if (child.tag != tt.tagDiv)
 886       return false;
 887
 888     if (child.next != null)
 889       return false;
 890
 891     mergeStyles(node, child);
 892     stripOnlyChild(node);
 893     return true;
 894   }
 895
 896   /*
 897       Symptom: <ul><li><ul>...</ul></li></ul>
 898       Action: discard outer list
 899   */
 900
 901   private boolean nestedList(Lexer lexer, Node node, MutableObject pnode) {
 902     Node child, list;
 903
 904     if (node.tag == tt.tagUl || node.tag == tt.tagOl) {
 905       child = node.content;
 906
 907       if (child == null)
 908         return false;
 909
 910       /* check child has no peers */
 911
 912       if (child.next != null)
 913         return false;
 914
 915       list = child.content;
 916
 917       if (list == null)
 918         return false;
 919
 920       if (list.tag != node.tag)
 921         return false;
 922
 923       pnode.setObject(node.next);
 924
 925       /* move inner list node into position of outer node */
 926       list.prev = node.prev;
 927       list.next = node.next;
 928       list.parent = node.parent;
 929       fixNodeLinks(list);
 930
 931       /* get rid of outer ul and its li */
 932       child.content = null;
 933       node.content = null;
 934       node.next = null;
 935
 936       /*
 937         If prev node was a list the chances are this node
 938         should be appended to that list. Word has no way of
 939         recognizing nested lists and just uses indents
 940       */
 941
 942       if (list.prev != null) {
 943         node = list;
 944         list = node.prev;
 945
 946         if (list.tag == tt.tagUl || list.tag == tt.tagOl) {
 947           list.next = node.next;
 948
 949           if (list.next != null)
 950             list.next.prev = list;
 951
 952           child = list.last; /* <li> */
 953
 954           node.parent = child;
 955           node.next = null;
 956           node.prev = child.last;
 957           fixNodeLinks(node);
 958         }
 959       }
 960
 961       cleanNode(lexer, node);
 962       return true;
 963     }
 964
 965     return false;
 966   }
 967
 968   /*
 969       Symptom: the only child of a block-level element is a
 970       presentation element such as B, I or FONT
 971
 972       Action: add style "font-weight: bold" to the block and
 973       strip the <b> element, leaving its children.
 974
 975     example:
 976
 977       <p>
 978         <b><font face="Arial" size="6">Draft Recommended Practice</font></b>
 979       </p>
 980
 981     becomes:
 982
 983         <p style="font-weight: bold; font-family: Arial; font-size: 6">
 984           Draft Recommended Practice
 985         </p>
 986
 987     This code also replaces the align attribute by a style attribute.
 988     However, to avoid CSS problems with Navigator 4, this isn't done
 989     for the elements: caption, tr and table
 990   */
 991   private boolean blockStyle(Lexer lexer, Node node, MutableObject pnode) {
 992     Node child;
 993
 994     if ((node.tag.model & (Dict.CM_BLOCK | Dict.CM_LIST | Dict.CM_DEFLIST | Dict.CM_TABLE)) != 0) {
 995       if (node.tag != tt.tagTable && node.tag != tt.tagTr && node.tag != tt.tagLi) {
 996         /* check for align attribute */
 997         if (node.tag != tt.tagCaption)
 998           textAlign(lexer, node);
 999
1000         child = node.content;
1001
1002         if (child == null)
1003           return false;
1004
1005         /* check child has no peers */
1006
1007         if (child.next != null)
1008           return false;
1009
1010         if (child.tag == tt.tagB) {
1011           mergeStyles(node, child);
1012           addStyleProperty(node, "font-weight: bold");
1013           stripOnlyChild(node);
1014           return true;
1015         }
1016
1017         if (child.tag == tt.tagI) {
1018           mergeStyles(node, child);
1019           addStyleProperty(node, "font-style: italic");
1020           stripOnlyChild(node);
1021           return true;
1022         }
1023
1024         if (child.tag == tt.tagFont) {
1025           mergeStyles(node, child);
1026           addFontStyles(node, child.attributes);
1027           stripOnlyChild(node);
1028           return true;
1029         }
1030       }
1031     }
1032
1033     return false;
1034   }
1035
1036   /* the only child of table cell or an inline element such as em */
1037   private boolean inlineStyle(Lexer lexer, Node node, MutableObject pnode) {
1038     Node child;
1039
1040     if (node.tag != tt.tagFont && (node.tag.model & (Dict.CM_INLINE | Dict.CM_ROW)) != 0) {
1041       child = node.content;
1042
1043       if (child == null)
1044         return false;
1045
1046       /* check child has no peers */
1047
1048       if (child.next != null)
1049         return false;
1050
1051       if (child.tag == tt.tagB && lexer.configuration.LogicalEmphasis) {
1052         mergeStyles(node, child);
1053         addStyleProperty(node, "font-weight: bold");
1054         stripOnlyChild(node);
1055         return true;
1056       }
1057
1058       if (child.tag == tt.tagI && lexer.configuration.LogicalEmphasis) {
1059         mergeStyles(node, child);
1060         addStyleProperty(node, "font-style: italic");
1061         stripOnlyChild(node);
1062         return true;
1063       }
1064
1065       if (child.tag == tt.tagFont) {
1066         mergeStyles(node, child);
1067         addFontStyles(node, child.attributes);
1068         stripOnlyChild(node);
1069         return true;
1070       }
1071     }
1072
1073     return false;
1074   }
1075
1076   /*
1077     Replace font elements by span elements, deleting
1078     the font element's attributes and replacing them
1079     by a single style attribute.
1080   */
1081   private boolean font2Span(Lexer lexer, Node node, MutableObject pnode) {
1082     AttVal av, style, next;
1083
1084     if (node.tag == tt.tagFont) {
1085       if (lexer.configuration.DropFontTags) {
1086         discardContainer(node, pnode);
1087         return false;
1088       }
1089
1090       /* if FONT is only child of parent element then leave alone */
1091       if (node.parent.content == node && node.next == null)
1092         return false;
1093
1094       addFontStyles(node, node.attributes);
1095
1096       /* extract style attribute and free the rest */
1097       av = node.attributes;
1098       style = null;
1099
1100       while (av != null) {
1101         next = av.next;
1102
1103         if (av.attribute.equals("style")) {
1104           av.next = null;
1105           style = av;
1106         }
1107
1108         av = next;
1109       }
1110
1111       node.attributes = style;
1112
1113       node.tag = tt.tagSpan;
1114       node.element = "span";
1115
1116       return true;
1117     }
1118
1119     return false;
1120   }
1121
1122   /*
1123     Applies all matching rules to a node.
1124   */
1125   private Node cleanNode(Lexer lexer, Node node) {
1126     Node next = null;
1127     MutableObject o = new MutableObject();
1128     boolean b = false;
1129
1130     for (next = node; node.isElement(); node = next) {
1131       o.setObject(next);
1132
1133       b = dir2Div(lexer, node, o);
1134       next = (Node) o.getObject();
1135       if (b)
1136         continue;
1137
1138       b = nestedList(lexer, node, o);
1139       next = (Node) o.getObject();
1140       if (b)
1141         continue;
1142
1143       b = center2Div(lexer, node, o);
1144       next = (Node) o.getObject();
1145       if (b)
1146         continue;
1147
1148       b = mergeDivs(lexer, node, o);
1149       next = (Node) o.getObject();
1150       if (b)
1151         continue;
1152
1153       b = blockStyle(lexer, node, o);
1154       next = (Node) o.getObject();
1155       if (b)
1156         continue;
1157
1158       b = inlineStyle(lexer, node, o);
1159       next = (Node) o.getObject();
1160       if (b)
1161         continue;
1162
1163       b = font2Span(lexer, node, o);
1164       next = (Node) o.getObject();
1165       if (b)
1166         continue;
1167
1168       break;
1169     }
1170
1171     return next;
1172   }
1173
1174   private Node createStyleProperties(Lexer lexer, Node node) {
1175     Node child;
1176
1177     if (node.content != null) {
1178       for (child = node.content; child != null; child = child.next) {
1179         child = createStyleProperties(lexer, child);
1180       }
1181     }
1182
1183     return cleanNode(lexer, node);
1184   }
1185
1186   private void defineStyleRules(Lexer lexer, Node node) {
1187     Node child;
1188
1189     if (node.content != null) {
1190       for (child = node.content; child != null; child = child.next) {
1191         defineStyleRules(lexer, child);
1192       }
1193     }
1194
1195     style2Rule(lexer, node);
1196   }
1197
1198   public void cleanTree(Lexer lexer, Node doc) {
1199     doc = createStyleProperties(lexer, doc);
1200
1201     if (!lexer.configuration.MakeClean) {
1202       defineStyleRules(lexer, doc);
1203       createStyleElement(lexer, doc);
1204     }
1205   }
1206
1207   /* simplifies <b><b> ... </b> ...</b> etc. */
1208   public void nestedEmphasis(Node node) {
1209     MutableObject o = new MutableObject();
1210     Node next;
1211
1212     while (node != null) {
1213       next = node.next;
1214
1215       if ((node.tag == tt.tagB || node.tag == tt.tagI) && node.parent != null && node.parent.tag == node.tag) {
1216         /* strip redundant inner element */
1217         o.setObject(next);
1218         discardContainer(node, o);
1219         next = (Node) o.getObject();
1220         node = next;
1221         continue;
1222       }
1223
1224       if (node.content != null)
1225         nestedEmphasis(node.content);
1226
1227       node = next;
1228     }
1229   }
1230
1231   /* replace i by em and b by strong */
1232   public void emFromI(Node node) {
1233     while (node != null) {
1234       if (node.tag == tt.tagI) {
1235         node.element = tt.tagEm.name;
1236         node.tag = tt.tagEm;
1237       } else if (node.tag == tt.tagB) {
1238         node.element = tt.tagStrong.name;
1239         node.tag = tt.tagStrong;
1240       }
1241
1242       if (node.content != null)
1243         emFromI(node.content);
1244
1245       node = node.next;
1246     }
1247   }
1248
1249   /*
1250    Some people use dir or ul without an li
1251    to indent the content. The pattern to
1252    look for is a list with a single implicit
1253    li. This is recursively replaced by an
1254    implicit blockquote.
1255   */
1256   public void list2BQ(Node node) {
1257     while (node != null) {
1258       if (node.content != null)
1259         list2BQ(node.content);
1260
1261       if (node.tag != null && node.tag.parser == ParserImpl.getParseList() && node.hasOneChild() && node.content.implicit) {
1262         stripOnlyChild(node);
1263         node.element = tt.tagBlockquote.name;
1264         node.tag = tt.tagBlockquote;
1265         node.implicit = true;
1266       }
1267
1268       node = node.next;
1269     }
1270   }
1271
1272   /*
1273    Replace implicit blockquote by div with an indent
1274    taking care to reduce nested blockquotes to a single
1275    div with the indent set to match the nesting depth
1276   */
1277   public void bQ2Div(Node node) {
1278     int indent;
1279     String indent_buf;
1280
1281     while (node != null) {
1282       if (node.tag == tt.tagBlockquote && node.implicit) {
1283         indent = 1;
1284
1285         while (node.hasOneChild() && node.content.tag == tt.tagBlockquote && node.implicit) {
1286           ++indent;
1287           stripOnlyChild(node);
1288         }
1289
1290         if (node.content != null)
1291           bQ2Div(node.content);
1292
1293         indent_buf = "margin-left: " + (new Integer(2 * indent)).toString() + "em";
1294
1295         node.element = tt.tagDiv.name;
1296         node.tag = tt.tagDiv;
1297         node.addAttribute("style", indent_buf);
1298       } else if (node.content != null)
1299         bQ2Div(node.content);
1300
1301       node = node.next;
1302     }
1303   }
1304
1305   /* node is <![if ...]> prune up to <![endif]> */
1306   public Node pruneSection(Lexer lexer, Node node) {
1307     for (;;) {
1308       /* discard node and returns next */
1309       node = Node.discardElement(node);
1310
1311       if (node == null)
1312         return null;
1313
1314       if (node.type == Node.SectionTag) {
1315         if ((Lexer.getString(node.textarray, node.start, 2)).equals("if")) {
1316           node = pruneSection(lexer, node);
1317           continue;
1318         }
1319
1320         if ((Lexer.getString(node.textarray, node.start, 5)).equals("endif")) {
1321           node = Node.discardElement(node);
1322           break;
1323         }
1324       }
1325     }
1326
1327     return node;
1328   }
1329
1330   public void dropSections(Lexer lexer, Node node) {
1331     while (node != null) {
1332       if (node.type == Node.SectionTag) {
1333         /* prune up to matching endif */
1334         if ((Lexer.getString(node.textarray, node.start, 2)).equals("if")) {
1335           node = pruneSection(lexer, node);
1336           continue;
1337         }
1338
1339         /* discard others as well */
1340         node = Node.discardElement(node);
1341         continue;
1342       }
1343
1344       if (node.content != null)
1345         dropSections(lexer, node.content);
1346
1347       node = node.next;
1348     }
1349   }
1350
1351   // gschadow patch start
1352   /** Get rid of all this pseudo-XML crap, sections, Asp tags, JSP tags, etc.
1353    **/
1354   public void dropPseudoXMLCrap(Lexer lexer, Node node) {
1355     while (node != null) {
1356       switch (node.type) {
1357         case Node.AspTag :
1358         case Node.JsteTag :
1359         case Node.PhpTag :
1360         case Node.SectionTag :
1361           node = Node.discardElement(node);
1362           break;
1363
1364         default :
1365           if (node.content != null)
1366             dropPseudoXMLCrap(lexer, node.content);
1367           node = node.next;
1368           break;
1369       }
1370     }
1371   }
1372   // gschadow patch end
1373
1374   public void purgeAttributes(Node node) {
1375     AttVal attr = node.attributes;
1376     AttVal next = null;
1377     AttVal prev = null;
1378
1379     while (attr != null) {
1380       next = attr.next;
1381
1382       /* special check for class="Code" denoting pre text */
1383       if (attr.attribute != null && attr.value != null && attr.attribute.equals("class") && attr.value.equals("Code")) {
1384         prev = attr;
1385       } else if (
1386         attr.attribute != null
1387           && (attr.attribute.equals("class")
1388             || attr.attribute.equals("style")
1389             || attr.attribute.equals("lang")
1390             || attr.attribute.startsWith("x:")
1391             || ((attr.attribute.equals("height") || attr.attribute.equals("width"))
1392               && (node.tag == tt.tagTd || node.tag == tt.tagTr || node.tag == tt.tagTh)))) {
1393         if (prev != null)
1394           prev.next = next;
1395         else
1396           node.attributes = next;
1397
1398       } else
1399         prev = attr;
1400
1401       attr = next;
1402     }
1403   }
1404
1405   /* Word2000 uses span excessively, so we strip span out */
1406   public Node stripSpan(Lexer lexer, Node span) {
1407     Node node;
1408     Node prev = null;
1409     Node content;
1410
1411     /*
1412      deal with span elements that have content
1413      by splicing the content in place of the span
1414      after having processed it
1415     */
1416
1417     cleanWord2000(lexer, span.content);
1418     content = span.content;
1419
1420     if (span.prev != null)
1421       prev = span.prev;
1422     else if (content != null) {
1423       node = content;
1424       content = content.next;
1425       Node.removeNode(node);
1426       Node.insertNodeBeforeElement(span, node);
1427       prev = node;
1428     }
1429
1430     while (content != null) {
1431       node = content;
1432       content = content.next;
1433       Node.removeNode(node);
1434       Node.insertNodeAfterElement(prev, node);
1435       prev = node;
1436     }
1437
1438     if (span.next == null)
1439       span.parent.last = prev;
1440
1441     node = span.next;
1442     span.content = null;
1443     Node.discardElement(span);
1444     return node;
1445   }
1446
1447   /* map non-breaking spaces to regular spaces */
1448   private void normalizeSpaces(Lexer lexer, Node node) {
1449     while (node != null) {
1450       if (node.content != null)
1451         normalizeSpaces(lexer, node.content);
1452
1453       if (node.type == Node.TextNode) {
1454         int i;
1455         MutableInteger c = new MutableInteger();
1456         int p = node.start;
1457
1458         for (i = node.start; i < node.end; ++i) {
1459           c.value = (int) node.textarray[i];
1460
1461           /* look for UTF-8 multibyte character */
1462           if (c.value > 0x7F)
1463             i += PPrint.getUTF8(node.textarray, i, c);
1464
1465           if (c.value == 160)
1466             c.value = ' ';
1467
1468           p = PPrint.putUTF8(node.textarray, p, c.value);
1469         }
1470       }
1471
1472       node = node.next;
1473     }
1474   }
1475
1476   /*
1477    This is a major clean up to strip out all the extra stuff you get
1478    when you save as web page from Word 2000. It doesn't yet know what
1479    to do with VML tags, but these will appear as errors unless you
1480    declare them as new tags, such as o:p which needs to be declared
1481    as inline.
1482   */
1483   public void cleanWord2000(Lexer lexer, Node node) {
1484     /* used to a list from a sequence of bulletted p's */
1485     Node list = null;
1486
1487     while (node != null) {
1488       /* discard Word's style verbiage */
1489       if (node.tag == tt.tagStyle || node.tag == tt.tagMeta || node.type == Node.CommentTag) {
1490         node = Node.discardElement(node);
1491         continue;
1492       }
1493
1494       /* strip out all span tags Word scatters so liberally! */
1495       if (node.tag == tt.tagSpan) {
1496         node = stripSpan(lexer, node);
1497         continue;
1498       }
1499
1500       /* get rid of Word's xmlns attributes */
1501       if (node.tag == tt.tagHtml) {
1502         /* check that it's a Word 2000 document */
1503         if (node.getAttrByName("xmlns:o") == null)
1504           return;
1505       }
1506
1507       if (node.tag == tt.tagLink) {
1508         AttVal attr = node.getAttrByName("rel");
1509
1510         if (attr != null && attr.value != null && attr.value.equals("File-List")) {
1511           node = Node.discardElement(node);
1512           continue;
1513         }
1514       }
1515
1516       /* discard empty paragraphs */
1517       if (node.content == null && node.tag == tt.tagP) {
1518         node = Node.discardElement(node);
1519         continue;
1520       }
1521
1522       if (node.tag == tt.tagP) {
1523         AttVal attr = node.getAttrByName("class");
1524
1525         /* map sequence of <p class="MsoListBullet"> to <ul>...</ul> */
1526         if (attr != null && attr.value != null && attr.value.equals("MsoListBullet")) {
1527           Node.coerceNode(lexer, node, tt.tagLi);
1528
1529           if (list == null || list.tag != tt.tagUl) {
1530             list = lexer.inferredTag("ul");
1531             Node.insertNodeBeforeElement(node, list);
1532           }
1533
1534           purgeAttributes(node);
1535
1536           if (node.content != null)
1537             cleanWord2000(lexer, node.content);
1538
1539           /* remove node and append to contents of list */
1540           Node.removeNode(node);
1541           Node.insertNodeAtEnd(list, node);
1542           node = list.next;
1543         }
1544         /* map sequence of <p class="Code"> to <pre>...</pre> */
1545         else if (attr != null && attr.value != null && attr.value.equals("Code")) {
1546           Node br = lexer.newLineNode();
1547           normalizeSpaces(lexer, node);
1548
1549           if (list == null || list.tag != tt.tagPre) {
1550             list = lexer.inferredTag("pre");
1551             Node.insertNodeBeforeElement(node, list);
1552           }
1553
1554           /* remove node and append to contents of list */
1555           Node.removeNode(node);
1556           Node.insertNodeAtEnd(list, node);
1557           stripSpan(lexer, node);
1558           Node.insertNodeAtEnd(list, br);
1559           node = list.next;
1560         } else
1561           list = null;
1562       } else
1563         list = null;
1564
1565       /* strip out style and class attributes */
1566       if (node.type == Node.StartTag || node.type == Node.StartEndTag)
1567         purgeAttributes(node);
1568
1569       if (node.content != null)
1570         cleanWord2000(lexer, node.content);
1571
1572       node = node.next;
1573     }
1574   }
1575
1576   public boolean isWord2000(Node root, TagTable tt) {
1577     Node html = root.findHTML(tt);
1578
1579     return (html != null && html.getAttrByName("xmlns:o") != null);
1580   }
1581 }