archive/net.sourceforge.phpeclipse.jtidy/src/net/sourceforge/phpdt/tidy/w3c/Lexer.java

   1 /*
   2  * @(#)Lexer.java   1.11 2000/08/16
   3  *
   4  */
   5
   6 package net.sourceforge.phpdt.tidy.w3c;
   7
   8 /**
   9  *
  10  * Lexer for html parser
  11  *
  12  * (c) 1998-2000 (W3C) MIT, INRIA, Keio University
  13  * See Tidy.java for the copyright notice.
  14  * Derived from <a href="http://www.w3.org/People/Raggett/tidy">
  15  * HTML Tidy Release 4 Aug 2000</a>
  16  *
  17  * @author  Dave Raggett <dsr@w3.org>
  18  * @author  Andy Quick <ac.quick@sympatico.ca> (translation to Java)
  19  * @version 1.0, 1999/05/22
  20  * @version 1.0.1, 1999/05/29
  21  * @version 1.1, 1999/06/18 Java Bean
  22  * @version 1.2, 1999/07/10 Tidy Release 7 Jul 1999
  23  * @version 1.3, 1999/07/30 Tidy Release 26 Jul 1999
  24  * @version 1.4, 1999/09/04 DOM support
  25  * @version 1.5, 1999/10/23 Tidy Release 27 Sep 1999
  26  * @version 1.6, 1999/11/01 Tidy Release 22 Oct 1999
  27  * @version 1.7, 1999/12/06 Tidy Release 30 Nov 1999
  28  * @version 1.8, 2000/01/22 Tidy Release 13 Jan 2000
  29  * @version 1.9, 2000/06/03 Tidy Release 30 Apr 2000
  30  * @version 1.10, 2000/07/22 Tidy Release 8 Jul 2000
  31  * @version 1.11, 2000/08/16 Tidy Release 4 Aug 2000
  32  */
  33
  34 /*
  35   Given a file stream fp it returns a sequence of tokens.
  36
  37      GetToken(fp) gets the next token
  38      UngetToken(fp) provides one level undo
  39
  40   The tags include an attribute list:
  41
  42     - linked list of attribute/value nodes
  43     - each node has 2 null-terminated strings.
  44     - entities are replaced in attribute values
  45
  46   white space is compacted if not in preformatted mode
  47   If not in preformatted mode then leading white space
  48   is discarded and subsequent white space sequences
  49   compacted to single space chars.
  50
  51   If XmlTags is no then Tag names are folded to upper
  52   case and attribute names to lower case.
  53
  54  Not yet done:
  55     -   Doctype subset and marked sections
  56 */
  57
  58 import java.io.PrintWriter;
  59 import java.util.Stack;
  60 import java.util.Vector;
  61
  62 import org.eclipse.core.resources.IFile;
  63
  64 public class Lexer {
  65
  66     private IFile iFile;
  67     public StreamIn in;   /* file stream */
  68     public PrintWriter errout;   /* error output stream */
  69     public short badAccess; /* for accessibility errors */
  70     public short badLayout; /* for bad style errors */
  71     public short badChars;  /* for bad char encodings */
  72     public short badForm;   /* for mismatched/mispositioned form tags */
  73     public short warnings;  /* count of warnings in this document */
  74     public short errors;    /* count of errors */
  75     public int   lines;     /* lines seen */
  76     public int   columns;   /* at start of current token */
  77     public boolean waswhite;  /* used to collapse contiguous white space */
  78     public boolean pushed;    /* true after token has been pushed back */
  79     public boolean insertspace;   /* when space is moved after end tag */
  80     public boolean excludeBlocks;  /* Netscape compatibility */
  81     public boolean exiled;    /* true if moved out of table */
  82     public boolean isvoyager; /* true if xmlns attribute on html element */
  83     public short versions;  /* bit vector of HTML versions */
  84     public int doctype;    /* version as given by doctype (if any) */
  85     public boolean badDoctype; /* e.g. if html or PUBLIC is missing */
  86     public int txtstart;  /* start of current node */
  87     public int txtend;    /* end of current node */
  88     public short state;     /* state of lexer's finite state machine */
  89     public Node token;
  90
  91     /*
  92       lexer character buffer
  93
  94       parse tree nodes span onto this buffer
  95       which contains the concatenated text
  96       contents of all of the elements.
  97
  98      lexsize must be reset for each file.
  99     */
 100     public byte[] lexbuf;   /* byte buffer of UTF-8 chars */
 101     public int lexlength;   /* allocated */
 102     public int lexsize;     /* used */
 103
 104     /* Inline stack for compatibility with Mosaic */
 105     public Node inode;        /* for deferring text node */
 106     public int insert;        /* for inferring inline tags */
 107     public Stack istack;
 108     public int istackbase;    /* start of frame */
 109
 110     public Style styles;      /* used for cleaning up presentation markup */
 111
 112     public Configuration configuration;
 113     protected int seenBodyEndTag; /* used by parser */
 114     private Vector nodeList;
 115
 116     public Lexer(IFile iFile, StreamIn in, Configuration configuration)
 117     {
 118         this.iFile = iFile;
 119         this.in = in;
 120         this.lines = 1;
 121         this.columns = 1;
 122         this.state = LEX_CONTENT;
 123         this.badAccess = 0;
 124         this.badLayout = 0;
 125         this.badChars = 0;
 126         this.badForm = 0;
 127         this.warnings = 0;
 128         this.errors = 0;
 129         this.waswhite = false;
 130         this.pushed = false;
 131         this.insertspace = false;
 132         this.exiled = false;
 133         this.isvoyager = false;
 134         this.versions = Dict.VERS_EVERYTHING;
 135         this.doctype = Dict.VERS_UNKNOWN;
 136         this.badDoctype = false;
 137         this.txtstart = 0;
 138         this.txtend = 0;
 139         this.token = null;
 140         this.lexbuf =  null;
 141         this.lexlength = 0;
 142         this.lexsize = 0;
 143         this.inode = null;
 144         this.insert = -1;
 145         this.istack = new Stack();
 146         this.istackbase = 0;
 147         this.styles = null;
 148         this.configuration = configuration;
 149         this.seenBodyEndTag = 0;
 150         this.nodeList = new Vector();
 151     }
 152
 153     public IFile getIFile() {
 154       return iFile;
 155     }
 156
 157     public Node newNode()
 158     {
 159         Node node = new Node();
 160         nodeList.addElement(node);
 161         return node;
 162     }
 163
 164     public Node newNode(short type, byte[] textarray, int start, int end)
 165     {
 166         Node node = new Node(type, textarray, start, end);
 167         nodeList.addElement(node);
 168         return node;
 169     }
 170
 171     public Node newNode(short type, byte[] textarray, int start, int end, String element)
 172     {
 173         Node node = new Node(type, textarray, start, end, element, configuration.tt);
 174         nodeList.addElement(node);
 175         return node;
 176     }
 177
 178     public Node cloneNode(Node node)
 179     {
 180         Node cnode = (Node)node.clone();
 181         nodeList.addElement(cnode);
 182         for (AttVal att = cnode.attributes; att != null; att = att.next) {
 183             if (att.asp != null)
 184                 nodeList.addElement(att.asp);
 185             if (att.php != null)
 186                 nodeList.addElement(att.php);
 187         }
 188         return cnode;
 189     }
 190
 191     public AttVal cloneAttributes(AttVal attrs)
 192     {
 193         AttVal cattrs = (AttVal)attrs.clone();
 194         for (AttVal att = cattrs; att != null; att = att.next) {
 195             if (att.asp != null)
 196                 nodeList.addElement(att.asp);
 197             if (att.php != null)
 198                 nodeList.addElement(att.php);
 199         }
 200         return cattrs;
 201     }
 202
 203     protected void updateNodeTextArrays(byte[] oldtextarray, byte[] newtextarray)
 204     {
 205         Node node;
 206         for (int i = 0; i < nodeList.size(); i++) {
 207             node = (Node)(nodeList.elementAt(i));
 208             if (node.textarray == oldtextarray)
 209                 node.textarray = newtextarray;
 210         }
 211     }
 212
 213     /* used for creating preformatted text from Word2000 */
 214     public Node newLineNode()
 215     {
 216         Node node = newNode();
 217
 218         node.textarray = this.lexbuf;
 219         node.start = this.lexsize;
 220         addCharToLexer((int)'\n');
 221         node.end = this.lexsize;
 222         return node;
 223     }
 224
 225     // Should always be able convert to/from UTF-8, so encoding exceptions are
 226     // converted to an Error to avoid adding throws declarations in
 227     // lots of methods.
 228
 229     public static byte[] getBytes(String str) {
 230         try {
 231             return str.getBytes("UTF8");
 232         } catch (java.io.UnsupportedEncodingException e) {
 233             throw new Error("string to UTF-8 conversion failed: " + e.getMessage());
 234         }
 235     }
 236
 237     public static String getString(byte[] bytes, int offset, int length) {
 238         try {
 239             return new String(bytes, offset, length, "UTF8");
 240         } catch (java.io.UnsupportedEncodingException e) {
 241             throw new Error("UTF-8 to string conversion failed: " + e.getMessage());
 242         }
 243     }
 244
 245     public boolean endOfInput()
 246     {
 247         return this.in.isEndOfStream();
 248     }
 249
 250     public void addByte(int c)
 251     {
 252         if (this.lexsize + 1 >= this.lexlength)
 253         {
 254             while (this.lexsize + 1 >= this.lexlength)
 255             {
 256                 if (this.lexlength == 0)
 257                     this.lexlength = 8192;
 258                 else
 259                     this.lexlength = this.lexlength * 2;
 260             }
 261
 262             byte[] temp = this.lexbuf;
 263             this.lexbuf = new byte[ this.lexlength ];
 264             if (temp != null)
 265             {
 266                 System.arraycopy( temp, 0, this.lexbuf, 0, temp.length );
 267                 updateNodeTextArrays(temp, this.lexbuf);
 268             }
 269         }
 270
 271         this.lexbuf[this.lexsize++] = (byte)c;
 272         this.lexbuf[this.lexsize] = (byte)'\0';  /* debug */
 273     }
 274
 275     public void changeChar(byte c)
 276     {
 277         if (this.lexsize > 0)
 278         {
 279             this.lexbuf[this.lexsize-1] = c;
 280         }
 281     }
 282
 283     /* store char c as UTF-8 encoded byte stream */
 284     public void addCharToLexer(int c)
 285     {
 286         if (c < 128)
 287             addByte(c);
 288         else if (c <= 0x7FF)
 289         {
 290             addByte(0xC0 | (c >> 6));
 291             addByte(0x80 | (c & 0x3F));
 292         }
 293         else if (c <= 0xFFFF)
 294         {
 295             addByte(0xE0 | (c >> 12));
 296             addByte(0x80 | ((c >> 6) & 0x3F));
 297             addByte(0x80 | (c & 0x3F));
 298         }
 299         else if (c <= 0x1FFFFF)
 300         {
 301             addByte(0xF0 | (c >> 18));
 302             addByte(0x80 | ((c >> 12) & 0x3F));
 303             addByte(0x80 | ((c >> 6) & 0x3F));
 304             addByte(0x80 | (c & 0x3F));
 305         }
 306         else
 307         {
 308             addByte(0xF8 | (c >> 24));
 309             addByte(0x80 | ((c >> 18) & 0x3F));
 310             addByte(0x80 | ((c >> 12) & 0x3F));
 311             addByte(0x80 | ((c >> 6) & 0x3F));
 312             addByte(0x80 | (c & 0x3F));
 313         }
 314     }
 315
 316     public void addStringToLexer(String str)
 317     {
 318         for ( int i = 0; i < str.length(); i++ ) {
 319             addCharToLexer( (int)str.charAt(i) );
 320         }
 321     }
 322
 323     /*
 324       No longer attempts to insert missing ';' for unknown
 325       enitities unless one was present already, since this
 326       gives unexpected results.
 327
 328       For example:   <a href="something.htm?foo&bar&fred">
 329       was tidied to: <a href="something.htm?foo&amp;bar;&amp;fred;">
 330       rather than:   <a href="something.htm?foo&amp;bar&amp;fred">
 331
 332       My thanks for Maurice Buxton for spotting this.
 333     */
 334     public void parseEntity(short mode)
 335     {
 336         short map;
 337         int start;
 338         boolean first = true;
 339         boolean semicolon = false;
 340         boolean numeric = false;
 341         int c, ch, startcol;
 342         String str;
 343
 344         start = this.lexsize - 1;  /* to start at "&" */
 345         startcol = this.in.curcol - 1;
 346
 347         while (true)
 348         {
 349             c = this.in.readChar();
 350             if (c == StreamIn.EndOfStream) break;
 351             if (c == ';')
 352             {
 353                 semicolon = true;
 354                 break;
 355             }
 356
 357             if (first && c == '#')
 358             {
 359                 addCharToLexer(c);
 360                 first = false;
 361                 numeric = true;
 362                 continue;
 363             }
 364
 365             first = false;
 366             map = MAP((char)c);
 367
 368             /* AQ: Added flag for numeric entities so that numeric entities
 369                with missing semi-colons are recognized.
 370                Eg. "&#114e&#112;..." is recognized as "rep"
 371             */
 372             if (numeric && ((c == 'x') || ((map & DIGIT) != 0)))
 373             {
 374                 addCharToLexer(c);
 375                 continue;
 376             }
 377             if (!numeric && ((map & NAMECHAR) != 0))
 378             {
 379                 addCharToLexer(c);
 380                 continue;
 381             }
 382
 383             /* otherwise put it back */
 384
 385             this.in.ungetChar(c);
 386             break;
 387         }
 388
 389         str = getString( this.lexbuf, start, this.lexsize - start );
 390         ch = EntityTable.getDefaultEntityTable().entityCode( str );
 391
 392         /* deal with unrecognized entities */
 393         if (ch <= 0)
 394         {
 395             /* set error position just before offending chararcter */
 396             this.lines = this.in.curline;
 397             this.columns = startcol;
 398
 399             if (this.lexsize > start +1 )
 400             {
 401                 Report.entityError(this, Report.UNKNOWN_ENTITY, str, ch);
 402
 403                 if (semicolon)
 404                     addCharToLexer(';');
 405             }
 406             else /* naked & */
 407             {
 408                 Report.entityError(this, Report.UNESCAPED_AMPERSAND, str, ch);
 409             }
 410         }
 411         else
 412         {
 413             if (c != ';')    /* issue warning if not terminated by ';' */
 414             {
 415                 /* set error position just before offending chararcter */
 416                 this.lines = this.in.curline;
 417                 this.columns = startcol;
 418                 Report.entityError(this, Report.MISSING_SEMICOLON, str, c);
 419             }
 420
 421             this.lexsize = start;
 422
 423             if (ch == 160 && (mode & Preformatted) != 0)
 424                 ch = ' ';
 425
 426             addCharToLexer(ch);
 427
 428             if (ch == '&' && !this.configuration.QuoteAmpersand)
 429             {
 430                 addCharToLexer('a');
 431                 addCharToLexer('m');
 432                 addCharToLexer('p');
 433                 addCharToLexer(';');
 434             }
 435         }
 436     }
 437
 438     public char parseTagName()
 439     {
 440         short map;
 441         int c;
 442
 443         /* fold case of first char in buffer */
 444
 445         c = this.lexbuf[this.txtstart];
 446         map = MAP((char)c);
 447
 448         if (!this.configuration.XmlTags && (map & UPPERCASE) != 0)
 449         {
 450             c += (int)((int)'a' - (int)'A');
 451             this.lexbuf[this.txtstart] = (byte)c;
 452         }
 453
 454         while (true)
 455         {
 456             c = this.in.readChar();
 457             if (c == StreamIn.EndOfStream) break;
 458             map = MAP((char)c);
 459
 460             if ((map & NAMECHAR) == 0)
 461                 break;
 462
 463             /* fold case of subsequent chars */
 464
 465             if (!this.configuration.XmlTags && (map & UPPERCASE) != 0)
 466                 c += (int)((int)'a' - (int)'A');
 467
 468             addCharToLexer(c);
 469         }
 470
 471         this.txtend = this.lexsize;
 472         return (char)c;
 473     }
 474
 475     public void addStringLiteral(String str)
 476     {
 477         for ( int i = 0; i < str.length(); i++ ) {
 478             addCharToLexer( (int)str.charAt(i) );
 479         }
 480     }
 481
 482     /* choose what version to use for new doctype */
 483     public short HTMLVersion()
 484     {
 485         short versions;
 486
 487         versions = this.versions;
 488
 489         if ((versions & Dict.VERS_HTML20) != 0)
 490             return Dict.VERS_HTML20;
 491
 492         if ((versions & Dict.VERS_HTML32) != 0)
 493             return Dict.VERS_HTML32;
 494
 495         if ((versions & Dict.VERS_HTML40_STRICT) != 0)
 496             return Dict.VERS_HTML40_STRICT;
 497
 498         if ((versions & Dict.VERS_HTML40_LOOSE) != 0)
 499             return Dict.VERS_HTML40_LOOSE;
 500
 501         if ((versions & Dict.VERS_FRAMES) != 0)
 502             return Dict.VERS_FRAMES;
 503
 504         return Dict.VERS_UNKNOWN;
 505     }
 506
 507     public String HTMLVersionName()
 508     {
 509         short guessed;
 510         int j;
 511
 512         guessed = apparentVersion();
 513
 514         for (j = 0; j < W3CVersion.length; ++j)
 515         {
 516             if (guessed == W3CVersion[j].code)
 517             {
 518                 if (this.isvoyager)
 519                     return W3CVersion[j].voyagerName;
 520
 521                 return W3CVersion[j].name;
 522             }
 523         }
 524
 525         return null;
 526     }
 527
 528     /* add meta element for Tidy */
 529     public boolean addGenerator(Node root)
 530     {
 531         AttVal attval;
 532         Node node;
 533         Node head = root.findHEAD(configuration.tt);
 534
 535         if (head != null)
 536         {
 537             for (node = head.content; node != null; node = node.next)
 538             {
 539                 if (node.tag == configuration.tt.tagMeta)
 540                 {
 541                     attval = node.getAttrByName("name");
 542
 543                     if (attval != null && attval.value != null &&
 544                         Lexer.wstrcasecmp(attval.value, "generator") == 0)
 545                     {
 546                         attval = node.getAttrByName("content");
 547
 548                         if (attval != null && attval.value != null &&
 549                             attval.value.length() >= 9 &&
 550                             Lexer.wstrcasecmp(attval.value.substring(0, 9), "HTML Tidy") == 0)
 551                         {
 552                             return false;
 553                         }
 554                     }
 555                 }
 556             }
 557
 558             node = this.inferredTag("meta");
 559             node.addAttribute("content", "HTML Tidy, see www.w3.org");
 560             node.addAttribute("name", "generator");
 561             Node.insertNodeAtStart(head, node);
 562             return true;
 563         }
 564
 565         return false;
 566     }
 567
 568     /* return true if substring s is in p and isn't all in upper case */
 569     /* this is used to check the case of SYSTEM, PUBLIC, DTD and EN */
 570     /* len is how many chars to check in p */
 571     private static boolean findBadSubString(String s, String p, int len)
 572     {
 573         int n = s.length();
 574         int i = 0;
 575         String ps;
 576
 577         while (n < len)
 578         {
 579             ps = p.substring(i, i + n);
 580             if (wstrcasecmp(s, ps) == 0)
 581                 return (!ps.equals(s.substring(0, n)));
 582
 583             ++i;
 584             --len;
 585         }
 586
 587         return false;
 588     }
 589
 590     public boolean checkDocTypeKeyWords(Node doctype)
 591     {
 592         int len = doctype.end - doctype.start;
 593         String s = getString(this.lexbuf, doctype.start, len);
 594
 595         return !(
 596             findBadSubString("SYSTEM", s, len) ||
 597             findBadSubString("PUBLIC", s, len) ||
 598             findBadSubString("//DTD", s, len) ||
 599             findBadSubString("//W3C", s, len) ||
 600             findBadSubString("//EN", s, len)
 601             );
 602     }
 603
 604     /* examine <!DOCTYPE> to identify version */
 605     public short findGivenVersion(Node doctype)
 606     {
 607         String p, s;
 608         int i, j;
 609         int len;
 610         String str1;
 611         String str2;
 612
 613         /* if root tag for doctype isn't html give up now */
 614         str1 = getString(this.lexbuf, doctype.start, 5);
 615         if (wstrcasecmp(str1, "html ") != 0)
 616             return 0;
 617
 618         if (!checkDocTypeKeyWords(doctype))
 619             Report.warning(this, doctype, null, Report.DTYPE_NOT_UPPER_CASE);
 620
 621         /* give up if all we are given is the system id for the doctype */
 622         str1 = getString(this.lexbuf, doctype.start + 5, 7);
 623         if (wstrcasecmp(str1, "SYSTEM ") == 0)
 624         {
 625             /* but at least ensure the case is correct */
 626             if (!str1.substring(0, 6).equals("SYSTEM"))
 627                 System.arraycopy( getBytes("SYSTEM"), 0,
 628                                   this.lexbuf, doctype.start + 5, 6 );
 629             return 0;  /* unrecognized */
 630         }
 631
 632         if (wstrcasecmp(str1, "PUBLIC ") == 0)
 633         {
 634             if (!str1.substring(0, 6).equals("PUBLIC"))
 635                 System.arraycopy( getBytes("PUBLIC "), 0,
 636                                   this.lexbuf, doctype.start + 5, 6 );
 637         }
 638         else
 639             this.badDoctype = true;
 640
 641         for (i = doctype.start; i < doctype.end; ++i)
 642         {
 643             if (this.lexbuf[i] == (byte)'"')
 644             {
 645                 str1 = getString( this.lexbuf, i + 1, 12 );
 646                 str2 = getString( this.lexbuf, i + 1, 13 );
 647                 if (str1.equals("-//W3C//DTD "))
 648                 {
 649                     /* compute length of identifier e.g. "HTML 4.0 Transitional" */
 650                     for (j = i + 13; j < doctype.end && this.lexbuf[j] != (byte)'/'; ++j);
 651                     len = j - i - 13;
 652                     p = getString( this.lexbuf, i + 13, len );
 653
 654                     for (j = 1; j < W3CVersion.length; ++j)
 655                     {
 656                         s = W3CVersion[j].name;
 657                         if (len == s.length() && s.equals(p))
 658                             return W3CVersion[j].code;
 659                     }
 660
 661                     /* else unrecognized version */
 662                 }
 663                 else if (str2.equals("-//IETF//DTD "))
 664                 {
 665                     /* compute length of identifier e.g. "HTML 2.0" */
 666                     for (j = i + 14; j < doctype.end && this.lexbuf[j] != (byte)'/'; ++j);
 667                     len = j - i - 14;
 668
 669                     p = getString( this.lexbuf, i + 14, len );
 670                     s = W3CVersion[0].name;
 671                     if (len == s.length() && s.equals(p))
 672                         return W3CVersion[0].code;
 673
 674                     /* else unrecognized version */
 675                 }
 676                 break;
 677             }
 678         }
 679
 680         return 0;
 681     }
 682
 683     public void fixHTMLNameSpace(Node root, String profile)
 684     {
 685         Node node;
 686         AttVal prev, attr;
 687
 688         for (node = root.content;
 689                 node != null && node.tag != configuration.tt.tagHtml; node = node.next);
 690
 691         if (node != null)
 692         {
 693             prev = null;
 694
 695             for (attr = node.attributes; attr != null; attr = attr.next)
 696             {
 697                 if (attr.attribute.equals("xmlns"))
 698                     break;
 699
 700                 prev = attr;
 701             }
 702
 703             if (attr != null)
 704             {
 705                 if (!attr.value.equals(profile))
 706                 {
 707                     Report.warning(this, node, null, Report.INCONSISTENT_NAMESPACE);
 708                     attr.value = profile;
 709                 }
 710             }
 711             else
 712             {
 713                 attr = new AttVal( node.attributes, null, (int)'"',
 714                                    "xmlns", profile );
 715                 attr.dict =
 716                     AttributeTable.getDefaultAttributeTable().findAttribute( attr );
 717                 node.attributes = attr;
 718             }
 719         }
 720     }
 721
 722     public boolean setXHTMLDocType(Node root)
 723     {
 724         String fpi = " ";
 725         String sysid = "";
 726         String namespace = XHTML_NAMESPACE;
 727         Node doctype;
 728
 729         doctype = root.findDocType();
 730
 731         if (configuration.docTypeMode == Configuration.DOCTYPE_OMIT)
 732         {
 733             if (doctype != null)
 734                 Node.discardElement(doctype);
 735             return true;
 736         }
 737
 738         if (configuration.docTypeMode == Configuration.DOCTYPE_AUTO)
 739         {
 740             /* see what flavor of XHTML this document matches */
 741             if ((this.versions & Dict.VERS_HTML40_STRICT) != 0)
 742             {  /* use XHTML strict */
 743                 fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
 744                 sysid = voyager_strict;
 745             }
 746             else if ((this.versions & Dict.VERS_LOOSE) != 0)
 747             {
 748                 fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
 749                 sysid = voyager_loose;
 750             }
 751             else if ((this.versions & Dict.VERS_FRAMES) != 0)
 752             {   /* use XHTML frames */
 753                 fpi = "-//W3C//DTD XHTML 1.0 Frameset//EN";
 754                 sysid = voyager_frameset;
 755             }
 756             else /* lets assume XHTML transitional */
 757             {
 758                 fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
 759                 sysid = voyager_loose;
 760             }
 761         }
 762         else if (configuration.docTypeMode == Configuration.DOCTYPE_STRICT)
 763         {
 764             fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
 765             sysid = voyager_strict;
 766         }
 767         else if (configuration.docTypeMode == Configuration.DOCTYPE_LOOSE)
 768         {
 769             fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
 770             sysid = voyager_loose;
 771         }
 772
 773         fixHTMLNameSpace(root, namespace);
 774
 775         if (doctype == null)
 776         {
 777             doctype = newNode(Node.DocTypeTag, this.lexbuf, 0, 0);
 778             doctype.next = root.content;
 779             doctype.parent = root;
 780             doctype.prev = null;
 781             root.content = doctype;
 782         }
 783
 784         if (configuration.docTypeMode == Configuration.DOCTYPE_USER &&
 785             configuration.docTypeStr != null)
 786         {
 787             fpi = configuration.docTypeStr;
 788             sysid = "";
 789         }
 790
 791         this.txtstart = this.lexsize;
 792         this.txtend = this.lexsize;
 793
 794         /* add public identifier */
 795         addStringLiteral("html PUBLIC ");
 796
 797         /* check if the fpi is quoted or not */
 798         if (fpi.charAt(0) == '"')
 799             addStringLiteral(fpi);
 800         else
 801         {
 802             addStringLiteral("\"");
 803             addStringLiteral(fpi);
 804             addStringLiteral("\"");
 805         }
 806
 807         if (sysid.length() + 6 >= this.configuration.wraplen)
 808             addStringLiteral("\n\"");
 809         else
 810             addStringLiteral("\n    \"");
 811
 812         /* add system identifier */
 813         addStringLiteral(sysid);
 814         addStringLiteral("\"");
 815
 816         this.txtend = this.lexsize;
 817
 818         doctype.start = this.txtstart;
 819         doctype.end = this.txtend;
 820
 821         return false;
 822     }
 823
 824     public short apparentVersion()
 825     {
 826         switch (this.doctype)
 827         {
 828         case Dict.VERS_UNKNOWN:
 829             return HTMLVersion();
 830
 831         case Dict.VERS_HTML20:
 832             if ((this.versions & Dict.VERS_HTML20) != 0)
 833                 return Dict.VERS_HTML20;
 834
 835             break;
 836
 837         case Dict.VERS_HTML32:
 838             if ((this.versions & Dict.VERS_HTML32) != 0)
 839                 return Dict.VERS_HTML32;
 840
 841             break; /* to replace old version by new */
 842
 843         case Dict.VERS_HTML40_STRICT:
 844             if ((this.versions & Dict.VERS_HTML40_STRICT) != 0)
 845                 return Dict.VERS_HTML40_STRICT;
 846
 847             break;
 848
 849         case Dict.VERS_HTML40_LOOSE:
 850             if ((this.versions & Dict.VERS_HTML40_LOOSE) != 0)
 851                 return Dict.VERS_HTML40_LOOSE;
 852
 853             break; /* to replace old version by new */
 854
 855         case Dict.VERS_FRAMES:
 856             if ((this.versions & Dict.VERS_FRAMES) != 0)
 857                 return Dict.VERS_FRAMES;
 858
 859             break;
 860         }
 861
 862         Report.warning(this, null, null, Report.INCONSISTENT_VERSION);
 863         return this.HTMLVersion();
 864     }
 865
 866     /* fixup doctype if missing */
 867     public boolean fixDocType(Node root)
 868     {
 869         Node doctype;
 870         int guessed = Dict.VERS_HTML40_STRICT, i;
 871
 872         if (this.badDoctype)
 873             Report.warning(this, null, null, Report.MALFORMED_DOCTYPE);
 874
 875         if (configuration.XmlOut)
 876             return true;
 877
 878         doctype = root.findDocType();
 879
 880         if (configuration.docTypeMode == Configuration.DOCTYPE_OMIT)
 881         {
 882             if (doctype != null)
 883                 Node.discardElement(doctype);
 884             return true;
 885         }
 886
 887         if (configuration.docTypeMode == Configuration.DOCTYPE_STRICT)
 888         {
 889             Node.discardElement(doctype);
 890             doctype = null;
 891             guessed = Dict.VERS_HTML40_STRICT;
 892         }
 893         else if (configuration.docTypeMode == Configuration.DOCTYPE_LOOSE)
 894         {
 895             Node.discardElement(doctype);
 896             doctype = null;
 897             guessed = Dict.VERS_HTML40_LOOSE;
 898         }
 899         else if (configuration.docTypeMode == Configuration.DOCTYPE_AUTO)
 900         {
 901             if (doctype != null)
 902             {
 903                 if (this.doctype == Dict.VERS_UNKNOWN)
 904                     return false;
 905
 906                 switch (this.doctype)
 907                 {
 908                 case Dict.VERS_UNKNOWN:
 909                     return false;
 910
 911                 case Dict.VERS_HTML20:
 912                     if ((this.versions & Dict.VERS_HTML20) != 0)
 913                         return true;
 914
 915                     break; /* to replace old version by new */
 916
 917                 case Dict.VERS_HTML32:
 918                     if ((this.versions & Dict.VERS_HTML32) != 0)
 919                         return true;
 920
 921                     break; /* to replace old version by new */
 922
 923                 case Dict.VERS_HTML40_STRICT:
 924                     if ((this.versions & Dict.VERS_HTML40_STRICT) != 0)
 925                         return true;
 926
 927                     break; /* to replace old version by new */
 928
 929                 case Dict.VERS_HTML40_LOOSE:
 930                     if ((this.versions & Dict.VERS_HTML40_LOOSE) != 0)
 931                         return true;
 932
 933                     break; /* to replace old version by new */
 934
 935                 case Dict.VERS_FRAMES:
 936                     if ((this.versions & Dict.VERS_FRAMES) != 0)
 937                         return true;
 938
 939                     break; /* to replace old version by new */
 940                 }
 941
 942                 /* INCONSISTENT_VERSION warning is now issued by ApparentVersion() */
 943             }
 944
 945             /* choose new doctype */
 946             guessed = HTMLVersion();
 947         }
 948
 949         if (guessed == Dict.VERS_UNKNOWN)
 950             return false;
 951
 952         /* for XML use the Voyager system identifier */
 953         if (this.configuration.XmlOut || this.configuration.XmlTags || this.isvoyager)
 954         {
 955             if (doctype != null)
 956                 Node.discardElement(doctype);
 957
 958             for (i = 0; i < W3CVersion.length; ++i)
 959             {
 960                 if (guessed == W3CVersion[i].code)
 961                 {
 962                     fixHTMLNameSpace(root, W3CVersion[i].profile);
 963                     break;
 964                 }
 965             }
 966
 967             return true;
 968         }
 969
 970         if (doctype == null)
 971         {
 972             doctype = newNode(Node.DocTypeTag, this.lexbuf, 0, 0);
 973             doctype.next = root.content;
 974             doctype.parent = root;
 975             doctype.prev = null;
 976             root.content = doctype;
 977         }
 978
 979         this.txtstart = this.lexsize;
 980         this.txtend = this.lexsize;
 981
 982         /* use the appropriate public identifier */
 983         addStringLiteral("html PUBLIC ");
 984
 985         if (configuration.docTypeMode == Configuration.DOCTYPE_USER &&
 986             configuration.docTypeStr != null)
 987             addStringLiteral(configuration.docTypeStr);
 988         else if (guessed == Dict.VERS_HTML20)
 989             addStringLiteral("\"-//IETF//DTD HTML 2.0//EN\"");
 990         else
 991         {
 992             addStringLiteral("\"-//W3C//DTD ");
 993
 994             for (i = 0; i < W3CVersion.length; ++i)
 995             {
 996                 if (guessed == W3CVersion[i].code)
 997                 {
 998                     addStringLiteral(W3CVersion[i].name);
 999                     break;
1000                 }
1001             }
1002
1003             addStringLiteral("//EN\"");
1004         }
1005
1006         this.txtend = this.lexsize;
1007
1008         doctype.start = this.txtstart;
1009         doctype.end = this.txtend;
1010
1011         return true;
1012     }
1013
1014     /* ensure XML document starts with <?XML version="1.0"?> */
1015     public boolean fixXMLPI(Node root)
1016     {
1017         Node xml;
1018         int s;
1019
1020         if( root.content != null && root.content.type == Node.ProcInsTag)
1021         {
1022             s = root.content.start;
1023
1024             if (this.lexbuf[s] == (byte)'x' &&
1025                 this.lexbuf[s+1] == (byte)'m' &&
1026                 this.lexbuf[s+2] == (byte)'l')
1027                 return true;
1028         }
1029
1030         xml = newNode(Node.ProcInsTag, this.lexbuf, 0, 0);
1031         xml.next = root.content;
1032
1033         if (root.content != null)
1034         {
1035             root.content.prev = xml;
1036             xml.next = root.content;
1037         }
1038
1039         root.content = xml;
1040
1041         this.txtstart = this.lexsize;
1042         this.txtend = this.lexsize;
1043         addStringLiteral("xml version=\"1.0\"");
1044         if (this.configuration.CharEncoding == Configuration.LATIN1)
1045             addStringLiteral(" encoding=\"ISO-8859-1\"");
1046         this.txtend = this.lexsize;
1047
1048         xml.start = this.txtstart;
1049         xml.end = this.txtend;
1050         return false;
1051     }
1052
1053     public Node inferredTag(String name)
1054     {
1055         Node node;
1056
1057         node = newNode(Node.StartTag,
1058                         this.lexbuf,
1059                         this.txtstart,
1060                         this.txtend,
1061                         name);
1062         node.implicit = true;
1063         return node;
1064     }
1065
1066     public static boolean expectsContent(Node node)
1067     {
1068         if (node.type != Node.StartTag)
1069             return false;
1070
1071         /* unknown element? */
1072         if (node.tag == null)
1073             return true;
1074
1075         if ((node.tag.model & Dict.CM_EMPTY) != 0)
1076             return false;
1077
1078         return true;
1079     }
1080
1081     /*
1082       create a text node for the contents of
1083       a CDATA element like style or script
1084       which ends with </foo> for some foo.
1085     */
1086     public Node getCDATA(Node container)
1087     {
1088         int c, lastc, start, len, i;
1089         String str;
1090         boolean endtag = false;
1091
1092         this.lines = this.in.curline;
1093         this.columns = this.in.curcol;
1094         this.waswhite = false;
1095         this.txtstart = this.lexsize;
1096         this.txtend = this.lexsize;
1097
1098         lastc = (int)'\0';
1099         start = -1;
1100
1101         while (true)
1102         {
1103             c = this.in.readChar();
1104             if (c == StreamIn.EndOfStream) break;
1105             /* treat \r\n as \n and \r as \n */
1106
1107             if (c == (int)'/' && lastc == (int)'<')
1108             {
1109                 if (endtag)
1110                 {
1111                     this.lines = this.in.curline;
1112                     this.columns = this.in.curcol - 3;
1113
1114                     Report.warning(this, null, null, Report.BAD_CDATA_CONTENT);
1115                 }
1116
1117                 start = this.lexsize + 1;  /* to first letter */
1118                 endtag = true;
1119             }
1120             else if (c == (int)'>' && start >= 0)
1121             {
1122                 len = this.lexsize - start;
1123                 if (len == container.element.length())
1124                 {
1125                     str = getString( this.lexbuf, start, len );
1126                     if (Lexer.wstrcasecmp(str, container.element) == 0)
1127                     {
1128                         this.txtend = start - 2;
1129                         break;
1130                     }
1131                 }
1132
1133                 this.lines = this.in.curline;
1134                 this.columns = this.in.curcol - 3;
1135
1136                 Report.warning(this, null, null, Report.BAD_CDATA_CONTENT);
1137
1138                 /* if javascript insert backslash before / */
1139
1140                 if (ParserImpl.isJavaScript(container))
1141                 {
1142                     for (i = this.lexsize; i > start-1; --i)
1143                         this.lexbuf[i] = this.lexbuf[i-1];
1144
1145                     this.lexbuf[start-1] = (byte)'\\';
1146                     this.lexsize++;
1147                 }
1148
1149                 start = -1;
1150             }
1151             else if (c == (int)'\r')
1152             {
1153                 c = this.in.readChar();
1154
1155                 if (c != (int)'\n')
1156                     this.in.ungetChar(c);
1157
1158                 c = (int)'\n';
1159             }
1160
1161             addCharToLexer((int)c);
1162             this.txtend = this.lexsize;
1163             lastc = c;
1164         }
1165
1166         if (c == StreamIn.EndOfStream)
1167             Report.warning(this, container, null, Report.MISSING_ENDTAG_FOR);
1168
1169         if (this.txtend > this.txtstart)
1170         {
1171             this.token = newNode(Node.TextNode,
1172                                   this.lexbuf,
1173                                   this.txtstart,
1174                                   this.txtend);
1175             return this.token;
1176         }
1177
1178         return null;
1179     }
1180
1181     public void ungetToken()
1182     {
1183         this.pushed = true;
1184     }
1185
1186     public static final short IgnoreWhitespace    = 0;
1187     public static final short MixedContent        = 1;
1188     public static final short Preformatted        = 2;
1189     public static final short IgnoreMarkup        = 3;
1190
1191     /*
1192       modes for GetToken()
1193
1194       MixedContent   -- for elements which don't accept PCDATA
1195       Preformatted       -- white space preserved as is
1196       IgnoreMarkup       -- for CDATA elements such as script, style
1197     */
1198
1199     public Node getToken(short mode)
1200     {
1201         short map;
1202         int c = 0;
1203         int lastc;
1204         int badcomment = 0;
1205         MutableBoolean isempty = new MutableBoolean();
1206         AttVal attributes;
1207
1208         if (this.pushed)
1209         {
1210             /* duplicate inlines in preference to pushed text nodes when appropriate */
1211             if (this.token.type != Node.TextNode ||
1212                 (this.insert == -1 && this.inode == null))
1213             {
1214                 this.pushed = false;
1215                 return this.token;
1216             }
1217         }
1218
1219         /* at start of block elements, unclosed inline
1220            elements are inserted into the token stream */
1221
1222         if (this.insert != -1 || this.inode != null)
1223             return insertedToken();
1224
1225         this.lines = this.in.curline;
1226         this.columns = this.in.curcol;
1227         this.waswhite = false;
1228
1229         this.txtstart = this.lexsize;
1230         this.txtend = this.lexsize;
1231
1232         while (true)
1233         {
1234             c = this.in.readChar();
1235             if (c == StreamIn.EndOfStream) break;
1236             if (this.insertspace && mode != IgnoreWhitespace)
1237             {
1238                 addCharToLexer(' ');
1239                 this.waswhite = true;
1240                 this.insertspace = false;
1241             }
1242
1243             /* treat \r\n as \n and \r as \n */
1244
1245             if (c == '\r')
1246             {
1247                 c = this.in.readChar();
1248
1249                 if (c != '\n')
1250                     this.in.ungetChar(c);
1251
1252                 c = '\n';
1253             }
1254
1255             addCharToLexer(c);
1256
1257             switch (this.state)
1258             {
1259             case LEX_CONTENT:  /* element content */
1260                 map = MAP((char)c);
1261
1262                 /*
1263                  Discard white space if appropriate. Its cheaper
1264                  to do this here rather than in parser methods
1265                  for elements that don't have mixed content.
1266                 */
1267                 if (((map & WHITE) != 0) && (mode == IgnoreWhitespace)
1268                       && this.lexsize == this.txtstart + 1)
1269                 {
1270                     --this.lexsize;
1271                     this.waswhite = false;
1272                     this.lines = this.in.curline;
1273                     this.columns = this.in.curcol;
1274                     continue;
1275                 }
1276
1277                 if (c == '<')
1278                 {
1279                     this.state = LEX_GT;
1280                     continue;
1281                 }
1282
1283                 if ((map & WHITE) != 0)
1284                 {
1285                     /* was previous char white? */
1286                     if (this.waswhite)
1287                     {
1288                         if (mode != Preformatted && mode != IgnoreMarkup)
1289                         {
1290                             --this.lexsize;
1291                             this.lines = this.in.curline;
1292                             this.columns = this.in.curcol;
1293                         }
1294                     }
1295                     else /* prev char wasn't white */
1296                     {
1297                         this.waswhite = true;
1298                         lastc = c;
1299
1300                         if (mode != Preformatted && mode != IgnoreMarkup && c != ' ')
1301                             changeChar((byte)' ');
1302                     }
1303
1304                     continue;
1305                 }
1306                 else if (c == '&' && mode != IgnoreMarkup)
1307                     parseEntity(mode);
1308
1309                 /* this is needed to avoid trimming trailing whitespace */
1310                 if (mode == IgnoreWhitespace)
1311                     mode = MixedContent;
1312
1313                 this.waswhite = false;
1314                 continue;
1315
1316             case LEX_GT:  /* < */
1317
1318                 /* check for endtag */
1319                 if (c == '/')
1320                 {
1321                     c = this.in.readChar();
1322                     if (c == StreamIn.EndOfStream)
1323                     {
1324                         this.in.ungetChar(c);
1325                         continue;
1326                     }
1327
1328                     addCharToLexer(c);
1329                     map = MAP((char)c);
1330
1331                     if ((map & LETTER) != 0)
1332                     {
1333                         this.lexsize -= 3;
1334                         this.txtend = this.lexsize;
1335                         this.in.ungetChar(c);
1336                         this.state = LEX_ENDTAG;
1337                         this.lexbuf[this.lexsize] = (byte)'\0';  /* debug */
1338                         this.in.curcol -= 2;
1339
1340                         /* if some text before the </ return it now */
1341                         if (this.txtend > this.txtstart)
1342                         {
1343                             /* trim space char before end tag */
1344                             if (mode == IgnoreWhitespace && this.lexbuf[this.lexsize - 1] == (byte)' ')
1345                             {
1346                                 this.lexsize -= 1;
1347                                 this.txtend = this.lexsize;
1348                             }
1349
1350                             this.token = newNode(Node.TextNode,
1351                                                   this.lexbuf,
1352                                                   this.txtstart,
1353                                                   this.txtend);
1354                             return this.token;
1355                         }
1356
1357                         continue;       /* no text so keep going */
1358                     }
1359
1360                     /* otherwise treat as CDATA */
1361                     this.waswhite = false;
1362                     this.state = LEX_CONTENT;
1363                     continue;
1364                 }
1365
1366                 if (mode == IgnoreMarkup)
1367                 {
1368                     /* otherwise treat as CDATA */
1369                     this.waswhite = false;
1370                     this.state = LEX_CONTENT;
1371                     continue;
1372                 }
1373
1374                 /*
1375                    look out for comments, doctype or marked sections
1376                    this isn't quite right, but its getting there ...
1377                 */
1378                 if (c == '!')
1379                 {
1380                     c = this.in.readChar();
1381
1382                     if (c == '-')
1383                     {
1384                         c = this.in.readChar();
1385
1386                         if (c == '-')
1387                         {
1388                             this.state = LEX_COMMENT;  /* comment */
1389                             this.lexsize -= 2;
1390                             this.txtend = this.lexsize;
1391
1392                             /* if some text before < return it now */
1393                             if (this.txtend > this.txtstart)
1394                             {
1395                                 this.token = newNode(Node.TextNode,
1396                                                       this.lexbuf,
1397                                                       this.txtstart,
1398                                                       this.txtend);
1399                                 return this.token;
1400                             }
1401
1402                             this.txtstart = this.lexsize;
1403                             continue;
1404                         }
1405
1406                         Report.warning(this, null, null, Report.MALFORMED_COMMENT);
1407                     }
1408                     else if (c == 'd' || c == 'D')
1409                     {
1410                         this.state = LEX_DOCTYPE; /* doctype */
1411                         this.lexsize -= 2;
1412                         this.txtend = this.lexsize;
1413                         mode = IgnoreWhitespace;
1414
1415                         /* skip until white space or '>' */
1416
1417                         for (;;)
1418                         {
1419                             c = this.in.readChar();
1420
1421                             if (c == StreamIn.EndOfStream || c == '>')
1422                             {
1423                                 this.in.ungetChar(c);
1424                                 break;
1425                             }
1426
1427                             map = MAP((char)c);
1428
1429                             if ((map & WHITE) == 0)
1430                                 continue;
1431
1432                             /* and skip to end of whitespace */
1433
1434                             for (;;)
1435                             {
1436                                 c = this.in.readChar();
1437
1438                                 if (c == StreamIn.EndOfStream || c == '>')
1439                                 {
1440                                     this.in.ungetChar(c);
1441                                     break;
1442                                 }
1443
1444                                 map = MAP((char)c);
1445
1446                                 if ((map & WHITE) != 0)
1447                                     continue;
1448
1449                                 this.in.ungetChar(c);
1450                                     break;
1451                             }
1452
1453                             break;
1454                         }
1455
1456                         /* if some text before < return it now */
1457                         if (this.txtend > this.txtstart)
1458                         {
1459                                 this.token = newNode(Node.TextNode,
1460                                                       this.lexbuf,
1461                                                       this.txtstart,
1462                                                       this.txtend);
1463                                 return this.token;
1464                         }
1465
1466                         this.txtstart = this.lexsize;
1467                         continue;
1468                     }
1469                     else if (c == '[')
1470                     {
1471                         /* Word 2000 embeds <![if ...]> ... <![endif]> sequences */
1472                         this.lexsize -= 2;
1473                         this.state = LEX_SECTION;
1474                         this.txtend = this.lexsize;
1475
1476                         /* if some text before < return it now */
1477                         if (this.txtend > this.txtstart)
1478                         {
1479                                 this.token = newNode(Node.TextNode,
1480                                                       this.lexbuf,
1481                                                       this.txtstart,
1482                                                       this.txtend);
1483                                 return this.token;
1484                         }
1485
1486                         this.txtstart = this.lexsize;
1487                         continue;
1488                     }
1489
1490                     /* otherwise swallow chars up to and including next '>' */
1491                     while (true)
1492                     {
1493                         c = this.in.readChar();
1494                         if (c == '>') break;
1495                         if (c == -1)
1496                         {
1497                             this.in.ungetChar(c);
1498                             break;
1499                         }
1500                     }
1501
1502                     this.lexsize -= 2;
1503                     this.lexbuf[this.lexsize] = (byte)'\0';
1504                     this.state = LEX_CONTENT;
1505                     continue;
1506                 }
1507
1508                 /*
1509                    processing instructions
1510                 */
1511
1512                 if (c == '?')
1513                 {
1514                     this.lexsize -= 2;
1515                     this.state = LEX_PROCINSTR;
1516                     this.txtend = this.lexsize;
1517
1518                     /* if some text before < return it now */
1519                     if (this.txtend > this.txtstart)
1520                     {
1521                         this.token = newNode(Node.TextNode,
1522                                               this.lexbuf,
1523                                               this.txtstart,
1524                                               this.txtend);
1525                         return this.token;
1526                     }
1527
1528                     this.txtstart = this.lexsize;
1529                     continue;
1530                 }
1531
1532                 /* Microsoft ASP's e.g. <% ... server-code ... %> */
1533                 if (c == '%')
1534                 {
1535                     this.lexsize -= 2;
1536                     this.state = LEX_ASP;
1537                     this.txtend = this.lexsize;
1538
1539                     /* if some text before < return it now */
1540                     if (this.txtend > this.txtstart)
1541                     {
1542                         this.token = newNode(Node.TextNode,
1543                                               this.lexbuf,
1544                                               this.txtstart,
1545                                               this.txtend);
1546                         return this.token;
1547                     }
1548
1549                     this.txtstart = this.lexsize;
1550                     continue;
1551                 }
1552
1553                 /* Netscapes JSTE e.g. <# ... server-code ... #> */
1554                 if (c == '#')
1555                 {
1556                     this.lexsize -= 2;
1557                     this.state = LEX_JSTE;
1558                     this.txtend = this.lexsize;
1559
1560                     /* if some text before < return it now */
1561                     if (this.txtend > this.txtstart)
1562                     {
1563                         this.token = newNode(Node.TextNode,
1564                                               this.lexbuf,
1565                                               this.txtstart,
1566                                               this.txtend);
1567                         return this.token;
1568                     }
1569
1570                     this.txtstart = this.lexsize;
1571                     continue;
1572                 }
1573
1574                 map = MAP((char)c);
1575
1576                 /* check for start tag */
1577                 if ((map & LETTER) != 0)
1578                 {
1579                     this.in.ungetChar(c);     /* push back letter */
1580                     this.lexsize -= 2;      /* discard "<" + letter */
1581                     this.txtend = this.lexsize;
1582                     this.state = LEX_STARTTAG;         /* ready to read tag name */
1583
1584                     /* if some text before < return it now */
1585                     if (this.txtend > this.txtstart)
1586                     {
1587                         this.token = newNode(Node.TextNode,
1588                                               this.lexbuf,
1589                                               this.txtstart,
1590                                               this.txtend);
1591                         return this.token;
1592                     }
1593
1594                     continue;       /* no text so keep going */
1595                 }
1596
1597                 /* otherwise treat as CDATA */
1598                 this.state = LEX_CONTENT;
1599                 this.waswhite = false;
1600                 continue;
1601
1602             case LEX_ENDTAG:  /* </letter */
1603                 this.txtstart = this.lexsize - 1;
1604                 this.in.curcol += 2;
1605                 c = parseTagName();
1606                 this.token = newNode(Node.EndTag, /* create endtag token */
1607                                       this.lexbuf,
1608                                       this.txtstart,
1609                                       this.txtend,
1610                                       getString(this.lexbuf,
1611                                                  this.txtstart,
1612                                                  this.txtend - this.txtstart));
1613                 this.lexsize = this.txtstart;
1614                 this.txtend = this.txtstart;
1615
1616                 /* skip to '>' */
1617                 while (c != '>')
1618                 {
1619                     c = this.in.readChar();
1620
1621                     if (c == StreamIn.EndOfStream)
1622                         break;
1623                 }
1624
1625                 if (c == StreamIn.EndOfStream)
1626                 {
1627                     this.in.ungetChar(c);
1628                     continue;
1629                 }
1630
1631                 this.state = LEX_CONTENT;
1632                 this.waswhite = false;
1633                 return this.token;  /* the endtag token */
1634
1635             case LEX_STARTTAG: /* first letter of tagname */
1636                 this.txtstart = this.lexsize - 1; /* set txtstart to first letter */
1637                 c = parseTagName();
1638                 isempty.value = false;
1639                 attributes = null;
1640                 this.token = newNode((isempty.value ? Node.StartEndTag : Node.StartTag),
1641                                       this.lexbuf,
1642                                       this.txtstart,
1643                                       this.txtend,
1644                                       getString(this.lexbuf,
1645                                                  this.txtstart,
1646                                                  this.txtend - this.txtstart));
1647
1648                 /* parse attributes, consuming closing ">" */
1649                 if (c != '>')
1650                 {
1651                     if (c == '/')
1652                         this.in.ungetChar(c);
1653
1654                     attributes = parseAttrs(isempty);
1655                 }
1656
1657                 if (isempty.value)
1658                     this.token.type = Node.StartEndTag;
1659
1660                 this.token.attributes = attributes;
1661                 this.lexsize = this.txtstart;
1662                 this.txtend = this.txtstart;
1663
1664                 /* swallow newline following start tag */
1665                 /* special check needed for CRLF sequence */
1666                 /* this doesn't apply to empty elements */
1667
1668                 if (expectsContent(this.token) ||
1669                     this.token.tag == configuration.tt.tagBr)
1670                 {
1671
1672                     c = this.in.readChar();
1673
1674                     if (c == '\r')
1675                     {
1676                         c = this.in.readChar();
1677
1678                         if (c != '\n')
1679                             this.in.ungetChar(c);
1680                     }
1681                     else if (c != '\n' && c != '\f')
1682                         this.in.ungetChar(c);
1683
1684                     this.waswhite = true;  /* to swallow leading whitespace */
1685                 }
1686                 else
1687                     this.waswhite = false;
1688
1689                 this.state = LEX_CONTENT;
1690
1691                 if (this.token.tag == null)
1692                     Report.error(this, null, this.token, Report.UNKNOWN_ELEMENT);
1693                 else if (!this.configuration.XmlTags)
1694                 {
1695                     this.versions &= this.token.tag.versions;
1696
1697                     if ((this.token.tag.versions & Dict.VERS_PROPRIETARY) != 0)
1698                     {
1699                         if (!this.configuration.MakeClean && (this.token.tag == configuration.tt.tagNobr ||
1700                                                 this.token.tag == configuration.tt.tagWbr))
1701                             Report.warning(this, null, this.token, Report.PROPRIETARY_ELEMENT);
1702                     }
1703
1704                     if (this.token.tag.chkattrs != null)
1705                     {
1706                         this.token.checkUniqueAttributes(this);
1707                         this.token.tag.chkattrs.check(this, this.token);
1708                     }
1709                     else
1710                         this.token.checkAttributes(this);
1711                 }
1712
1713                 return this.token;  /* return start tag */
1714
1715             case LEX_COMMENT:  /* seen <!-- so look for --> */
1716
1717                 if (c != '-')
1718                     continue;
1719
1720                 c = this.in.readChar();
1721                 addCharToLexer(c);
1722
1723                 if (c != '-')
1724                     continue;
1725
1726                 end_comment: while (true) {
1727                     c = this.in.readChar();
1728
1729                     if (c == '>')
1730                     {
1731                         if (badcomment != 0)
1732                             Report.warning(this, null, null, Report.MALFORMED_COMMENT);
1733
1734                         this.txtend = this.lexsize - 2; // AQ 8Jul2000
1735                         this.lexbuf[this.lexsize] = (byte)'\0';
1736                         this.state = LEX_CONTENT;
1737                         this.waswhite = false;
1738                         this.token = newNode(Node.CommentTag,
1739                                               this.lexbuf,
1740                                               this.txtstart,
1741                                               this.txtend);
1742
1743                         /* now look for a line break */
1744
1745                         c = this.in.readChar();
1746
1747                         if (c == '\r')
1748                         {
1749                             c = this.in.readChar();
1750
1751                             if (c != '\n')
1752                                 this.token.linebreak = true;
1753                         }
1754
1755                         if (c == '\n')
1756                             this.token.linebreak = true;
1757                         else
1758                             this.in.ungetChar(c);
1759
1760                         return this.token;
1761                     }
1762
1763                     /* note position of first such error in the comment */
1764                     if (badcomment == 0)
1765                     {
1766                         this.lines = this.in.curline;
1767                         this.columns = this.in.curcol - 3;
1768                     }
1769
1770                     badcomment++;
1771                     if (this.configuration.FixComments)
1772                         this.lexbuf[this.lexsize - 2] = (byte)'=';
1773
1774                     addCharToLexer(c);
1775
1776                     /* if '-' then look for '>' to end the comment */
1777                     if (c != '-')
1778                         break end_comment;
1779
1780                 }
1781                 /* otherwise continue to look for --> */
1782                 this.lexbuf[this.lexsize - 2] = (byte)'=';
1783                 continue;
1784
1785             case LEX_DOCTYPE:  /* seen <!d so look for '>' munging whitespace */
1786                 map = MAP((char)c);
1787
1788                 if ((map & WHITE) != 0)
1789                 {
1790                     if (this.waswhite)
1791                         this.lexsize -= 1;
1792
1793                     this.waswhite = true;
1794                 }
1795                 else
1796                     this.waswhite = false;
1797
1798                 if (c != '>')
1799                     continue;
1800
1801                 this.lexsize -= 1;
1802                 this.txtend = this.lexsize;
1803                 this.lexbuf[this.lexsize] = (byte)'\0';
1804                 this.state = LEX_CONTENT;
1805                 this.waswhite = false;
1806                 this.token = newNode(Node.DocTypeTag,
1807                                       this.lexbuf,
1808                                       this.txtstart,
1809                                       this.txtend);
1810                 /* make a note of the version named by the doctype */
1811                 this.doctype = findGivenVersion(this.token);
1812                 return this.token;
1813
1814             case LEX_PROCINSTR:  /* seen <? so look for '>' */
1815                 /* check for PHP preprocessor instructions <?php ... ?> */
1816
1817                 if  (this.lexsize - this.txtstart == 3)
1818                 {
1819                     if ((getString(this.lexbuf, this.txtstart, 3)).equals("php"))
1820                     {
1821                         this.state = LEX_PHP;
1822                         continue;
1823                     }
1824                 }
1825
1826                 if (this.configuration.XmlPIs)  /* insist on ?> as terminator */
1827                 {
1828                     if (c != '?')
1829                         continue;
1830
1831                     /* now look for '>' */
1832                     c = this.in.readChar();
1833
1834                     if (c == StreamIn.EndOfStream)
1835                     {
1836                         Report.warning(this, null, null, Report.UNEXPECTED_END_OF_FILE);
1837                         this.in.ungetChar(c);
1838                         continue;
1839                     }
1840
1841                     addCharToLexer(c);
1842                 }
1843
1844                 if (c != '>')
1845                     continue;
1846
1847                 this.lexsize -= 1;
1848                 this.txtend = this.lexsize;
1849                 this.lexbuf[this.lexsize] = (byte)'\0';
1850                 this.state = LEX_CONTENT;
1851                 this.waswhite = false;
1852                 this.token = newNode(Node.ProcInsTag,
1853                                       this.lexbuf,
1854                                       this.txtstart,
1855                                       this.txtend);
1856                 return this.token;
1857
1858             case LEX_ASP:  /* seen <% so look for "%>" */
1859                 if (c != '%')
1860                     continue;
1861
1862                 /* now look for '>' */
1863                 c = this.in.readChar();
1864
1865
1866                 if (c != '>')
1867                 {
1868                     this.in.ungetChar(c);
1869                     continue;
1870                 }
1871
1872                 this.lexsize -= 1;
1873                 this.txtend = this.lexsize;
1874                 this.lexbuf[this.lexsize] = (byte)'\0';
1875                 this.state = LEX_CONTENT;
1876                 this.waswhite = false;
1877                 this.token = newNode(Node.AspTag,
1878                                       this.lexbuf,
1879                                       this.txtstart,
1880                                       this.txtend);
1881                 return this.token;
1882
1883             case LEX_JSTE:  /* seen <# so look for "#>" */
1884                 if (c != '#')
1885                     continue;
1886
1887                 /* now look for '>' */
1888                 c = this.in.readChar();
1889
1890
1891                 if (c != '>')
1892                 {
1893                     this.in.ungetChar(c);
1894                     continue;
1895                 }
1896
1897                 this.lexsize -= 1;
1898                 this.txtend = this.lexsize;
1899                 this.lexbuf[this.lexsize] = (byte)'\0';
1900                 this.state = LEX_CONTENT;
1901                 this.waswhite = false;
1902                 this.token = newNode(Node.JsteTag,
1903                                       this.lexbuf,
1904                                       this.txtstart,
1905                                       this.txtend);
1906                 return this.token;
1907
1908             case LEX_PHP: /* seen "<?php" so look for "?>" */
1909                 if (c != '?')
1910                     continue;
1911
1912                 /* now look for '>' */
1913                 c = this.in.readChar();
1914
1915                 if (c != '>')
1916                 {
1917                     this.in.ungetChar(c);
1918                     continue;
1919                 }
1920
1921                 this.lexsize -= 1;
1922                 this.txtend = this.lexsize;
1923                 this.lexbuf[this.lexsize] = (byte)'\0';
1924                 this.state = LEX_CONTENT;
1925                 this.waswhite = false;
1926                 this.token = newNode(Node.PhpTag,
1927                                       this.lexbuf,
1928                                       this.txtstart,
1929                                       this.txtend);
1930                 return this.token;
1931
1932             case LEX_SECTION: /* seen "<![" so look for "]>" */
1933                 if (c == '[')
1934                 {
1935                     if (this.lexsize == (this.txtstart + 6) &&
1936                         (getString(this.lexbuf, this.txtstart, 6)).equals("CDATA["))
1937                     {
1938                         this.state = LEX_CDATA;
1939                         this.lexsize -= 6;
1940                         continue;
1941                     }
1942                 }
1943
1944                 if (c != ']')
1945                     continue;
1946
1947                 /* now look for '>' */
1948                 c = this.in.readChar();
1949
1950                 if (c != '>')
1951                 {
1952                     this.in.ungetChar(c);
1953                     continue;
1954                 }
1955
1956                 this.lexsize -= 1;
1957                 this.txtend = this.lexsize;
1958                 this.lexbuf[this.lexsize] = (byte)'\0';
1959                 this.state = LEX_CONTENT;
1960                 this.waswhite = false;
1961                 this.token = newNode(Node.SectionTag,
1962                                       this.lexbuf,
1963                                       this.txtstart,
1964                                       this.txtend);
1965                 return this.token;
1966
1967             case LEX_CDATA: /* seen "<![CDATA[" so look for "]]>" */
1968                 if (c != ']')
1969                     continue;
1970
1971                 /* now look for ']' */
1972                 c = this.in.readChar();
1973
1974                 if (c != ']')
1975                 {
1976                     this.in.ungetChar(c);
1977                     continue;
1978                 }
1979
1980                 /* now look for '>' */
1981                 c = this.in.readChar();
1982
1983                 if (c != '>')
1984                 {
1985                     this.in.ungetChar(c);
1986                     continue;
1987                 }
1988
1989                 this.lexsize -= 1;
1990                 this.txtend = this.lexsize;
1991                 this.lexbuf[this.lexsize] = (byte)'\0';
1992                 this.state = LEX_CONTENT;
1993                 this.waswhite = false;
1994                 this.token = newNode(Node.CDATATag,
1995                                       this.lexbuf,
1996                                       this.txtstart,
1997                                       this.txtend);
1998                 return this.token;
1999             }
2000         }
2001
2002         if (this.state == LEX_CONTENT)  /* text string */
2003         {
2004             this.txtend = this.lexsize;
2005
2006             if (this.txtend > this.txtstart)
2007             {
2008                 this.in.ungetChar(c);
2009
2010                 if (this.lexbuf[this.lexsize - 1] == (byte)' ')
2011                 {
2012                     this.lexsize -= 1;
2013                     this.txtend = this.lexsize;
2014                 }
2015
2016                 this.token = newNode(Node.TextNode,
2017                                       this.lexbuf,
2018                                       this.txtstart,
2019                                       this.txtend);
2020                 return this.token;
2021             }
2022         }
2023         else if (this.state == LEX_COMMENT) /* comment */
2024         {
2025             if (c == StreamIn.EndOfStream)
2026                 Report.warning(this, null, null, Report.MALFORMED_COMMENT);
2027
2028             this.txtend = this.lexsize;
2029             this.lexbuf[this.lexsize] = (byte)'\0';
2030             this.state = LEX_CONTENT;
2031             this.waswhite = false;
2032             this.token = newNode(Node.CommentTag,
2033                                   this.lexbuf,
2034                                   this.txtstart,
2035                                   this.txtend);
2036             return this.token;
2037         }
2038
2039         return null;
2040     }
2041
2042     /*
2043      parser for ASP within start tags
2044
2045      Some people use ASP for to customize attributes
2046      Tidy isn't really well suited to dealing with ASP
2047      This is a workaround for attributes, but won't
2048      deal with the case where the ASP is used to tailor
2049      the attribute value. Here is an example of a work
2050      around for using ASP in attribute values:
2051
2052       href="<%=rsSchool.Fields("ID").Value%>"
2053
2054      where the ASP that generates the attribute value
2055      is masked from Tidy by the quotemarks.
2056
2057     */
2058
2059     public Node parseAsp()
2060     {
2061         int c;
2062         Node asp = null;
2063
2064         this.txtstart = this.lexsize;
2065
2066         for (;;)
2067         {
2068             c = this.in.readChar();
2069             addCharToLexer(c);
2070
2071
2072             if (c != '%')
2073                 continue;
2074
2075             c = this.in.readChar();
2076             addCharToLexer(c);
2077
2078             if (c == '>')
2079                 break;
2080         }
2081
2082         this.lexsize -= 2;
2083         this.txtend = this.lexsize;
2084
2085         if (this.txtend > this.txtstart)
2086             asp = newNode(Node.AspTag,
2087                            this.lexbuf,
2088                            this.txtstart,
2089                            this.txtend);
2090
2091         this.txtstart = this.txtend;
2092         return asp;
2093     }
2094
2095     /*
2096      PHP is like ASP but is based upon XML
2097      processing instructions, e.g. <?php ... ?>
2098     */
2099     public Node parsePhp()
2100     {
2101         int c;
2102         Node php = null;
2103
2104         this.txtstart = this.lexsize;
2105
2106         for (;;)
2107         {
2108             c = this.in.readChar();
2109             addCharToLexer(c);
2110
2111
2112             if (c != '?')
2113                 continue;
2114
2115             c = this.in.readChar();
2116             addCharToLexer(c);
2117
2118             if (c == '>')
2119                 break;
2120         }
2121
2122         this.lexsize -= 2;
2123         this.txtend = this.lexsize;
2124
2125         if (this.txtend > this.txtstart)
2126             php = newNode(Node.PhpTag,
2127                            this.lexbuf,
2128                            this.txtstart,
2129                            this.txtend);
2130
2131         this.txtstart = this.txtend;
2132         return php;
2133     }
2134
2135     /* consumes the '>' terminating start tags */
2136     public String parseAttribute(MutableBoolean isempty, MutableObject asp,
2137                                  MutableObject php)
2138     {
2139         int start = 0;
2140         // int len = 0;   Removed by BUGFIX for 126265
2141         short map;
2142         String attr;
2143         int c = 0;
2144
2145         asp.setObject(null);  /* clear asp pointer */
2146         php.setObject(null);  /* clear php pointer */
2147         /* skip white space before the attribute */
2148
2149         for (;;)
2150         {
2151             c = this.in.readChar();
2152
2153             if (c == '/')
2154             {
2155                 c = this.in.readChar();
2156
2157                 if (c == '>')
2158                 {
2159                     isempty.value = true;
2160                     return null;
2161                 }
2162
2163                 this.in.ungetChar(c);
2164                 c = '/';
2165                 break;
2166             }
2167
2168             if (c == '>')
2169                 return null;
2170
2171             if (c =='<')
2172             {
2173                 c = this.in.readChar();
2174
2175                 if (c == '%')
2176                 {
2177                     asp.setObject(parseAsp());
2178                     return null;
2179                 }
2180                 else if (c == '?')
2181                 {
2182                     php.setObject(parsePhp());
2183                     return null;
2184                 }
2185
2186                 this.in.ungetChar(c);
2187                 Report.attrError(this, this.token, null, Report.UNEXPECTED_GT);
2188                 return null;
2189             }
2190
2191             if (c == '"' || c == '\'')
2192             {
2193                 Report.attrError(this, this.token, null, Report.UNEXPECTED_QUOTEMARK);
2194                 continue;
2195             }
2196
2197             if (c == StreamIn.EndOfStream)
2198             {
2199                 Report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE);
2200                 this.in.ungetChar(c);
2201                 return null;
2202             }
2203
2204             map = MAP((char)c);
2205
2206             if ((map & WHITE) == 0)
2207                 break;
2208         }
2209
2210         start = this.lexsize;
2211
2212         for (;;)
2213         {
2214          /* but push back '=' for parseValue() */
2215             if (c == '=' || c == '>')
2216             {
2217                 this.in.ungetChar(c);
2218                 break;
2219             }
2220
2221             if (c == '<' || c == StreamIn.EndOfStream)
2222             {
2223                 this.in.ungetChar(c);
2224                 break;
2225             }
2226
2227             map = MAP((char)c);
2228
2229             if ((map & WHITE) != 0)
2230                 break;
2231
2232          /* what should be done about non-namechar characters? */
2233          /* currently these are incorporated into the attr name */
2234
2235             if (!this.configuration.XmlTags && (map & UPPERCASE) != 0)
2236                 c += (int)('a' - 'A');
2237
2238             //  ++len;    Removed by BUGFIX for 126265
2239             addCharToLexer(c);
2240
2241             c = this.in.readChar();
2242         }
2243
2244         // Following line added by GLP to fix BUG 126265.  This is a temporary comment
2245         // and should be removed when Tidy is fixed.
2246         int len = this.lexsize - start;
2247         attr = (len > 0 ? getString(this.lexbuf, start, len) : null);
2248         this.lexsize = start;
2249
2250         return attr;
2251     }
2252
2253     /*
2254      invoked when < is seen in place of attribute value
2255      but terminates on whitespace if not ASP, PHP or Tango
2256      this routine recognizes ' and " quoted strings
2257     */
2258     public int parseServerInstruction()
2259     {
2260         int c, map, delim = '"';
2261         boolean isrule = false;
2262
2263         c = this.in.readChar();
2264         addCharToLexer(c);
2265
2266         /* check for ASP, PHP or Tango */
2267         if (c == '%' || c == '?' || c == '@')
2268             isrule = true;
2269
2270         for (;;)
2271         {
2272             c = this.in.readChar();
2273
2274             if (c == StreamIn.EndOfStream)
2275                 break;
2276
2277             if (c == '>')
2278             {
2279                 if (isrule)
2280                     addCharToLexer(c);
2281                 else
2282                     this.in.ungetChar(c);
2283
2284                 break;
2285             }
2286
2287             /* if not recognized as ASP, PHP or Tango */
2288             /* then also finish value on whitespace */
2289             if (!isrule)
2290             {
2291                 map = MAP((char)c);
2292
2293                 if ((map & WHITE) != 0)
2294                     break;
2295             }
2296
2297             addCharToLexer(c);
2298
2299             if (c == '"')
2300             {
2301                 do
2302                 {
2303                     c = this.in.readChar();
2304                     addCharToLexer(c);
2305                 }
2306                 while (c != '"');
2307                 delim = '\'';
2308                 continue;
2309             }
2310
2311             if (c == '\'')
2312             {
2313                 do
2314                 {
2315                     c = this.in.readChar();
2316                     addCharToLexer(c);
2317                 }
2318                 while (c != '\'');
2319             }
2320         }
2321
2322         return delim;
2323     }
2324
2325     /* values start with "=" or " = " etc. */
2326     /* doesn't consume the ">" at end of start tag */
2327
2328     public String parseValue(String name, boolean foldCase,
2329                              MutableBoolean isempty, MutableInteger pdelim)
2330     {
2331         int len = 0;
2332         int start;
2333         short map;
2334         boolean seen_gt = false;
2335         boolean munge = true;
2336         int c = 0;
2337         int lastc, delim, quotewarning;
2338         String value;
2339
2340         delim = 0;
2341         pdelim.value = (int)'"';
2342
2343         /*
2344          Henry Zrepa reports that some folk are using the
2345          embed element with script attributes where newlines
2346          are significant and must be preserved
2347         */
2348         if (configuration.LiteralAttribs)
2349             munge = false;
2350
2351         /* skip white space before the '=' */
2352
2353         for (;;)
2354         {
2355             c = this.in.readChar();
2356
2357             if (c == StreamIn.EndOfStream)
2358             {
2359                 this.in.ungetChar(c);
2360                 break;
2361             }
2362
2363             map = MAP((char)c);
2364
2365             if ((map & WHITE) == 0)
2366                break;
2367         }
2368
2369     /*
2370       c should be '=' if there is a value
2371       other legal possibilities are white
2372       space, '/' and '>'
2373     */
2374
2375         if (c != '=')
2376         {
2377             this.in.ungetChar(c);
2378             return null;
2379         }
2380
2381      /* skip white space after '=' */
2382
2383         for (;;)
2384         {
2385             c = this.in.readChar();
2386
2387             if (c == StreamIn.EndOfStream)
2388             {
2389                 this.in.ungetChar(c);
2390                 break;
2391             }
2392
2393             map = MAP((char)c);
2394
2395             if ((map & WHITE) == 0)
2396                break;
2397         }
2398
2399      /* check for quote marks */
2400
2401         if (c == '"' || c == '\'')
2402             delim = c;
2403         else if (c == '<')
2404         {
2405             start = this.lexsize;
2406             addCharToLexer(c);
2407             pdelim.value = parseServerInstruction();
2408             len = this.lexsize - start;
2409             this.lexsize = start;
2410             return (len > 0 ? getString(this.lexbuf, start, len) : null);
2411         }
2412         else
2413             this.in.ungetChar(c);
2414
2415      /*
2416        and read the value string
2417        check for quote mark if needed
2418      */
2419
2420         quotewarning = 0;
2421         start = this.lexsize;
2422         c = '\0';
2423
2424         for (;;)
2425         {
2426             lastc = c;  /* track last character */
2427             c = this.in.readChar();
2428
2429             if (c == StreamIn.EndOfStream)
2430             {
2431                 Report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE);
2432                 this.in.ungetChar(c);
2433                 break;
2434             }
2435
2436             if (delim == (char)0)
2437             {
2438                 if (c == '>')
2439                 {
2440                     this.in.ungetChar(c);
2441                     break;
2442                 }
2443
2444                 if (c == '"' || c == '\'')
2445                 {
2446                     Report.attrError(this, this.token, null, Report.UNEXPECTED_QUOTEMARK);
2447                     break;
2448                 }
2449
2450                 if (c == '<')
2451                 {
2452                     /* this.in.ungetChar(c); */
2453                     Report.attrError(this, this.token, null, Report.UNEXPECTED_GT);
2454                     /* break; */
2455                 }
2456
2457                 /*
2458                  For cases like <br clear=all/> need to avoid treating /> as
2459                  part of the attribute value, however care is needed to avoid
2460                  so treating <a href=http://www.acme.com/> in this way, which
2461                  would map the <a> tag to <a href="http://www.acme.com"/>
2462                 */
2463                 if (c == '/')
2464                 {
2465                     /* peek ahead in case of /> */
2466                     c = this.in.readChar();
2467
2468                     if (c == '>' &&
2469                         !AttributeTable.getDefaultAttributeTable().isUrl(name))
2470                     {
2471                         isempty.value = true;
2472                         this.in.ungetChar(c);
2473                         break;
2474                     }
2475
2476                     /* unget peeked char */
2477                     this.in.ungetChar(c);
2478                     c = '/';
2479                 }
2480             }
2481             else  /* delim is '\'' or '"' */
2482             {
2483                 if (c == delim)
2484                     break;
2485
2486                 /* treat CRLF, CR and LF as single line break */
2487
2488                 if (c == '\r')
2489                 {
2490                     c = this.in.readChar();
2491                     if (c != '\n')
2492                         this.in.ungetChar(c);
2493
2494                     c = '\n';
2495                 }
2496
2497                 if (c == '\n' || c == '<' || c == '>')
2498                     ++quotewarning;
2499
2500                 if (c == '>')
2501                     seen_gt = true;
2502             }
2503
2504             if (c == '&')
2505             {
2506                 addCharToLexer(c);
2507                 parseEntity((short)0);
2508                 continue;
2509             }
2510
2511             /*
2512              kludge for JavaScript attribute values
2513              with line continuations in string literals
2514             */
2515             if (c == '\\')
2516             {
2517                 c = this.in.readChar();
2518
2519                 if (c != '\n')
2520                 {
2521                     this.in.ungetChar(c);
2522                     c = '\\';
2523                 }
2524             }
2525
2526             map = MAP((char)c);
2527
2528             if ((map & WHITE) != 0)
2529             {
2530                 if (delim == (char)0)
2531                     break;
2532
2533                 if (munge)
2534                 {
2535                     c = ' ';
2536
2537                     if (lastc == ' ')
2538                         continue;
2539                 }
2540             }
2541             else if (foldCase && (map & UPPERCASE) != 0)
2542                 c += (int)('a' - 'A');
2543
2544             addCharToLexer(c);
2545         }
2546
2547         if (quotewarning > 10 && seen_gt && munge)
2548         {
2549             /*
2550                there is almost certainly a missing trailling quote mark
2551                as we have see too many newlines, < or > characters.
2552
2553                an exception is made for Javascript attributes and the
2554                javascript URL scheme which may legitimately include < and >
2555             */
2556             if (!AttributeTable.getDefaultAttributeTable().isScript(name) &&
2557                 !(AttributeTable.getDefaultAttributeTable().isUrl(name) &&
2558                   (getString(this.lexbuf, start, 11)).equals("javascript:")))
2559                     Report.error(this, null, null, Report.SUSPECTED_MISSING_QUOTE);
2560         }
2561
2562         len = this.lexsize - start;
2563         this.lexsize = start;
2564
2565         if (len > 0 || delim != 0)
2566             value = getString(this.lexbuf, start, len);
2567         else
2568             value = null;
2569
2570         /* note delimiter if given */
2571         if (delim != 0)
2572             pdelim.value = delim;
2573         else
2574             pdelim.value = (int)'"';
2575
2576         return value;
2577     }
2578
2579     /* attr must be non-null */
2580     public static boolean isValidAttrName(String attr)
2581     {
2582         short map;
2583         char c;
2584         int i;
2585
2586         /* first character should be a letter */
2587         c = attr.charAt(0);
2588         map = MAP(c);
2589
2590         if (!((map & LETTER) != 0))
2591             return false;
2592
2593         /* remaining characters should be namechars */
2594         for( i = 1; i < attr.length(); i++)
2595         {
2596             c = attr.charAt(i);
2597             map = MAP(c);
2598
2599             if((map & NAMECHAR) != 0)
2600                 continue;
2601
2602             return false;
2603         }
2604
2605         return true;
2606     }
2607
2608     /* swallows closing '>' */
2609
2610     public AttVal parseAttrs(MutableBoolean isempty)
2611     {
2612         AttVal av, list;
2613         String attribute, value;
2614         MutableInteger delim = new MutableInteger();
2615         MutableObject asp = new MutableObject();
2616         MutableObject php = new MutableObject();
2617
2618         list = null;
2619
2620         for (; !endOfInput();)
2621         {
2622             attribute = parseAttribute(isempty, asp, php);
2623
2624             if (attribute == null)
2625             {
2626                 /* check if attributes are created by ASP markup */
2627                 if (asp.getObject() != null)
2628                 {
2629                     av = new AttVal(list, null, (Node)asp.getObject(), null,
2630                                     '\0', null, null );
2631                     list = av;
2632                     continue;
2633                 }
2634
2635                 /* check if attributes are created by PHP markup */
2636                 if (php.getObject() != null)
2637                 {
2638                     av = new AttVal(list, null, null, (Node)php.getObject(),
2639                                     '\0', null, null );
2640                     list = av;
2641                     continue;
2642                 }
2643
2644                 break;
2645             }
2646
2647             value = parseValue(attribute, false, isempty, delim);
2648
2649             if (attribute != null && isValidAttrName(attribute))
2650             {
2651                 av = new AttVal( list, null, null, null,
2652                                  delim.value, attribute, value );
2653                 av.dict =
2654                     AttributeTable.getDefaultAttributeTable().findAttribute(av);
2655                 list = av;
2656             }
2657             else
2658             {
2659                 av = new AttVal( null, null, null, null,
2660                                  0, attribute, value );
2661                 Report.attrError(this, this.token, value, Report.BAD_ATTRIBUTE_VALUE);
2662             }
2663         }
2664
2665         return list;
2666     }
2667
2668     /*
2669       push a copy of an inline node onto stack
2670       but don't push if implicit or OBJECT or APPLET
2671       (implicit tags are ones generated from the istack)
2672
2673       One issue arises with pushing inlines when
2674       the tag is already pushed. For instance:
2675
2676           <p><em>text
2677           <p><em>more text
2678
2679       Shouldn't be mapped to
2680
2681           <p><em>text</em></p>
2682           <p><em><em>more text</em></em>
2683     */
2684     public void pushInline( Node node )
2685     {
2686         IStack is;
2687
2688         if (node.implicit)
2689             return;
2690
2691         if (node.tag == null)
2692             return;
2693
2694         if ((node.tag.model & Dict.CM_INLINE) == 0 )
2695             return;
2696
2697         if ((node.tag.model & Dict.CM_OBJECT) != 0)
2698             return;
2699
2700         if (node.tag != configuration.tt.tagFont && isPushed(node))
2701             return;
2702
2703         // make sure there is enough space for the stack
2704         is = new IStack();
2705         is.tag = node.tag;
2706         is.element = node.element;
2707         if (node.attributes != null)
2708             is.attributes = cloneAttributes(node.attributes);
2709         this.istack.push( is );
2710     }
2711
2712     /* pop inline stack */
2713     public void popInline( Node node )
2714     {
2715         AttVal av;
2716         IStack is;
2717
2718         if (node != null) {
2719
2720             if (node.tag == null)
2721                 return;
2722
2723             if ((node.tag.model & Dict.CM_INLINE) == 0)
2724                 return;
2725
2726             if ((node.tag.model & Dict.CM_OBJECT) != 0)
2727                 return;
2728
2729             // if node is </a> then pop until we find an <a>
2730             if (node.tag == configuration.tt.tagA) {
2731
2732                 while (this.istack.size() > 0) {
2733                     is = (IStack)this.istack.pop();
2734                     if (is.tag == configuration.tt.tagA) {
2735                         break;
2736                     }
2737                 }
2738
2739                 if (this.insert >= this.istack.size())
2740                     this.insert = -1;
2741                 return;
2742             }
2743         }
2744
2745         if (this.istack.size() > 0) {
2746             is = (IStack)this.istack.pop();
2747             if (this.insert >= this.istack.size())
2748                 this.insert = -1;
2749         }
2750     }
2751
2752     public boolean isPushed( Node node )
2753     {
2754         int i;
2755         IStack is;
2756
2757         for (i = this.istack.size() - 1; i >= 0; --i) {
2758             is = (IStack)this.istack.elementAt(i);
2759             if (is.tag == node.tag)
2760                 return true;
2761         }
2762
2763         return false;
2764     }
2765
2766     /*
2767       This has the effect of inserting "missing" inline
2768       elements around the contents of blocklevel elements
2769       such as P, TD, TH, DIV, PRE etc. This procedure is
2770       called at the start of ParseBlock. when the inline
2771       stack is not empty, as will be the case in:
2772
2773         <i><h1>italic heading</h1></i>
2774
2775       which is then treated as equivalent to
2776
2777         <h1><i>italic heading</i></h1>
2778
2779       This is implemented by setting the lexer into a mode
2780       where it gets tokens from the inline stack rather than
2781       from the input stream.
2782     */
2783     public int inlineDup( Node node )
2784     {
2785         int n;
2786
2787         n = this.istack.size() - this.istackbase;
2788         if ( n > 0 ) {
2789             this.insert = this.istackbase;
2790             this.inode = node;
2791         }
2792
2793         return n;
2794     }
2795
2796     public Node insertedToken()
2797     {
2798         Node node;
2799         IStack is;
2800         int n;
2801
2802         // this will only be null if inode != null
2803         if (this.insert == -1) {
2804             node = this.inode;
2805             this.inode = null;
2806             return node;
2807         }
2808
2809         // is this is the "latest" node then update
2810         // the position, otherwise use current values
2811
2812         if (this.inode == null) {
2813             this.lines = this.in.curline;
2814             this.columns = this.in.curcol;
2815         }
2816
2817         node = newNode(Node.StartTag,
2818                         this.lexbuf,
2819                         this.txtstart,
2820                         this.txtend);   // GLP:  Bugfix 126261.  Remove when this change
2821                                         //       is fixed in istack.c in the original Tidy
2822         node.implicit = true;
2823         is = (IStack)this.istack.elementAt( this.insert );
2824         node.element = is.element;
2825         node.tag = is.tag;
2826         if (is.attributes != null)
2827             node.attributes = cloneAttributes(is.attributes);
2828
2829         // advance lexer to next item on the stack
2830         n = this.insert;
2831
2832         // and recover state if we have reached the end
2833         if (++n < this.istack.size() ) {
2834             this.insert = n;
2835         } else {
2836             this.insert = -1;
2837         }
2838
2839         return node;
2840     }
2841
2842     /* AQ: Try this for speed optimization */
2843     public static int wstrcasecmp(String s1, String s2)
2844     {
2845         return (s1.equalsIgnoreCase(s2) ? 0 : 1);
2846     }
2847
2848     public static int wstrcaselexcmp(String s1, String s2)
2849     {
2850         char c;
2851         int i = 0;
2852
2853         while ( i < s1.length() && i < s2.length() ) {
2854             c = s1.charAt(i);
2855             if ( toLower(c) != toLower( s2.charAt(i) ) ) {
2856                 break;
2857             }
2858             i += 1;
2859         }
2860         if ( i == s1.length() && i == s2.length() ) {
2861             return 0;
2862         } else if ( i == s1.length() ) {
2863             return -1;
2864         } else if ( i == s2.length() ) {
2865             return 1;
2866         } else {
2867             return ( s1.charAt(i) > s2.charAt(i) ? 1 : -1 );
2868         }
2869     }
2870
2871     public static boolean wsubstr(String s1, String s2)
2872     {
2873         int i;
2874         int len1 = s1.length();
2875         int len2 = s2.length();
2876
2877         for (i = 0; i <= len1 - len2; ++i)
2878         {
2879             if (s2.equalsIgnoreCase(s1.substring(i)))
2880                 return true;
2881         }
2882
2883         return false;
2884     }
2885
2886     public boolean canPrune(Node element)
2887     {
2888         if (element.type == Node.TextNode)
2889             return true;
2890
2891         if (element.content != null)
2892             return false;
2893
2894         if (element.tag == configuration.tt.tagA && element.attributes != null)
2895             return false;
2896
2897         if (element.tag == configuration.tt.tagP && !this.configuration.DropEmptyParas)
2898             return false;
2899
2900         if (element.tag == null)
2901             return false;
2902
2903         if ((element.tag.model & Dict.CM_ROW) != 0)
2904             return false;
2905
2906         if (element.tag == configuration.tt.tagApplet)
2907             return false;
2908
2909         if (element.tag == configuration.tt.tagObject)
2910             return false;
2911
2912         if (element.attributes != null &&
2913             (element.getAttrByName("id") != null ||
2914                element.getAttrByName("name") != null) )
2915             return false;
2916
2917         return true;
2918     }
2919
2920     /* duplicate name attribute as an id */
2921     public void fixId(Node node)
2922     {
2923         AttVal name = node.getAttrByName("name");
2924         AttVal id = node.getAttrByName("id");
2925
2926         if (name != null)
2927         {
2928             if (id != null)
2929             {
2930                 if (!id.value.equals(name.value))
2931                     Report.attrError(this, node, "name", Report.ID_NAME_MISMATCH);
2932             }
2933             else if (this.configuration.XmlOut)
2934                 node.addAttribute("id", name.value);
2935         }
2936     }
2937
2938     /*
2939      defer duplicates when entering a table or other
2940      element where the inlines shouldn't be duplicated
2941     */
2942     public void deferDup()
2943     {
2944         this.insert = -1;
2945         this.inode = null;
2946     }
2947
2948     /* Private methods and fields */
2949
2950     /* lexer char types */
2951     private static final short DIGIT       = 1;
2952     private static final short LETTER      = 2;
2953     private static final short NAMECHAR    = 4;
2954     private static final short WHITE       = 8;
2955     private static final short NEWLINE     = 16;
2956     private static final short LOWERCASE   = 32;
2957     private static final short UPPERCASE   = 64;
2958
2959     /* lexer GetToken states */
2960
2961     private static final short LEX_CONTENT     = 0;
2962     private static final short LEX_GT          = 1;
2963     private static final short LEX_ENDTAG      = 2;
2964     private static final short LEX_STARTTAG    = 3;
2965     private static final short LEX_COMMENT     = 4;
2966     private static final short LEX_DOCTYPE     = 5;
2967     private static final short LEX_PROCINSTR   = 6;
2968     private static final short LEX_ENDCOMMENT  = 7;
2969     private static final short LEX_CDATA       = 8;
2970     private static final short LEX_SECTION     = 9;
2971     private static final short LEX_ASP         = 10;
2972     private static final short LEX_JSTE        = 11;
2973     private static final short LEX_PHP         = 12;
2974
2975     /* used to classify chars for lexical purposes */
2976     private static short[] lexmap = new short[128];
2977
2978     private static void mapStr(String str, short code)
2979     {
2980         int j;
2981
2982         for ( int i = 0; i < str.length(); i++ ) {
2983             j = (int)str.charAt(i);
2984             lexmap[j] |= code;
2985         }
2986     }
2987
2988     static {
2989         mapStr("\r\n\f", (short)(NEWLINE|WHITE));
2990         mapStr(" \t", WHITE);
2991         mapStr("-.:_", NAMECHAR);
2992         mapStr("0123456789", (short)(DIGIT|NAMECHAR));
2993         mapStr("abcdefghijklmnopqrstuvwxyz", (short)(LOWERCASE|LETTER|NAMECHAR));
2994         mapStr("ABCDEFGHIJKLMNOPQRSTUVWXYZ", (short)(UPPERCASE|LETTER|NAMECHAR));
2995     }
2996
2997     private static short MAP( char c )
2998     {
2999         return ((int)c < 128 ? lexmap[(int)c] : 0);
3000     }
3001
3002     private static boolean isWhite(char c)
3003     {
3004         short m = MAP(c);
3005
3006         return (m & WHITE) != 0;
3007     }
3008
3009     private static boolean isDigit(char c)
3010     {
3011         short m;
3012
3013         m = MAP(c);
3014
3015         return (m & DIGIT) != 0;
3016     }
3017
3018     private static boolean isLetter(char c)
3019     {
3020         short m;
3021
3022         m = MAP(c);
3023
3024         return (m & LETTER) != 0;
3025     }
3026
3027     private static char toLower(char c)
3028     {
3029         short m = MAP(c);
3030
3031         if ((m & UPPERCASE) != 0)
3032             c = (char)( (int)c + (int)'a' - (int)'A' );
3033
3034         return c;
3035     }
3036
3037     private static char toUpper(char c)
3038     {
3039         short m = MAP(c);
3040
3041         if ((m & LOWERCASE) != 0)
3042             c = (char)( (int)c + (int)'A' - (int)'a' );
3043
3044         return c;
3045     }
3046
3047     public static char foldCase(char c, boolean tocaps, boolean xmlTags)
3048     {
3049         short m;
3050
3051         if (!xmlTags)
3052         {
3053             m = MAP(c);
3054
3055             if (tocaps)
3056             {
3057                 if ((m & LOWERCASE) != 0)
3058                     c = (char)( (int)c + (int)'A' - (int)'a' );
3059             }
3060             else /* force to lower case */
3061             {
3062                 if ((m & UPPERCASE) != 0)
3063                     c = (char)( (int)c + (int)'a' - (int)'A' );
3064             }
3065         }
3066
3067         return c;
3068     }
3069
3070
3071     private static class W3CVersionInfo
3072     {
3073         String name;
3074         String voyagerName;
3075         String profile;
3076         short code;
3077
3078         public W3CVersionInfo( String name,
3079                                String voyagerName,
3080                                String profile,
3081                                short code )
3082         {
3083             this.name = name;
3084             this.voyagerName = voyagerName;
3085             this.profile = profile;
3086             this.code = code;
3087         }
3088     }
3089
3090     /* the 3 URIs  for the XHTML 1.0 DTDs */
3091     private static final String voyager_loose    = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";
3092     private static final String voyager_strict   = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";
3093     private static final String voyager_frameset = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd";
3094
3095     private static final String XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml";
3096
3097     private static Lexer.W3CVersionInfo[] W3CVersion =
3098     {
3099         new W3CVersionInfo("HTML 4.01",
3100                            "XHTML 1.0 Strict",
3101                            voyager_strict,
3102                            Dict.VERS_HTML40_STRICT),
3103         new W3CVersionInfo("HTML 4.01 Transitional",
3104                            "XHTML 1.0 Transitional",
3105                            voyager_loose,
3106                            Dict.VERS_HTML40_LOOSE),
3107         new W3CVersionInfo("HTML 4.01 Frameset",
3108                            "XHTML 1.0 Frameset",
3109                            voyager_frameset,
3110                            Dict.VERS_FRAMES),
3111         new W3CVersionInfo("HTML 4.0",
3112                            "XHTML 1.0 Strict",
3113                            voyager_strict,
3114                            Dict.VERS_HTML40_STRICT),
3115         new W3CVersionInfo("HTML 4.0 Transitional",
3116                            "XHTML 1.0 Transitional",
3117                            voyager_loose,
3118                            Dict.VERS_HTML40_LOOSE),
3119         new W3CVersionInfo("HTML 4.0 Frameset",
3120                            "XHTML 1.0 Frameset",
3121                            voyager_frameset,
3122                            Dict.VERS_FRAMES),
3123         new W3CVersionInfo("HTML 3.2",
3124                            "XHTML 1.0 Transitional",
3125                            voyager_loose,
3126                            Dict.VERS_HTML32),
3127         new W3CVersionInfo("HTML 2.0",
3128                            "XHTML 1.0 Strict",
3129                            voyager_strict,
3130                            Dict.VERS_HTML20)
3131     };
3132
3133 }