2 * @(#)StreamInImpl.java 1.11 2000/08/16
6 package net.sourceforge.phpdt.tidy.w3c;
10 * Input Stream Implementation
12 * (c) 1998-2000 (W3C) MIT, INRIA, Keio University
13 * Derived from <a href="http://www.w3.org/People/Raggett/tidy">
14 * HTML Tidy Release 4 Aug 2000</a>
16 * @author Dave Raggett <dsr@w3.org>
17 * @author Andy Quick <ac.quick@sympatico.ca> (translation to Java)
18 * @version 1.0, 1999/05/22
19 * @version 1.0.1, 1999/05/29
20 * @version 1.1, 1999/06/18 Java Bean
21 * @version 1.2, 1999/07/10 Tidy Release 7 Jul 1999
22 * @version 1.3, 1999/07/30 Tidy Release 26 Jul 1999
23 * @version 1.4, 1999/09/04 DOM support
24 * @version 1.5, 1999/10/23 Tidy Release 27 Sep 1999
25 * @version 1.6, 1999/11/01 Tidy Release 22 Oct 1999
26 * @version 1.7, 1999/12/06 Tidy Release 30 Nov 1999
27 * @version 1.8, 2000/01/22 Tidy Release 13 Jan 2000
28 * @version 1.9, 2000/06/03 Tidy Release 30 Apr 2000
29 * @version 1.10, 2000/07/22 Tidy Release 8 Jul 2000
30 * @version 1.11, 2000/08/16 Tidy Release 4 Aug 2000
33 import java.io.IOException;
34 import java.io.InputStream;
36 public class StreamInImpl extends StreamIn {
38 /* Mapping for Windows Western character set (128-159) to Unicode */
39 private static int[] Win2Unicode =
41 0x20AC, 0x0000, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
42 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x0000, 0x017D, 0x0000,
43 0x0000, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
44 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x0000, 0x017E, 0x0178
48 John Love-Jensen contributed this table for mapping MacRoman
49 character set to Unicode
52 private static int[] Mac2Unicode =
55 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
56 0x0008, 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F,
58 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
59 0x0018, 0x0019, 0x001A, 0x001B, 0x001C, 0x001D, 0x001E, 0x001F,
61 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
62 0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F,
64 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
65 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F,
67 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
68 0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F,
70 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
71 0x0058, 0x0059, 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F,
73 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
74 0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F,
76 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
77 0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x007F,
79 0x00C4, 0x00C5, 0x00C7, 0x00C9, 0x00D1, 0x00D6, 0x00DC, 0x00E1,
80 0x00E0, 0x00E2, 0x00E4, 0x00E3, 0x00E5, 0x00E7, 0x00E9, 0x00E8,
82 0x00EA, 0x00EB, 0x00ED, 0x00EC, 0x00EE, 0x00EF, 0x00F1, 0x00F3,
83 0x00F2, 0x00F4, 0x00F6, 0x00F5, 0x00FA, 0x00F9, 0x00FB, 0x00FC,
85 0x2020, 0x00B0, 0x00A2, 0x00A3, 0x00A7, 0x2022, 0x00B6, 0x00DF,
86 0x00AE, 0x00A9, 0x2122, 0x00B4, 0x00A8, 0x2260, 0x00C6, 0x00D8,
88 0x221E, 0x00B1, 0x2264, 0x2265, 0x00A5, 0x00B5, 0x2202, 0x2211,
89 0x220F, 0x03C0, 0x222B, 0x00AA, 0x00BA, 0x03A9, 0x00E6, 0x00F8,
91 0x00BF, 0x00A1, 0x00AC, 0x221A, 0x0192, 0x2248, 0x2206, 0x00AB,
92 0x00BB, 0x2026, 0x00A0, 0x00C0, 0x00C3, 0x00D5, 0x0152, 0x0153,
94 0x2013, 0x2014, 0x201C, 0x201D, 0x2018, 0x2019, 0x00F7, 0x25CA,
95 0x00FF, 0x0178, 0x2044, 0x20AC, 0x2039, 0x203A, 0xFB01, 0xFB02,
97 0x2021, 0x00B7, 0x201A, 0x201E, 0x2030, 0x00C2, 0x00CA, 0x00C1,
98 0x00CB, 0x00C8, 0x00CD, 0x00CE, 0x00CF, 0x00CC, 0x00D3, 0x00D4,
99 /* xF0 = Apple Logo */
100 0xF8FF, 0x00D2, 0x00DA, 0x00DB, 0x00D9, 0x0131, 0x02C6, 0x02DC,
101 0x00AF, 0x02D8, 0x02D9, 0x02DA, 0x00B8, 0x02DD, 0x02DB, 0x02C7
104 public StreamInImpl(InputStream stream, int encoding, int tabsize)
106 this.stream = stream;
110 this.tabsize = tabsize;
113 this.encoding = encoding;
114 this.state = FSM_ASCII;
115 this.endOfStream = false;
118 /* read char from stream */
119 public int readCharFromStream()
124 c = this.stream.read();
126 if (c == EndOfStream) {
127 this.endOfStream = true;
132 A document in ISO-2022 based encoding uses some ESC sequences
133 called "designator" to switch character sets. The designators
134 defined and used in ISO-2022-JP are:
136 "ESC" + "(" + ? for ISO646 variants
139 "ESC" + "$" + "(" + ? for multibyte character sets
141 Where ? stands for a single character used to indicate the
142 character set for multibyte characters.
144 Tidy handles this by preserving the escape sequence and
145 setting the top bit of each byte for non-ascii chars. This
146 bit is then cleared on output. The input stream keeps track
147 of the state to determine when to set/clear the bit.
150 if (this.encoding == Configuration.ISO2022)
152 if (c == 0x1b) /* ESC */
154 this.state = FSM_ESC;
162 this.state = FSM_ESCD;
164 this.state = FSM_ESCP;
166 this.state = FSM_ASCII;
171 this.state = FSM_ESCDP;
173 this.state = FSM_NONASCII;
177 this.state = FSM_NONASCII;
181 this.state = FSM_ASCII;
192 if (this.encoding != Configuration.UTF8)
195 /* deal with UTF-8 encoded char */
197 if ((c & 0xE0) == 0xC0) /* 110X XXXX two bytes */
202 else if ((c & 0xF0) == 0xE0) /* 1110 XXXX three bytes */
207 else if ((c & 0xF8) == 0xF0) /* 1111 0XXX four bytes */
212 else if ((c & 0xFC) == 0xF8) /* 1111 10XX five bytes */
217 else if ((c & 0xFE) == 0xFC) /* 1111 110X six bytes */
222 else /* 0XXX XXXX one byte */
225 /* successor bytes should have the form 10XX XXXX */
226 for (i = 1; i <= count; ++i)
228 c = this.stream.read();
230 if (c == EndOfStream) {
231 this.endOfStream = true;
235 n = (n << 6) | (c & 0x3F);
238 catch (IOException e) {
239 System.err.println("StreamInImpl.readCharFromStream: " + e.toString());
246 public int readChar()
266 this.lastcol = this.curcol;
277 c = readCharFromStream();
291 c = readCharFromStream();
304 this.tabs = this.tabsize - ((this.curcol - 1) % this.tabsize) - 1;
310 /* strip control characters, except for Esc */
318 /* watch out for IS02022 */
320 if (this.encoding == Configuration.RAW ||
321 this.encoding == Configuration.ISO2022)
327 if (this.encoding == Configuration.MACROMAN)
330 /* produced e.g. as a side-effect of smart quotes in Word */
332 if (127 < c && c < 160)
334 Report.encodingError((Lexer)this.lexer, Report.WINDOWS_CHARS, c);
336 c = Win2Unicode[c - 128];
349 public void ungetChar(int c)
359 this.curcol = this.lastcol;
362 public boolean isEndOfStream()
364 return this.endOfStream;