Optimized Scanner
[phpeclipse.git] / net.sourceforge.phpeclipse / src / net / sourceforge / phpdt / internal / ui / text / HTML2TextReader.java
1 package net.sourceforge.phpdt.internal.ui.text;
2
3
4 /*
5  * (c) Copyright IBM Corp. 2000, 2001.
6  * All Rights Reserved.
7  */
8
9
10 import java.io.IOException;
11 import java.io.PushbackReader;
12 import java.io.Reader;
13 import java.util.HashMap;
14 import java.util.HashSet;
15 import java.util.Map;
16 import java.util.Set;
17
18 import net.sourceforge.phpdt.internal.ui.PHPUIMessages;
19
20 import org.eclipse.jface.text.TextPresentation;
21 import org.eclipse.swt.SWT;
22 import org.eclipse.swt.custom.StyleRange;
23
24
25
26
27 /**
28  * Reads the text contents from a reader of HTML contents and translates 
29  * the tags or cut them out.
30  */
31 public class HTML2TextReader extends SubstitutionTextReader {
32         
33         
34         private static final String LINE_DELIM= System.getProperty("line.separator", "\n"); //$NON-NLS-1$ //$NON-NLS-2$
35         private static final String EMPTY_STRING= ""; //$NON-NLS-1$
36         private static final Map fgEntityLookup;
37         private static final Set fgTags;
38         
39         static {
40                 
41                 fgTags= new HashSet();
42                 fgTags.add("b"); //$NON-NLS-1$
43                 fgTags.add("br"); //$NON-NLS-1$
44                 fgTags.add("h5"); //$NON-NLS-1$
45                 fgTags.add("p"); //$NON-NLS-1$
46                 fgTags.add("dl"); //$NON-NLS-1$
47                 fgTags.add("dt"); //$NON-NLS-1$
48                 fgTags.add("dd"); //$NON-NLS-1$
49                 fgTags.add("li"); //$NON-NLS-1$
50                 fgTags.add("ul"); //$NON-NLS-1$
51                 fgTags.add("pre"); //$NON-NLS-1$
52                 
53                 fgEntityLookup= new HashMap(7);
54                 fgEntityLookup.put("lt", "<"); //$NON-NLS-1$ //$NON-NLS-2$
55                 fgEntityLookup.put("gt", ">"); //$NON-NLS-1$ //$NON-NLS-2$
56                 fgEntityLookup.put("nbsp", " "); //$NON-NLS-1$ //$NON-NLS-2$
57                 fgEntityLookup.put("amp", "&"); //$NON-NLS-1$ //$NON-NLS-2$
58                 fgEntityLookup.put("circ", "^"); //$NON-NLS-1$ //$NON-NLS-2$
59                 fgEntityLookup.put("tilde", "~"); //$NON-NLS-2$ //$NON-NLS-1$
60                 fgEntityLookup.put("quot", "\"");                //$NON-NLS-1$ //$NON-NLS-2$
61         }
62         
63         private int fCounter= 0;
64         private TextPresentation fTextPresentation;
65         private int fBold= 0;
66         private int fStartOffset= -1;
67         private boolean fInParagraph= false;
68         private boolean fIsPreformattedText= false;
69         
70         /**
71          * Transforms the html text from the reader to formatted text.
72          * @param presentation If not <code>null</code>, formattings will be applied to 
73          * the presentation.
74         */
75         public HTML2TextReader(Reader reader, TextPresentation presentation) {
76                 super(new PushbackReader(reader));
77                 fTextPresentation= presentation;
78         }
79         
80         public int read() throws IOException {
81                 int c= super.read();
82                 if (c != -1)
83                         ++ fCounter;
84                 return c;
85         }
86         
87         protected void startBold() {
88                 if (fBold == 0)
89                         fStartOffset= fCounter;
90                 ++ fBold;
91         }
92
93         protected void startPreformattedText() {
94                 fIsPreformattedText= true;
95                 setSkipWhitespace(false);
96         }
97
98         protected void stopPreformattedText() {
99                 fIsPreformattedText= false;
100                 setSkipWhitespace(true);
101         }
102         
103         protected void stopBold() {
104                 -- fBold;
105                 if (fBold == 0) {
106                         if (fTextPresentation != null) {
107                                 fTextPresentation.addStyleRange(new StyleRange(fStartOffset, fCounter - fStartOffset, null, null, SWT.BOLD));
108                         }
109                         fStartOffset= -1;
110                 }
111         }
112         
113         /**
114          * @see SubstitutionTextReader#computeSubstitution(char)
115          */
116         protected String computeSubstitution(int c) throws IOException {
117                 
118                 if (c == '<')
119                         return  processHTMLTag();
120                 else if (c == '&')
121                         return processEntity();
122                 else if (fIsPreformattedText)
123                         return processPreformattedText(c);
124                 
125                 return null;
126         }
127
128         private String html2Text(String html) {
129                 
130                 String tag= html;
131                 if ('/' == tag.charAt(0))
132                         tag= tag.substring(1);
133                         
134                 if (!fgTags.contains(tag))
135                         return EMPTY_STRING;
136
137
138                 if ("pre".equals(html)) { //$NON-NLS-1$
139                         startPreformattedText();
140                         return EMPTY_STRING;
141                 }
142
143                 if ("/pre".equals(html)) { //$NON-NLS-1$
144                         stopPreformattedText();
145                         return EMPTY_STRING;
146                 }
147
148                 if (fIsPreformattedText)
149                         return EMPTY_STRING;
150
151                 if ("b".equals(html)) { //$NON-NLS-1$
152                         startBold();
153                         return EMPTY_STRING;
154                 }
155                                 
156                 if ("h5".equals(html) || "dt".equals(html)) { //$NON-NLS-1$ //$NON-NLS-2$
157                         startBold();
158                         return EMPTY_STRING;
159                 }
160                 
161                 if ("dl".equals(html)) //$NON-NLS-1$
162                         return LINE_DELIM;
163                 
164                 if ("dd".equals(html)) //$NON-NLS-1$
165                         return "\t"; //$NON-NLS-1$
166                 
167                 if ("li".equals(html)) //$NON-NLS-1$
168                         return LINE_DELIM + "\t" + PHPUIMessages.getString("HTML2TextReader.dash"); //$NON-NLS-1$ //$NON-NLS-2$
169                                         
170                 if ("/b".equals(html)) { //$NON-NLS-1$
171                         stopBold();
172                         return EMPTY_STRING;
173                 }
174
175                 if ("p".equals(html))  { //$NON-NLS-1$
176                         fInParagraph= true;
177                         return LINE_DELIM;
178                 }
179
180                 if ("br".equals(html)) //$NON-NLS-1$
181                         return LINE_DELIM;
182                 
183                 if ("/p".equals(html))  { //$NON-NLS-1$
184                         boolean inParagraph= fInParagraph;
185                         fInParagraph= false;
186                         return inParagraph ? EMPTY_STRING : LINE_DELIM;
187                 }
188                         
189                 if ("/h5".equals(html) || "/dt".equals(html)) { //$NON-NLS-1$ //$NON-NLS-2$
190                         stopBold();
191                         return LINE_DELIM;
192                 }
193                 
194                 if ("/dd".equals(html)) //$NON-NLS-1$
195                         return LINE_DELIM;
196                                 
197                 return EMPTY_STRING;
198         }
199         
200         /*
201          * A '<' has been read. Process a html tag
202          */ 
203         private String processHTMLTag() throws IOException {
204                 
205                 StringBuffer buf= new StringBuffer();
206                 int ch;
207                 do {            
208                         
209                         ch= nextChar();
210                         
211                         while (ch != -1 && ch != '>') {
212                                 buf.append(Character.toLowerCase((char) ch));
213                                 ch= nextChar();
214                                 if (ch == '"'){
215                                         buf.append(Character.toLowerCase((char) ch));
216                                         ch= nextChar();
217                                         while (ch != -1 && ch != '"'){
218                                                 buf.append(Character.toLowerCase((char) ch));
219                                                 ch= nextChar();
220                                         }       
221                                 }
222                                 if (ch == '<'){
223                                         unread(ch);
224                                         return '<' + buf.toString();
225                                 }       
226                         }
227                         
228                         if (ch == -1)
229                                 return null;
230                         
231                         int tagLen= buf.length();
232                         // needs special treatment for comments 
233                         if ((tagLen >= 3 && "!--".equals(buf.substring(0, 3))) //$NON-NLS-1$
234                                 && !(tagLen >= 5 && "--!".equals(buf.substring(tagLen - 3)))) { //$NON-NLS-1$
235                                 // unfinished comment
236                                 buf.append(ch);
237                         } else {
238                                 break;
239                         }
240                 } while (true);
241                  
242                 return html2Text(buf.toString());
243         }
244
245         private String processPreformattedText(int c) {
246                 if  (c == '\r' || c == '\n')
247                         fCounter++;
248                 return null;
249         }
250
251         
252         private void unread(int ch) throws IOException {
253                 ((PushbackReader) getReader()).unread(ch);
254         }
255         
256         protected String entity2Text(String symbol) {
257                 if (symbol.length() > 1 && symbol.charAt(0) == '#') {
258                         int ch;
259                         try {
260                                 if (symbol.charAt(1) == 'x') {
261                                         ch= Integer.parseInt(symbol.substring(2), 16);
262                                 } else {
263                                         ch= Integer.parseInt(symbol.substring(1), 10);
264                                 }
265                                 return EMPTY_STRING + (char)ch;
266                         } catch (NumberFormatException e) {
267                         }
268                 } else {
269                         String str= (String) fgEntityLookup.get(symbol);
270                         if (str != null) {
271                                 return str;
272                         }
273                 }
274                 return "&" + symbol; // not found //$NON-NLS-1$
275         }
276         
277         /*
278          * A '&' has been read. Process a entity
279          */     
280         private String processEntity() throws IOException {
281                 StringBuffer buf= new StringBuffer();
282                 int ch= nextChar();
283                 while (Character.isLetterOrDigit((char)ch) || ch == '#') {
284                         buf.append((char) ch);
285                         ch= nextChar();
286                 }
287                 
288                 if (ch == ';') 
289                         return entity2Text(buf.toString());
290                 
291                 buf.insert(0, '&');
292                 if (ch != -1)
293                         buf.append((char) ch);
294                 return buf.toString();
295         }
296 }