1 package net.sourceforge.phpdt.internal.ui.text;
5 * (c) Copyright IBM Corp. 2000, 2001.
10 import java.io.IOException;
11 import java.io.PushbackReader;
12 import java.io.Reader;
13 import java.util.HashMap;
14 import java.util.HashSet;
18 import net.sourceforge.phpdt.internal.ui.PHPUIMessages;
20 import org.eclipse.jface.text.TextPresentation;
21 import org.eclipse.swt.SWT;
22 import org.eclipse.swt.custom.StyleRange;
28 * Reads the text contents from a reader of HTML contents and translates
29 * the tags or cut them out.
31 public class HTML2TextReader extends SubstitutionTextReader {
34 private static final String LINE_DELIM= System.getProperty("line.separator", "\n"); //$NON-NLS-1$ //$NON-NLS-2$
35 private static final String EMPTY_STRING= ""; //$NON-NLS-1$
36 private static final Map fgEntityLookup;
37 private static final Set fgTags;
41 fgTags= new HashSet();
42 fgTags.add("b"); //$NON-NLS-1$
43 fgTags.add("br"); //$NON-NLS-1$
44 fgTags.add("h5"); //$NON-NLS-1$
45 fgTags.add("p"); //$NON-NLS-1$
46 fgTags.add("dl"); //$NON-NLS-1$
47 fgTags.add("dt"); //$NON-NLS-1$
48 fgTags.add("dd"); //$NON-NLS-1$
49 fgTags.add("li"); //$NON-NLS-1$
50 fgTags.add("ul"); //$NON-NLS-1$
51 fgTags.add("pre"); //$NON-NLS-1$
53 fgEntityLookup= new HashMap(7);
54 fgEntityLookup.put("lt", "<"); //$NON-NLS-1$ //$NON-NLS-2$
55 fgEntityLookup.put("gt", ">"); //$NON-NLS-1$ //$NON-NLS-2$
56 fgEntityLookup.put("nbsp", " "); //$NON-NLS-1$ //$NON-NLS-2$
57 fgEntityLookup.put("amp", "&"); //$NON-NLS-1$ //$NON-NLS-2$
58 fgEntityLookup.put("circ", "^"); //$NON-NLS-1$ //$NON-NLS-2$
59 fgEntityLookup.put("tilde", "~"); //$NON-NLS-2$ //$NON-NLS-1$
60 fgEntityLookup.put("quot", "\""); //$NON-NLS-1$ //$NON-NLS-2$
63 private int fCounter= 0;
64 private TextPresentation fTextPresentation;
66 private int fStartOffset= -1;
67 private boolean fInParagraph= false;
68 private boolean fIsPreformattedText= false;
71 * Transforms the html text from the reader to formatted text.
72 * @param presentation If not <code>null</code>, formattings will be applied to
75 public HTML2TextReader(Reader reader, TextPresentation presentation) {
76 super(new PushbackReader(reader));
77 fTextPresentation= presentation;
80 public int read() throws IOException {
87 protected void startBold() {
89 fStartOffset= fCounter;
93 protected void startPreformattedText() {
94 fIsPreformattedText= true;
95 setSkipWhitespace(false);
98 protected void stopPreformattedText() {
99 fIsPreformattedText= false;
100 setSkipWhitespace(true);
103 protected void stopBold() {
106 if (fTextPresentation != null) {
107 fTextPresentation.addStyleRange(new StyleRange(fStartOffset, fCounter - fStartOffset, null, null, SWT.BOLD));
114 * @see SubstitutionTextReader#computeSubstitution(char)
116 protected String computeSubstitution(int c) throws IOException {
119 return processHTMLTag();
121 return processEntity();
122 else if (fIsPreformattedText)
123 return processPreformattedText(c);
128 private String html2Text(String html) {
131 if ('/' == tag.charAt(0))
132 tag= tag.substring(1);
134 if (!fgTags.contains(tag))
138 if ("pre".equals(html)) { //$NON-NLS-1$
139 startPreformattedText();
143 if ("/pre".equals(html)) { //$NON-NLS-1$
144 stopPreformattedText();
148 if (fIsPreformattedText)
151 if ("b".equals(html)) { //$NON-NLS-1$
156 if ("h5".equals(html) || "dt".equals(html)) { //$NON-NLS-1$ //$NON-NLS-2$
161 if ("dl".equals(html)) //$NON-NLS-1$
164 if ("dd".equals(html)) //$NON-NLS-1$
165 return "\t"; //$NON-NLS-1$
167 if ("li".equals(html)) //$NON-NLS-1$
168 return LINE_DELIM + "\t" + PHPUIMessages.getString("HTML2TextReader.dash"); //$NON-NLS-1$ //$NON-NLS-2$
170 if ("/b".equals(html)) { //$NON-NLS-1$
175 if ("p".equals(html)) { //$NON-NLS-1$
180 if ("br".equals(html)) //$NON-NLS-1$
183 if ("/p".equals(html)) { //$NON-NLS-1$
184 boolean inParagraph= fInParagraph;
186 return inParagraph ? EMPTY_STRING : LINE_DELIM;
189 if ("/h5".equals(html) || "/dt".equals(html)) { //$NON-NLS-1$ //$NON-NLS-2$
194 if ("/dd".equals(html)) //$NON-NLS-1$
201 * A '<' has been read. Process a html tag
203 private String processHTMLTag() throws IOException {
205 StringBuffer buf= new StringBuffer();
211 while (ch != -1 && ch != '>') {
212 buf.append(Character.toLowerCase((char) ch));
215 buf.append(Character.toLowerCase((char) ch));
217 while (ch != -1 && ch != '"'){
218 buf.append(Character.toLowerCase((char) ch));
224 return '<' + buf.toString();
231 int tagLen= buf.length();
232 // needs special treatment for comments
233 if ((tagLen >= 3 && "!--".equals(buf.substring(0, 3))) //$NON-NLS-1$
234 && !(tagLen >= 5 && "--!".equals(buf.substring(tagLen - 3)))) { //$NON-NLS-1$
235 // unfinished comment
242 return html2Text(buf.toString());
245 private String processPreformattedText(int c) {
246 if (c == '\r' || c == '\n')
252 private void unread(int ch) throws IOException {
253 ((PushbackReader) getReader()).unread(ch);
256 protected String entity2Text(String symbol) {
257 if (symbol.length() > 1 && symbol.charAt(0) == '#') {
260 if (symbol.charAt(1) == 'x') {
261 ch= Integer.parseInt(symbol.substring(2), 16);
263 ch= Integer.parseInt(symbol.substring(1), 10);
265 return EMPTY_STRING + (char)ch;
266 } catch (NumberFormatException e) {
269 String str= (String) fgEntityLookup.get(symbol);
274 return "&" + symbol; // not found //$NON-NLS-1$
278 * A '&' has been read. Process a entity
280 private String processEntity() throws IOException {
281 StringBuffer buf= new StringBuffer();
283 while (Character.isLetterOrDigit((char)ch) || ch == '#') {
284 buf.append((char) ch);
289 return entity2Text(buf.toString());
293 buf.append((char) ch);
294 return buf.toString();