improved PHP parser
[phpeclipse.git] / net.sourceforge.phpeclipse / src / net / sourceforge / phpdt / internal / ui / text / JavaBreakIterator.java
1 /*******************************************************************************
2  * Copyright (c) 2000, 2004 IBM Corporation and others.
3  * All rights reserved. This program and the accompanying materials 
4  * are made available under the terms of the Common Public License v1.0
5  * which accompanies this distribution, and is available at
6  * http://www.eclipse.org/legal/cpl-v10.html
7  * 
8  * Contributors:
9  *     IBM Corporation - initial API and implementation
10  *******************************************************************************/
11 package net.sourceforge.phpdt.internal.ui.text;
12
13 import java.text.BreakIterator;
14 import java.text.CharacterIterator;
15
16 import net.sourceforge.phpdt.internal.compiler.parser.Scanner;
17
18 import org.eclipse.jface.text.Assert;
19
20
21 /**
22  * A java break iterator. It returns all breaks, including before and after 
23  * whitespace, and it returns all camelcase breaks.
24  * <p>
25  * A line break may be any of "\n", "\r", "\r\n", "\n\r".
26  * </p>
27  * 
28  * @since 3.0
29  */
30 public class JavaBreakIterator extends BreakIterator {
31
32         /**
33          * A run of common characters.
34          */
35         protected static abstract class Run {
36                 /** The length of this run. */
37                 protected int length;
38                 
39                 public Run() {
40                         init();
41                 }
42                 
43                 /**
44                  * Returns <code>true</code> if this run consumes <code>ch</code>,
45                  * <code>false</code> otherwise. If <code>true</code> is returned,
46                  * the length of the receiver is adjusted accordingly.
47                  * 
48                  * @param ch the character to test
49                  * @return <code>true</code> if <code>ch</code> was consumed
50                  */
51                 protected boolean consume(char ch) {
52                         if (isValid(ch)) {
53                                 length++;
54                                 return true;
55                         }
56                         return false;
57                 }
58                 
59                 /**
60                  * Whether this run accepts that character; does not update state. Called
61                  * from the default implementation of <code>consume</code>.
62                  * 
63                  * @param ch the character to test
64                  * @return <code>true</code> if <code>ch</code> is accepted
65                  */
66                 protected abstract boolean isValid(char ch);
67                 
68                 /**
69                  * Resets this run to the initial state.
70                  */
71                 protected void init() {
72                         length= 0;
73                 }
74         }
75         
76         static final class Whitespace extends Run {
77                 protected boolean isValid(char ch) {
78                         return Character.isWhitespace(ch) && ch != '\n' && ch != '\r';
79                 }
80         }
81         
82         static final class LineDelimiter extends Run {
83                 /** State: INIT -> delimiter -> EXIT. */
84                 private char fState;
85                 private static final char INIT= '\0';
86                 private static final char EXIT= '\1';
87                 
88                 /*
89                  * @see net.sourceforge.phpdt.internal.ui.text.JavaBreakIterator.Run#init()
90                  */
91                 protected void init() {
92                         super.init();
93                         fState= INIT;
94                 }
95                 
96                 /*
97                  * @see net.sourceforge.phpdt.internal.ui.text.JavaBreakIterator.Run#consume(char)
98                  */
99                 protected boolean consume(char ch) {
100                         if (!isValid(ch) || fState == EXIT)
101                                 return false;
102                         
103                         if (fState == INIT) {
104                                 fState= ch;
105                                 length++;
106                                 return true;
107                         } else if (fState != ch) {
108                                 fState= EXIT;
109                                 length++;
110                                 return true;
111                         } else {
112                                 return false;
113                         }
114                 }
115                 
116                 protected boolean isValid(char ch) {
117                         return ch == '\n' || ch == '\r';
118                 }
119         }
120         
121         static final class Identifier extends Run {
122                 /*
123                  * @see net.sourceforge.phpdt.internal.ui.text.JavaBreakIterator.Run#isValid(char)
124                  */
125                 protected boolean isValid(char ch) {
126                         return Scanner.isPHPIdentifierPart(ch);
127                 }
128         }
129         
130         static final class CamelCaseIdentifier extends Run {
131                 /* states */
132                 private static final int S_INIT= 0;
133                 private static final int S_LOWER= 1;
134                 private static final int S_ONE_CAP= 2;
135                 private static final int S_ALL_CAPS= 3;
136                 private static final int S_EXIT= 4;
137                 private static final int S_EXIT_MINUS_ONE= 5;
138
139                 /* character types */
140                 private static final int K_INVALID= 0;
141                 private static final int K_LOWER= 1;
142                 private static final int K_UPPER= 2;
143                 private static final int K_OTHER= 3;
144                 
145                 private int fState;
146                 
147                 private final static int[][] MATRIX= new int[][] {
148                                 // K_INVALID, K_LOWER,           K_UPPER,    K_OTHER
149                                 {  S_EXIT,    S_LOWER,           S_ONE_CAP,  S_LOWER }, // S_INIT
150                                 {  S_EXIT,    S_LOWER,           S_EXIT,     S_LOWER }, // S_LOWER
151                                 {  S_EXIT,    S_LOWER,           S_ALL_CAPS, S_LOWER }, // S_ONE_CAP
152                                 {  S_EXIT,    S_EXIT_MINUS_ONE,  S_ALL_CAPS, S_LOWER }, // S_ALL_CAPS
153                 };
154                 
155                 /*
156                  * @see net.sourceforge.phpdt.internal.ui.text.JavaBreakIterator.Run#init()
157                  */
158                 protected void init() {
159                         super.init();
160                         fState= S_INIT;
161                 }
162                 
163                 /*
164                  * @see net.sourceforge.phpdt.internal.ui.text.JavaBreakIterator.Run#consumes(char)
165                  */
166                 protected boolean consume(char ch) {
167                         int kind= getKind(ch);
168                         fState= MATRIX[fState][kind];
169                         switch (fState) {
170                                 case S_LOWER:
171                                 case S_ONE_CAP:
172                                 case S_ALL_CAPS:
173                                         length++;
174                                         return true;
175                                 case S_EXIT:
176                                         return false;
177                                 case S_EXIT_MINUS_ONE:
178                                         length--;
179                                         return false;
180                                 default:
181                                         Assert.isTrue(false);
182                                         return false;
183                         }
184                 }
185                 
186                 /**
187                  * Determines the kind of a character.
188                  * 
189                  * @param ch the character to test
190                  */
191                 private int getKind(char ch) {
192                         if (Character.isUpperCase(ch))
193                                 return K_UPPER;
194                         if (Character.isLowerCase(ch))
195                                 return K_LOWER;
196                         if (Scanner.isPHPIdentifierPart(ch)) // _, digits...
197                                 return K_OTHER;
198                         return K_INVALID;
199                 }
200
201                 /*
202                  * @see net.sourceforge.phpdt.internal.ui.text.JavaBreakIterator.Run#isValid(char)
203                  */
204                 protected boolean isValid(char ch) {
205                         return Scanner.isPHPIdentifierPart(ch);
206                 }
207         }
208
209         static final class Other extends Run {
210                 /*
211                  * @see net.sourceforge.phpdt.internal.ui.text.JavaBreakIterator.Run#isValid(char)
212                  */
213                 protected boolean isValid(char ch) {
214                         return !Character.isWhitespace(ch) && !Scanner.isPHPIdentifierPart(ch);
215                 }
216         }
217         
218         private static final Run WHITESPACE= new Whitespace();
219         private static final Run DELIMITER= new LineDelimiter();
220         private static final Run CAMELCASE= new CamelCaseIdentifier(); // new Identifier();
221         private static final Run OTHER= new Other();
222         
223         /** The platform break iterator (word instance) used as a base. */ 
224         protected final BreakIterator fIterator;
225         /** The text we operate on. */
226         protected CharSequence fText;
227         /** our current position for the stateful methods. */
228         private int fIndex;
229         
230         
231         /**
232          * Creates a new break iterator.
233          */
234         public JavaBreakIterator() {
235                 fIterator= BreakIterator.getWordInstance();
236                 fIndex= fIterator.current();
237         }
238
239         /*
240          * @see java.text.BreakIterator#current()
241          */
242         public int current() {
243                 return fIndex;
244         }
245
246         /*
247          * @see java.text.BreakIterator#first()
248          */
249         public int first() {
250                 fIndex= fIterator.first();
251                 return fIndex;
252         }
253
254         /*
255          * @see java.text.BreakIterator#following(int)
256          */
257         public int following(int offset) {
258                 // work around too eager IAEs in standard impl
259                 if (offset == getText().getEndIndex())
260                         return DONE;
261                 
262                 int next= fIterator.following(offset);
263                 if (next == DONE)
264                         return DONE;
265                 
266                 // TODO deal with complex script word boundaries
267                 // Math.min(offset + run.length, next) does not work
268                 // since wordinstance considers _ as boundaries
269                 // seems to work fine, however
270                 Run run= consumeRun(offset);
271                 return offset + run.length;
272                 
273         }
274
275         /**
276          * Consumes a run of characters at the limits of which we introduce a break.
277          * @param offset the offset to start at
278          * @return the run that was consumed
279          */
280         private Run consumeRun(int offset) {
281                 // assert offset < length
282                 
283                 char ch= fText.charAt(offset);
284                 int length= fText.length();
285                 Run run= getRun(ch);
286                 while (run.consume(ch) && offset < length - 1) {
287                         offset++;
288                         ch= fText.charAt(offset);
289                 }
290                 
291                 return run;
292         }
293
294         /**
295          * Retunrs a run based on a character.
296          * 
297          * @param ch the character to test
298          * @return the correct character given <code>ch</code>
299          */
300         private Run getRun(char ch) {
301                 Run run;
302                 if (WHITESPACE.isValid(ch))
303                         run= WHITESPACE;
304                 else if (DELIMITER.isValid(ch))
305                         run= DELIMITER;
306                 else if (CAMELCASE.isValid(ch))
307                         run= CAMELCASE;
308                 else if (OTHER.isValid(ch))
309                         run= OTHER;
310                 else {
311                         Assert.isTrue(false);
312                         return null;
313                 }
314                 
315                 run.init();
316                 return run;
317         }
318         
319         /*
320          * @see java.text.BreakIterator#getText()
321          */
322         public CharacterIterator getText() {
323                 return fIterator.getText();
324         }
325
326         /*
327          * @see java.text.BreakIterator#isBoundary(int)
328          */
329         public boolean isBoundary(int offset) {
330         if (offset == getText().getBeginIndex())
331             return true;
332         else
333             return following(offset - 1) == offset;
334         }
335
336         /*
337          * @see java.text.BreakIterator#last()
338          */
339         public int last() {
340                 fIndex= fIterator.last();
341                 return fIndex;
342         }
343
344         /*
345          * @see java.text.BreakIterator#next()
346          */
347         public int next() {
348                 fIndex= following(fIndex);
349                 return fIndex;
350         }
351
352         /*
353          * @see java.text.BreakIterator#next(int)
354          */
355         public int next(int n) {
356                 return fIterator.next(n);
357         }
358         
359         /*
360          * @see java.text.BreakIterator#preceding(int)
361          */
362         public int preceding(int offset) {
363                 if (offset == getText().getBeginIndex())
364                         return DONE;
365                 
366                 if (isBoundary(offset - 1))
367                         return offset - 1;
368
369                 int previous= offset - 1;
370                 do {
371                         previous= fIterator.preceding(previous);
372                 } while (!isBoundary(previous));
373                 
374                 int last= DONE;
375                 while (previous < offset) {
376                         last= previous;
377                         previous= following(previous);
378                 }
379                 
380                 return last;
381         }
382
383         /*
384          * @see java.text.BreakIterator#previous()
385          */
386         public int previous() {
387                 fIndex= preceding(fIndex);
388                 return fIndex;
389         }
390
391         /*
392          * @see java.text.BreakIterator#setText(java.lang.String)
393          */
394         public void setText(String newText) {
395                 setText((CharSequence) newText);
396         }
397
398         /**
399          * Creates a break iterator given a char sequence.
400          * @param newText the new text
401          */
402         public void setText(CharSequence newText) {
403                 fText= newText;
404                 fIterator.setText(new SequenceCharacterIterator(newText));
405                 first();
406         }
407
408         /*
409          * @see java.text.BreakIterator#setText(java.text.CharacterIterator)
410          */
411         public void setText(CharacterIterator newText) {
412                 if (newText instanceof CharSequence) {
413                         fText= (CharSequence) newText;
414                         fIterator.setText(newText);
415                         first();
416                 } else {
417                         throw new UnsupportedOperationException("CharacterIterator not supported"); //$NON-NLS-1$
418                 }
419         }
420 }