improved PHP Scanner
[phpeclipse.git] / net.sourceforge.phpeclipse / src / net / sourceforge / phpdt / internal / compiler / parser / Scanner.java
index c5d042b..0826535 100644 (file)
@@ -156,12 +156,31 @@ public class Scanner implements IScanner, ITerminalSymbols {
   public static final int SquareBracket = 1;
   public static final int CurlyBracket = 2;
   public static final int BracketKinds = 3;
+
+  public static final boolean DEBUG = false;
   public Scanner() {
     this(false, false);
   }
   public Scanner(boolean tokenizeComments, boolean tokenizeWhiteSpace) {
     this(tokenizeComments, tokenizeWhiteSpace, false);
   }
+
+  /**
+   * Determines if the specified character is
+   * permissible as the first character in a PHP identifier
+   */
+  public static boolean isPHPIdentifierStart(char ch) {
+    return Character.isLetter(ch) || (ch == '_');
+  }
+
+  /**
+   * Determines if the specified character may be part of a PHP identifier as
+   * other than the first character
+   */
+  public static boolean isPHPIdentifierPart(char ch) {
+    return Character.isLetterOrDigit(ch) || (ch == '_');
+  }
+
   public final boolean atEnd() {
     // This code is not relevant if source is 
     // Only a part of the real stream input
@@ -232,7 +251,7 @@ public class Scanner implements IScanner, ITerminalSymbols {
     }
     return result;
   }
-  
+
   public final char[] getCurrentTokenSource(int startPos) {
     // Return the token REAL source (aka unicodes are precomputed)
 
@@ -256,7 +275,7 @@ public class Scanner implements IScanner, ITerminalSymbols {
     }
     return result;
   }
-  
+
   public final char[] getCurrentTokenSourceString() {
     //return the token REAL source (aka unicodes are precomputed).
     //REMOVE the two " that are at the beginning and the end.
@@ -663,7 +682,7 @@ public class Scanner implements IScanner, ITerminalSymbols {
         }
 
         currentCharacter = (char) (((c1 * 16 + c2) * 16 + c3) * 16 + c4);
-        if (!Character.isJavaIdentifierPart(currentCharacter)) {
+        if (!isPHPIdentifierPart(currentCharacter)) {
           currentPosition = temp;
           return false;
         }
@@ -684,7 +703,7 @@ public class Scanner implements IScanner, ITerminalSymbols {
         return true;
       } //-------------end unicode traitement--------------
       else {
-        if (!Character.isJavaIdentifierPart(currentCharacter)) {
+        if (!isPHPIdentifierPart(currentCharacter)) {
           currentPosition = temp;
           return false;
         }
@@ -860,12 +879,12 @@ public class Scanner implements IScanner, ITerminalSymbols {
                     int heredocStart = currentPosition;
                     int heredocLength = 0;
                     currentCharacter = source[currentPosition++];
-                    if (Character.isJavaIdentifierStart(currentCharacter)) {
+                    if (isPHPIdentifierStart(currentCharacter)) {
                       currentCharacter = source[currentPosition++];
                     } else {
                       return TokenNameERROR;
                     }
-                    while (Character.isJavaIdentifierPart(currentCharacter)) {
+                    while (isPHPIdentifierPart(currentCharacter)) {
                       currentCharacter = source[currentPosition++];
                     }
 
@@ -1047,28 +1066,28 @@ public class Scanner implements IScanner, ITerminalSymbols {
                 }
 
                 while (currentCharacter != '\'') {
-                  
+
                   /**** in PHP \r and \n are valid in string literals ****/
-//                  if ((currentCharacter == '\n')
-//                    || (currentCharacter == '\r')) {
-//                    // relocate if finding another quote fairly close: thus unicode '/u000D' will be fully consumed
-//                    for (int lookAhead = 0; lookAhead < 50; lookAhead++) {
-//                      if (currentPosition + lookAhead == source.length)
-//                        break;
-//                      if (source[currentPosition + lookAhead] == '\n')
-//                        break;
-//                      if (source[currentPosition + lookAhead] == '\"') {
-//                        currentPosition += lookAhead + 1;
-//                        break;
-//                      }
-//                    }
-//                    throw new InvalidInputException(INVALID_CHAR_IN_STRING);
-//                  }
+                  //                  if ((currentCharacter == '\n')
+                  //                    || (currentCharacter == '\r')) {
+                  //                    // relocate if finding another quote fairly close: thus unicode '/u000D' will be fully consumed
+                  //                    for (int lookAhead = 0; lookAhead < 50; lookAhead++) {
+                  //                      if (currentPosition + lookAhead == source.length)
+                  //                        break;
+                  //                      if (source[currentPosition + lookAhead] == '\n')
+                  //                        break;
+                  //                      if (source[currentPosition + lookAhead] == '\"') {
+                  //                        currentPosition += lookAhead + 1;
+                  //                        break;
+                  //                      }
+                  //                    }
+                  //                    throw new InvalidInputException(INVALID_CHAR_IN_STRING);
+                  //                  }
                   if (currentCharacter == '\\') {
                     int escapeSize = currentPosition;
                     boolean backSlashAsUnicodeInString = unicodeAsBackSlash;
                     //scanEscapeCharacter make a side effect on this value and we need the previous value few lines down this one
-                    scanEscapeCharacter();
+                    scanSingleQuotedEscapeCharacter();
                     escapeSize = currentPosition - escapeSize;
                     if (withoutUnicodePtr == 0) {
                       //buffer all the entries that have been left aside....
@@ -1149,29 +1168,28 @@ public class Scanner implements IScanner, ITerminalSymbols {
                 }
 
                 while (currentCharacter != '"') {
-                  
-                  
+
                   /**** in PHP \r and \n are valid in string literals ****/
-//                  if ((currentCharacter == '\n')
-//                    || (currentCharacter == '\r')) {
-//                    // relocate if finding another quote fairly close: thus unicode '/u000D' will be fully consumed
-//                    for (int lookAhead = 0; lookAhead < 50; lookAhead++) {
-//                      if (currentPosition + lookAhead == source.length)
-//                        break;
-//                      if (source[currentPosition + lookAhead] == '\n')
-//                        break;
-//                      if (source[currentPosition + lookAhead] == '\"') {
-//                        currentPosition += lookAhead + 1;
-//                        break;
-//                      }
-//                    }
-//                    throw new InvalidInputException(INVALID_CHAR_IN_STRING);
-//                  }
+                  //                  if ((currentCharacter == '\n')
+                  //                    || (currentCharacter == '\r')) {
+                  //                    // relocate if finding another quote fairly close: thus unicode '/u000D' will be fully consumed
+                  //                    for (int lookAhead = 0; lookAhead < 50; lookAhead++) {
+                  //                      if (currentPosition + lookAhead == source.length)
+                  //                        break;
+                  //                      if (source[currentPosition + lookAhead] == '\n')
+                  //                        break;
+                  //                      if (source[currentPosition + lookAhead] == '\"') {
+                  //                        currentPosition += lookAhead + 1;
+                  //                        break;
+                  //                      }
+                  //                    }
+                  //                    throw new InvalidInputException(INVALID_CHAR_IN_STRING);
+                  //                  }
                   if (currentCharacter == '\\') {
                     int escapeSize = currentPosition;
                     boolean backSlashAsUnicodeInString = unicodeAsBackSlash;
                     //scanEscapeCharacter make a side effect on this value and we need the previous value few lines down this one
-                    scanEscapeCharacter();
+                    scanDoubleQuotedEscapeCharacter();
                     escapeSize = currentPosition - escapeSize;
                     if (withoutUnicodePtr == 0) {
                       //buffer all the entries that have been left aside....
@@ -1238,64 +1256,7 @@ public class Scanner implements IScanner, ITerminalSymbols {
               }
               return TokenNameStringLiteral;
             case '`' :
-            try {
-              // consume next character
-              unicodeAsBackSlash = false;
-              if (((currentCharacter = source[currentPosition++]) == '\\')
-                && (source[currentPosition] == 'u')) {
-                getNextUnicodeChar();
-              } else {
-                if (withoutUnicodePtr != 0) {
-                  withoutUnicodeBuffer[++withoutUnicodePtr] =
-                    currentCharacter;
-                }
-              }
-
-              while (currentCharacter != '`') {
-                  
-                  
-                /**** in PHP \r and \n are valid in string literals ****/
-//                if ((currentCharacter == '\n')
-//                  || (currentCharacter == '\r')) {
-//                  // relocate if finding another quote fairly close: thus unicode '/u000D' will be fully consumed
-//                  for (int lookAhead = 0; lookAhead < 50; lookAhead++) {
-//                    if (currentPosition + lookAhead == source.length)
-//                      break;
-//                    if (source[currentPosition + lookAhead] == '\n')
-//                      break;
-//                    if (source[currentPosition + lookAhead] == '\"') {
-//                      currentPosition += lookAhead + 1;
-//                      break;
-//                    }
-//                  }
-//                  throw new InvalidInputException(INVALID_CHAR_IN_STRING);
-//                }
-                if (currentCharacter == '\\') {
-                  int escapeSize = currentPosition;
-                  boolean backSlashAsUnicodeInString = unicodeAsBackSlash;
-                  //scanEscapeCharacter make a side effect on this value and we need the previous value few lines down this one
-                  scanEscapeCharacter();
-                  escapeSize = currentPosition - escapeSize;
-                  if (withoutUnicodePtr == 0) {
-                    //buffer all the entries that have been left aside....
-                    withoutUnicodePtr =
-                      currentPosition - escapeSize - 1 - startPosition;
-                    System.arraycopy(
-                      source,
-                      startPosition,
-                      withoutUnicodeBuffer,
-                      1,
-                      withoutUnicodePtr);
-                    withoutUnicodeBuffer[++withoutUnicodePtr] =
-                      currentCharacter;
-                  } else { //overwrite the / in the buffer
-                    withoutUnicodeBuffer[withoutUnicodePtr] =
-                      currentCharacter;
-                    if (backSlashAsUnicodeInString) { //there are TWO \ in the stream where only one is correct
-                      withoutUnicodePtr--;
-                    }
-                  }
-                }
+              try {
                 // consume next character
                 unicodeAsBackSlash = false;
                 if (((currentCharacter = source[currentPosition++]) == '\\')
@@ -1308,38 +1269,94 @@ public class Scanner implements IScanner, ITerminalSymbols {
                   }
                 }
 
-              }
-            } catch (IndexOutOfBoundsException e) {
-              throw new InvalidInputException(UNTERMINATED_STRING);
-            } catch (InvalidInputException e) {
-              if (e.getMessage().equals(INVALID_ESCAPE)) {
-                // relocate if finding another quote fairly close: thus unicode '/u000D' will be fully consumed
-                for (int lookAhead = 0; lookAhead < 50; lookAhead++) {
-                  if (currentPosition + lookAhead == source.length)
-                    break;
-                  if (source[currentPosition + lookAhead] == '\n')
-                    break;
-                  if (source[currentPosition + lookAhead] == '`') {
-                    currentPosition += lookAhead + 1;
-                    break;
+                while (currentCharacter != '`') {
+
+                  /**** in PHP \r and \n are valid in string literals ****/
+                  //                if ((currentCharacter == '\n')
+                  //                  || (currentCharacter == '\r')) {
+                  //                  // relocate if finding another quote fairly close: thus unicode '/u000D' will be fully consumed
+                  //                  for (int lookAhead = 0; lookAhead < 50; lookAhead++) {
+                  //                    if (currentPosition + lookAhead == source.length)
+                  //                      break;
+                  //                    if (source[currentPosition + lookAhead] == '\n')
+                  //                      break;
+                  //                    if (source[currentPosition + lookAhead] == '\"') {
+                  //                      currentPosition += lookAhead + 1;
+                  //                      break;
+                  //                    }
+                  //                  }
+                  //                  throw new InvalidInputException(INVALID_CHAR_IN_STRING);
+                  //                }
+                  if (currentCharacter == '\\') {
+                    int escapeSize = currentPosition;
+                    boolean backSlashAsUnicodeInString = unicodeAsBackSlash;
+                    //scanEscapeCharacter make a side effect on this value and we need the previous value few lines down this one
+                    scanDoubleQuotedEscapeCharacter();
+                    escapeSize = currentPosition - escapeSize;
+                    if (withoutUnicodePtr == 0) {
+                      //buffer all the entries that have been left aside....
+                      withoutUnicodePtr =
+                        currentPosition - escapeSize - 1 - startPosition;
+                      System.arraycopy(
+                        source,
+                        startPosition,
+                        withoutUnicodeBuffer,
+                        1,
+                        withoutUnicodePtr);
+                      withoutUnicodeBuffer[++withoutUnicodePtr] =
+                        currentCharacter;
+                    } else { //overwrite the / in the buffer
+                      withoutUnicodeBuffer[withoutUnicodePtr] =
+                        currentCharacter;
+                      if (backSlashAsUnicodeInString) { //there are TWO \ in the stream where only one is correct
+                        withoutUnicodePtr--;
+                      }
+                    }
                   }
+                  // consume next character
+                  unicodeAsBackSlash = false;
+                  if (((currentCharacter = source[currentPosition++]) == '\\')
+                    && (source[currentPosition] == 'u')) {
+                    getNextUnicodeChar();
+                  } else {
+                    if (withoutUnicodePtr != 0) {
+                      withoutUnicodeBuffer[++withoutUnicodePtr] =
+                        currentCharacter;
+                    }
+                  }
+
                 }
+              } catch (IndexOutOfBoundsException e) {
+                throw new InvalidInputException(UNTERMINATED_STRING);
+              } catch (InvalidInputException e) {
+                if (e.getMessage().equals(INVALID_ESCAPE)) {
+                  // relocate if finding another quote fairly close: thus unicode '/u000D' will be fully consumed
+                  for (int lookAhead = 0; lookAhead < 50; lookAhead++) {
+                    if (currentPosition + lookAhead == source.length)
+                      break;
+                    if (source[currentPosition + lookAhead] == '\n')
+                      break;
+                    if (source[currentPosition + lookAhead] == '`') {
+                      currentPosition += lookAhead + 1;
+                      break;
+                    }
+                  }
 
+                }
+                throw e; // rethrow
               }
-              throw e; // rethrow
-            }
-            if (checkNonExternalizedStringLiterals) { // check for presence of NLS tags //$NON-NLS-?$ where ? is an int.
-              if (currentLine == null) {
-                currentLine = new NLSLine();
-                lines.add(currentLine);
+              if (checkNonExternalizedStringLiterals) { // check for presence of       NLS tags //$NON-NLS-?$ where ? is an int.
+                if (currentLine == null) {
+                  currentLine = new NLSLine();
+                  lines.add(currentLine);
+                }
+                currentLine.add(
+                  new StringLiteral(
+                    getCurrentTokenSourceString(),
+                    startPosition,
+                    currentPosition - 1));
               }
-              currentLine.add(
-                new StringLiteral(
-                  getCurrentTokenSourceString(),
-                  startPosition,
-                  currentPosition - 1));
-            }
-            return TokenNameStringInterpolated;
+              return TokenNameStringInterpolated;
             case '#' :
             case '/' :
               {
@@ -1568,11 +1585,15 @@ public class Scanner implements IScanner, ITerminalSymbols {
 
             default :
               if (currentCharacter == '$') {
-                if (getNextChar('{'))
+                while ( (currentCharacter = source[currentPosition++])=='$') {
+                }
+                if (currentCharacter == '{')
                   return TokenNameDOLLAR_LBRACE;
-                return scanIdentifierOrKeyword(true);
+                if (isPHPIdentifierStart(currentCharacter))
+                  return scanIdentifierOrKeyword(true);
+                return TokenNameERROR;
               }
-              if (Character.isJavaIdentifierStart(currentCharacter))
+              if (isPHPIdentifierStart(currentCharacter))
                 return scanIdentifierOrKeyword(false);
               if (Character.isDigit(currentCharacter))
                 return scanNumber(false);
@@ -1669,7 +1690,7 @@ public class Scanner implements IScanner, ITerminalSymbols {
               test = getNextChar('\\');
               if (test) {
                 try {
-                  scanEscapeCharacter();
+                  scanDoubleQuotedEscapeCharacter();
                 } catch (InvalidInputException ex) {
                 };
               } else {
@@ -1718,7 +1739,7 @@ public class Scanner implements IScanner, ITerminalSymbols {
                 }
                 if (currentCharacter == '\\') {
                   try {
-                    scanEscapeCharacter();
+                    scanDoubleQuotedEscapeCharacter();
                   } catch (InvalidInputException ex) {
                   };
                 }
@@ -1930,7 +1951,7 @@ public class Scanner implements IScanner, ITerminalSymbols {
             }
 
           default :
-            if (Character.isJavaIdentifierStart(currentCharacter)
+            if (isPHPIdentifierStart(currentCharacter)
               || currentCharacter == '$') {
               try {
                 scanIdentifierOrKeyword((currentCharacter == '$'));
@@ -2421,7 +2442,8 @@ public class Scanner implements IScanner, ITerminalSymbols {
     commentPtr = -1; // reset comment stack
   }
 
-  public final void scanEscapeCharacter() throws InvalidInputException {
+  public final void scanSingleQuotedEscapeCharacter()
+    throws InvalidInputException {
     // the string with "\\u" is a legal string of two chars \ and u
     //thus we use a direct access to the source (for regular cases).
 
@@ -2439,18 +2461,49 @@ public class Scanner implements IScanner, ITerminalSymbols {
     } else
       currentCharacter = source[currentPosition++];
     switch (currentCharacter) {
-      case 'b' :
-        currentCharacter = '\b';
+      case '\'' :
+        currentCharacter = '\'';
         break;
+      case '\\' :
+        currentCharacter = '\\';
+        break;
+      default :
+        currentCharacter = '\\';
+        currentPosition--;
+    }
+  }
+
+  public final void scanDoubleQuotedEscapeCharacter()
+    throws InvalidInputException {
+    // the string with "\\u" is a legal string of two chars \ and u
+    //thus we use a direct access to the source (for regular cases).
+
+    if (unicodeAsBackSlash) {
+      // consume next character
+      unicodeAsBackSlash = false;
+      if (((currentCharacter = source[currentPosition++]) == '\\')
+        && (source[currentPosition] == 'u')) {
+        getNextUnicodeChar();
+      } else {
+        if (withoutUnicodePtr != 0) {
+          withoutUnicodeBuffer[++withoutUnicodePtr] = currentCharacter;
+        }
+      }
+    } else
+      currentCharacter = source[currentPosition++];
+    switch (currentCharacter) {
+      //      case 'b' :
+      //        currentCharacter = '\b';
+      //        break;
       case 't' :
         currentCharacter = '\t';
         break;
       case 'n' :
         currentCharacter = '\n';
         break;
-      case 'f' :
-        currentCharacter = '\f';
-        break;
+        //      case 'f' :
+        //        currentCharacter = '\f';
+        //        break;
       case 'r' :
         currentCharacter = '\r';
         break;
@@ -2463,6 +2516,9 @@ public class Scanner implements IScanner, ITerminalSymbols {
       case '\\' :
         currentCharacter = '\\';
         break;
+      case '$' :
+        currentCharacter = '$';
+        break;
       default :
         // -----------octal escape--------------
         // OctalDigit
@@ -2522,6 +2578,7 @@ public class Scanner implements IScanner, ITerminalSymbols {
     //disptach on the second char :-)...cool....but fast !
 
     useAssertAsAnIndentifier = false;
+    
     while (getNextCharAsJavaIdentifierPart()) {
     };