Implemeted action for uploading Wikipedia articles (thanks to D.Wunsch)
[phpeclipse.git] / archive / net.sourceforge.phpeclipse.quantum.sql / src / com / quantum / sql / parser / SQLLexx.java
index 42c6276..8c83886 100644 (file)
@@ -2,10 +2,36 @@ package com.quantum.sql.parser;
 
 import java.util.Vector;
 
+/**
+ * <p>An SQL Lexer.  From 
+ * <a href="http://www.dictionary.com/">dictionary.com</a>:
+ * 
+ * <blockquote>
+ * <p><b>lexer</b>
+ *
+ * <p>/lek'sr/ n. Common hacker shorthand for 'lexical
+ * analyzer', the input-tokenizing stage in the parser for a language
+ * (the part that breaks it into word-like pieces).
+ * </blockquote>
+ * 
+ * <p>Note that this class has nothing to do with the Sci-fi channel's
+ * <a href="http://www.scifi.com/lexx/">Lexx</a> TV series.
+ */
 public class SQLLexx {
-       private static String endline = ";"; //$NON-NLS-1$
-       private static String dash = "-"; //$NON-NLS-1$
-       private static String group = "/"; //$NON-NLS-1$
+       private final static char CHAR_EOL = '\n';
+       private final static char CHAR_DASH = '-';
+       private final static char CHAR_ESCAPE = '\\';
+       private final static char CHAR_SEPARATOR = ';';
+       
+       private final static int CONDITION_WHITESPACE = 1;
+       private final static int CONDITION_IDENTIFIER = 2;
+       private final static int CONDITION_IDENTIFIER_INITIAL = 3;
+       private final static int CONDITION_LITERAL_SIMPLE_QUOTE = 4;
+       private final static int CONDITION_LITERAL_DOUBLE_QUOTE = 5;
+       private final static int CONDITION_NUMERIC = 6;
+       private final static int CONDITION_EOL = 7;
+       
+       
        /**
         * Parses a SQL text into tokens. 
         * @param text
@@ -19,124 +45,75 @@ public class SQLLexx {
                                int offset = p.getOffset();
                                char c = p.getNext();
                                // Adds END_OF_LINE token
-                               if (c == '\n') {
-                                       tokens.addElement(new Token(Token.END_OF_LINE, "\n", offset, offset + 1));      
+                               if (c == CHAR_EOL) {
+                                       tokens.addElement(new Token(Token.END_OF_LINE, CHAR_EOL, offset));      
                                }
                                // Adds WHITESPACE token;
-                               else if (Character.isWhitespace(c)) {
-                                       StringBuffer value = new StringBuffer();
-                                       while (Character.isWhitespace(c) && !p.isDone()) {
-                                               value.append(c);
-                                               c = p.getNext();
-                                       }
-                                       // done because of is done
-                                       if (Character.isWhitespace(c)) {
-                                               value.append(c);
-                                       } else if (!p.isDone()){
-                                               p.back();
-                                       }
+                               else if (CheckCondition( c, CONDITION_WHITESPACE)) 
+                               {
+                                       StringBuffer value = AddTokenWhile(p, c, CONDITION_WHITESPACE);
                                        tokens.addElement(new Token(Token.WHITESPACE, value.toString(), offset, offset + value.length()));
                                // Adds IDENTIFIER token (can be reserved SQL word or not);
-                               } else if (Character.isLetter(c) || c == '_' || c == '$') {
-                                       StringBuffer value = new StringBuffer();
-                                       while ((Character.isLetterOrDigit(c) || c == '_'  || c == '$') && !p.isDone()) {
-                                               value.append(c);
-                                               c = p.getNext();
-                                       }
-                                       if ((Character.isLetterOrDigit(c) || c == '_')) {
-                                               value.append(c);
-                                       } else if (!p.isDone()){
-                                               p.back();
-                                       }
+                               } else if (CheckCondition( c , CONDITION_IDENTIFIER_INITIAL)) 
+                               {
+                                       StringBuffer value = AddTokenWhile(p, c, CONDITION_IDENTIFIER);
                                        tokens.addElement(new Token(Token.IDENTIFIER, value.toString(), offset, offset + value.length()));
                                // Adds LITERAL token;
-                               } else if (c == '\'') {
-                                       StringBuffer value = new StringBuffer();
-                                       value.append(c);
-                                       if (!p.isDone()) {
-                                               c = p.getNext();
-                                               while (c != '\'' && c != '\n' && !p.isDone()) {
-                                                       value.append(c);
-                                                       c = p.getNext();
-                                               }
-                                               if (c == '\'' || p.isDone()) {
-                                                       value.append(c);
-                                               } else if (!p.isDone()){
-                                                       p.back();
-                                               }
-                                       }
+                               } else if (CheckCondition(c, CONDITION_LITERAL_SIMPLE_QUOTE)) {
+                                       StringBuffer value = AddTokenUntil(p, c, CONDITION_LITERAL_SIMPLE_QUOTE);
+                                       tokens.addElement(new Token(Token.LITERAL, value.toString(), offset, offset + value.length()));
+                               // Adds LITERAL token;
+                               } else if (CheckCondition(c, CONDITION_LITERAL_DOUBLE_QUOTE)) {
+                                       StringBuffer value = AddTokenUntil(p, c, CONDITION_LITERAL_SIMPLE_QUOTE);
                                        tokens.addElement(new Token(Token.LITERAL, value.toString(), offset, offset + value.length()));
-                               // Adds COMMENT token (or SYMBOL (dash) if only one dash);
-                               } else if (c == '-') {
-                                       p.mark();
+                               // Adds NUMERIC token;
+                               } else if (Character.isDigit(c)) {
+                                       StringBuffer value = AddTokenWhile(p, c, CONDITION_NUMERIC);
+                                       tokens.addElement(new Token(Token.NUMERIC, value.toString(), offset, offset + value.length()));
+                               // Adds COMMENT token if two dashes (or SYMBOL (dash) if only one dash);
+                               } else if (c == CHAR_DASH) {
                                        if (p.isDone()) {
-                                               tokens.addElement(new Token(Token.SYMBOL, dash, offset, offset + 1));
+                                               tokens.addElement(new Token(Token.SYMBOL, new Character(CHAR_DASH).toString(), offset, offset + 1));
                                        } else {
-                                               char next = p.getNext();
-                                               if (next == '-') {
-                                                       StringBuffer value = new StringBuffer("--"); //$NON-NLS-1$
-                                                       if (!p.isDone()) {
-                                                               c = p.getNext();
-                                                               while (c != '\n' && !p.isDone()) {
-                                                                       value.append(c);
-                                                                       c = p.getNext();
-                                                               }
-                                                               if (p.isDone()) {
-                                                                       value.append(c);
-                                                               } else {
-                                                                       p.back();
-                                                               }
-                                                       }
+                                               char next = p.peek();
+                                               if (next == CHAR_DASH) {
+                                                       StringBuffer value = AddTokenUntil(p, CHAR_DASH, CONDITION_EOL);
                                                        tokens.addElement(new Token(Token.COMMENT, value.toString(), offset, offset + value.length()));
                                                } else {
-                                                       tokens.addElement(new Token(Token.SYMBOL, dash, offset, offset + 1));
-                                                       p.reset();
+                                                       tokens.addElement(new Token(Token.SYMBOL, new Character(CHAR_DASH).toString(), offset, offset + 1));
                                                }
+                                       }       
+                 //    Determine if the ';' is escaped or not
+                               } else if (c == CHAR_ESCAPE) {
+                                       if (p.peek() == CHAR_SEPARATOR) {
+                                               p.getNext();    // We advance the pointer so the separator is not marked again
+                                               // We DON´T SAVE the scape character in the tokens. 
+                                               // For correct sintax highlighting we set the offset to +2
+                                               // This is so far the only case when a character is eliminated and not saved to the tokens.
+                                               // That means it won´t be sent to the database when executed.
+                                               // This is to allow definitions of procedures with ';' as an end-of-sentence, 
+                                               //  not as an execution symbol for SQL.
+                                               tokens.addElement(new Token(Token.SYMBOL, new Character(CHAR_SEPARATOR).toString() , offset, offset + 2));
+                                       }       else {
+                                               tokens.addElement(new Token(Token.SYMBOL, new Character(CHAR_ESCAPE).toString() , offset, offset + 1));
                                        }
                                // Adds SEPARATOR token (;),  considers the rest of the line as COMMENT token;
-                               } else if (c == ';') {
-                                       tokens.addElement(new Token(Token.SEPARATOR, endline, offset, offset + 1));
-                                       StringBuffer value = new StringBuffer();
+                               } else if (c == CHAR_SEPARATOR) {
+                                       tokens.addElement(new Token(Token.SEPARATOR, new Character(CHAR_SEPARATOR).toString(), offset, offset + 1));
+                                       // The rest of the line will be a comment
                                        if (!p.isDone()) {
-                                               c = p.getNext();
-                                               while (c != '\n' && !p.isDone()) {
-                                                       value.append(c);
-                                                       c = p.getNext();
-                                               }
-                                               if (p.isDone()) {
-                                                       value.append(c);
-                                               } else {
-                                                       p.back();
-                                               }
-                                               // We add to the offset so as to skip the initial ';'
+                                               StringBuffer value = AddTokenUntil(p, "", CONDITION_EOL);
+                                               //      We add to the offset so as to skip the initial ';'
                                                offset++;
                                                tokens.addElement(new Token(Token.COMMENT, value.toString(), offset, offset + value.length()));
                                        }
-                               // Adds NUMERIC token;
-                               } else if (Character.isDigit(c)) {
-                                       StringBuffer value = new StringBuffer();
-                                       while ((Character.isDigit(c) || c == '.') && !p.isDone()) {
-                                               value.append(c);
-                                               c = p.getNext();
-                                       }
-                                       if ((Character.isDigit(c) || c == '.')) {
-                                               value.append(c);
-                                       } else {
-                                               p.back();
-                                       }
-                                       tokens.addElement(new Token(Token.NUMERIC, value.toString(), offset, offset + value.length()));
-                               // Adds COMMENT token (or GROUP (slash) if only one slash);
+                               // Adds COMMENT token, for several lines;
                                } else if (c == '/') {
-                                       p.mark();
                                        // If we have '/*', it's a comment till '*/' found or eof
                                        if (p.peek() == '*') {
                                                tokens.addElement(tokenizeComment(p, offset));
                                        } else {
-                                               // It's not '/*' , so it's a group token
-                                               // BCH ??? what's this business about groups?  
-                                               // Shouldn't '/' be a divide operator?
-                                               tokens.addElement(new Token(Token.SYMBOL, new String(new char[] {c}) /*group*/, offset, offset + 1));
-                                               p.reset();
+                                               tokens.addElement(new Token(Token.SYMBOL, new String(new char[] {c}) , offset, offset + 1));
                                        }
                                // Adds SYMBOL token;
                                } else {
@@ -154,6 +131,84 @@ public class SQLLexx {
                return tokens;
        }
        /**
+        * Searchs for a token end, UNTIL the condition is true, or a newline, or the end of the StringPointer
+        * The end character is also addedd to the StringBuffer
+        * @param p
+        * @param s A string with the first character from the token, already extracted from the StringPointer
+        * @param condition
+        * @return a StringBuffer with the complete token
+        */
+       private static StringBuffer AddTokenUntil(StringPointer p, String s, int condition) {
+               StringBuffer value = new StringBuffer(s);
+               if (p.isDone()) return value;
+               for(;;) {
+                       char c = p.getNext();
+                       if (c != CHAR_EOL) value.append(c);
+                       if (CheckCondition (c, condition) || c == CHAR_EOL || p.isDone()) {
+                               break; 
+                       } 
+               }       
+               return value;
+       }
+       private static StringBuffer AddTokenUntil(StringPointer p, char c, int condition) {
+               return AddTokenUntil(p, new Character(c).toString(), condition);
+       }
+       /**
+        * Searchs for a token end, WHILE the condition is true, or the end or the StringPointer.
+        * @param p             The StringPointer where the original stream is
+        * @param s             A string with the first character from the token, already extracted from the StringPointer
+        * @param condition     The condition to end the token
+        * @return a StringBuffer with the complete token
+        */
+       private static StringBuffer AddTokenWhile(StringPointer p, String s, int condition) {
+               StringBuffer value = new StringBuffer(s);
+               if (p.isDone()) return value;
+               for(;;) {
+                       char c = p.getNext();
+                       if (CheckCondition (c, condition)) {
+                               value.append(c);
+                               if (p.isDone()) break; 
+                       } 
+                       else
+                       {
+                               p.back();
+                               break;
+                       }
+               }       
+               return value;
+       }
+       private static StringBuffer AddTokenWhile(StringPointer p, char c, int condition) {
+               return AddTokenWhile(p, new Character(c).toString(), condition);
+       }
+       /**
+        * Returns true if the character meets the condition, and false if not. 
+        * New conditions should be defined in this function
+        * @param c     The character to check the condition
+        * @param condition The condition to check
+        * @return
+        */
+       private static boolean CheckCondition(char c, int condition) {
+               switch (condition) {
+               case CONDITION_WHITESPACE:
+                       return Character.isWhitespace(c);
+               case CONDITION_IDENTIFIER_INITIAL:
+                       return (Character.isLetter(c) || c == '$' || c == '#'); 
+               case CONDITION_IDENTIFIER:
+                       return (Character.isLetter(c) || Character.isDigit(c) || c == '_' || c == '$' || c == '#'); 
+               case CONDITION_LITERAL_SIMPLE_QUOTE:
+                       return (c == '\''); 
+               case CONDITION_LITERAL_DOUBLE_QUOTE:
+                       return (c == '\"');
+               case CONDITION_NUMERIC:
+                       return (Character.isDigit(c) || c == '.'); 
+               case CONDITION_EOL:
+                       return (c == CHAR_EOL); 
+               default:
+                       break;
+               }
+               return false;
+       }
+       /**
         * @param tokens
         * @param p
         * @param offset