1) Fixed calculation of the new indentation method of splitted strings.
[phpeclipse.git] / archive / net.sourceforge.phpeclipse.quantum.sql / src / com / quantum / sql / parser / SQLLexx.java
1 package com.quantum.sql.parser;
2
3 import java.util.Vector;
4
5 /**
6  * <p>An SQL Lexer.  From 
7  * <a href="http://www.dictionary.com/">dictionary.com</a>:
8  * 
9  * <blockquote>
10  * <p><b>lexer</b>
11  *
12  * <p>/lek'sr/ n. Common hacker shorthand for 'lexical
13  * analyzer', the input-tokenizing stage in the parser for a language
14  * (the part that breaks it into word-like pieces).
15  * </blockquote>
16  * 
17  * <p>Note that this class has nothing to do with the Sci-fi channel's
18  * <a href="http://www.scifi.com/lexx/">Lexx</a> TV series.
19  */
20 public class SQLLexx {
21         private final static char CHAR_EOL = '\n';
22         private final static char CHAR_DASH = '-';
23         private final static char CHAR_ESCAPE = '\\';
24         private final static char CHAR_SEPARATOR = ';';
25         
26         private final static int CONDITION_WHITESPACE = 1;
27         private final static int CONDITION_IDENTIFIER = 2;
28         private final static int CONDITION_IDENTIFIER_INITIAL = 3;
29         private final static int CONDITION_LITERAL_SIMPLE_QUOTE = 4;
30         private final static int CONDITION_LITERAL_DOUBLE_QUOTE = 5;
31         private final static int CONDITION_NUMERIC = 6;
32         private final static int CONDITION_EOL = 7;
33         
34         
35         /**
36          * Parses a SQL text into tokens. 
37          * @param text
38          * @return a vector of Token objects.
39          */
40         public static Vector parse(String text) {
41                 Vector tokens = new Vector();
42                 StringPointer p = new StringPointer(text);
43                 try {
44                         while (!p.isDone()) {
45                                 int offset = p.getOffset();
46                                 char c = p.getNext();
47                                 // Adds END_OF_LINE token
48                                 if (c == CHAR_EOL) {
49                                         tokens.addElement(new Token(Token.END_OF_LINE, CHAR_EOL, offset));      
50                                 }
51                                 // Adds WHITESPACE token;
52                                 else if (CheckCondition( c, CONDITION_WHITESPACE)) 
53                                 {
54                                         StringBuffer value = AddTokenWhile(p, c, CONDITION_WHITESPACE);
55                                         tokens.addElement(new Token(Token.WHITESPACE, value.toString(), offset, offset + value.length()));
56                                 // Adds IDENTIFIER token (can be reserved SQL word or not);
57                                 } else if (CheckCondition( c , CONDITION_IDENTIFIER_INITIAL)) 
58                                 {
59                                         StringBuffer value = AddTokenWhile(p, c, CONDITION_IDENTIFIER);
60                                         tokens.addElement(new Token(Token.IDENTIFIER, value.toString(), offset, offset + value.length()));
61                                 // Adds LITERAL token;
62                                 } else if (CheckCondition(c, CONDITION_LITERAL_SIMPLE_QUOTE)) {
63                                         StringBuffer value = AddTokenUntil(p, c, CONDITION_LITERAL_SIMPLE_QUOTE);
64                                         tokens.addElement(new Token(Token.LITERAL, value.toString(), offset, offset + value.length()));
65                                 // Adds LITERAL token;
66                                 } else if (CheckCondition(c, CONDITION_LITERAL_DOUBLE_QUOTE)) {
67                                         StringBuffer value = AddTokenUntil(p, c, CONDITION_LITERAL_SIMPLE_QUOTE);
68                                         tokens.addElement(new Token(Token.LITERAL, value.toString(), offset, offset + value.length()));
69                                 // Adds NUMERIC token;
70                                 } else if (Character.isDigit(c)) {
71                                         StringBuffer value = AddTokenWhile(p, c, CONDITION_NUMERIC);
72                                         tokens.addElement(new Token(Token.NUMERIC, value.toString(), offset, offset + value.length()));
73                                 // Adds COMMENT token if two dashes (or SYMBOL (dash) if only one dash);
74                                 } else if (c == CHAR_DASH) {
75                                         if (p.isDone()) {
76                                                 tokens.addElement(new Token(Token.SYMBOL, new Character(CHAR_DASH).toString(), offset, offset + 1));
77                                         } else {
78                                                 char next = p.peek();
79                                                 if (next == CHAR_DASH) {
80                                                         StringBuffer value = AddTokenUntil(p, CHAR_DASH, CONDITION_EOL);
81                                                         tokens.addElement(new Token(Token.COMMENT, value.toString(), offset, offset + value.length()));
82                                                 } else {
83                                                         tokens.addElement(new Token(Token.SYMBOL, new Character(CHAR_DASH).toString(), offset, offset + 1));
84                                                 }
85                                         }       
86                  //     Determine if the ';' is escaped or not
87                                 } else if (c == CHAR_ESCAPE) {
88                                         if (p.peek() == CHAR_SEPARATOR) {
89                                                 p.getNext();    // We advance the pointer so the separator is not marked again
90                                                 // We DON´T SAVE the scape character in the tokens. 
91                                                 // For correct sintax highlighting we set the offset to +2
92                                                 // This is so far the only case when a character is eliminated and not saved to the tokens.
93                                                 // That means it won´t be sent to the database when executed.
94                                                 // This is to allow definitions of procedures with ';' as an end-of-sentence, 
95                                                 //  not as an execution symbol for SQL.
96                                                 tokens.addElement(new Token(Token.SYMBOL, new Character(CHAR_SEPARATOR).toString() , offset, offset + 2));
97                                         }       else {
98                                                 tokens.addElement(new Token(Token.SYMBOL, new Character(CHAR_ESCAPE).toString() , offset, offset + 1));
99                                         }
100                                 // Adds SEPARATOR token (;),  considers the rest of the line as COMMENT token;
101                                 } else if (c == CHAR_SEPARATOR) {
102                                         tokens.addElement(new Token(Token.SEPARATOR, new Character(CHAR_SEPARATOR).toString(), offset, offset + 1));
103                                         // The rest of the line will be a comment
104                                         if (!p.isDone()) {
105                                                 StringBuffer value = AddTokenUntil(p, "", CONDITION_EOL);
106                                                 //      We add to the offset so as to skip the initial ';'
107                                                 offset++;
108                                                 tokens.addElement(new Token(Token.COMMENT, value.toString(), offset, offset + value.length()));
109                                         }
110                                 // Adds COMMENT token, for several lines;
111                                 } else if (c == '/') {
112                                         // If we have '/*', it's a comment till '*/' found or eof
113                                         if (p.peek() == '*') {
114                                                 tokens.addElement(tokenizeComment(p, offset));
115                                         } else {
116                                                 tokens.addElement(new Token(Token.SYMBOL, new String(new char[] {c}) , offset, offset + 1));
117                                         }
118                                 // Adds SYMBOL token;
119                                 } else {
120                                         tokens.addElement(new Token(Token.SYMBOL, new String(new char[] {c}), offset, offset + 1));
121                                 }
122                         }
123                 } catch (RuntimeException e) {
124                         e.printStackTrace();
125                 }
126                 
127 //              System.out.println("-------------------");
128 //              for (int i = 0; i < tokens.size(); i++) {
129 //                      System.out.println((Token) tokens.elementAt(i));
130 //              }
131                 return tokens;
132         }
133         /**
134          * Searchs for a token end, UNTIL the condition is true, or a newline, or the end of the StringPointer
135          * The end character is also addedd to the StringBuffer
136          * @param p
137          * @param s A string with the first character from the token, already extracted from the StringPointer
138          * @param condition
139          * @return a StringBuffer with the complete token
140          */
141         private static StringBuffer AddTokenUntil(StringPointer p, String s, int condition) {
142                 StringBuffer value = new StringBuffer(s);
143                 if (p.isDone()) return value;
144                 for(;;) {
145                         char c = p.getNext();
146                         if (c != CHAR_EOL) value.append(c);
147                         if (CheckCondition (c, condition) || c == CHAR_EOL || p.isDone()) {
148                                 break; 
149                         } 
150                 }       
151                 return value;
152         }
153         private static StringBuffer AddTokenUntil(StringPointer p, char c, int condition) {
154                 return AddTokenUntil(p, new Character(c).toString(), condition);
155         }
156         /**
157          * Searchs for a token end, WHILE the condition is true, or the end or the StringPointer.
158          * @param p             The StringPointer where the original stream is
159          * @param s             A string with the first character from the token, already extracted from the StringPointer
160          * @param condition     The condition to end the token
161          * @return a StringBuffer with the complete token
162          */
163         private static StringBuffer AddTokenWhile(StringPointer p, String s, int condition) {
164                 StringBuffer value = new StringBuffer(s);
165                 if (p.isDone()) return value;
166                 for(;;) {
167                         char c = p.getNext();
168                         if (CheckCondition (c, condition)) {
169                                 value.append(c);
170                                 if (p.isDone()) break; 
171                         } 
172                         else
173                         {
174                                 p.back();
175                                 break;
176                         }
177                 }       
178                 return value;
179         }
180         private static StringBuffer AddTokenWhile(StringPointer p, char c, int condition) {
181                 return AddTokenWhile(p, new Character(c).toString(), condition);
182         }
183         /**
184          * Returns true if the character meets the condition, and false if not. 
185          * New conditions should be defined in this function
186          * @param c     The character to check the condition
187          * @param condition The condition to check
188          * @return
189          */
190         private static boolean CheckCondition(char c, int condition) {
191                 switch (condition) {
192                 case CONDITION_WHITESPACE:
193                         return Character.isWhitespace(c);
194                 case CONDITION_IDENTIFIER_INITIAL:
195                         return (Character.isLetter(c) || c == '$' || c == '#'); 
196                 case CONDITION_IDENTIFIER:
197                         return (Character.isLetter(c) || Character.isDigit(c) || c == '_' || c == '$' || c == '#'); 
198                 case CONDITION_LITERAL_SIMPLE_QUOTE:
199                         return (c == '\''); 
200                 case CONDITION_LITERAL_DOUBLE_QUOTE:
201                         return (c == '\"');
202                 case CONDITION_NUMERIC:
203                         return (Character.isDigit(c) || c == '.'); 
204                 case CONDITION_EOL:
205                         return (c == CHAR_EOL); 
206                 default:
207                         break;
208                 }
209                 return false;
210         }
211         /**
212          * @param tokens
213          * @param p
214          * @param offset
215          */
216         private static Token tokenizeComment(StringPointer p, int offset) {
217                 char c;
218                 StringBuffer value = new StringBuffer();
219                 c = p.getNext();
220                 value.append('/');
221                 while (!( c == '*' && p.peek() == '/' ) && !p.isDone()) {
222                         value.append(c);
223                         c = p.getNext();
224                 }
225                 if (!p.isDone()){
226                         value.append(c);
227                         c = p.getNext();
228                         value.append(c);        
229                 }
230                 return new Token(Token.COMMENT, value.toString(), offset, offset + value.length());
231         }
232 }