1 package com.quantum.sql.parser;
3 import java.util.Vector;
6 * <p>An SQL Lexer. From
7 * <a href="http://www.dictionary.com/">dictionary.com</a>:
12 * <p>/lek'sr/ n. Common hacker shorthand for 'lexical
13 * analyzer', the input-tokenizing stage in the parser for a language
14 * (the part that breaks it into word-like pieces).
17 * <p>Note that this class has nothing to do with the Sci-fi channel's
18 * <a href="http://www.scifi.com/lexx/">Lexx</a> TV series.
20 public class SQLLexx {
21 private final static char CHAR_EOL = '\n';
22 private final static char CHAR_DASH = '-';
23 private final static char CHAR_ESCAPE = '\\';
24 private final static char CHAR_SEPARATOR = ';';
26 private final static int CONDITION_WHITESPACE = 1;
27 private final static int CONDITION_IDENTIFIER = 2;
28 private final static int CONDITION_IDENTIFIER_INITIAL = 3;
29 private final static int CONDITION_LITERAL_SIMPLE_QUOTE = 4;
30 private final static int CONDITION_LITERAL_DOUBLE_QUOTE = 5;
31 private final static int CONDITION_NUMERIC = 6;
32 private final static int CONDITION_EOL = 7;
36 * Parses a SQL text into tokens.
38 * @return a vector of Token objects.
40 public static Vector parse(String text) {
41 Vector tokens = new Vector();
42 StringPointer p = new StringPointer(text);
45 int offset = p.getOffset();
47 // Adds END_OF_LINE token
49 tokens.addElement(new Token(Token.END_OF_LINE, CHAR_EOL, offset));
51 // Adds WHITESPACE token;
52 else if (CheckCondition( c, CONDITION_WHITESPACE))
54 StringBuffer value = AddTokenWhile(p, c, CONDITION_WHITESPACE);
55 tokens.addElement(new Token(Token.WHITESPACE, value.toString(), offset, offset + value.length()));
56 // Adds IDENTIFIER token (can be reserved SQL word or not);
57 } else if (CheckCondition( c , CONDITION_IDENTIFIER_INITIAL))
59 StringBuffer value = AddTokenWhile(p, c, CONDITION_IDENTIFIER);
60 tokens.addElement(new Token(Token.IDENTIFIER, value.toString(), offset, offset + value.length()));
61 // Adds LITERAL token;
62 } else if (CheckCondition(c, CONDITION_LITERAL_SIMPLE_QUOTE)) {
63 StringBuffer value = AddTokenUntil(p, c, CONDITION_LITERAL_SIMPLE_QUOTE);
64 tokens.addElement(new Token(Token.LITERAL, value.toString(), offset, offset + value.length()));
65 // Adds LITERAL token;
66 } else if (CheckCondition(c, CONDITION_LITERAL_DOUBLE_QUOTE)) {
67 StringBuffer value = AddTokenUntil(p, c, CONDITION_LITERAL_SIMPLE_QUOTE);
68 tokens.addElement(new Token(Token.LITERAL, value.toString(), offset, offset + value.length()));
69 // Adds NUMERIC token;
70 } else if (Character.isDigit(c)) {
71 StringBuffer value = AddTokenWhile(p, c, CONDITION_NUMERIC);
72 tokens.addElement(new Token(Token.NUMERIC, value.toString(), offset, offset + value.length()));
73 // Adds COMMENT token if two dashes (or SYMBOL (dash) if only one dash);
74 } else if (c == CHAR_DASH) {
76 tokens.addElement(new Token(Token.SYMBOL, new Character(CHAR_DASH).toString(), offset, offset + 1));
79 if (next == CHAR_DASH) {
80 StringBuffer value = AddTokenUntil(p, CHAR_DASH, CONDITION_EOL);
81 tokens.addElement(new Token(Token.COMMENT, value.toString(), offset, offset + value.length()));
83 tokens.addElement(new Token(Token.SYMBOL, new Character(CHAR_DASH).toString(), offset, offset + 1));
86 // Determine if the ';' is escaped or not
87 } else if (c == CHAR_ESCAPE) {
88 if (p.peek() == CHAR_SEPARATOR) {
89 p.getNext(); // We advance the pointer so the separator is not marked again
90 // We DON´T SAVE the scape character in the tokens.
91 // For correct sintax highlighting we set the offset to +2
92 // This is so far the only case when a character is eliminated and not saved to the tokens.
93 // That means it won´t be sent to the database when executed.
94 // This is to allow definitions of procedures with ';' as an end-of-sentence,
95 // not as an execution symbol for SQL.
96 tokens.addElement(new Token(Token.SYMBOL, new Character(CHAR_SEPARATOR).toString() , offset, offset + 2));
98 tokens.addElement(new Token(Token.SYMBOL, new Character(CHAR_ESCAPE).toString() , offset, offset + 1));
100 // Adds SEPARATOR token (;), considers the rest of the line as COMMENT token;
101 } else if (c == CHAR_SEPARATOR) {
102 tokens.addElement(new Token(Token.SEPARATOR, new Character(CHAR_SEPARATOR).toString(), offset, offset + 1));
103 // The rest of the line will be a comment
105 StringBuffer value = AddTokenUntil(p, "", CONDITION_EOL);
106 // We add to the offset so as to skip the initial ';'
108 tokens.addElement(new Token(Token.COMMENT, value.toString(), offset, offset + value.length()));
110 // Adds COMMENT token, for several lines;
111 } else if (c == '/') {
112 // If we have '/*', it's a comment till '*/' found or eof
113 if (p.peek() == '*') {
114 tokens.addElement(tokenizeComment(p, offset));
116 tokens.addElement(new Token(Token.SYMBOL, new String(new char[] {c}) , offset, offset + 1));
118 // Adds SYMBOL token;
120 tokens.addElement(new Token(Token.SYMBOL, new String(new char[] {c}), offset, offset + 1));
123 } catch (RuntimeException e) {
127 // System.out.println("-------------------");
128 // for (int i = 0; i < tokens.size(); i++) {
129 // System.out.println((Token) tokens.elementAt(i));
134 * Searchs for a token end, UNTIL the condition is true, or a newline, or the end of the StringPointer
135 * The end character is also addedd to the StringBuffer
137 * @param s A string with the first character from the token, already extracted from the StringPointer
139 * @return a StringBuffer with the complete token
141 private static StringBuffer AddTokenUntil(StringPointer p, String s, int condition) {
142 StringBuffer value = new StringBuffer(s);
143 if (p.isDone()) return value;
145 char c = p.getNext();
146 if (c != CHAR_EOL) value.append(c);
147 if (CheckCondition (c, condition) || c == CHAR_EOL || p.isDone()) {
153 private static StringBuffer AddTokenUntil(StringPointer p, char c, int condition) {
154 return AddTokenUntil(p, new Character(c).toString(), condition);
157 * Searchs for a token end, WHILE the condition is true, or the end or the StringPointer.
158 * @param p The StringPointer where the original stream is
159 * @param s A string with the first character from the token, already extracted from the StringPointer
160 * @param condition The condition to end the token
161 * @return a StringBuffer with the complete token
163 private static StringBuffer AddTokenWhile(StringPointer p, String s, int condition) {
164 StringBuffer value = new StringBuffer(s);
165 if (p.isDone()) return value;
167 char c = p.getNext();
168 if (CheckCondition (c, condition)) {
170 if (p.isDone()) break;
180 private static StringBuffer AddTokenWhile(StringPointer p, char c, int condition) {
181 return AddTokenWhile(p, new Character(c).toString(), condition);
184 * Returns true if the character meets the condition, and false if not.
185 * New conditions should be defined in this function
186 * @param c The character to check the condition
187 * @param condition The condition to check
190 private static boolean CheckCondition(char c, int condition) {
192 case CONDITION_WHITESPACE:
193 return Character.isWhitespace(c);
194 case CONDITION_IDENTIFIER_INITIAL:
195 return (Character.isLetter(c) || c == '$' || c == '#');
196 case CONDITION_IDENTIFIER:
197 return (Character.isLetter(c) || Character.isDigit(c) || c == '_' || c == '$' || c == '#');
198 case CONDITION_LITERAL_SIMPLE_QUOTE:
200 case CONDITION_LITERAL_DOUBLE_QUOTE:
202 case CONDITION_NUMERIC:
203 return (Character.isDigit(c) || c == '.');
205 return (c == CHAR_EOL);
216 private static Token tokenizeComment(StringPointer p, int offset) {
218 StringBuffer value = new StringBuffer();
221 while (!( c == '*' && p.peek() == '/' ) && !p.isDone()) {
230 return new Token(Token.COMMENT, value.toString(), offset, offset + value.length());