import java.util.Vector;
+/**
+ * <p>An SQL Lexer. From
+ * <a href="http://www.dictionary.com/">dictionary.com</a>:
+ *
+ * <blockquote>
+ * <p><b>lexer</b>
+ *
+ * <p>/lek'sr/ n. Common hacker shorthand for 'lexical
+ * analyzer', the input-tokenizing stage in the parser for a language
+ * (the part that breaks it into word-like pieces).
+ * </blockquote>
+ *
+ * <p>Note that this class has nothing to do with the Sci-fi channel's
+ * <a href="http://www.scifi.com/lexx/">Lexx</a> TV series.
+ */
public class SQLLexx {
- private static String endline = ";"; //$NON-NLS-1$
- private static String dash = "-"; //$NON-NLS-1$
- private static String group = "/"; //$NON-NLS-1$
+ private final static char CHAR_EOL = '\n';
+ private final static char CHAR_DASH = '-';
+ private final static char CHAR_ESCAPE = '\\';
+ private final static char CHAR_SEPARATOR = ';';
+
+ private final static int CONDITION_WHITESPACE = 1;
+ private final static int CONDITION_IDENTIFIER = 2;
+ private final static int CONDITION_IDENTIFIER_INITIAL = 3;
+ private final static int CONDITION_LITERAL_SIMPLE_QUOTE = 4;
+ private final static int CONDITION_LITERAL_DOUBLE_QUOTE = 5;
+ private final static int CONDITION_NUMERIC = 6;
+ private final static int CONDITION_EOL = 7;
+
+
/**
* Parses a SQL text into tokens.
* @param text
int offset = p.getOffset();
char c = p.getNext();
// Adds END_OF_LINE token
- if (c == '\n') {
- tokens.addElement(new Token(Token.END_OF_LINE, "\n", offset, offset + 1));
+ if (c == CHAR_EOL) {
+ tokens.addElement(new Token(Token.END_OF_LINE, CHAR_EOL, offset));
}
// Adds WHITESPACE token;
- else if (Character.isWhitespace(c)) {
- StringBuffer value = new StringBuffer();
- while (Character.isWhitespace(c) && !p.isDone()) {
- value.append(c);
- c = p.getNext();
- }
- // done because of is done
- if (Character.isWhitespace(c)) {
- value.append(c);
- } else if (!p.isDone()){
- p.back();
- }
+ else if (CheckCondition( c, CONDITION_WHITESPACE))
+ {
+ StringBuffer value = AddTokenWhile(p, c, CONDITION_WHITESPACE);
tokens.addElement(new Token(Token.WHITESPACE, value.toString(), offset, offset + value.length()));
// Adds IDENTIFIER token (can be reserved SQL word or not);
- } else if (Character.isLetter(c) || c == '_' || c == '$') {
- StringBuffer value = new StringBuffer();
- while ((Character.isLetterOrDigit(c) || c == '_' || c == '$') && !p.isDone()) {
- value.append(c);
- c = p.getNext();
- }
- if ((Character.isLetterOrDigit(c) || c == '_')) {
- value.append(c);
- } else if (!p.isDone()){
- p.back();
- }
+ } else if (CheckCondition( c , CONDITION_IDENTIFIER_INITIAL))
+ {
+ StringBuffer value = AddTokenWhile(p, c, CONDITION_IDENTIFIER);
tokens.addElement(new Token(Token.IDENTIFIER, value.toString(), offset, offset + value.length()));
// Adds LITERAL token;
- } else if (c == '\'') {
- StringBuffer value = new StringBuffer();
- value.append(c);
- if (!p.isDone()) {
- c = p.getNext();
- while (c != '\'' && c != '\n' && !p.isDone()) {
- value.append(c);
- c = p.getNext();
- }
- if (c == '\'' || p.isDone()) {
- value.append(c);
- } else if (!p.isDone()){
- p.back();
- }
- }
+ } else if (CheckCondition(c, CONDITION_LITERAL_SIMPLE_QUOTE)) {
+ StringBuffer value = AddTokenUntil(p, c, CONDITION_LITERAL_SIMPLE_QUOTE);
+ tokens.addElement(new Token(Token.LITERAL, value.toString(), offset, offset + value.length()));
+ // Adds LITERAL token;
+ } else if (CheckCondition(c, CONDITION_LITERAL_DOUBLE_QUOTE)) {
+ StringBuffer value = AddTokenUntil(p, c, CONDITION_LITERAL_SIMPLE_QUOTE);
tokens.addElement(new Token(Token.LITERAL, value.toString(), offset, offset + value.length()));
- // Adds COMMENT token (or SYMBOL (dash) if only one dash);
- } else if (c == '-') {
- p.mark();
+ // Adds NUMERIC token;
+ } else if (Character.isDigit(c)) {
+ StringBuffer value = AddTokenWhile(p, c, CONDITION_NUMERIC);
+ tokens.addElement(new Token(Token.NUMERIC, value.toString(), offset, offset + value.length()));
+ // Adds COMMENT token if two dashes (or SYMBOL (dash) if only one dash);
+ } else if (c == CHAR_DASH) {
if (p.isDone()) {
- tokens.addElement(new Token(Token.SYMBOL, dash, offset, offset + 1));
+ tokens.addElement(new Token(Token.SYMBOL, new Character(CHAR_DASH).toString(), offset, offset + 1));
} else {
- char next = p.getNext();
- if (next == '-') {
- StringBuffer value = new StringBuffer("--"); //$NON-NLS-1$
- if (!p.isDone()) {
- c = p.getNext();
- while (c != '\n' && !p.isDone()) {
- value.append(c);
- c = p.getNext();
- }
- if (p.isDone()) {
- value.append(c);
- } else {
- p.back();
- }
- }
+ char next = p.peek();
+ if (next == CHAR_DASH) {
+ StringBuffer value = AddTokenUntil(p, CHAR_DASH, CONDITION_EOL);
tokens.addElement(new Token(Token.COMMENT, value.toString(), offset, offset + value.length()));
} else {
- tokens.addElement(new Token(Token.SYMBOL, dash, offset, offset + 1));
- p.reset();
+ tokens.addElement(new Token(Token.SYMBOL, new Character(CHAR_DASH).toString(), offset, offset + 1));
}
+ }
+ // Determine if the ';' is escaped or not
+ } else if (c == CHAR_ESCAPE) {
+ if (p.peek() == CHAR_SEPARATOR) {
+ p.getNext(); // We advance the pointer so the separator is not marked again
+ // We DON´T SAVE the scape character in the tokens.
+ // For correct sintax highlighting we set the offset to +2
+ // This is so far the only case when a character is eliminated and not saved to the tokens.
+ // That means it won´t be sent to the database when executed.
+ // This is to allow definitions of procedures with ';' as an end-of-sentence,
+ // not as an execution symbol for SQL.
+ tokens.addElement(new Token(Token.SYMBOL, new Character(CHAR_SEPARATOR).toString() , offset, offset + 2));
+ } else {
+ tokens.addElement(new Token(Token.SYMBOL, new Character(CHAR_ESCAPE).toString() , offset, offset + 1));
}
// Adds SEPARATOR token (;), considers the rest of the line as COMMENT token;
- } else if (c == ';') {
- tokens.addElement(new Token(Token.SEPARATOR, endline, offset, offset + 1));
- StringBuffer value = new StringBuffer();
+ } else if (c == CHAR_SEPARATOR) {
+ tokens.addElement(new Token(Token.SEPARATOR, new Character(CHAR_SEPARATOR).toString(), offset, offset + 1));
+ // The rest of the line will be a comment
if (!p.isDone()) {
- c = p.getNext();
- while (c != '\n' && !p.isDone()) {
- value.append(c);
- c = p.getNext();
- }
- if (p.isDone()) {
- value.append(c);
- } else {
- p.back();
- }
- // We add to the offset so as to skip the initial ';'
+ StringBuffer value = AddTokenUntil(p, "", CONDITION_EOL);
+ // We add to the offset so as to skip the initial ';'
offset++;
tokens.addElement(new Token(Token.COMMENT, value.toString(), offset, offset + value.length()));
}
- // Adds NUMERIC token;
- } else if (Character.isDigit(c)) {
- StringBuffer value = new StringBuffer();
- while ((Character.isDigit(c) || c == '.') && !p.isDone()) {
- value.append(c);
- c = p.getNext();
- }
- if ((Character.isDigit(c) || c == '.')) {
- value.append(c);
- } else {
- p.back();
- }
- tokens.addElement(new Token(Token.NUMERIC, value.toString(), offset, offset + value.length()));
- // Adds COMMENT token (or GROUP (slash) if only one slash);
+ // Adds COMMENT token, for several lines;
} else if (c == '/') {
- p.mark();
// If we have '/*', it's a comment till '*/' found or eof
if (p.peek() == '*') {
tokens.addElement(tokenizeComment(p, offset));
} else {
- // It's not '/*' , so it's a group token
- // BCH ??? what's this business about groups?
- // Shouldn't '/' be a divide operator?
- tokens.addElement(new Token(Token.SYMBOL, new String(new char[] {c}) /*group*/, offset, offset + 1));
- p.reset();
+ tokens.addElement(new Token(Token.SYMBOL, new String(new char[] {c}) , offset, offset + 1));
}
// Adds SYMBOL token;
} else {
return tokens;
}
/**
+ * Searchs for a token end, UNTIL the condition is true, or a newline, or the end of the StringPointer
+ * The end character is also addedd to the StringBuffer
+ * @param p
+ * @param s A string with the first character from the token, already extracted from the StringPointer
+ * @param condition
+ * @return a StringBuffer with the complete token
+ */
+ private static StringBuffer AddTokenUntil(StringPointer p, String s, int condition) {
+ StringBuffer value = new StringBuffer(s);
+ if (p.isDone()) return value;
+ for(;;) {
+ char c = p.getNext();
+ if (c != CHAR_EOL) value.append(c);
+ if (CheckCondition (c, condition) || c == CHAR_EOL || p.isDone()) {
+ break;
+ }
+ }
+ return value;
+ }
+ private static StringBuffer AddTokenUntil(StringPointer p, char c, int condition) {
+ return AddTokenUntil(p, new Character(c).toString(), condition);
+ }
+ /**
+ * Searchs for a token end, WHILE the condition is true, or the end or the StringPointer.
+ * @param p The StringPointer where the original stream is
+ * @param s A string with the first character from the token, already extracted from the StringPointer
+ * @param condition The condition to end the token
+ * @return a StringBuffer with the complete token
+ */
+ private static StringBuffer AddTokenWhile(StringPointer p, String s, int condition) {
+ StringBuffer value = new StringBuffer(s);
+ if (p.isDone()) return value;
+ for(;;) {
+ char c = p.getNext();
+ if (CheckCondition (c, condition)) {
+ value.append(c);
+ if (p.isDone()) break;
+ }
+ else
+ {
+ p.back();
+ break;
+ }
+ }
+ return value;
+ }
+ private static StringBuffer AddTokenWhile(StringPointer p, char c, int condition) {
+ return AddTokenWhile(p, new Character(c).toString(), condition);
+ }
+ /**
+ * Returns true if the character meets the condition, and false if not.
+ * New conditions should be defined in this function
+ * @param c The character to check the condition
+ * @param condition The condition to check
+ * @return
+ */
+ private static boolean CheckCondition(char c, int condition) {
+ switch (condition) {
+ case CONDITION_WHITESPACE:
+ return Character.isWhitespace(c);
+ case CONDITION_IDENTIFIER_INITIAL:
+ return (Character.isLetter(c) || c == '$' || c == '#');
+ case CONDITION_IDENTIFIER:
+ return (Character.isLetter(c) || Character.isDigit(c) || c == '_' || c == '$' || c == '#');
+ case CONDITION_LITERAL_SIMPLE_QUOTE:
+ return (c == '\'');
+ case CONDITION_LITERAL_DOUBLE_QUOTE:
+ return (c == '\"');
+ case CONDITION_NUMERIC:
+ return (Character.isDigit(c) || c == '.');
+ case CONDITION_EOL:
+ return (c == CHAR_EOL);
+ default:
+ break;
+ }
+ return false;
+ }
+ /**
* @param tokens
* @param p
* @param offset