package com.quantum.sql.parser; import java.util.Vector; /** *

An SQL Lexer. From * dictionary.com: * *

*

lexer * *

/lek'sr/ n. Common hacker shorthand for 'lexical * analyzer', the input-tokenizing stage in the parser for a language * (the part that breaks it into word-like pieces). *

* *

Note that this class has nothing to do with the Sci-fi channel's * Lexx TV series. */ public class SQLLexx { private final static char CHAR_EOL = '\n'; private final static char CHAR_DASH = '-'; private final static char CHAR_ESCAPE = '\\'; private final static char CHAR_SEPARATOR = ';'; private final static int CONDITION_WHITESPACE = 1; private final static int CONDITION_IDENTIFIER = 2; private final static int CONDITION_IDENTIFIER_INITIAL = 3; private final static int CONDITION_LITERAL_SIMPLE_QUOTE = 4; private final static int CONDITION_LITERAL_DOUBLE_QUOTE = 5; private final static int CONDITION_NUMERIC = 6; private final static int CONDITION_EOL = 7; /** * Parses a SQL text into tokens. * @param text * @return a vector of Token objects. */ public static Vector parse(String text) { Vector tokens = new Vector(); StringPointer p = new StringPointer(text); try { while (!p.isDone()) { int offset = p.getOffset(); char c = p.getNext(); // Adds END_OF_LINE token if (c == CHAR_EOL) { tokens.addElement(new Token(Token.END_OF_LINE, CHAR_EOL, offset)); } // Adds WHITESPACE token; else if (CheckCondition( c, CONDITION_WHITESPACE)) { StringBuffer value = AddTokenWhile(p, c, CONDITION_WHITESPACE); tokens.addElement(new Token(Token.WHITESPACE, value.toString(), offset, offset + value.length())); // Adds IDENTIFIER token (can be reserved SQL word or not); } else if (CheckCondition( c , CONDITION_IDENTIFIER_INITIAL)) { StringBuffer value = AddTokenWhile(p, c, CONDITION_IDENTIFIER); tokens.addElement(new Token(Token.IDENTIFIER, value.toString(), offset, offset + value.length())); // Adds LITERAL token; } else if (CheckCondition(c, CONDITION_LITERAL_SIMPLE_QUOTE)) { StringBuffer value = AddTokenUntil(p, c, CONDITION_LITERAL_SIMPLE_QUOTE); tokens.addElement(new Token(Token.LITERAL, value.toString(), offset, offset + value.length())); // Adds LITERAL token; } else if (CheckCondition(c, CONDITION_LITERAL_DOUBLE_QUOTE)) { StringBuffer value = AddTokenUntil(p, c, CONDITION_LITERAL_SIMPLE_QUOTE); tokens.addElement(new Token(Token.LITERAL, value.toString(), offset, offset + value.length())); // Adds NUMERIC token; } else if (Character.isDigit(c)) { StringBuffer value = AddTokenWhile(p, c, CONDITION_NUMERIC); tokens.addElement(new Token(Token.NUMERIC, value.toString(), offset, offset + value.length())); // Adds COMMENT token if two dashes (or SYMBOL (dash) if only one dash); } else if (c == CHAR_DASH) { if (p.isDone()) { tokens.addElement(new Token(Token.SYMBOL, new Character(CHAR_DASH).toString(), offset, offset + 1)); } else { char next = p.peek(); if (next == CHAR_DASH) { StringBuffer value = AddTokenUntil(p, CHAR_DASH, CONDITION_EOL); tokens.addElement(new Token(Token.COMMENT, value.toString(), offset, offset + value.length())); } else { tokens.addElement(new Token(Token.SYMBOL, new Character(CHAR_DASH).toString(), offset, offset + 1)); } } // Determine if the ';' is escaped or not } else if (c == CHAR_ESCAPE) { if (p.peek() == CHAR_SEPARATOR) { p.getNext(); // We advance the pointer so the separator is not marked again // We DONīT SAVE the scape character in the tokens. // For correct sintax highlighting we set the offset to +2 // This is so far the only case when a character is eliminated and not saved to the tokens. // That means it wonīt be sent to the database when executed. // This is to allow definitions of procedures with ';' as an end-of-sentence, // not as an execution symbol for SQL. tokens.addElement(new Token(Token.SYMBOL, new Character(CHAR_SEPARATOR).toString() , offset, offset + 2)); } else { tokens.addElement(new Token(Token.SYMBOL, new Character(CHAR_ESCAPE).toString() , offset, offset + 1)); } // Adds SEPARATOR token (;), considers the rest of the line as COMMENT token; } else if (c == CHAR_SEPARATOR) { tokens.addElement(new Token(Token.SEPARATOR, new Character(CHAR_SEPARATOR).toString(), offset, offset + 1)); // The rest of the line will be a comment if (!p.isDone()) { StringBuffer value = AddTokenUntil(p, "", CONDITION_EOL); // We add to the offset so as to skip the initial ';' offset++; tokens.addElement(new Token(Token.COMMENT, value.toString(), offset, offset + value.length())); } // Adds COMMENT token, for several lines; } else if (c == '/') { // If we have '/*', it's a comment till '*/' found or eof if (p.peek() == '*') { tokens.addElement(tokenizeComment(p, offset)); } else { tokens.addElement(new Token(Token.SYMBOL, new String(new char[] {c}) , offset, offset + 1)); } // Adds SYMBOL token; } else { tokens.addElement(new Token(Token.SYMBOL, new String(new char[] {c}), offset, offset + 1)); } } } catch (RuntimeException e) { e.printStackTrace(); } // System.out.println("-------------------"); // for (int i = 0; i < tokens.size(); i++) { // System.out.println((Token) tokens.elementAt(i)); // } return tokens; } /** * Searchs for a token end, UNTIL the condition is true, or a newline, or the end of the StringPointer * The end character is also addedd to the StringBuffer * @param p * @param s A string with the first character from the token, already extracted from the StringPointer * @param condition * @return a StringBuffer with the complete token */ private static StringBuffer AddTokenUntil(StringPointer p, String s, int condition) { StringBuffer value = new StringBuffer(s); if (p.isDone()) return value; for(;;) { char c = p.getNext(); if (c != CHAR_EOL) value.append(c); if (CheckCondition (c, condition) || c == CHAR_EOL || p.isDone()) { break; } } return value; } private static StringBuffer AddTokenUntil(StringPointer p, char c, int condition) { return AddTokenUntil(p, new Character(c).toString(), condition); } /** * Searchs for a token end, WHILE the condition is true, or the end or the StringPointer. * @param p The StringPointer where the original stream is * @param s A string with the first character from the token, already extracted from the StringPointer * @param condition The condition to end the token * @return a StringBuffer with the complete token */ private static StringBuffer AddTokenWhile(StringPointer p, String s, int condition) { StringBuffer value = new StringBuffer(s); if (p.isDone()) return value; for(;;) { char c = p.getNext(); if (CheckCondition (c, condition)) { value.append(c); if (p.isDone()) break; } else { p.back(); break; } } return value; } private static StringBuffer AddTokenWhile(StringPointer p, char c, int condition) { return AddTokenWhile(p, new Character(c).toString(), condition); } /** * Returns true if the character meets the condition, and false if not. * New conditions should be defined in this function * @param c The character to check the condition * @param condition The condition to check * @return */ private static boolean CheckCondition(char c, int condition) { switch (condition) { case CONDITION_WHITESPACE: return Character.isWhitespace(c); case CONDITION_IDENTIFIER_INITIAL: return (Character.isLetter(c) || c == '$' || c == '#'); case CONDITION_IDENTIFIER: return (Character.isLetter(c) || Character.isDigit(c) || c == '_' || c == '$' || c == '#'); case CONDITION_LITERAL_SIMPLE_QUOTE: return (c == '\''); case CONDITION_LITERAL_DOUBLE_QUOTE: return (c == '\"'); case CONDITION_NUMERIC: return (Character.isDigit(c) || c == '.'); case CONDITION_EOL: return (c == CHAR_EOL); default: break; } return false; } /** * @param tokens * @param p * @param offset */ private static Token tokenizeComment(StringPointer p, int offset) { char c; StringBuffer value = new StringBuffer(); c = p.getNext(); value.append('/'); while (!( c == '*' && p.peek() == '/' ) && !p.isDone()) { value.append(c); c = p.getNext(); } if (!p.isDone()){ value.append(c); c = p.getNext(); value.append(c); } return new Token(Token.COMMENT, value.toString(), offset, offset + value.length()); } }