zoukankan      html  css  js  c++  java
  • SQL分词器1.10版

    处理SQL及分词效果:

    select * from ( select rownum as rn,tb1.stuid,tb1.summary from ( select stuid,sum(score) as summary from gk_score group by stuid order by summary desc ) tb1 order by tb1.summary desc ) tb2 where rn<11
    Index                Type No              Text                 Type Desc            
    ------------------------------------------------------------------------------------
    0                    1                    select               KW:select            
    1                    2                    *                    Text                 
    2                    4                    from                 KW:from              
    3                    18                   (                    (                    
    4                    1                    select               KW:select            
    5                    2                    rownum               Text                 
    6                    13                   as                   KW:as                
    7                    2                    rn                   Text                 
    8                    3                    ,                    Comma                
    9                    2                    tb1.stuid            Text                 
    10                   3                    ,                    Comma                
    11                   2                    tb1.summary          Text                 
    12                   4                    from                 KW:from              
    13                   18                   (                    (                    
    14                   1                    select               KW:select            
    15                   2                    stuid                Text                 
    16                   3                    ,                    Comma                
    17                   2                    sum                  Text                 
    18                   18                   (                    (                    
    19                   2                    score                Text                 
    20                   19                   )                    )                    
    21                   13                   as                   KW:as                
    22                   2                    summary              Text                 
    23                   4                    from                 KW:from              
    24                   2                    gk_score             Text                 
    25                   14                   group                KW:group             
    26                   10                   by                   KW:by                
    27                   2                    stuid                Text                 
    28                   9                    order                KW:order             
    29                   10                   by                   KW:by                
    30                   2                    summary              Text                 
    31                   11                   desc                 KW:asc               
    32                   19                   )                    )                    
    33                   2                    tb1                  Text                 
    34                   9                    order                KW:order             
    35                   10                   by                   KW:by                
    36                   2                    tb1.summary          Text                 
    37                   11                   desc                 KW:asc               
    38                   19                   )                    )                    
    39                   2                    tb2                  Text                 
    40                   5                    where                KW:where             
    41                   2                    rn                   Text                 
    42                   16                   <                    <                    
    43                   2                    11                   Text                 

    程序:

    package com.heyang.easysql.lex10;
    
    import java.io.BufferedReader;
    import java.io.FileInputStream;
    import java.io.InputStreamReader;
    import java.util.ArrayList;
    import java.util.Collections;
    import java.util.List;
    
    class Token{
        static final int TYPE_SELECT=1;
        static final int TYPE_TEXT=2;
        static final int TYPE_COMMA=3;
        static final int TYPE_FROM=4;
        static final int TYPE_WHERE=5;
        static final int TYPE_AND=6;
        static final int TYPE_EQUAL=7;
        static final int TYPE_OR=8;
        static final int TYPE_ORDER=9;
        static final int TYPE_BY=10;
        static final int TYPE_ASC=11;
        static final int TYPE_DESC=12;
        static final int TYPE_AS=13;
        static final int TYPE_GROUP=14;
        static final int TYPE_HAVING=15;
        static final int TYPE_LESSTHAN=16;
        static final int TYPE_GREATERTHAN=17;
        static final int TYPE_OPEN_PARENTHESIS=18;
        static final int TYPE_CLOSE_PARENTHESIS=19;
        static final int TYPE_CONNECT=20;
        static final int TYPE_LESSTHAN_OR_EQUAL=21;
        static final int TYPE_GREATERTHAN_OR_EQUAL=22;
        static final int TYPE_LESSTHAN_OR_GREATERTHAN=23;
        static final int TYPE_CASE=24;
        static final int TYPE_WHEN=25;
        static final int TYPE_THEN=26;
        static final int TYPE_ELSE=27;
        static final int TYPE_END=28;
        static final int TYPE_IS=29;
        static final int TYPE_NULL=30;
        static final int TYPE_TRUE=31;
        static final int TYPE_FALSE=32;
        static final int TYPE_PLUS=33;
        static final int TYPE_MINUS=34;
        //static final int TYPE_MULTI=35;
        static final int TYPE_DEVIDE=36;
        static final int TYPE_DISTINCT=37;
        static final int TYPE_OVER=38;
        static final int TYPE_STRING_CONCAT=39;
        static final int TYPE_ON=40;
        static final int TYPE_JOIN=41;
        static final int TYPE_INNER=42;
        static final int TYPE_LEFT=43;
        static final int TYPE_RIGHT=44;
        static final int TYPE_OUTER=45;
        static final int TYPE_FULL=46;
        static final int TYPE_WITHIN=47;
        static final int TYPE_PARTITION=48;
        
        int type;
        String text;
        
        public Token(char c,int type) {
            this.text=String.valueOf(c);
            this.type=type;
        }
        
        public Token(String word,int type) {
            this.text=word;
            this.type=type;
        }
        
        public String getTypeStr() {
            if(type==TYPE_SELECT) {
                return "KW:select";
            }else if(type==TYPE_FROM) {
                return "KW:from";
            }else if(type==TYPE_COMMA) {
                return "Comma";
            }else if(type==TYPE_TEXT) {
                return "Text";
            }else if(type==TYPE_WHERE) {
                return "KW:where";
            }else if(type==TYPE_AND) {
                return "KW:and";
            }else if(type==TYPE_EQUAL) {
                return "=";
            }else if(type==TYPE_OR) {
                return "KW:or";
            }else if(type==TYPE_ORDER) {
                return "KW:order";
            }else if(type==TYPE_BY) {
                return "KW:by";
            }else if(type==TYPE_ASC) {
                return "KW:asc";
            }else if(type==TYPE_DESC) {
                return "KW:desc";
            }else if(type==TYPE_AS) {
                return "KW:as";
            }else if(type==TYPE_GROUP) {
                return "KW:group";
            }else if(type==TYPE_HAVING) {
                return "KW:having";
            }else if(type==TYPE_LESSTHAN) {
                return "<";
            }else if(type==TYPE_GREATERTHAN) {
                return ">";
            }else if(type==TYPE_OPEN_PARENTHESIS) {
                return "(";
            }else if(type==TYPE_CLOSE_PARENTHESIS) {
                return ")";
            }else if(type==TYPE_CONNECT) {
                return "KW:connect";
            }else if(type==TYPE_LESSTHAN_OR_EQUAL) {
                return "<=";
            }else if(type==TYPE_GREATERTHAN_OR_EQUAL) {
                return ">=";
            }else if(type==TYPE_LESSTHAN_OR_GREATERTHAN) {
                return "<>";
            }else if(type==TYPE_CASE) {
                return "KW:case";
            }else if(type==TYPE_WHEN) {
                return "KW:when";
            }else if(type==TYPE_THEN) {
                return "KW:then";
            }else if(type==TYPE_ELSE) {
                return "KW:else";
            }else if(type==TYPE_END) {
                return "KW:end";
            }else if(type==TYPE_IS) {
                return "KW:is";
            }else if(type==TYPE_NULL) {
                return "KW:null";
            }else if(type==TYPE_TRUE) {
                return "KW:true";
            }else if(type==TYPE_FALSE) {
                return "KW:false";
            }else if(type==TYPE_PLUS) {
                return "+";
            }else if(type==TYPE_MINUS) {
                return "-";
            }else if(type==TYPE_DEVIDE) {
                return "/";
            }else if(type==TYPE_DISTINCT) {
                return "KW:distinct";
            }else if(type==TYPE_OVER) {
                return "KW:over";
            }else if(type==TYPE_STRING_CONCAT) {
                return "||";
            }else if(type==TYPE_ON) {
                return "KW:on";
            }else if(type==TYPE_JOIN) {
                return "KW:join";
            }else if(type==TYPE_INNER) {
                return "KW:inner";
            }else if(type==TYPE_LEFT) {
                return "KW:left";
            }else if(type==TYPE_RIGHT) {
                return "KW:right";
            }else if(type==TYPE_OUTER) {
                return "KW:outer";
            }else if(type==TYPE_FULL) {
                return "KW:full";
            }else if(type==TYPE_WITHIN) {
                return "KW:within";
            }else if(type==TYPE_PARTITION) {
                return "KW:partition";
            }
            
            return null;
        }
    }
    
    public class Lexer {
        private List<Token> tokenList;
        
        public Lexer(String inputSql) {
            String sql=pretreat(inputSql);
            String swallowed="";
            
            tokenList=new ArrayList<Token>();
            for(int i=0;i<sql.length();i++){
                char c=sql.charAt(i);
                
                if(Character.isWhitespace(c)){
                    addTextToList(swallowed);
                    swallowed="";
                }else if(c==','){
                    addTextToList(swallowed);
                    swallowed="";
                    tokenList.add(new Token(c,Token.TYPE_COMMA));
                }else if(c=='='){
                    addTextToList(swallowed);
                    swallowed="";
                    tokenList.add(new Token(c,Token.TYPE_EQUAL));
                }else if(c=='<'){
                    int next=i+1;
                    if(next<sql.length() && sql.charAt(next)=='=') {
                        addTextToList(swallowed);
                        swallowed="";
                        tokenList.add(new Token("<=",Token.TYPE_LESSTHAN_OR_EQUAL));
                        i++;
                    }else if(next<sql.length() && sql.charAt(next)=='>') {
                        addTextToList(swallowed);
                        swallowed="";
                        tokenList.add(new Token("<>",Token.TYPE_LESSTHAN_OR_GREATERTHAN));
                        i++;
                    }else {
                        addTextToList(swallowed);
                        swallowed="";
                        tokenList.add(new Token(c,Token.TYPE_LESSTHAN));
                    }
                }else if(c=='>'){
                    int next=i+1;
                    if(next<sql.length() && sql.charAt(next)=='=') {
                        addTextToList(swallowed);
                        swallowed="";
                        tokenList.add(new Token(">=",Token.TYPE_GREATERTHAN_OR_EQUAL));
                        i++;
                    }else {
                        addTextToList(swallowed);
                        swallowed="";
                        tokenList.add(new Token(c,Token.TYPE_GREATERTHAN));
                    }
                }else if(c=='|'){
                    int next=i+1;
                    if(next<sql.length() && sql.charAt(next)=='|') {
                        addTextToList(swallowed);
                        swallowed="";
                        tokenList.add(new Token("||",Token.TYPE_STRING_CONCAT));
                        i++;
                    }
                }else if(c=='('){
                    addTextToList(swallowed);
                    swallowed="";
                    tokenList.add(new Token(c,Token.TYPE_OPEN_PARENTHESIS));
                }else if(c==')'){
                    addTextToList(swallowed);
                    swallowed="";
                    tokenList.add(new Token(c,Token.TYPE_CLOSE_PARENTHESIS));
                }else if(c=='+'){
                    addTextToList(swallowed);
                    swallowed="";
                    tokenList.add(new Token(c,Token.TYPE_PLUS));
                }else if(c=='-'){
                    addTextToList(swallowed);
                    swallowed="";
                    tokenList.add(new Token(c,Token.TYPE_MINUS));
                }else if(c=='/'){
                    addTextToList(swallowed);
                    swallowed="";
                    tokenList.add(new Token(c,Token.TYPE_DEVIDE));
                }else {
                    swallowed+=c;
                }
            }
        }
        
        private int findTypeByText(String text) {
            Object[][] arr= {
                                {"select", Token.TYPE_SELECT},
                                {"from",   Token.TYPE_FROM},
                                {"where",  Token.TYPE_WHERE},
                                {"and",    Token.TYPE_AND},
                                {"or",     Token.TYPE_OR},
                                {"order",  Token.TYPE_ORDER},
                                {"by",     Token.TYPE_BY},
                                {"asc",    Token.TYPE_ASC},
                                {"desc",   Token.TYPE_ASC},
                                {"asc",    Token.TYPE_DESC},
                                {"as",     Token.TYPE_AS},
                                {"group",  Token.TYPE_GROUP},
                                {"having", Token.TYPE_HAVING},
                                {"connect", Token.TYPE_CONNECT},
                                {"case",   Token.TYPE_CASE},
                                {"when",   Token.TYPE_WHEN},
                                {"then",   Token.TYPE_THEN},
                                {"else",   Token.TYPE_ELSE},
                                {"end",   Token.TYPE_END},
                                {"is",   Token.TYPE_IS},
                                {"null",   Token.TYPE_NULL},
                                {"true",   Token.TYPE_TRUE},
                                {"false",   Token.TYPE_FALSE},
                                {"distinct",   Token.TYPE_DISTINCT},
                                {"over",   Token.TYPE_OVER},
                                {"on",   Token.TYPE_ON},
                                {"join",   Token.TYPE_JOIN},
                                {"inner",   Token.TYPE_INNER},
                                {"left",   Token.TYPE_LEFT},
                                {"right",   Token.TYPE_RIGHT},
                                {"outer",   Token.TYPE_OUTER},
                                {"full",   Token.TYPE_FULL},
                                {"within",   Token.TYPE_WITHIN},
                                {"partition",   Token.TYPE_PARTITION},
                            };
            
            for(Object[] arrInner:arr) {
                String keyword=String.valueOf(arrInner[0]);
                if(keyword.equalsIgnoreCase(text)) {
                    return Integer.parseInt(arrInner[1].toString());
                }
            }
            
            return Token.TYPE_TEXT;
        }
        
        private void addTextToList(String text) {
            int type=findTypeByText(text);
            addToken2List(text,type);
        }
        
        private void addToken2List(String text,int type) {
            if(text.trim().length()>0) {
                tokenList.add(new Token(text,type));
            }
        }
        
        public void printTokenList() {
            final String continuousStar = createRepeatedStr("-", 84);
            final String layout = "%-20s %-20s %-20s %-20s %s";
            StringBuilder sb = new StringBuilder();
    
            sb.append(String.format(layout, "Index", "Type No","Text","Type Desc","
    "));
            sb.append(continuousStar + "
    ");
            int index=0;
            for(Token token:tokenList) {
                sb.append(String.format(layout, String.valueOf(index),String.valueOf(token.type), token.text,token.getTypeStr(),"
    "));
                index++;
            }
            
            System.out.println(sb.toString());
        }
        
        private static String createRepeatedStr(String seed, int n) {
            return String.join("", Collections.nCopies(n, seed));
        }
        
        private String pretreat(String raw) {
            return raw.trim()+" ";
        }
        
        public static void main(String[] args)  throws Exception{
            String sql=removeExtraSpace(readSqlFromFile("c:\temp\12.sql"));
            System.out.println(sql);
            new Lexer(sql).printTokenList();;
        }
        
        private static String readSqlFromFile(String filePath) throws Exception{
            StringBuilder sb=new StringBuilder();
            BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(filePath), "UTF-8"));  
            String line = null;  
            while( ( line = br.readLine() ) != null ) {
                sb.append(line);
            }
            br.close();  
            return sb.toString();
        }
        
        private static String removeExtraSpace(String raw) {
            return raw.replaceAll("\s{2,}", " ");
        }
    }

    --2020年5月13日 16点07分--

  • 相关阅读:
    数据库基本概念(一)
    Java语言概述
    Linx
    小菜鸟之Cisco
    小菜鸟之网络基础1
    小菜鸟之JAVA输入输出
    小菜鸟之java JDBC编程
    小菜鸟之JAVA面试题库1
    小菜鸟之java异常
    小菜鸟之正则表达式
  • 原文地址:https://www.cnblogs.com/heyang78/p/12881770.html
Copyright © 2011-2022 走看看