zoukankan      html  css  js  c++  java
  • 编译impala、拓展impala语法解析模块

    以前也编译过,但是每次编译都忘记怎么做,然后都得重新找需要下载的文件。

    编译文件:buildall.sh

    如果想只编译前端可以这样运行:

    buildall.sh -fe_only
    

    编译时会去S3下载一些文件,由于在国外下载很慢,所以可以在本地开ss去下载好再上传到编译服务器上

    那么会下载哪些东西呢?
    编辑bin/bootstrap_toolchain.py
    找到下面这几句话

    def wget_and_unpack_package(download_path, file_name, destination, wget_no_clobber):
      print "URL {0}".format(download_path)
      print "Downloading {0} to {1}".format(file_name, destination)
      # --no-clobber avoids downloading the file if a file with the name already exists
    
      sh.wget(download_path, directory_prefix=destination, no_clobber=wget_no_clobber)
      print "Extracting {0}".format(file_name)
      sh.tar(z=True, x=True, f=os.path.join(destination, file_name), directory=destination)
      sh.rm(os.path.join(destination, file_name))
    

    把后面4行注释掉,就不会去真正下载了:

    def wget_and_unpack_package(download_path, file_name, destination, wget_no_clobber):
      print "URL {0}".format(download_path)
      print "Downloading {0} to {1}".format(file_name, destination)
      # --no-clobber avoids downloading the file if a file with the name already exists
      """
      sh.wget(download_path, directory_prefix=destination, no_clobber=wget_no_clobber)
      print "Extracting {0}".format(file_name)
      sh.tar(z=True, x=True, f=os.path.join(destination, file_name), directory=destination)
      sh.rm(os.path.join(destination, file_name))
      """
    

    然后找到这段话:

    def bootstrap(toolchain_root, packages):
      """Downloads and unpacks each package in the list `packages` into `toolchain_root` if it
      doesn't exist already.
      """
      if not try_get_platform_release_label():
        check_custom_toolchain(toolchain_root, packages)
        return
    
      # Detect the compiler
      compiler = "gcc-{0}".format(os.environ["IMPALA_GCC_VERSION"])
    
      for p in packages:
        pkg_name, pkg_version = unpack_name_and_version(p)
        if check_for_existing_package(toolchain_root, pkg_name, pkg_version, compiler):
          continue
        if pkg_name != "kudu" or os.environ["KUDU_IS_SUPPORTED"] == "true":
          download_package(toolchain_root, pkg_name, pkg_version, compiler)
        else:
          build_kudu_stub(toolchain_root, pkg_version, compiler)
        write_version_file(toolchain_root, pkg_name, pkg_version, compiler,
            get_platform_release_label())
    

    把最后一句话注释掉:

    def bootstrap(toolchain_root, packages):
      """Downloads and unpacks each package in the list `packages` into `toolchain_root` if it
      doesn't exist already.
      """
      if not try_get_platform_release_label():
        check_custom_toolchain(toolchain_root, packages)
        return
    
      # Detect the compiler
      compiler = "gcc-{0}".format(os.environ["IMPALA_GCC_VERSION"])
    
      for p in packages:
        pkg_name, pkg_version = unpack_name_and_version(p)
        if check_for_existing_package(toolchain_root, pkg_name, pkg_version, compiler):
          continue
        if pkg_name != "kudu" or os.environ["KUDU_IS_SUPPORTED"] == "true":
          download_package(toolchain_root, pkg_name, pkg_version, compiler)
        else:
          build_kudu_stub(toolchain_root, pkg_version, compiler)
        """
        write_version_file(toolchain_root, pkg_name, pkg_version, compiler,
            get_platform_release_label())
        """
    

    运行buildall.sh后屏幕就会打印出需要下载的东西,上传到toolchain文件夹就行。上传结束后再把刚才注释的代码恢复就好

    加速编译前端

    运行下面的命令,去掉测试,只编译前端代码

    ./buildall.sh -skiptests -fe_only
    

    上面命令运行成功后,找到infra/python/deps/pip_download.py的下段代码

    def download_package(pkg_name, pkg_version):
      '''Download the required package. Sometimes the download can be flaky, so we use the
      retry decorator.'''
      pkg_type = 'sdist' # Don't download wheel archives for now
      # This JSON endpoint is not provided by PyPI mirrors so we always need to get this
      # from pypi.python.org.
      pkg_info = json.loads(urlopen('https://pypi.python.org/pypi/%s/json' % pkg_name).read())
    
      downloader = URLopener()
      for pkg in pkg_info['releases'][pkg_version]:
        if pkg['packagetype'] == pkg_type:
          filename = pkg['filename']
          expected_md5 = pkg['md5_digest']
          if os.path.isfile(filename) and check_md5sum(filename, expected_md5):
            print "File with matching md5sum already exists, skipping %s" % filename
            return True
          pkg_url = "{0}/packages/{1}".format(PYPI_MIRROR, pkg['path'])
          print "Downloading %s from %s" % (filename, pkg_url)
          downloader.retrieve(pkg_url, filename)
          actual_md5 = md5(open(filename).read()).hexdigest()
          if check_md5sum(filename, expected_md5):
            return True
          else:
            print "MD5 mismatch in file %s." % filename
            return False
      print "Could not find archive to download for %s %s %s" % (
          pkg_name, pkg_version, pkg_type)
      sys.exit(1)
    

    这段代码会打开url链接下载第三方软件,然后检查md5值,非常慢,所以注释掉整个代码,返回True:

    def download_package(pkg_name, pkg_version):
      return True
      '''Download the required package. Sometimes the download can be flaky, so we use the
      retry decorator.'''
      """
      pkg_type = 'sdist' # Don't download wheel archives for now
      # This JSON endpoint is not provided by PyPI mirrors so we always need to get this
      # from pypi.python.org.
      pkg_info = json.loads(urlopen('https://pypi.python.org/pypi/%s/json' % pkg_name).read())
    
      downloader = URLopener()
      for pkg in pkg_info['releases'][pkg_version]:
        if pkg['packagetype'] == pkg_type:
          filename = pkg['filename']
          expected_md5 = pkg['md5_digest']
          if os.path.isfile(filename) and check_md5sum(filename, expected_md5):
            print "File with matching md5sum already exists, skipping %s" % filename
            return True
          pkg_url = "{0}/packages/{1}".format(PYPI_MIRROR, pkg['path'])
          print "Downloading %s from %s" % (filename, pkg_url)
          downloader.retrieve(pkg_url, filename)
          actual_md5 = md5(open(filename).read()).hexdigest()
          if check_md5sum(filename, expected_md5):
            return True
          else:
            print "MD5 mismatch in file %s." % filename
            return False
      print "Could not find archive to download for %s %s %s" % (
          pkg_name, pkg_version, pkg_type)
      sys.exit(1)
      """
    

    修改前端词法、语法解析源码

    impala使用了jflex做词法解析,java_cup去做语法解析。
    java_cup有个java_cup.runtime.Symbol类用来表示解析到的每个词,其中left属性代表词的行号,right属性代表词的列号,但是用行列来代表词在sql中的位置很不方便,我想要修改成可以获取当前词的开始、末尾在字符串的下标。
    因此编辑fe/src/main/jflex/sql-scanner.flex文件,增加一个%char让jflex记录字符偏移变量到yychar
    然后把newToken改成这样:

    private ExtendSymbol newToken(int id, Object value) {
        String text = yytext();
        return new ExtendSymbol(id, yyline+1, yycolumn+1, value,
            this.yychar, this.yychar + text.length(), text);
      }
    

    这样就通过SqlScanner拿到当前词在Reader中的位置了,另外我们希望Symbol还能提供位置信息,所以增加一个子类:

    package java_cup.runtime;
    
    import java_cup.runtime.Symbol;
    
    public class ExtendSymbol extends Symbol {
        public int start = -1;
        public int end = -1;
        public String text;
    
        public ExtendSymbol(int id, int left, int right, Object value,
                            int start, int end, String text) {
            super(id, left, right, value);
            this.start = start;
            this.end = end;
            this.text = text;
        }
    
        public ExtendSymbol(int id, ExtendSymbol left, ExtendSymbol right, Object value) {
            this(id, left.left, right.right, value, left.start, right.end, null);
        }
    
        public ExtendSymbol(int id, ExtendSymbol left, ExtendSymbol right) {
            this(id, left, right, null);
        }
    
        public ExtendSymbol(int id, int left, int right, Object value) {
            this(id, left, right, value, -1, -1, null);
        }
    
        public ExtendSymbol(int id, Object o) {
            this(id, -1, -1, o);
        }
    
        public ExtendSymbol(int id, int left, int right) {
            this(id, left, right, (Object)null);
        }
    
        public ExtendSymbol(int sym_num) {
            super(sym_num, -1);
        }
    
        ExtendSymbol(int sym_num, int state) {
            super(sym_num, state);
        }
    }
    
    

    增加一个符号工厂类:

    package java_cup.runtime;
    
    import java_cup.runtime.Symbol;
    import java_cup.runtime.SymbolFactory;
    
    public class ExtendSymbolFactory implements SymbolFactory {
        @Override
        public ExtendSymbol newSymbol(String name, int id, Symbol left, Symbol right, Object value) {
            return new ExtendSymbol(id, (ExtendSymbol) left, (ExtendSymbol) right, value);
        }
    
        @Override
        public ExtendSymbol newSymbol(String name, int id, Symbol left, Symbol right) {
            return new ExtendSymbol(id, (ExtendSymbol) left, (ExtendSymbol) right);
        }
    
        @Override
        public ExtendSymbol newSymbol(String name, int id, Object o) {
            return new ExtendSymbol(id, o);
        }
    
        @Override
        public ExtendSymbol newSymbol(String name, int id) {
            return new ExtendSymbol(id);
        }
    
        @Override
        public ExtendSymbol startSymbol(String name, int id, int state) {
            return new ExtendSymbol(id, state);
        }
    }
    
    

    为语法块增加位置信息就比较复杂了,主要给org/apache/impala/analysis中的类增加一个带位置信息和子语法块的类:

    package org.apache.impala.analysis;
    
    import java.util.List;
    
    public class SyntaxBlock {
        public int startPosition = -1;
        public int endPosition = -1;
        public List<SyntaxBlock> subBlocks;
    
        public SyntaxBlock() {
        }
    
        public SyntaxBlock(int startPosition, int endPosition) {
            this.startPosition = startPosition;
            this.endPosition = endPosition;
        }
    
        public SyntaxBlock(int startPosition, int endPosition, List<SyntaxBlock> subBlocks) {
            this.startPosition = startPosition;
            this.endPosition = endPosition;
            this.subBlocks = subBlocks;
        }
    }
    

    然后加上一个子类ObjectSyntaxBlock,用来存放String、Object、HashMap、ArrayList、Pair这些类型的语法块

    package org.apache.impala.analysis;
    
    import java.util.List;
    
    public class ObjectSyntaxBlock<T> extends SyntaxBlock {
        public T objectValue;
    
        public ObjectSyntaxBlock() {
        }
    
        public ObjectSyntaxBlock(T objectValue) {
            this.objectValue = objectValue;
        }
    
        public ObjectSyntaxBlock(int startPosition, int endPosition, T objectValue) {
            super(startPosition, endPosition);
            this.objectValue = objectValue;
        }
    
        public ObjectSyntaxBlock(int startPosition, int endPosition, List<SyntaxBlock> subBlocks, T objectValue) {
            super(startPosition, endPosition, subBlocks);
            this.objectValue = objectValue;
        }
    
        public T getObjectValue() {
            return objectValue;
        }
    }
    
    

    最难的一步是修改sql-parse.cup文件,需要把所有语法信息都修改了,非常耗时间
    和原版的区别是非终结符的类型如果是String、Object、HashMap、ArrayList、Pair、enum和非org.apache.impala.analysis包的类的话,需要用ObjectSyntaxBlock包装一层
    比如

    nonterminal List<UnionOperand> values_operand_list;
    nonterminal TDescribeOutputStyle describe_output_style;
    

    需要修改成

    nonterminal ObjectSyntaxBlock<List<UnionOperand>> values_operand_list;
    nonterminal ObjectSyntaxBlock<TDescribeOutputStyle> describe_output_style;
    

    语法块的定义比如

    table_ref ::=
      dotted_path:path
      {: RESULT = new TableRef(path, null); :}
      | dotted_path:path alias_clause:alias
      {: RESULT = new TableRef(path, alias); :}
      | LPAREN query_stmt:query RPAREN alias_clause:alias
      {: RESULT = new InlineViewRef(alias, query); :}
      ;
    

    需要修改为:

    table_ref ::=
      dotted_path:path
      {:
        ExtendSymbol _0_symbol = (ExtendSymbol) CUP$SqlParser$stack.peek();
    
        RESULT = new TableRef(path.objectValue, null);
        RESULT.startPosition = _0_symbol.start;
        RESULT.endPosition = _0_symbol.end;
        RESULT.subBlocks = Lists.newArrayList(
            (SyntaxBlock) _0_symbol.value
        );
      :}
      | dotted_path:path alias_clause:alias
      {:
        ExtendSymbol _1_symbol = (ExtendSymbol) CUP$SqlParser$stack.elementAt(CUP$SqlParser$top - 1);
        ExtendSymbol _0_symbol = (ExtendSymbol) CUP$SqlParser$stack.peek();
    
        RESULT = new TableRef(path.objectValue, alias.objectValue);
        RESULT.startPosition = _1_symbol.start;
        RESULT.endPosition = _0_symbol.end;
        RESULT.subBlocks = Lists.newArrayList(
            (SyntaxBlock) _1_symbol.value,
            (SyntaxBlock) _0_symbol.value
        );
      :}
      | LPAREN query_stmt:query RPAREN alias_clause:alias
      {:
        ExtendSymbol _3_symbol = (ExtendSymbol) CUP$SqlParser$stack.elementAt(CUP$SqlParser$top - 3);
        ExtendSymbol _2_symbol = (ExtendSymbol) CUP$SqlParser$stack.elementAt(CUP$SqlParser$top - 2);
        ExtendSymbol _1_symbol = (ExtendSymbol) CUP$SqlParser$stack.elementAt(CUP$SqlParser$top - 1);
        ExtendSymbol _0_symbol = (ExtendSymbol) CUP$SqlParser$stack.peek();
    
        RESULT = new InlineViewRef(alias.objectValue, query);
        RESULT.startPosition = _3_symbol.start;
        RESULT.endPosition = _0_symbol.end;
        RESULT.subBlocks = Lists.newArrayList(
            (SyntaxBlock) _3_symbol.value,
            (SyntaxBlock) _2_symbol.value,
            (SyntaxBlock) _1_symbol.value,
            (SyntaxBlock) _0_symbol.value
        );
      :}
      ;
    

    像SelectStmt这种类,我们让他直接或间接继承SyntaxBlock,所以可以直接设置位置信息,不用包装

    select_stmt ::=
        select_clause:selectList
      {:
        RESULT = new SelectStmt(selectList, null, null, null, null, null, null);
      :}
      |
        select_clause:selectList
        from_clause:fromClause
        where_clause:wherePredicate
        group_by_clause:groupingExprs
        having_clause:havingPredicate
        opt_order_by_clause:orderByClause
        opt_limit_offset_clause:limitOffsetClause
      {:
        RESULT = new SelectStmt(selectList, fromClause, wherePredicate, groupingExprs,
                                havingPredicate, orderByClause, limitOffsetClause);
      :}
      ;
    

    改成:

    select_stmt ::=
        select_clause:selectList
      {:
        ExtendSymbol _0_symbol = (ExtendSymbol) CUP$SqlParser$stack.peek();
    
        RESULT = new SelectStmt(selectList, null, null, null, null, null, null);
        RESULT.startPosition = _0_symbol.start;
        RESULT.endPosition = _0_symbol.end;
        RESULT.subBlocks = Lists.newArrayList(
            (SyntaxBlock) _0_symbol.value
        );
      :}
      |
        select_clause:selectList
        from_clause:fromClause
        where_clause:wherePredicate
        group_by_clause:groupingExprs
        having_clause:havingPredicate
        opt_order_by_clause:orderByClause
        opt_limit_offset_clause:limitOffsetClause
      {:
        ExtendSymbol _6_symbol = (ExtendSymbol) CUP$SqlParser$stack.elementAt(CUP$SqlParser$top - 6);
        ExtendSymbol _5_symbol = (ExtendSymbol) CUP$SqlParser$stack.elementAt(CUP$SqlParser$top - 5);
        ExtendSymbol _4_symbol = (ExtendSymbol) CUP$SqlParser$stack.elementAt(CUP$SqlParser$top - 4);
        ExtendSymbol _3_symbol = (ExtendSymbol) CUP$SqlParser$stack.elementAt(CUP$SqlParser$top - 3);
        ExtendSymbol _2_symbol = (ExtendSymbol) CUP$SqlParser$stack.elementAt(CUP$SqlParser$top - 2);
        ExtendSymbol _1_symbol = (ExtendSymbol) CUP$SqlParser$stack.elementAt(CUP$SqlParser$top - 1);
        ExtendSymbol _0_symbol = (ExtendSymbol) CUP$SqlParser$stack.peek();
    
        RESULT = new SelectStmt(selectList, fromClause, wherePredicate, groupingExprs.objectValue,
                                havingPredicate, orderByClause.objectValue, limitOffsetClause);
        RESULT.startPosition = _6_symbol.start;
        RESULT.endPosition = _0_symbol.end;
        RESULT.subBlocks = Lists.newArrayList(
            (SyntaxBlock) _6_symbol.value,
            (SyntaxBlock) _5_symbol.value,
            (SyntaxBlock) _4_symbol.value,
            (SyntaxBlock) _3_symbol.value,
            (SyntaxBlock) _2_symbol.value,
            (SyntaxBlock) _1_symbol.value,
            (SyntaxBlock) _0_symbol.value
        );
      :}
      ;
    

    其中groupingExprs.objectValue是因为groupingExprsopt_order_by_clause语法块的对象,opt_order_by_clause是ObjectSyntaxBlock类型的,所以本来可以直接引用,现在需要加上objectValue才能访问到包装里面的对象

    github地址:impala增加语法块位置库

  • 相关阅读:
    图文详解 Android Binder跨进程通信机制 原理
    支链氨基酸怎么吃
    C#泛型约束
    树状结构 Tree data structure in C#
    wrap ConcurrentDictionary in BlockingCollection
    ConcurrentBag扩展 批量加入
    Dictionary GetOrAdd
    ConcurrentDictionary AddOrUpdate
    object pool
    C# 结构体定义 转换字节数组 z
  • 原文地址:https://www.cnblogs.com/lanhj/p/6693617.html
Copyright © 2011-2022 走看看