zoukankan      html  css  js  c++  java
  • 算子扫描与递归核

    算子扫描与递归核

    这是关于如何在TVM中进行循环计算的介绍资料。递归计算是神经网络的一种典型模式。

    from __future__ import absolute_import, print_function

     

    import tvm

    import tvm.testing

    from tvm import te

    import numpy as np

    TVM支持扫描运算符来描述符号循环。下面的扫描操作计算X列上的累计值。              扫描在张量的最高维上进行。s_state是一个占位符,用于描述扫描的转换状态。s_init描述了如何初始化前k个时间步。在这里,由于s_init’s的第一个维度是1,它描述了如何在第一时间步初始化状态。             

    s_update描述如何在时间步t更新值。更新值可以通过状态占位符引用上一个时间步的值。虽然在当前或以后的时间步骤中引用s_state是无效的。             

    扫描采用状态占位符、初始值和更新描述。还建议(尽管不是必需的)列出扫描单元的输入。扫描的结果是一个张量,给出在时域上更新后的s_state的结果。

    m = te.var("m")

    n = te.var("n")

    X = te.placeholder((m, n), name="X")

    s_state = te.placeholder((m, n))

    s_init = te.compute((1, n), lambda _, i: X[0, i])

    s_update = te.compute((m, n), lambda t, i: s_state[t - 1, i] + X[t, i])

    s_scan = tvm.te.scan(s_init, s_update, s_state, inputs=[X])

    Schedule the Scan Cell

    可以通过分别调度更新和初始化部分来调度扫描主体。调度更新部件的第一个迭代维度是无效的。要在时间上拆分迭代,用户可以调度scan_op.scan_axis。

    s = te.create_schedule(s_scan.op)
    num_thread = 256
    block_x = te.thread_axis("blockIdx.x")
    thread_x = te.thread_axis("threadIdx.x")
    xo, xi = s[s_init].split(s_init.op.axis[1], factor=num_thread)
    s[s_init].bind(xo, block_x)
    s[s_init].bind(xi, thread_x)
    xo, xi = s[s_update].split(s_update.op.axis[1], factor=num_thread)
    s[s_update].bind(xo, block_x)
    s[s_update].bind(xi, thread_x)
    print(tvm.lower(s, [X, s_scan], simple_mode=True))

    Out:

    primfn(X_1: handle, scan_1: handle) -> ()

      attr = {"global_symbol": "main", "tir.noalias": True}

      buffers = {scan: Buffer(scan_2: Pointer(float32), float32, [m: int32, n: int32], [stride: int32, stride_1: int32], type="auto"),

                 X: Buffer(X_2: Pointer(float32), float32, [m, n], [stride_2: int32, stride_3: int32], type="auto")}

      buffer_map = {X_1: X, scan_1: scan} {

      attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = floordiv((n + 255), 256);

      attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 256;

      if @tir.likely((((blockIdx.x*256) + threadIdx.x) < n), dtype=bool) {

        scan_2[(((blockIdx.x*256) + threadIdx.x)*stride_1)] = (float32*)X_2[(((blockIdx.x*256) + threadIdx.x)*stride_3)]

      }

      for (scan.idx: int32, 0, (m - 1)) {

        attr [IterVar(blockIdx.x, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = floordiv((n + 255), 256);

        attr [IterVar(threadIdx.x, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 256;

        if @tir.likely((((blockIdx.x*256) + threadIdx.x) < n), dtype=bool) {

          scan_2[(((scan.idx + 1)*stride) + (((blockIdx.x*256) + threadIdx.x)*stride_1))] = ((float32*)scan_2[((scan.idx*stride) + (((blockIdx.x*256) + threadIdx.x)*stride_1))] + (float32*)X_2[(((scan.idx + 1)*stride_2) + (((blockIdx.x*256) + threadIdx.x)*stride_3))])

        }

      }

    }

    Build and Verify

    可以像其他TVM内核一样构建扫描内核,使用numpy来验证结果的正确性。

    fscan = tvm.build(s, [X, s_scan], "cuda", name="myscan")
    ctx = tvm.gpu(0)
    n = 1024
    m = 10
    a_np = np.random.uniform(size=(m, n)).astype(s_scan.dtype)
    a = tvm.nd.array(a_np, ctx)
    b = tvm.nd.array(np.zeros((m, n), dtype=s_scan.dtype), ctx)
    fscan(a, b)
    tvm.testing.assert_allclose(b.asnumpy(), np.cumsum(a_np, axis=0))

    Multi-Stage Scan Cell

    在上面的例子中,描述了扫描单元使用一个张量计算阶段在s_update。可以在扫描单元中使用多个张量级。             

    以下几行显示扫描单元中有两个阶段操作的扫描。

    m = te.var("m")

    n = te.var("n")

    X = te.placeholder((m, n), name="X")

    s_state = te.placeholder((m, n))

    s_init = te.compute((1, n), lambda _, i: X[0, i])

    s_update_s1 = te.compute((m, n), lambda t, i: s_state[t - 1, i] * 2, name="s1")

    s_update_s2 = te.compute((m, n), lambda t, i: s_update_s1[t, i] + X[t, i], name="s2")

    s_scan = tvm.te.scan(s_init, s_update_s2, s_state, inputs=[X])

    这些中间张量也可以正常调度。为了确保正确性,TVM创建了一个组约束,禁止在扫描循环之外的位置compute_at扫描体。

    s = te.create_schedule(s_scan.op)
    xo, xi = s[s_update_s2].split(s_update_s2.op.axis[1], factor=32)
    s[s_update_s1].compute_at(s[s_update_s2], xo)
    print(tvm.lower(s, [X, s_scan], simple_mode=True))

    Out:

    primfn(X_1: handle, scan_1: handle) -> ()

      attr = {"global_symbol": "main", "tir.noalias": True}

      buffers = {scan: Buffer(scan_2: Pointer(float32), float32, [m: int32, n: int32], [stride: int32, stride_1: int32], type="auto"),

                 X: Buffer(X_2: Pointer(float32), float32, [m, n], [stride_2: int32, stride_3: int32], type="auto")}

      buffer_map = {X_1: X, scan_1: scan} {

      attr [s1: Pointer(float32)] "storage_scope" = "global";

      allocate(s1, float32, [32]) {

        for (i: int32, 0, n) {

          scan_2[(i*stride_1)] = (float32*)X_2[(i*stride_3)]

        }

        for (scan.idx: int32, 0, (m - 1)) {

          for (i.outer: int32, 0, floordiv((n + 31), 32)) {

            for (i_1: int32, 0, 32) {

              if @tir.likely((((i.outer*32) + i_1) < n), dtype=bool) {

                s1[i_1] = ((float32*)scan_2[((scan.idx*stride) + (((i.outer*32) + i_1)*stride_1))]*2f32)

              }

            }

            for (i.inner: int32, 0, 32) {

              if @tir.likely((((i.outer*32) + i.inner) < n), dtype=bool) {

                scan_2[(((scan.idx + 1)*stride) + (((i.outer*32) + i.inner)*stride_1))] = ((float32*)s1[i.inner] + (float32*)X_2[(((scan.idx + 1)*stride_2) + (((i.outer*32) + i.inner)*stride_3))])

              }

            }

          }

        }

      }

    }

    Multiple States

    对于像RNN这样的复杂应用程序,可能需要不止一个递归状态。扫描支持多种重复状态。下面的示例演示了如何使用两种状态构建递归。

    m = te.var("m")

    n = te.var("n")

    l = te.var("l")

    X = te.placeholder((m, n), name="X")

    s_state1 = te.placeholder((m, n))

    s_state2 = te.placeholder((m, l))

    s_init1 = te.compute((1, n), lambda _, i: X[0, i])

    s_init2 = te.compute((1, l), lambda _, i: 0.0)

    s_update1 = te.compute((m, n), lambda t, i: s_state1[t - 1, i] + X[t, i])

    s_update2 = te.compute((m, l), lambda t, i: s_state2[t - 1, i] + s_state1[t - 1, 0])

    s_scan1, s_scan2 = tvm.te.scan(

        [s_init1, s_init2], [s_update1, s_update2], [s_state1, s_state2], inputs=[X]

    )

    s = te.create_schedule(s_scan1.op)

    print(tvm.lower(s, [X, s_scan1, s_scan2], simple_mode=True))

    Out:

    primfn(X_1: handle, scan.v0_1: handle, scan.v1_1: handle) -> ()

      attr = {"global_symbol": "main", "tir.noalias": True}

      buffers = {scan.v1: Buffer(scan.v1_2: Pointer(float32), float32, [m: int32, l: int32], [stride: int32, stride_1: int32], type="auto"),

                 scan.v0: Buffer(scan.v0_2: Pointer(float32), float32, [m, n: int32], [stride_2: int32, stride_3: int32], type="auto"),

                 X: Buffer(X_2: Pointer(float32), float32, [m, n], [stride_4: int32, stride_5: int32], type="auto")}

      buffer_map = {X_1: X, scan.v0_1: scan.v0, scan.v1_1: scan.v1} {

      for (i: int32, 0, n) {

        scan.v0_2[(i*stride_3)] = (float32*)X_2[(i*stride_5)]

      }

      for (i_1: int32, 0, l) {

        scan.v1_2[(i_1*stride_1)] = 0f32

      }

      for (scan.idx: int32, 0, (m - 1)) {

        for (i_2: int32, 0, n) {

          scan.v0_2[(((scan.idx + 1)*stride_2) + (i_2*stride_3))] = ((float32*)scan.v0_2[((scan.idx*stride_2) + (i_2*stride_3))] + (float32*)X_2[(((scan.idx + 1)*stride_4) + (i_2*stride_5))])

        }

        for (i_3: int32, 0, l) {

          scan.v1_2[(((scan.idx + 1)*stride) + (i_3*stride_1))] = ((float32*)scan.v1_2[((scan.idx*stride) + (i_3*stride_1))] + (float32*)scan.v0_2[(scan.idx*stride_2)])

        }

      }

    }

    Summary

    本文提供扫描原语的概况。             

    用init和update描述扫描。             

    按正常计划安排扫描单元。             

    对于复杂的工作负载,在扫描单元中使用多个状态和步骤。

    https://tvm.apache.org/docs/tutorials/language/scan.html             

    下载Python源代码:scan.py             

    下载Jupyter笔记:scan.ipynb

    人工智能芯片与自动驾驶
  • 相关阅读:
    mongodb 记录
    php保存文件
    调用AngularJS的API
    angular修改数据
    大小写转换
    使用Properties类动态加载配置文件里的内容
    org.apache.commons.cli.Options
    Google guava和Apache commons
    orc格式文件
    shell的awk命令使用
  • 原文地址:https://www.cnblogs.com/wujianming-110117/p/14132088.html
Copyright © 2011-2022 走看看