zoukankan      html  css  js  c++  java
  • Tars在鲲鹏ARM64服务器上的移植分享

    Tars介绍

    Tars是将腾讯内部使用的微服务架构TAF(Total Application Framework)多年的实践成果总结而成的开源项目。是基于名字服务使用Tars协议的高性能RPC开发框架,同时配套一体化的服务治理平台,帮助个人或者企业快速的以微服务的方式构建自己稳定可靠的分布式应用。

    源码获取

    git clone https://github.com/TarsCloud/...
    cd TarsFramework
    git submodule update --init --recursive

    移植过程

    1 原子操作实现

    在tarscpp/util/include/util/tc_atomic.h下的几个原子操作函数(inc_fast,dec_and_test,add_and_return)都是基于x86汇编实现,在ARM64平台下,使用gcc内置函数实现,示例如下:
    原x86嵌汇编实现:

    int add_and_return(int i) 
    {
        /* Modern 486+ processor */
            int __i = i;
            __asm__ __volatile__(
                TARS_LOCK "xaddl %0, %1;"
                :"=r"(i)
                :"m"(_value.counter), "0"(i));
    
         return i + __i;
    }

    支持ARM64平台后的实现:

    int add_and_return(int i)
    {
    #if defined(__aarch64__)
            return __atomic_add_fetch(&_value.counter,i,__ATOMIC_ACQ_REL);
    #else
        /* Modern 486+ processor */
            int __i = i;
            __asm__ __volatile__(
                TARS_LOCK "xaddl %0, %1;"
                :"=r"(i)
                :"m"(_value.counter), "0"(i));
    
         return i + __i;
    #endif
    }

    2 高精度计时器实现

    在tarscpp/util/include/util/tc_timeprovider.h下实现了基于x86汇编的高精度计时器。其中
    Rdtsc是x86下一条读取TSC的指令。在ARM64平台下,我们可以通过mrs指令来读取CNTVCT_EL0计时器来实现,具体实现如下。
    原x86嵌汇编实现:

    #define rdtsc(low,high) 
           __asm__ __volatile__("rdtsc" : "=a" (low), "=d" (high))

    支持ARM64平台后的实现:

    #if defined(__aarch64__)
     #define rdtsc(var) 
           asm volatile("mrs %0, CNTVCT_EL0" : "=r"(var))
     #elif defined(__x86_64__)
     #define rdtsc(low,high) 
           __asm__ __volatile__("rdtsc" : "=a" (low), "=d" (high))
     #endif

    3 协程实现

    协程是一种用户态的轻量级线程,其调度完全由用户控制。因此,协程调度切换时需要用户自己将寄存器和栈保存到其他地方,再切回来的时候,恢复先前保存的寄存器上下文和栈。在tarscpp/util/src/下的tc_make_x86_64_sysv_elf_gas.s和tc_jump_x86_64_sysv_elf_gas.s实现了基于x86_64架构的协程堆栈初始化和寄存器上下文切换操作。具体实现如下:

    make_fcontext:
        leaq   -0x58(%rdi),    %rax        /* reserve space for fcontext_t at top of context stack */
    
        /* shift address in RAX to lower 16 byte boundary */
        /* == pointer to fcontext_t and address of context stack */
        andq   $-16,           %rax
    
        movq   %rdi,           0x40(%rax) /* save address of context stack pointer (base) in fcontext_t */
        movq   %rsi,           0x48(%rax) /* save context stack size in fcontext_t */
        movq   %rdx,           0x38(%rax) /* save address of context function in fcontext_t */
    
        stmxcsr  0x50(%rax)                /* save MMX control and status word */
        fnstcw   0x54(%rax)                /* save x87 control word */
    
        leaq   -0x8(%rax),      %rdx       /* reserve space for the return address on context stack, (RSP - 0x8) % 16 == 0 */
        movq   %rdx,            0x30(%rax) /* save address in RDX as stack pointer for context function */
    
        leaq   finish(%rip),    %rcx       /* compute abs address of label finish */
        movq   %rcx,            (%rdx)     /* save address of finish as return address for context function */
                                           /* entered after context function returns */
    
        ret                                /* return pointer to fcontext_t placed on context stack */
    
    finish:
        /* RSP points to same address as RSP on entry of context function + 0x8 */
        xorq    %rdi,           %rdi       /* exit code is zero */
        call   _exit@PLT                   /* exit application */
        hlt
    jump_fcontext:
        movq     %rbx,       (%rdi)         /* save RBX */
        movq     %r12,       0x8(%rdi)      /* save R12 */
        movq     %r13,       0x10(%rdi)     /* save R13 */
        movq     %r14,       0x18(%rdi)     /* save R14 */
        movq     %r15,       0x20(%rdi)     /* save R15 */
        movq     %rbp,       0x28(%rdi)     /* save RBP */
    
        cmp      $0,         %rcx
        je       1f
    
        stmxcsr  0x50(%rdi)             /* save MMX control and status word */
        fnstcw   0x54(%rdi)             /* save x87 control word */
    
        ldmxcsr  0x50(%rsi)             /* restore MMX control and status word */
        fldcw    0x54(%rsi)             /* restore x87 control word */
    1:
    
        leaq     0x8(%rsp),  %rax       /* exclude the return address and save as stack pointer */
        movq     %rax,       0x30(%rdi) /* save as stack pointer */
        movq     (%rsp),     %rax       /* save return address */
        movq     %rax,       0x38(%rdi) /* save return address as RIP */
    
        movq     (%rsi),      %rbx      /* restore RBX */
        movq     0x8(%rsi),   %r12      /* restore R12 */
        movq     0x10(%rsi),  %r13      /* restore R13 */
        movq     0x18(%rsi),  %r14      /* restore R14 */
        movq     0x20(%rsi),  %r15      /* restore R15 */
        movq     0x28(%rsi),  %rbp      /* restore RBP */
    
        movq     0x30(%rsi),  %rsp      /* restore RSP */
        movq     0x38(%rsi),  %rcx      /* fetch the address to return to */
    
        movq     %rdx,        %rax      /* use third arg as return value after jump */
        movq     %rdx,        %rdi      /* use third arg as first arg in context function */
    
        jmp      %rcx                  /* indirect jump to context */

    通过理解以上x86实现并结合Procedure Call Standard for the Arm规范我们做了如下支持ARM64平台的实现:

    make_fcontext:
        mov x5, x0
        sub x5, x5 , 0x88
        and x5, x5 , #-16
    
        str x0, [x5, 0x70]
        str x1, [x5, 0x78]
        str x2, [x5, 0x80]
    
        /*sub x6, x5, 0x10 */
        str x5, [x5, 0x68]
    
        mov x0, x5
        LDR x7, =finish
        str x7, [x5, 0x60]
        ret
    
    finish:
        /* RSP points to same address as RSP on entry of context function + 0x8 */
        eor   x0,     x0,         x0       /* exit code is zero */
        bl     exit                   /* exit application */
    jump_fcontext:
        stp x18, x19, [x0]
        stp x20, x21, [x0, 0x10]
        stp x22, x23, [x0, 0x20]
        stp x24, x25, [x0, 0x30]
        stp x26, x27, [x0, 0x40]
        stp x28, x29, [x0, 0x50]
        str      x30, [x0, 0x60]
        mov x5,  sp
        str      x5,  [x0, 0x68]
        str      x30, [x0, 0x80]
    
    
        ldp x18, x19, [x1]
        ldp x20, x21, [x1, 0x10]
        ldp x22, x23, [x1, 0x20]
        ldp x24, x25, [x1, 0x30]
        ldp x26, x27, [x1, 0x40]
        ldp x28, x29, [x1, 0x50]
        ldr      x30, [x1, 0x60]
        ldr      x5,  [x1, 0x68]
        mov sp,  x5
    
        ldr      x6,  [x1, 0x80]
    
        mov x0,  x2
    
        br x6

    如想详细了解Procedure Call Standard for the Arm规范请参考如下链接:https://developer.arm.com/doc...

    总结

    Tars的移植没有牵扯到太多跟CPU架构相关的代码,因此工作量不大,但需要我们对ARM64架构有一定的了解,才能确保移植的准确性。该文的几个移植点具有通用性,对于其他应用移植到ARM64平台具有借鉴意义。

  • 相关阅读:
    元学习Meta Learning/Learning to learn
    TRAINING A CLASSIFIER训练分类器(pytorch官网60分钟闪电战第四节)
    NEURAL NETWORKS神经网络(pytorch官网60分钟闪电战第三节)
    AUTOGRAD: 自动分化(pytorch官网60分钟闪电战第二节)
    WHAT IS PYTORCH?(pytorch官网60分钟闪电战第一节)
    windows找不到gpedit.msc
    The “freeze_support()“ line can be omitted if the program is not going to be frozen to produ
    torch.mul() 和 torch.mm() 的区别
    vue面试题(2)
    JS输出题练习
  • 原文地址:https://www.cnblogs.com/dream397/p/14600133.html
Copyright © 2011-2022 走看看