zoukankan      html  css  js  c++  java
  • Volatile & Memory Barrier

    global_timer_upper和global_timer_low指向的存储空间是 global timer 的高32位和低32位,它们会不停变化,下面的程序功能是读取当前的global timer值。这里会出现一个常见的编译器优化带来的错误。

    源程序如下:

    #0  [volatile] unsigned long *global_timer_upper, *global_timer_low;
    inline unsigned long long read_cycle(unsigned int *re_low, unsigned int *re_upper) { #1 [volatile] unsigned int __low,__upper_old,__upper_new; unsigned long long value; __upper_new = *global_timer_upper; do { __upper_old = __upper_new; #2 [asm volatile("":::"memory");] __low = *global_timer_low;
           #3 [asm volatile("":::"memory");] __upper_new
    = *global_timer_upper;
           #4 [asm volatile("":::"memory");] }
    while(__upper_new != __upper_old); *re_low = __low; *re_upper = __upper_new; value = (unsigned long long)__low | (((unsigned long long)__upper_new) << 32); return value; }

    源程序编译后得到的结果如下,可以看出经过编译器的优化,循环消失了,因为编译器认为执行流中没有改变*global_timer_upper,所以while循环中的比较是多余的。

    00000000 <read_cycle>:
       0:   b470            push    {r4, r5, r6}
       2:   f240 0400       movw    r4, #0
       6:   f2c0 0400       movt    r4, #0
       a:   2200            movs    r2, #0
       c:   6865            ldr     r5, [r4, #4]
       e:   6826            ldr     r6, [r4, #0]
      10:   682d            ldr     r5, [r5, #0]
      12:   6834            ldr     r4, [r6, #0]
      14:   432a            orrs    r2, r5
      16:   6005            str     r5, [r0, #0]
      18:   4610            mov     r0, r2
      1a:   600c            str     r4, [r1, #0]
      1c:   4621            mov     r1, r4
      1e:   bc70            pop     {r4, r5, r6}
      20:   4770            bx      lr
      22:   bf00            nop

    只在#1处,添加volatile修饰,编译结果如下,此时明显看到循环的出现,但是这里的程序依然是错误的,编译器虽然虽然没有对__upper_new\__upper_old等值的读写进行优化,使用了堆栈进行存储。但是,编译器认为*global_timer_upper的值没有改变过,所以将其存储在寄存器r4中:

    00000000 <read_cycle>:
       0:   f240 0300       movw    r3, #0
       4:   f2c0 0300       movt    r3, #0
       8:   b4f0            push    {r4, r5, r6, r7}
       a:   b084            sub     sp, #16
       c:   cb0c            ldmia   r3, {r2, r3}  @ 读取upper和low的内存地址放到r2,r3中
       e:   6814            ldr     r4, [r2, #0]  @ 从对应内存读取upper至r4 (__upper_new = * global_timer_upper)
      10:   9401            str     r4, [sp, #4]  @ 将upper放在栈上[sp,#4]
      12:   681d            ldr     r5, [r3, #0]  @ 从对应内存读取lower至r5 
      14:   9b01            ldr     r3, [sp, #4]  @ 循环开始 相对于 __upper_old = __upper_new
      16:   9302            str     r3, [sp, #8]  @ 将__upper_old放在栈上[sp,#8] 
      18:   9503            str     r5, [sp, #12] @ 将lower保存在栈上[sp,#12]
      1a:   9401            str     r4, [sp, #4]  @ 将将r4中的__upper_new放在栈上[sp,#4], r4是最开始e处获得的upper值,一直未改变。
      1c:   9a01            ldr     r2, [sp, #4]  @ 从栈上读出__upper_new
      1e:   9b02            ldr     r3, [sp, #8]  @ 从栈上读出__upper_old
      20:   429a            cmp     r2, r3
      22:   d1f7            bne.n   14 <read_cycle+0x14>
      24:   9f03            ldr     r7, [sp, #12]
      26:   2200            movs    r2, #0
      28:   9d01            ldr     r5, [sp, #4]
      2a:   9c03            ldr     r4, [sp, #12]
      2c:   9e01            ldr     r6, [sp, #4]
      2e:   4322            orrs    r2, r4
      30:   6007            str     r7, [r0, #0]
      32:   600d            str     r5, [r1, #0]
      34:   4610            mov     r0, r2
      36:   4631            mov     r1, r6
      38:   b004            add     sp, #16
      3a:   bcf0            pop     {r4, r5, r6, r7}
      3c:   4770            bx      lr
      3e:   bf00            nop

    只在#2处,添加memory barrier,编译的结果如下,此时虽然是从global timer 的对应内存中读取数据,但是仍然存在问题,指令的乱序问题。我们的设计,应该是先读low,再读upper,但是编译后的结果是先读upper,再读low,导致结果错误,所以考虑在这两条语句间加memory barrier。

    00000000 <read_cycle>:
       0:   f240 0300       movw    r3, #0
       4:   f2c0 0300       movt    r3, #0
       8:   b430            push    {r4, r5}
       a:   681a            ldr     r2, [r3, #0]  @ 读取upper的内存地址
       c:   6814            ldr     r4, [r2, #0]  @ 从对应内存读取upper ,__upper_new = * global_timer_upper
       e:   e000            b.n     12 <read_cycle+0x12>
      10:   4614            mov     r4, r2     @ __upper_old = __upper_new
      12:   e893 0024       ldmia.w r3, {r2, r5}  @ 读取upper和lower的内存地址
      16:   6812            ldr     r2, [r2, #0]  @ 从对应内存读取 upper , __upper_new = * global_timer_upper
      18:   682d            ldr     r5, [r5, #0]  @ 从对应内存读取 low
      1a:   42a2            cmp     r2, r4        @ 比较
      1c:   d1f8            bne.n   10 <read_cycle+0x10>
      1e:   2200            movs    r2, #0
      20:   6005            str     r5, [r0, #0]
      22:   432a            orrs    r2, r5
      24:   600c            str     r4, [r1, #0]
      26:   4610            mov     r0, r2
      28:   4621            mov     r1, r4
      2a:   bc30            pop     {r4, r5}
      2c:   4770            bx      lr
      2e:   bf00            nop

    只在#4处添加memory barrier,编译的结果如下,效果和不加一样。

    00000000 <read_cycle>:
       0:   f240 0300       movw    r3, #0
       4:   f2c0 0300       movt    r3, #0
       8:   b430            push    {r4, r5}
       a:   cb0c            ldmia   r3, {r2, r3}
       c:   6815            ldr     r5, [r2, #0]
       e:   681c            ldr     r4, [r3, #0]
      10:   2200            movs    r2, #0
      12:   6004            str     r4, [r0, #0]
      14:   4322            orrs    r2, r4
      16:   600d            str     r5, [r1, #0]
      18:   4610            mov     r0, r2
      1a:   4629            mov     r1, r5
      1c:   bc30            pop     {r4, r5}
      1e:   4770            bx      lr

    只在#3处添加memory barrier,编译的结果正确,指令未乱序,而且对low的读取也未被优化,这个结果和MB的作用有关,有待分析:

    00000000 <read_cycle>:
       0:   f240 0300       movw    r3, #0
       4:   f2c0 0300       movt    r3, #0
       8:   b430            push    {r4, r5}
       a:   681a            ldr     r2, [r3, #0] @ 读取upper的内存地址
       c:   6814            ldr     r4, [r2, #0] @ 从对应内存中读取upper , __upper_new = * global_timer_upper;
       e:   e000            b.n     12 <read_cycle+0x12>
      10:   4614            mov     r4, r2       @ __upper_old = __upper_new
      12:   685a            ldr     r2, [r3, #4] @ 读取low的内存地址
      14:   6815            ldr     r5, [r2, #0] @ 从对应内存中读取low 
      16:   681a            ldr     r2, [r3, #0] @ 读取upper的内存地址
      18:   6812            ldr     r2, [r2, #0] @ 从对应内存中读取upper
      1a:   42a2            cmp     r2, r4       @ 比较 
      1c:   d1f8            bne.n   10 <read_cycle+0x10>
      1e:   2200            movs    r2, #0
      20:   6005            str     r5, [r0, #0]
      22:   432a            orrs    r2, r5
      24:   600c            str     r4, [r1, #0]
      26:   4610            mov     r0, r2
      28:   4621            mov     r1, r4
      2a:   bc30            pop     {r4, r5}
      2c:   4770            bx      lr
      2e:   bf00            nop

     只在#0处添加volatile,结果也是正确的,这个结果也很奇怪,表面上看这样只能防止对global_timer_upper/lower这两个指针读写的优化,而我们要防止对这两个指针所指向的存储空间(即*global_timer_upper/low)读写的优化,但是加上这样的修饰符,确实是起作用了,可能是指针类型的关系:

    00000000 <read_cycle>:
       0:   f240 0300       movw    r3, #0
       4:   f2c0 0300       movt    r3, #0
       8:   b470            push    {r4, r5, r6}
       a:   e893 0044       ldmia.w r3, {r2, r6}
       e:   6814            ldr     r4, [r2, #0]
      10:   e000            b.n     14 <read_cycle+0x14>
      12:   461c            mov     r4, r3
      14:   6835            ldr     r5, [r6, #0]
      16:   6813            ldr     r3, [r2, #0]
      18:   42a3            cmp     r3, r4
      1a:   d1fa            bne.n   12 <read_cycle+0x12>
      1c:   2200            movs    r2, #0
      1e:   6005            str     r5, [r0, #0]
      20:   432a            orrs    r2, r5
      22:   600c            str     r4, [r1, #0]
      24:   4610            mov     r0, r2
      26:   4621            mov     r1, r4
      28:   bc70            pop     {r4, r5, r6}
      2a:   4770            bx      lr

     还有一个奇怪的现象,就是如果函数的修饰同时有inline和static,而且此文件中没有其他函数调用此函数,那么obj文件中,可能没有此函数的符号,可以用nm或者objdump来查看。

  • 相关阅读:
    Leetcode 538. Convert BST to Greater Tree
    Leetcode 530. Minimum Absolute Difference in BST
    Leetcode 501. Find Mode in Binary Search Tree
    Leetcode 437. Path Sum III
    Leetcode 404. Sum of Left Leaves
    Leetcode 257. Binary Tree Paths
    Leetcode 235. Lowest Common Ancestor of a Binary Search Tree
    Leetcode 226. Invert Binary Tree
    Leetcode 112. Path Sum
    Leetcode 111. Minimum Depth of Binary Tree
  • 原文地址:https://www.cnblogs.com/godjesse/p/3252053.html
Copyright © 2011-2022 走看看