zoukankan      html  css  js  c++  java
  • Volatile & Memory Barrier

    global_timer_upper和global_timer_low指向的存储空间是 global timer 的高32位和低32位,它们会不停变化,下面的程序功能是读取当前的global timer值。这里会出现一个常见的编译器优化带来的错误。

    源程序如下:

    #0  [volatile] unsigned long *global_timer_upper, *global_timer_low;
    inline unsigned long long read_cycle(unsigned int *re_low, unsigned int *re_upper) { #1 [volatile] unsigned int __low,__upper_old,__upper_new; unsigned long long value; __upper_new = *global_timer_upper; do { __upper_old = __upper_new; #2 [asm volatile("":::"memory");] __low = *global_timer_low;
           #3 [asm volatile("":::"memory");] __upper_new
    = *global_timer_upper;
           #4 [asm volatile("":::"memory");] }
    while(__upper_new != __upper_old); *re_low = __low; *re_upper = __upper_new; value = (unsigned long long)__low | (((unsigned long long)__upper_new) << 32); return value; }

    源程序编译后得到的结果如下,可以看出经过编译器的优化,循环消失了,因为编译器认为执行流中没有改变*global_timer_upper,所以while循环中的比较是多余的。

    00000000 <read_cycle>:
       0:   b470            push    {r4, r5, r6}
       2:   f240 0400       movw    r4, #0
       6:   f2c0 0400       movt    r4, #0
       a:   2200            movs    r2, #0
       c:   6865            ldr     r5, [r4, #4]
       e:   6826            ldr     r6, [r4, #0]
      10:   682d            ldr     r5, [r5, #0]
      12:   6834            ldr     r4, [r6, #0]
      14:   432a            orrs    r2, r5
      16:   6005            str     r5, [r0, #0]
      18:   4610            mov     r0, r2
      1a:   600c            str     r4, [r1, #0]
      1c:   4621            mov     r1, r4
      1e:   bc70            pop     {r4, r5, r6}
      20:   4770            bx      lr
      22:   bf00            nop

    只在#1处,添加volatile修饰,编译结果如下,此时明显看到循环的出现,但是这里的程序依然是错误的,编译器虽然虽然没有对__upper_new\__upper_old等值的读写进行优化,使用了堆栈进行存储。但是,编译器认为*global_timer_upper的值没有改变过,所以将其存储在寄存器r4中:

    00000000 <read_cycle>:
       0:   f240 0300       movw    r3, #0
       4:   f2c0 0300       movt    r3, #0
       8:   b4f0            push    {r4, r5, r6, r7}
       a:   b084            sub     sp, #16
       c:   cb0c            ldmia   r3, {r2, r3}  @ 读取upper和low的内存地址放到r2,r3中
       e:   6814            ldr     r4, [r2, #0]  @ 从对应内存读取upper至r4 (__upper_new = * global_timer_upper)
      10:   9401            str     r4, [sp, #4]  @ 将upper放在栈上[sp,#4]
      12:   681d            ldr     r5, [r3, #0]  @ 从对应内存读取lower至r5 
      14:   9b01            ldr     r3, [sp, #4]  @ 循环开始 相对于 __upper_old = __upper_new
      16:   9302            str     r3, [sp, #8]  @ 将__upper_old放在栈上[sp,#8] 
      18:   9503            str     r5, [sp, #12] @ 将lower保存在栈上[sp,#12]
      1a:   9401            str     r4, [sp, #4]  @ 将将r4中的__upper_new放在栈上[sp,#4], r4是最开始e处获得的upper值,一直未改变。
      1c:   9a01            ldr     r2, [sp, #4]  @ 从栈上读出__upper_new
      1e:   9b02            ldr     r3, [sp, #8]  @ 从栈上读出__upper_old
      20:   429a            cmp     r2, r3
      22:   d1f7            bne.n   14 <read_cycle+0x14>
      24:   9f03            ldr     r7, [sp, #12]
      26:   2200            movs    r2, #0
      28:   9d01            ldr     r5, [sp, #4]
      2a:   9c03            ldr     r4, [sp, #12]
      2c:   9e01            ldr     r6, [sp, #4]
      2e:   4322            orrs    r2, r4
      30:   6007            str     r7, [r0, #0]
      32:   600d            str     r5, [r1, #0]
      34:   4610            mov     r0, r2
      36:   4631            mov     r1, r6
      38:   b004            add     sp, #16
      3a:   bcf0            pop     {r4, r5, r6, r7}
      3c:   4770            bx      lr
      3e:   bf00            nop

    只在#2处,添加memory barrier,编译的结果如下,此时虽然是从global timer 的对应内存中读取数据,但是仍然存在问题,指令的乱序问题。我们的设计,应该是先读low,再读upper,但是编译后的结果是先读upper,再读low,导致结果错误,所以考虑在这两条语句间加memory barrier。

    00000000 <read_cycle>:
       0:   f240 0300       movw    r3, #0
       4:   f2c0 0300       movt    r3, #0
       8:   b430            push    {r4, r5}
       a:   681a            ldr     r2, [r3, #0]  @ 读取upper的内存地址
       c:   6814            ldr     r4, [r2, #0]  @ 从对应内存读取upper ,__upper_new = * global_timer_upper
       e:   e000            b.n     12 <read_cycle+0x12>
      10:   4614            mov     r4, r2     @ __upper_old = __upper_new
      12:   e893 0024       ldmia.w r3, {r2, r5}  @ 读取upper和lower的内存地址
      16:   6812            ldr     r2, [r2, #0]  @ 从对应内存读取 upper , __upper_new = * global_timer_upper
      18:   682d            ldr     r5, [r5, #0]  @ 从对应内存读取 low
      1a:   42a2            cmp     r2, r4        @ 比较
      1c:   d1f8            bne.n   10 <read_cycle+0x10>
      1e:   2200            movs    r2, #0
      20:   6005            str     r5, [r0, #0]
      22:   432a            orrs    r2, r5
      24:   600c            str     r4, [r1, #0]
      26:   4610            mov     r0, r2
      28:   4621            mov     r1, r4
      2a:   bc30            pop     {r4, r5}
      2c:   4770            bx      lr
      2e:   bf00            nop

    只在#4处添加memory barrier,编译的结果如下,效果和不加一样。

    00000000 <read_cycle>:
       0:   f240 0300       movw    r3, #0
       4:   f2c0 0300       movt    r3, #0
       8:   b430            push    {r4, r5}
       a:   cb0c            ldmia   r3, {r2, r3}
       c:   6815            ldr     r5, [r2, #0]
       e:   681c            ldr     r4, [r3, #0]
      10:   2200            movs    r2, #0
      12:   6004            str     r4, [r0, #0]
      14:   4322            orrs    r2, r4
      16:   600d            str     r5, [r1, #0]
      18:   4610            mov     r0, r2
      1a:   4629            mov     r1, r5
      1c:   bc30            pop     {r4, r5}
      1e:   4770            bx      lr

    只在#3处添加memory barrier,编译的结果正确,指令未乱序,而且对low的读取也未被优化,这个结果和MB的作用有关,有待分析:

    00000000 <read_cycle>:
       0:   f240 0300       movw    r3, #0
       4:   f2c0 0300       movt    r3, #0
       8:   b430            push    {r4, r5}
       a:   681a            ldr     r2, [r3, #0] @ 读取upper的内存地址
       c:   6814            ldr     r4, [r2, #0] @ 从对应内存中读取upper , __upper_new = * global_timer_upper;
       e:   e000            b.n     12 <read_cycle+0x12>
      10:   4614            mov     r4, r2       @ __upper_old = __upper_new
      12:   685a            ldr     r2, [r3, #4] @ 读取low的内存地址
      14:   6815            ldr     r5, [r2, #0] @ 从对应内存中读取low 
      16:   681a            ldr     r2, [r3, #0] @ 读取upper的内存地址
      18:   6812            ldr     r2, [r2, #0] @ 从对应内存中读取upper
      1a:   42a2            cmp     r2, r4       @ 比较 
      1c:   d1f8            bne.n   10 <read_cycle+0x10>
      1e:   2200            movs    r2, #0
      20:   6005            str     r5, [r0, #0]
      22:   432a            orrs    r2, r5
      24:   600c            str     r4, [r1, #0]
      26:   4610            mov     r0, r2
      28:   4621            mov     r1, r4
      2a:   bc30            pop     {r4, r5}
      2c:   4770            bx      lr
      2e:   bf00            nop

     只在#0处添加volatile,结果也是正确的,这个结果也很奇怪,表面上看这样只能防止对global_timer_upper/lower这两个指针读写的优化,而我们要防止对这两个指针所指向的存储空间(即*global_timer_upper/low)读写的优化,但是加上这样的修饰符,确实是起作用了,可能是指针类型的关系:

    00000000 <read_cycle>:
       0:   f240 0300       movw    r3, #0
       4:   f2c0 0300       movt    r3, #0
       8:   b470            push    {r4, r5, r6}
       a:   e893 0044       ldmia.w r3, {r2, r6}
       e:   6814            ldr     r4, [r2, #0]
      10:   e000            b.n     14 <read_cycle+0x14>
      12:   461c            mov     r4, r3
      14:   6835            ldr     r5, [r6, #0]
      16:   6813            ldr     r3, [r2, #0]
      18:   42a3            cmp     r3, r4
      1a:   d1fa            bne.n   12 <read_cycle+0x12>
      1c:   2200            movs    r2, #0
      1e:   6005            str     r5, [r0, #0]
      20:   432a            orrs    r2, r5
      22:   600c            str     r4, [r1, #0]
      24:   4610            mov     r0, r2
      26:   4621            mov     r1, r4
      28:   bc70            pop     {r4, r5, r6}
      2a:   4770            bx      lr

     还有一个奇怪的现象,就是如果函数的修饰同时有inline和static,而且此文件中没有其他函数调用此函数,那么obj文件中,可能没有此函数的符号,可以用nm或者objdump来查看。

  • 相关阅读:
    网络编程学习小结
    我的学习笔记_Windows_HOOK编程 2009-12-03 11:19
    void及void指针含义的深刻解析
    Android开发之自己定义TabHost文字及背景(源码分享)
    ActionBar自己定义改动无效解决方法
    一位Erlang程序猿的自白
    Xcode 5.1安装插件:规范凝视生成器VVDocumenter
    Socket程序中的Error#10054错误
    CSDN博客清理缓存
    ACM 位运算
  • 原文地址:https://www.cnblogs.com/godjesse/p/3252053.html
Copyright © 2011-2022 走看看