zoukankan      html  css  js  c++  java
  • A53的mmu配置说明

    1 MMU简介

    1.1 为什么需要MMU

    • MMU的作用是将core发出的虚拟地址VA转换为物理地址PA。在操作系统里,进程的VA可以很大且连续,进程不用考虑PA的实际情况,有MMU完成VA到PA的转换
    • 裸跑程序里,一般不向OS这么复杂,MMU会做平坦映射,即VA=PA

    1.2 裸跑不开MMU行不行

    • 行,但效率很低。
    • 现在的CPU,要想使用Cache,必须使能MMU,MMU页表里有cache访问属性配置。
    • 在ARM里,如果不开MMU,不仅不能开启cache,连内存属性都不是normal,而是device,device属性不允许硬件对AXI总线的信号进行合并、乱序等,效率较低。
    • 所以,一般的CPU启动代码,会较早开启MMU和ARM,ATF的BL1(bootrom)就是这样。

    1.3 架构

    如上图

    • MMU集成在每个core里,每个core有1个MMU
    • MMU做VA到PA的转换时,需要转换规则,这个规则叫==页表==
    • 页表保存在memory里,通过寄存器高速MMU单元页表的存放位置
    • TLB是MMU里的小cache,用于保存已经读取过的页表,这样可以大大提高效率
    • 如果MMU要读取的页表不在TLB里,MMU就要去memory里找页表,这叫Table Walk,类似cache miss,效率较低
    • MMU在读memory时,可以过cache,也可以不过。有寄存器可以配置

    2 A53的MMU细节

    2.1 TTBL0_EL1/2/3和TTBL1_EL1

    • TTBL存页表的地址,MMU据此找页表
    • 为什么EL1有两个,TTBL0_EL1和TTBL1_EL1



      在有OS的系统里,OS的地址映射几乎是不变的,为了提高效率,专门把TTBL1_EL1给kernel用,user用TTBL0_EL1.
    • linux的kernel空间地址FFFF , user空间为0000 ,why?
      VA的高bit用于选择TTBL,kernel的FFFF,MMU会自动用TTBL1_EL1,反之用TTBL0_EL1

    2.2 TCR_EL*

    主要控制:

    • IPS, PA的位宽
    • T1SZ,T0SZ: VA位宽
    • TG0,TG1:最小颗粒度,A53支持4KB和64KB,不支持16KB
    • 还有MMU table walk时的cache属性配置

    2.3 MAIR_EL*


    在ARMv8架构里,页表不再包含具体的cache属性配置,而只包含1个3bit的index,这个index就是指向MAIR_EL*的某一个Attr,每一个Attr可以配置成不同的cache属性。

    2.4 MMU页表


    以4KB颗粒度进行说明:

    • 当VA设置少于39时,Level0就可以省略,TTBL直接指向level1即可。以此类推
    • block指块,到此结束,不会再有下级。例如l1的block为1G,则只用1个描述符就可以描述1G空间
    • Point to下一级时,为Table描述符,指向下一级的基址

    3 配置实例

    以某处理器裸跑为例:

    • VA = 32bit
    • l1: bit31 - bit30, 4 entries
    • l2: 29 - 21, 512 entries
    • l3: 20 - 12, 512 entries

    • level 1在memory的值:

    level 2在memory的值:

    • level 3在memory的值,E8000000对应的80004200:

    code:

    //----------------------------------------------------------------
    // setup tx511 translation table
    //
    //----------------------------------------------------------------
    #include "v8_mmu.h"
    
        .text
        .cfi_sections .debug_frame  // put stack frame info into .debug_frame instead of .eh_frame
    
    	.global setup_ttb
    	.global ZeroBlock
    
        .global __ttb0_l1
        .global __ttb0_l2_ram
        .global __ttb0_l3_ram_e80
        .global __ttb0_l3_ram_e82
        .global __ttb0_l3_ram_e84
        .global __ttb0_l3_ram_e86
    
    
    //----------------------------------------------------------------
    // setup tx511 translation table
    // level 1 table, 4 entries:
    // 0000 0000 - 3FFF FFFF, 1GB block, DDR
    // 4000 0000 - 7FFF FFFF, 1GB block, DDR
    // 8000 0000 - BFFF FFFF, 1GB block, DDR
    // C000 0000 - FFFF FFFF, point to level2 tabel
    //
    // level 2 table, 512 entries:
    // C000 0000 - DFFF FFFF, 256 entries, 512MB DDR, 2MB block
    // E000 0000 - E3FF FFFF, 32  entries, 64MB OSPI0 flash, 2MB block
    // E400 0000 - E7FF FFFF, 32  entries, 64MB OSPI1 flash, 2MB block
    
    // E800 0000 - E81F FFFF, 1 entry, point to level 3_1
    // E820 0000 - E83F FFFF, 1 entry, point to level 3_2
    //
    //----------------------------------------------------------------
    
    
        .type setup_ttb, "function"
        .cfi_startproc
    setup_ttb:
    
        //
        // x21 = address of L1 tables
        // x22 = address of L2 tables
        // x23 = address of L3 tables E80
        // x24 = address of L3 tables E82
        // x25 = address of L3 tables E84
        // x26 = address of L3 tables E86
        //
        ldr x2, =0
        ldr x3, =0
    
        ldr x21, =__ttb0_l1
        mov x0, x21
        mov x1, #(4 << 3)
        // can not call func ZeroBlock,  not support nesting
    loop_zero_0:
        subs x1, x1, #16
        stp  x2, x3, [x0, #-16]!
        b.ne loop_zero_0
    
        ldr x22, =__ttb0_l2_ram
        mov x1, #(512 << 3)
        mov x0, x22
    loop_zero_1:
        subs x1, x1, #16
        stp  x2, x3, [x0, #-16]!
        b.ne loop_zero_1
    
        ldr x23, =__ttb0_l3_ram_e80
        mov x1, #(512 << 3)
        mov x0, x23
    loop_zero_2:
        subs x1, x1, #16
        stp  x2, x3, [x0, #-16]!
        b.ne loop_zero_2
    
        ldr x24, =__ttb0_l3_ram_e82
        mov x1, #(512 << 3)
        mov x0, x24
    loop_zero_3:
        subs x1, x1, #16
        stp  x2, x3, [x0, #-16]!
        b.ne loop_zero_3
    
        ldr x25, =__ttb0_l3_ram_e84
        mov x1, #(512 << 3)
        mov x0, x25
    loop_zero_4:
        subs x1, x1, #16
        stp  x2, x3, [x0, #-16]!
        b.ne loop_zero_4
    
        ldr x26, =__ttb0_l3_ram_e86
        mov x1, #(512 << 3)
        mov x0, x26
    loop_zero_5:
        subs x1, x1, #16
        stp  x2, x3, [x0, #-16]!
        b.ne loop_zero_5
    
    	// 0000 0000 - 3FFF FFFF, 1GB block, DDR
    	// 4000 0000 - 7FFF FFFF, 1GB block, DDR
    	// 8000 0000 - BFFF FFFF, 1GB block, DDR
    	// 3 1G block, write to l1 table
    	//
    	ldr x1, =3
    	ldr x2, =0x40000000
    	ldr x3, =(0x00000000	   | 
    			  TT_S1_ATTR_BLOCK | 
                 (1 << TT_S1_ATTR_MATTR_LSB) | 
                  TT_S1_ATTR_NS | 
                  TT_S1_ATTR_AP_RW_PL1 | 
                  TT_S1_ATTR_SH_INNER | 
                  TT_S1_ATTR_AF | 
                  TT_S1_ATTR_nG)
        mov x4, x21
    
    loop1:
    	str x3, [x4], #8
    	add x3, x3, x2
    	subs x1, x1, #1
    	bne loop1
    
    	// C000 0000 - FFFF FFFF, point to level2 tabel, write to l1 table
        orr x1, x22, #TT_S1_ATTR_PAGE
        str x1, [x4]
    
    
    	// level 2 table: C000 0000 - DFFF FFFF, 256 entries, 512MB DDR, 2MB block
    	ldr x1, =256
    	ldr x2, =0x200000
    	ldr x3, =(0xC0000000	   | 
    			  TT_S1_ATTR_BLOCK | 
                 (1 << TT_S1_ATTR_MATTR_LSB) | 
                  TT_S1_ATTR_NS | 
                  TT_S1_ATTR_AP_RW_PL1 | 
                  TT_S1_ATTR_SH_INNER | 
                  TT_S1_ATTR_AF | 
                  TT_S1_ATTR_nG)
        mov x4, x22
    loop2_ddr:
    	str x3, [x4], #8
    	add x3, x3, x2
    	subs x1, x1, #1
    	bne loop2_ddr
    
    	// level 2 table: E000 0000 - E3FF FFFF, 32 entries, 64MB OSPI0 flash, 2MB block
    	ldr x1, =32
    	ldr x2, =0x200000
    	ldr x3, =(0xE0000000	   | 
    			  TT_S1_ATTR_BLOCK | 
                 (1 << TT_S1_ATTR_MATTR_LSB) | 
                  TT_S1_ATTR_NS | 
                  TT_S1_ATTR_AP_RW_PL1 | 
                  TT_S1_ATTR_SH_INNER | 
                  TT_S1_ATTR_AF | 
                  TT_S1_ATTR_nG)
    
    loop2_ospi0:
    	str x3, [x4], #8
    	add x3, x3, x2
    	subs x1, x1, #1
    	bne loop2_ospi0
    
    	// level 2 table: E400 0000 - E7FF FFFF, 32 entries, 64MB OSPI1 flash, 2MB block
    	ldr x1, =32
    	ldr x2, =0x200000
    	ldr x3, =(0xE4000000	   | 
    			  TT_S1_ATTR_BLOCK | 
                 (1 << TT_S1_ATTR_MATTR_LSB) | 
                  TT_S1_ATTR_NS | 
                  TT_S1_ATTR_AP_RW_PL1 | 
                  TT_S1_ATTR_SH_INNER | 
                  TT_S1_ATTR_AF | 
                  TT_S1_ATTR_nG)
    loop2_ospi1:
    	str x3, [x4], #8
    	add x3, x3, x2
    	subs x1, x1, #1
    	bne loop2_ospi1
    
    	// level 2 table: E800 0000 - E81F FFFF, 1 entry, point to level 3_1
        orr x1, x23, #TT_S1_ATTR_TABLE
        ldr x2, =0xE8000000
    	ubfx x3, x2, #21, #9
        str x1, [x22, x3, lsl #3]
    
    	// level 2 table: E820 0000 - E83F FFFF, 1 entry, point to level 3_2
        orr x1, x24, #TT_S1_ATTR_TABLE
        ldr x2, =0xE8200000
    	ubfx x3, x2, #21, #9
        str x1, [x22, x3, lsl #3]
    
    	// level 2 table: E840 0000 - E85F FFFF, 1 entry, point to level 3_3
        orr x1, x25, #TT_S1_ATTR_TABLE
        ldr x2, =0xE8400000
    	ubfx x3, x2, #21, #9
        str x1, [x22, x3, lsl #3]
    
    	// level 2 table: E860 0000 - E87F FFFF, 1 entry, point to level 3_4
        orr x1, x26, #TT_S1_ATTR_TABLE
        ldr x2, =0xE8600000
    	ubfx x3, x2, #21, #9
        str x1, [x22, x3, lsl #3]
    
    
    	// level 3 table: E800 0000 - E81F FFFF, 512 entry, x23
    	// E800 0000 - E803 FFFF, 256kB on-chip-sram
    	ldr x1, =0x1000
    	ldr x2, =4
    	ldr x3, =0xE8000000
    	ldr x4, = (TT_S1_ATTR_PAGE | 
                 (1 << TT_S1_ATTR_MATTR_LSB) | 
                  TT_S1_ATTR_NS | 
                  TT_S1_ATTR_AP_RW_PL1 | 
                  TT_S1_ATTR_SH_INNER | 
                  TT_S1_ATTR_AF | 
                  TT_S1_ATTR_nG)
    loop3_sram:
    	ubfx x5, x3, #12, #9
        orr x6, x3, x4
    	str x6, [x23, x5, lsl #3]
    	add x3, x3, x1
    	subs x2, x2, #1
    	bne loop3_sram
    
    	// level 3 table: E820 0000 - E83F FFFF, 512 entry, x24
    	// valid addr  E820 0000 - E838 6FFF
    	ldr x1, =0x1000
    	ldr x2, =((0xE8386FFF + 1 - 0xE8200000) >> 12)
    	ldr x3, =0xE8200000
    	ldr x4, = (TT_S1_ATTR_PAGE | 
                 (2 << TT_S1_ATTR_MATTR_LSB) | 
                  TT_S1_ATTR_NS | 
                  TT_S1_ATTR_AP_RW_PL1 | 
                  TT_S1_ATTR_SH_INNER | 
                  TT_S1_ATTR_AF | 
                  TT_S1_ATTR_nG)
    loop3_Top_ahb:
    	ubfx x5, x3, #12, #9
        orr x6, x3, x4
    	str x6, [x24, x5, lsl #3]
    	add x3, x3, x1
    	subs x2, x2, #1
    	bne loop3_Top_ahb
    
    	// level 3 table: E840 0000 - E85F FFFF, 512 entry, x25
    	// valid addr  E840 0000 - E850 FFFF
    	ldr x1, =0x1000
    	ldr x2, =((0xE850FFFF + 1 - 0xE8400000) >> 12)
    	ldr x3, =0xE8400000
    	ldr x4, = (TT_S1_ATTR_PAGE | 
                 (2 << TT_S1_ATTR_MATTR_LSB) | 
                  TT_S1_ATTR_NS | 
                  TT_S1_ATTR_AP_RW_PL1 | 
                  TT_S1_ATTR_SH_INNER | 
                  TT_S1_ATTR_AF | 
                  TT_S1_ATTR_nG)
    loop3_lp_hp_gic_ddr_ahb:
    	ubfx x5, x3, #12, #9
        orr x6, x3, x4
    	str x6, [x25, x5, lsl #3]
    	add x3, x3, x1
    	subs x2, x2, #1
    	bne loop3_lp_hp_gic_ddr_ahb
    
    	// level 3 table: E860 0000 - E87F FFFF, 512 entry, x26
    	// valid addr  E860 0000 - E869 4FFF
    	ldr x1, =0x1000
    	ldr x2, =((0xE8694FFF + 1 - 0xE8600000) >> 12)
    	ldr x3, =0xE8600000
    	ldr x4, = (TT_S1_ATTR_PAGE | 
                 (2 << TT_S1_ATTR_MATTR_LSB) | 
                  TT_S1_ATTR_NS | 
                  TT_S1_ATTR_AP_RW_PL1 | 
                  TT_S1_ATTR_SH_INNER | 
                  TT_S1_ATTR_AF | 
                  TT_S1_ATTR_nG)
    loop3_vo_vi_ahb:
    	ubfx x5, x3, #12, #9
        orr x6, x3, x4
    	str x6, [x26, x5, lsl #3]
    	add x3, x3, x1
    	subs x2, x2, #1
    	bne loop3_vo_vi_ahb
    
    
        ret
        .cfi_endproc
  • 相关阅读:
    [转]sql语句优化原则
    [Effective C++]构造/析构/赋值运算
    [Effective C++]定制new和delete
    [Effective C++]让自己习惯C++
    [C++ primer]联合:节省空间的类
    [C++ primer]类成员指针
    [C++ primer]运行时类型识别(RTTI)
    [APUE]fork后子进程的运行情况
    [C++ primer]优化内存分配
    [C++ primer]虚函数和纯虚函数
  • 原文地址:https://www.cnblogs.com/liuwanpeng/p/14372976.html
Copyright © 2011-2022 走看看