物理布局探测

内存管理是 Linux 非常重要也非常复杂的一个环节, 本节我们从最基本的物理内存布局探测, 到 memblock NUMA slab 等复杂的内存管理方案做一些深入的分析

对于一个操作系统, 在启动之初有两个非常关键的问题

获取设备总内存

内存总大小等信息作为设备的关键信息,应该在硬件启动初期就由CPU获得并存储,操作系统只需要通过CPU的相关协定读取即可,这个协定就是BIOS中断

在x86芯片中,探测物理内存布局用的BIOS中断向量是0x15,根据ax寄存器值的不同,有三种常见的方式:0xe820,0x801和0x88.

// arch/x86/boot/main.c
void main() {
    // ...
    /* Detect memory layout */
    detect_memory();
    // ...
}

detect_memory 中依次执行三个 BIOS 中断向 CPU 尝试获取物理内存布局, 这三个函数内部通过 boot_params 将内存的信息带出来

boot_params 由 boot.h 引入, extern struct boot_params boot_params

struct boot_params 定义很长, 位于 arch/x86/include/uapi/asm/bootparam.h

// arch/x86/boot/memory.c
void detect_memory(void) {
    detect_memory_e820(); /* 使用e820 BIOS中断获取物理内存布局 */
    detect_memory_e801(); /* 使用e801 BIOS中断获取物理内存布局 */
    detect_memory_88(); /* 使用88 BIOS中断获取物理内存布局 */
}

其中 e820 需要设置 AX 向量号为 0xe820,

// arch/x86/boot/memory.c
#define SMAP    0x534d4150  /* ASCII "SMAP" */

// Input:
// AX = E820h
// EAX = 0000E820h
// EDX = 534D4150h ('SMAP')
// EBX = continuation value or 00000000h to start at beginning of map
// ECX = size of buffer for result, in bytes (should be >= 20 bytes)
// ES:DI -> buffer for result (see #00581)
// int 0x15
static void detect_memory_e820(void)
{
    int count = 0;
    struct biosregs ireg, oreg;
    struct boot_e820_entry *desc = boot_params.e820_table;
    static struct boot_e820_entry buf; /* static so it is zeroed */

    initregs(&ireg);
    ireg.ax  = 0xe820;
    ireg.cx  = sizeof(buf);
    ireg.edx = SMAP;
    ireg.di  = (size_t)&buf;

    /*
     * Note: at least one BIOS is known which assumes that the
     * buffer pointed to by one e820 call is the same one as
     * the previous call, and only changes modified fields.  Therefore,
     * we use a temporary buffer and copy the results entry by entry.
     *
     * This routine deliberately does not try to account for
     * ACPI 3+ extended attributes.  This is because there are
     * BIOSes in the field which report zero for the valid bit for
     * all ranges, and we don't currently make any use of the
     * other attribute bits.  Revisit this if we see the extended
     * attribute bits deployed in a meaningful way in the future.
     */


    // Ouput:
    // CF clear if successful
    // EAX = 534D4150h ('SMAP')
    // ES:DI buffer filled
    // EBX = next offset from which to copy or 00000000h if all done
    // ECX = actual length returned in bytes
    // CF set on error
    // AH = error code (86h) (see #00496 at INT 15/AH=80h)
    do {
        intcall(0x15, &ireg, &oreg);
        ireg.ebx = oreg.ebx; /* for next iteration... */

        /* BIOSes which terminate the chain with CF = 1 as opposed
           to %ebx = 0 don't always report the SMAP signature on
           the final, failing, probe. */
        if (oreg.eflags & X86_EFLAGS_CF)
            break;

        /* Some BIOSes stop returning SMAP in the middle of
           the search loop.  We don't know exactly how the BIOS
           screwed up the map at that point, we might have a
           partial map, the full map, or complete garbage, so
           just return failure. */
        if (oreg.eax != SMAP) {
            count = 0;
            break;
        }

        *desc++ = buf;
        count++;
    } while (ireg.ebx && count < ARRAY_SIZE(boot_params.e820_table));

    boot_params.e820_entries = count;
}

static void detect_memory_e801(void)
{
    struct biosregs ireg, oreg;

    initregs(&ireg);
    ireg.ax = 0xe801;
    intcall(0x15, &ireg, &oreg);

    if (oreg.eflags & X86_EFLAGS_CF)
        return;

    /* Do we really need to do this? */
    if (oreg.cx || oreg.dx) {
        oreg.ax = oreg.cx;
        oreg.bx = oreg.dx;
    }

    if (oreg.ax > 15*1024) {
        return; /* Bogus! */
    } else if (oreg.ax == 15*1024) {
        boot_params.alt_mem_k = (oreg.bx << 6) + oreg.ax;
    } else {
        /*
         * This ignores memory above 16MB if we have a memory
         * hole there.  If someone actually finds a machine
         * with a memory hole at 16MB and no support for
         * 0E820h they should probably generate a fake e820
         * map.
         */
        boot_params.alt_mem_k = oreg.ax;
    }
}

static void detect_memory_88(void)
{
    struct biosregs ireg, oreg;

    initregs(&ireg);
    ireg.ah = 0x88;
    intcall(0x15, &ireg, &oreg);

    boot_params.screen_info.ext_mem_k = oreg.ax;
}

20230628161738

memblock

memblock 子系统主要用于引导过程中的物理内存管理,特别是在早期的启动阶段,当内核尚未完全初始化和建立内存管理器时.一旦内核初始化完成,memblock 子系统的功能通常会被更高级的内存管理机制所取代,如 buddy allocator(伙伴系统)或 slab allocator(SLAB 系统)

20230628182631

memblock 的功能主要包括

/**
 * struct memblock - memblock allocator metadata
 * @bottom_up: is bottom up direction? 用于判断记录的内存是否从底部往顶部增长
 * @current_limit: physical address of the current allocation limit 当前内存管理器管理的物理地址上限
 * @memory: usable memory regions 操作系统可用内存,即E820探测物理布局时,flags为usable的内存区域
 * @reserved: reserved memory regions 在boot阶段保留的内存,包括E820探测物理布局时,flags为reserved的内存区域,boot阶段分配出去的内存区域
 */
struct memblock {
    bool bottom_up;  /* is bottom up direction? */
    phys_addr_t current_limit;
    struct memblock_type memory;
    struct memblock_type reserved;
};
/**
 * struct memblock_type - collection of memory regions of certain type
 * @cnt: number of regions 记录的内存区域(memblock_region)的数量
 * @max: size of the allocated array 最多能使用的内存区域数,当预留的内存区域不足时,管理器会扩展
 * @total_size: size of all regions 所有内存区域的内存之和
 * @regions: array of regions 内存区域数组,每一项代表usable或保留的内存区域
 * @name: the memory type symbolic name 内存管理器类型的名称,例如"memory","reserved"等
 */
struct memblock_type {
    unsigned long cnt;
    unsigned long max;
    phys_addr_t total_size;
    struct memblock_region *regions;
    char *name;
};
/**
 * struct memblock_region - represents a memory region
 * @base: base address of the region 内存区域的起始地址,类型为u64或u32,表示64位/32位架构的支持最大地址长度
 * @size: size of the region 内存区域的大小
 * @flags: memory region attributes 内存区域的类型表示,有四种类型:MEMBLOCK_NONE(普通内存),MEMBLOCK_HOTPLUG(可热拔插内存),MEMBLOCK_MIRROR(镜像内存),MEMBLOCK_NOMAP(非内核直接映射内存),相同类型的相邻内存,条件合适时可以被合并
 * @nid: NUMA node id 暂时略去与NUMA相关的内容
 */
struct memblock_region {
    phys_addr_t base;
    phys_addr_t size;
    enum memblock_flags flags;
#ifdef CONFIG_NUMA
    int nid;
#endif
};


/**
 * enum memblock_flags - definition of memory region attributes
 * @MEMBLOCK_NONE: no special request
 * @MEMBLOCK_HOTPLUG: memory region indicated in the firmware-provided memory
 * map during early boot as hot(un)pluggable system RAM (e.g., memory range
 * that might get hotunplugged later). With "movable_node" set on the kernel
 * commandline, try keeping this memory region hotunpluggable. Does not apply
 * to memblocks added ("hotplugged") after early boot.
 * @MEMBLOCK_MIRROR: mirrored region
 * @MEMBLOCK_NOMAP: don't add to kernel direct mapping and treat as
 * reserved in the memory map; refer to memblock_mark_nomap() description
 * for further details
 * @MEMBLOCK_DRIVER_MANAGED: memory region that is always detected and added
 * via a driver, and never indicated in the firmware-provided memory map as
 * system RAM. This corresponds to IORESOURCE_SYSRAM_DRIVER_MANAGED in the
 * kernel resource tree.
 */
enum memblock_flags {
    MEMBLOCK_NONE       = 0x0,  /* No special request */
    MEMBLOCK_HOTPLUG    = 0x1,  /* hotpluggable region */
    MEMBLOCK_MIRROR     = 0x2,  /* mirrored region */
    MEMBLOCK_NOMAP      = 0x4,  /* don't add to kernel direct mapping */
    MEMBLOCK_DRIVER_MANAGED = 0x8,  /* always detected via a driver */
};

总结

操作系统通过BIOS 0x15中断,常见有E820、E801和E88子中断号获取设备总内存大小, 内存类型为usable的才能被操作系统所使用

memblock 子系统主要用于引导过程中的物理内存管理

参考

zood