内存管理是 Linux 非常重要也非常复杂的一个环节, 本节我们从最基本的物理内存布局探测, 到 memblock NUMA slab 等复杂的内存管理方案做一些深入的分析
对于一个操作系统, 在启动之初有两个非常关键的问题
内存总大小等信息作为设备的关键信息,应该在硬件启动初期就由CPU获得并存储,操作系统只需要通过CPU的相关协定读取即可,这个协定就是BIOS中断
在x86芯片中,探测物理内存布局用的BIOS中断向量是0x15,根据ax寄存器值的不同,有三种常见的方式:0xe820,0x801和0x88.
// arch/x86/boot/main.c
void main() {
// ...
/* Detect memory layout */
detect_memory();
// ...
}
detect_memory 中依次执行三个 BIOS 中断向 CPU 尝试获取物理内存布局, 这三个函数内部通过 boot_params
将内存的信息带出来
boot_params 由 boot.h 引入,
extern struct boot_params boot_params
struct boot_params 定义很长, 位于 arch/x86/include/uapi/asm/bootparam.h
// arch/x86/boot/memory.c
void detect_memory(void) {
detect_memory_e820(); /* 使用e820 BIOS中断获取物理内存布局 */
detect_memory_e801(); /* 使用e801 BIOS中断获取物理内存布局 */
detect_memory_88(); /* 使用88 BIOS中断获取物理内存布局 */
}
其中 e820 需要设置 AX 向量号为 0xe820
,
// arch/x86/boot/memory.c
#define SMAP 0x534d4150 /* ASCII "SMAP" */
// Input:
// AX = E820h
// EAX = 0000E820h
// EDX = 534D4150h ('SMAP')
// EBX = continuation value or 00000000h to start at beginning of map
// ECX = size of buffer for result, in bytes (should be >= 20 bytes)
// ES:DI -> buffer for result (see #00581)
// int 0x15
static void detect_memory_e820(void)
{
int count = 0;
struct biosregs ireg, oreg;
struct boot_e820_entry *desc = boot_params.e820_table;
static struct boot_e820_entry buf; /* static so it is zeroed */
initregs(&ireg);
ireg.ax = 0xe820;
ireg.cx = sizeof(buf);
ireg.edx = SMAP;
ireg.di = (size_t)&buf;
/*
* Note: at least one BIOS is known which assumes that the
* buffer pointed to by one e820 call is the same one as
* the previous call, and only changes modified fields. Therefore,
* we use a temporary buffer and copy the results entry by entry.
*
* This routine deliberately does not try to account for
* ACPI 3+ extended attributes. This is because there are
* BIOSes in the field which report zero for the valid bit for
* all ranges, and we don't currently make any use of the
* other attribute bits. Revisit this if we see the extended
* attribute bits deployed in a meaningful way in the future.
*/
// Ouput:
// CF clear if successful
// EAX = 534D4150h ('SMAP')
// ES:DI buffer filled
// EBX = next offset from which to copy or 00000000h if all done
// ECX = actual length returned in bytes
// CF set on error
// AH = error code (86h) (see #00496 at INT 15/AH=80h)
do {
intcall(0x15, &ireg, &oreg);
ireg.ebx = oreg.ebx; /* for next iteration... */
/* BIOSes which terminate the chain with CF = 1 as opposed
to %ebx = 0 don't always report the SMAP signature on
the final, failing, probe. */
if (oreg.eflags & X86_EFLAGS_CF)
break;
/* Some BIOSes stop returning SMAP in the middle of
the search loop. We don't know exactly how the BIOS
screwed up the map at that point, we might have a
partial map, the full map, or complete garbage, so
just return failure. */
if (oreg.eax != SMAP) {
count = 0;
break;
}
*desc++ = buf;
count++;
} while (ireg.ebx && count < ARRAY_SIZE(boot_params.e820_table));
boot_params.e820_entries = count;
}
static void detect_memory_e801(void)
{
struct biosregs ireg, oreg;
initregs(&ireg);
ireg.ax = 0xe801;
intcall(0x15, &ireg, &oreg);
if (oreg.eflags & X86_EFLAGS_CF)
return;
/* Do we really need to do this? */
if (oreg.cx || oreg.dx) {
oreg.ax = oreg.cx;
oreg.bx = oreg.dx;
}
if (oreg.ax > 15*1024) {
return; /* Bogus! */
} else if (oreg.ax == 15*1024) {
boot_params.alt_mem_k = (oreg.bx << 6) + oreg.ax;
} else {
/*
* This ignores memory above 16MB if we have a memory
* hole there. If someone actually finds a machine
* with a memory hole at 16MB and no support for
* 0E820h they should probably generate a fake e820
* map.
*/
boot_params.alt_mem_k = oreg.ax;
}
}
static void detect_memory_88(void)
{
struct biosregs ireg, oreg;
initregs(&ireg);
ireg.ah = 0x88;
intcall(0x15, &ireg, &oreg);
boot_params.screen_info.ext_mem_k = oreg.ax;
}
答:通过BIOS 0x15中断,常见有E820,E801和E88子中断号.
答:不是的,只有内存类型为usable的才能被操作系统所使用.
memblock 子系统主要用于引导过程中的物理内存管理,特别是在早期的启动阶段,当内核尚未完全初始化和建立内存管理器时.一旦内核初始化完成,memblock 子系统的功能通常会被更高级的内存管理机制所取代,如 buddy allocator(伙伴系统)或 slab allocator(SLAB 系统)
memblock 的功能主要包括
/**
* struct memblock - memblock allocator metadata
* @bottom_up: is bottom up direction? 用于判断记录的内存是否从底部往顶部增长
* @current_limit: physical address of the current allocation limit 当前内存管理器管理的物理地址上限
* @memory: usable memory regions 操作系统可用内存,即E820探测物理布局时,flags为usable的内存区域
* @reserved: reserved memory regions 在boot阶段保留的内存,包括E820探测物理布局时,flags为reserved的内存区域,boot阶段分配出去的内存区域
*/
struct memblock {
bool bottom_up; /* is bottom up direction? */
phys_addr_t current_limit;
struct memblock_type memory;
struct memblock_type reserved;
};
/**
* struct memblock_type - collection of memory regions of certain type
* @cnt: number of regions 记录的内存区域(memblock_region)的数量
* @max: size of the allocated array 最多能使用的内存区域数,当预留的内存区域不足时,管理器会扩展
* @total_size: size of all regions 所有内存区域的内存之和
* @regions: array of regions 内存区域数组,每一项代表usable或保留的内存区域
* @name: the memory type symbolic name 内存管理器类型的名称,例如"memory","reserved"等
*/
struct memblock_type {
unsigned long cnt;
unsigned long max;
phys_addr_t total_size;
struct memblock_region *regions;
char *name;
};
/**
* struct memblock_region - represents a memory region
* @base: base address of the region 内存区域的起始地址,类型为u64或u32,表示64位/32位架构的支持最大地址长度
* @size: size of the region 内存区域的大小
* @flags: memory region attributes 内存区域的类型表示,有四种类型:MEMBLOCK_NONE(普通内存),MEMBLOCK_HOTPLUG(可热拔插内存),MEMBLOCK_MIRROR(镜像内存),MEMBLOCK_NOMAP(非内核直接映射内存),相同类型的相邻内存,条件合适时可以被合并
* @nid: NUMA node id 暂时略去与NUMA相关的内容
*/
struct memblock_region {
phys_addr_t base;
phys_addr_t size;
enum memblock_flags flags;
#ifdef CONFIG_NUMA
int nid;
#endif
};
/**
* enum memblock_flags - definition of memory region attributes
* @MEMBLOCK_NONE: no special request
* @MEMBLOCK_HOTPLUG: memory region indicated in the firmware-provided memory
* map during early boot as hot(un)pluggable system RAM (e.g., memory range
* that might get hotunplugged later). With "movable_node" set on the kernel
* commandline, try keeping this memory region hotunpluggable. Does not apply
* to memblocks added ("hotplugged") after early boot.
* @MEMBLOCK_MIRROR: mirrored region
* @MEMBLOCK_NOMAP: don't add to kernel direct mapping and treat as
* reserved in the memory map; refer to memblock_mark_nomap() description
* for further details
* @MEMBLOCK_DRIVER_MANAGED: memory region that is always detected and added
* via a driver, and never indicated in the firmware-provided memory map as
* system RAM. This corresponds to IORESOURCE_SYSRAM_DRIVER_MANAGED in the
* kernel resource tree.
*/
enum memblock_flags {
MEMBLOCK_NONE = 0x0, /* No special request */
MEMBLOCK_HOTPLUG = 0x1, /* hotpluggable region */
MEMBLOCK_MIRROR = 0x2, /* mirrored region */
MEMBLOCK_NOMAP = 0x4, /* don't add to kernel direct mapping */
MEMBLOCK_DRIVER_MANAGED = 0x8, /* always detected via a driver */
};
操作系统通过BIOS 0x15中断,常见有E820、E801和E88子中断号获取设备总内存大小, 内存类型为usable的才能被操作系统所使用
memblock 子系统主要用于引导过程中的物理内存管理