linux 内存地址空间管理 mm_struct

215 atomic_t mm_users;
/* How many users with user space? */
216 atomic_t mm_count;
/* How many references to "struct mm_struct" (users count as 1) */

这两个 counter 乍看好像差不多，那 Linux 使用中有什么区别呢？看代码就是最好的解释了。

681static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) 682 {
    683 struct mm_struct * mm,
    *oldmm;
    684 int retval;
 
    692 tsk - >mm = NULL;
    693 tsk - >active_mm = NULL;
    694 695
    /*
 696         * Are we cloning a kernel thread?
 697         *
 698         * We need to steal a active VM for that..
 699         */
    700 oldmm = current - >mm;
    701
    if (!oldmm) 702
    return 0;
    703 704
    if (clone_flags & CLONE_VM) {
        705 atomic_inc( & oldmm - >mm_users);
        706 mm = oldmm;
        707 goto good_mm;
        708
    }

无论我们在调用 fork,vfork,clone 的时候最终会调用 do_fork 函数，区别在于 vfork 和 clone 会给 copy_mm 传入一个 CLONE_VM 的 flag，这个标识表示父子进程都运行在同样一个'虚拟地址空间'上面（在 Linux 称之为 lightweight process 或者线程），当然也就共享同样的物理地址空间（Page Frames)。

copy_mm 函数中，如果创建线程中有 CLONE_VM 标识，则表示父子进程共享地址空间和同一个内存描述符，并且只需要将 mm_users 值 + 1，也就是说 mm_users 表示正在引用该地址空间的 thread 数目，是一个 thread level 的 counter。

mm_count 呢？mm_count 的理解有点复杂。

对 Linux 来说，用户进程和内核线程（kernel thread) 都是 task_struct 的实例，唯一的区别是 kernel thread 是没有进程地址空间的，内核线程也没有 mm 描述符的，所以内核线程的 tsk->mm 域是空（NULL）。内核 scheduler 在进程 context switching 的时候，会根据 tsk->mm 判断即将调度的进程是用户进程还是内核线程。但是虽然 thread thread 不用访问用户进程地址空间，但是仍然需要 page table 来访问 kernel 自己的空间。但是幸运的是，对于任何用户进程来说，他们的内核空间都是 100% 相同的，所以内核可以'borrow'上一个被调用的用户进程的 mm 中的页表来访问内核地址，这个 mm 就记录在 active_mm。

简而言之就是，对于 kernel thread,tsk->mm == NULL 表示自己内核线程的身份，而 tsk->active_mm 是借用上一个用户进程的 mm，用 mm 的 page table 来访问内核空间。对于用户进程，tsk->mm == tsk->active_mm。

为了支持这个特别，mm_struct 里面引入了另外一个 counter，mm_count。刚才说过 mm_users 表示这个进程地址空间被多少线程共享或者引用，而 mm_count 则表示这个地址空间被内核线程引用的次数 + 1。

比如一个进程 A 有 3 个线程，那么这个 A 的 mm_struct 的 mm_users 值为 3，但是 mm_count 为 1，所以 mm_count 是 process level 的 counter。维护 2 个 counter 有何用处呢？考虑这样的 scenario，内核调度完 A 以后，切换到内核内核线程 B，B 'borrow' A 的 mm 描述符以访问内核空间，这时 mm_count 变成了 2，同时另外一个 cpu core 调度了 A 并且进程 A exit，这个时候 mm_users 变为了 0，mm_count 变为了 1，但是内核不会因为 mm_users==0 而销毁这个 mm_struct，内核只会当 mm_count==0 的时候才会释放 mm_struct，因为这个时候既没有用户进程使用这个地址空间，也没有内核线程引用这个地址空间。

 449static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
 450{
 451        atomic_set(&mm->mm_users, 1);
 452        atomic_set(&mm->mm_count, 1);

在初始化一个 mm 实例的时候，mm_users 和 mm_count 都被初始化为 1。

2994/*
2995 * context_switch - switch to the new MM and the new
2996 * thread's register state.
2997 */
2998static inline void
2999context_switch(struct rq *rq, struct task_struct *prev,
3000               struct task_struct *next)
3001{
3002        struct mm_struct *mm, *oldmm;
3003
3004        prepare_task_switch(rq, prev, next);
3005        trace_sched_switch(rq, prev, next);
3006        mm = next->mm;
3007        oldmm = prev->active_mm;
3014
3015        if (unlikely(!mm)) {
3016                next->active_mm = oldmm;
3017                atomic_inc(&oldmm->mm_count);
3018                enter_lazy_tlb(oldmm, next);
3019        } else
3020                switch_mm(oldmm, mm, next);
3021

上面的代码是 Linux scheduler 进行的 context switch 的一小段，从 unlike(!mm) 开始，next->active_mm = oldmm 表示如果将要切换倒内核线程，则'借用'前一个拥护进程的 mm 描述符，并把他赋给 active_mm，重点是将'借用'的 mm 描述符的 mm_counter 加 1。

下面我们看看在 fork 一个进程的时候，是怎样处理的 mm_struct 的。

1362/*
1363 *  Ok, this is the main fork-routine.
1364 *
1365 * It copies the process, and if successful kick-starts
1366 * it and waits for it to finish using the VM if required.
1367 */
1368long do_fork(unsigned long clone_flags,
1369              unsigned long stack_start,
1370              struct pt_regs *regs,
1371              unsigned long stack_size,
1372              int __user *parent_tidptr,
1373              int __user *child_tidptr)
1374{
1417        p = copy_process(clone_flags, stack_start, regs, stack_size,
1418                         child_tidptr, NULL, trace);

do_fork 调用 copy_process。

 973/*
 974 * This creates a new process as a copy of the old one,
 975 * but does not actually start it yet.
 976 *
 977 * It copies the registers, and all the appropriate
 978 * parts of the process environment (as per the clone
 979 * flags). The actual kick-off is left to the caller.
 980 */
 981static struct task_struct *copy_process(unsigned long clone_flags,
 982                                        unsigned long stack_start,
 983                                        struct pt_regs *regs,
 984                                        unsigned long stack_size,
 985                                        int __user *child_tidptr,
 986                                        struct pid *pid,
 987                                        int trace)
 988{
1155        if ((retval = copy_mm(clone_flags, p)))
1156                goto bad_fork_cleanup_signal;

copy_process 调用 copy_mm，下面来分析 copy_mm。

 681static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
 682{
 683        struct mm_struct * mm, *oldmm;
 684        int retval;
 685
 686        tsk->min_flt = tsk->maj_flt = 0;
 687        tsk->nvcsw = tsk->nivcsw = 0;
 688#ifdef CONFIG_DETECT_HUNG_TASK
 689        tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
 690#endif
 691
 692        tsk->mm = NULL;
 693        tsk->active_mm = NULL;
 694
 695        /*
 696         * Are we cloning a kernel thread?
 697         *
 698         * We need to steal a active VM for that..
 699         */
 700        oldmm = current->mm;
 701        if (!oldmm)
 702                return 0;
 703
 704        if (clone_flags & CLONE_VM) {
 705                atomic_inc(&oldmm->mm_users);
 706                mm = oldmm;
 707                goto good_mm;
 708        }
 709
 710        retval = -ENOMEM;
 711        mm = dup_mm(tsk);
 712        if (!mm)
 713                goto fail_nomem;
 714
 715good_mm:
 716        /* Initializing for Swap token stuff */
 717        mm->token_priority = 0;
 718        mm->last_interval = 0;
 719
 720        tsk->mm = mm;
 721        tsk->active_mm = mm;
 722        return 0;
 723
 724fail_nomem:
 725        return retval;
 726}

692，693 行，对子进程或者线程的 mm 和 active_mm 初始化（NULL）。

700 - 708 行，就是我们上面说的如果是创建线程，则新线程共享创建进程的 mm，所以不需要进行下面的 copy 操作。

重点就是 711 行的 dup_mm(tsk)。

 621/*
 622 * Allocate a new mm structure and copy contents from the
 623 * mm structure of the passed in task structure.
 624 */
 625struct mm_struct *dup_mm(struct task_struct *tsk)
 626{
 627        struct mm_struct *mm, *oldmm = current->mm;
 628        int err;
 629
 630        if (!oldmm)
 631                return NULL;
 632
 633        mm = allocate_mm();
 634        if (!mm)
 635                goto fail_nomem;
 636
 637        memcpy(mm, oldmm, sizeof(*mm));
 638
 639        /* Initializing for Swap token stuff */
 640        mm->token_priority = 0;
 641        mm->last_interval = 0;
 642
 643        if (!mm_init(mm, tsk))
 644                goto fail_nomem;
 645
 646        if (init_new_context(tsk, mm))
 647                goto fail_nocontext;
 648
 649        dup_mm_exe_file(oldmm, mm);
 650
 651        err = dup_mmap(mm, oldmm);
 652        if (err)
 653                goto free_pt;
 654
 655        mm->hiwater_rss = get_mm_rss(mm);
 656        mm->hiwater_vm = mm->total_vm;
 657
 658        if (mm->binfmt && !try_module_get(mm->binfmt->module))
 659                goto free_pt;
 660
 661        return mm;

633 行，用 slab 分配了 mm_struct 的内存对象。

637 行，对子进程的 mm_struct 进程赋值，使其等于父进程，这样子进程 mm 和父进程 mm 的每一个域的值都相同。

在 copy_mm 的实现中，主要是为了实现 unix COW 的语义，所以理论上我们只需要父子进程 mm 中的 start_x 和 end_x 之类的域（像 start_data,end_data) 相等，而对其余的域（像 mm_users) 则需要 re-init，这个操作主要在 mm_init 中完成。

449static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct * p) 450 {
    451 atomic_set( & mm - >mm_users, 1);
    452 atomic_set( & mm - >mm_count, 1);
    453 init_rwsem( & mm - >mmap_sem);
    454 INIT_LIST_HEAD( & mm - >mmlist);
    455 mm - >flags = (current - >mm) ? 456(current - >mm - >flags & MMF_INIT_MASK) : default_dump_filter;
    457 mm - >core_state = NULL;
    458 mm - >nr_ptes = 0;
    459 set_mm_counter(mm, file_rss, 0);
    460 set_mm_counter(mm, anon_rss, 0);
    461 spin_lock_init( & mm - >page_table_lock);
    462 mm - >free_area_cache = TASK_UNMAPPED_BASE;
    463 mm - >cached_hole_size = ~0UL;
    464 mm_init_aio(mm);
    465 mm_init_owner(mm, p);
    466 467
    if (likely(!mm_alloc_pgd(mm))) {
        468 mm - >def_flags = 0;
        469 mmu_notifier_mm_init(mm);
        470
        return mm;
        471
    }
    472 473 free_mm(mm);
    474
    return NULL;
    475
}

其中特别要关注的是 467 - 471 行的 mm_alloc_pdg，也就是 page table 的拷贝, page table 负责 logic address 到 physical address 的转换。

拷贝的结果就是父子进程有独立的 page table，但是 page table 里面的每个 entries 值都是相同的，也就是说父子进程独立地址空间中相同 logical address 都对应于相同的 physical address，这样也就是实现了父子进程的 COW(copy on write) 语义。

事实上，vfork 和 fork 相比，最大的开销节省就是对 page table 的拷贝。

而在内核 2.6 中，由于 page table 的拷贝，fork 在性能上是有所损耗的，所以内核社区里面讨论过 shared page table 的实现（http://lwn.net/Articles/149888/）。

来源: http://www.bubuko.com/infodetail-2080119.html

与本文相关文章

暂无,快来抢沙发吧！