Docker 基础技术之 Linux namespace 源码分析

日期：2021-09-16 栏目：程序人生浏览：次

上篇我们从进程 clone 的角度，结合代码简单分析了 Linux 提供的 6 种 namespace，本篇从源码上进一步分析 Linux namespace，让你对 Docker namespace 的隔离机制有更深的认识。我用的是 Linux-4.1.19 的版本，由于 namespace 模块更新都比较少，所以，只要 3.0 以上的版本都是差不多的。

从内核进程描述符 task_struct 开始切入

由于 Linux namespace 是用来做进程资源隔离的，所以在进程描述符中，一定有 namespace 所对应的信息，我们可以从这里开始切入代码。

首先找到描述进程信息 task_struct，找到指向 namespace 的结构 struct *nsproxy（sched.h）：

struct task_struct { ...... /* namespaces */ struct nsproxy *nsproxy; ...... }

其中 nsproxy 结构体定义在 nsproxy.h 中：

/* * A structure to contain pointers to all per-process * namespaces - fs (mount), uts, network, sysvipc, etc. * * 'count' is the number of tasks holding a reference. * The count for each namespace, then, will be the number * of nsproxies pointing to it, not the number of tasks. * * The nsproxy is shared by tasks which share all namespaces. * As soon as a single namespace is cloned or unshared, the * nsproxy is copied. */ struct nsproxy { atomic_t count; struct uts_namespace *uts_ns; struct ipc_namespace *ipc_ns; struct mnt_namespace *mnt_ns; struct pid_namespace *pid_ns; struct net *net_ns; }; extern struct nsproxy init_nsproxy;

这个结构是被所有 namespace 所共享的，只要一个 namespace 被 clone 了，nsproxy 也会被 clone。注意到，由于 user namespace 是和其他 namespace 耦合在一起的，所以没出现在上述结构中。

同时，nsproxy.h 中还定义了一些对 namespace 的操作，包括 copy_namespaces 等。

int copy_namespaces(unsigned long flags, struct task_struct *tsk); void exit_task_namespaces(struct task_struct *tsk); void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new); void free_nsproxy(struct nsproxy *ns); int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **, struct fs_struct *);

task_struct，nsproxy，几种 namespace 之间的关系如下所示：

Docker 基础技术之 Linux namespace 源码分析

各个 namespace 的初始化

在各个 namespace 结构定义下都有个 init 函数，nsproxy 也有个 init_nsproxy 函数，init_nsproxy 在 task 初始化的时候会被初始化，附带的，init_nsproxy 中定义了各个 namespace 的 init 函数，如下：

在 init_task 函数中（init_task.h）:

/* * INIT_TASK is used to set up the first task table, touch at * your own risk!. Base=0, limit=0x1fffff (=2MB) */ #define INIT_TASK(tsk) \ { ...... .nsproxy = &init_nsproxy, ...... }

继续跟进 init_nsproxy，在 nsproxy.c 中：

struct nsproxy init_nsproxy = { .count = ATOMIC_INIT(1), .uts_ns = &init_uts_ns, #if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC) .ipc_ns = &init_ipc_ns, #endif .mnt_ns = NULL, .pid_ns_for_children = &init_pid_ns, #ifdef CONFIG_NET .net_ns = &init_net, #endif };

可见，init_nsproxy 中，对 uts, ipc, pid, net 都进行了初始化，但 mount 却没有。

创建新的 namespace

初始化完之后，下面看看如何创建一个新的 namespace，通过前面的文章，我们知道是通过 clone 函数来完成的，在 Linux kernel 中，fork/vfork() 对 clone 进行了封装。如下：

#ifdef __ARCH_WANT_SYS_FORK SYSCALL_DEFINE0(fork) { #ifdef CONFIG_MMU return do_fork(SIGCHLD, 0, 0, NULL, NULL); #else /* can not support in nommu mode */ return -EINVAL; #endif } #endif #ifdef __ARCH_WANT_SYS_VFORK SYSCALL_DEFINE0(vfork) { return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, 0, NULL, NULL); } #endif #ifdef __ARCH_WANT_SYS_CLONE #ifdef CONFIG_CLONE_BACKWARDS SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, int __user *, parent_tidptr, int, tls_val, int __user *, child_tidptr) #elif defined(CONFIG_CLONE_BACKWARDS2) SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags, int __user *, parent_tidptr, int __user *, child_tidptr, int, tls_val) #elif defined(CONFIG_CLONE_BACKWARDS3) SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp, int, stack_size, int __user *, parent_tidptr, int __user *, child_tidptr, int, tls_val) #else SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, int __user *, parent_tidptr, int __user *, child_tidptr, int, tls_val) #endif { return do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr); } #endif

可以看到，无论是 fork() 还是 vfork()，最终都会调用到 do_fork() 函数：

/* * Ok, this is the main fork-routine. * * It copies the process, and if successful kick-starts * it and waits for it to finish using the VM if required. */ long do_fork(unsigned long clone_flags, unsigned long stack_start, unsigned long stack_size, int __user *parent_tidptr, int __user *child_tidptr) { // 创建进程描述符指针 struct task_struct *p; int trace = 0; long nr; /* * Determine whether and which event to report to ptracer. When * called from kernel_thread or CLONE_UNTRACED is explicitly * requested, no event is reported; otherwise, report if the event * for the type of forking is enabled. */ if (!(clone_flags & CLONE_UNTRACED)) { if (clone_flags & CLONE_VFORK) trace = PTRACE_EVENT_VFORK; else if ((clone_flags & CSIGNAL) != SIGCHLD) trace = PTRACE_EVENT_CLONE; else trace = PTRACE_EVENT_FORK; if (likely(!ptrace_event_enabled(current, trace))) trace = 0; } // 复制进程描述符，返回值是 task_struct p = copy_process(clone_flags, stack_start, stack_size, child_tidptr, NULL, trace); /* * Do this prior waking up the new thread - the thread pointer * might get invalid after that point, if the thread exits quickly. */ if (!IS_ERR(p)) { struct completion vfork; struct pid *pid; trace_sched_process_fork(current, p); // 得到新进程描述符的 pid pid = get_task_pid(p, PIDTYPE_PID); nr = pid_vnr(pid); if (clone_flags & CLONE_PARENT_SETTID) put_user(nr, parent_tidptr); // 调用 vfork() 方法，完成相关的初始化工作 if (clone_flags & CLONE_VFORK) { p->vfork_done = &vfork; init_completion(&vfork); get_task_struct(p); } // 将新进程加入到调度器中，为其分配 CPU，准备执行 wake_up_new_task(p); // fork() 完成，子进程开始运行，并让 ptrace 跟踪 /* forking complete and child started to run, tell ptracer */ if (unlikely(trace)) ptrace_event_pid(trace, pid); // 如果是 vfork()，将父进程加入等待队列，等待子进程完成 if (clone_flags & CLONE_VFORK) { if (!wait_for_vfork_done(p, &vfork)) ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid); } put_pid(pid); } else { nr = PTR_ERR(p); } return nr; }

转载注明出处：https://www.heiqu.com/zysszy.html

Docker 基础技术之 Linux namespace 源码分析

相关推荐