#define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* Get all of the CLONE_NEW* flags. */ #include "namespace.h" extern char *escape_json_string(char *str); /* Synchronisation values. */ enum sync_t { SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */ SYNC_USERMAP_ACK = 0x41, /* Mapping finished by the parent. */ SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */ SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */ SYNC_GRANDCHILD = 0x44, /* The grandchild is ready to run. */ SYNC_CHILD_FINISH = 0x45, /* The child or grandchild has finished. */ }; /* * Synchronisation value for cgroup namespace setup. * The same constant is defined in process_linux.go as "createCgroupns". */ #define CREATECGROUPNS 0x80 #define STAGE_SETUP -1 /* longjmp() arguments. */ #define STAGE_PARENT 0 #define STAGE_CHILD 1 #define STAGE_INIT 2 /* Stores the current stage of nsexec. */ int current_stage = STAGE_SETUP; /* Assume the stack grows down, so arguments should be above it. */ struct clone_t { /* * Reserve some space for clone() to locate arguments * and retcode in this place */ char stack[4096] __attribute__((aligned(16))); char stack_ptr[0]; /* There's two children. This is used to execute the different code. */ jmp_buf *env; int jmpval; }; struct nlconfig_t { char *data; /* Process settings. */ uint32_t cloneflags; char *oom_score_adj; size_t oom_score_adj_len; /* User namespace settings. */ char *uidmap; size_t uidmap_len; char *gidmap; size_t gidmap_len; char *namespaces; size_t namespaces_len; uint8_t is_setgroup; /* Rootless container settings. */ uint8_t is_rootless_euid; /* boolean */ char *uidmappath; size_t uidmappath_len; char *gidmappath; size_t gidmappath_len; }; #define PANIC "panic" #define FATAL "fatal" #define ERROR "error" #define WARNING "warning" #define INFO "info" #define DEBUG "debug" static int logfd = -1; /* * List of netlink message types sent to us as part of bootstrapping the init. * These constants are defined in libcontainer/message_linux.go. */ #define INIT_MSG 62000 #define CLONE_FLAGS_ATTR 27281 #define NS_PATHS_ATTR 27282 #define UIDMAP_ATTR 27283 #define GIDMAP_ATTR 27284 #define SETGROUP_ATTR 27285 #define OOM_SCORE_ADJ_ATTR 27286 #define ROOTLESS_EUID_ATTR 27287 #define UIDMAPPATH_ATTR 27288 #define GIDMAPPATH_ATTR 27289 /* * Use the raw syscall for versions of glibc which don't include a function for * it, namely (glibc 2.12). */ #if __GLIBC__ == 2 && __GLIBC_MINOR__ < 14 # define _GNU_SOURCE # include "syscall.h" # if !defined(SYS_setns) && defined(__NR_setns) # define SYS_setns __NR_setns # endif # ifndef SYS_setns # error "setns(2) syscall not supported by glibc version" # endif int setns(int fd, int nstype) { return syscall(SYS_setns, fd, nstype); } #endif static void write_log(const char *level, const char *format, ...) { char *message = NULL, *stage = NULL; va_list args; int ret; if (logfd < 0 || level == NULL) goto out; va_start(args, format); ret = vasprintf(&message, format, args); va_end(args); if (ret < 0) goto out; message = escape_json_string(message); if (current_stage == STAGE_SETUP) stage = strdup("nsexec"); else ret = asprintf(&stage, "nsexec-%d", current_stage); if (ret < 0) goto out; dprintf(logfd, "{\"level\":\"%s\", \"msg\": \"%s[%d]: %s\"}\n", level, stage, getpid(), message); out: free(message); free(stage); } /* XXX: This is ugly. */ static int syncfd = -1; #define bail(fmt, ...) \ do { \ write_log(FATAL, fmt ": %m", ##__VA_ARGS__); \ exit(1); \ } while(0) static int write_file(char *data, size_t data_len, char *pathfmt, ...) { int fd, len, ret = 0; char path[PATH_MAX]; va_list ap; va_start(ap, pathfmt); len = vsnprintf(path, PATH_MAX, pathfmt, ap); va_end(ap); if (len < 0) return -1; fd = open(path, O_RDWR); if (fd < 0) { return -1; } len = write(fd, data, data_len); if (len != data_len) { ret = -1; goto out; } out: close(fd); return ret; } enum policy_t { SETGROUPS_DEFAULT = 0, SETGROUPS_ALLOW, SETGROUPS_DENY, }; /* This *must* be called before we touch gid_map. */ static void update_setgroups(int pid, enum policy_t setgroup) { char *policy; switch (setgroup) { case SETGROUPS_ALLOW: policy = "allow"; break; case SETGROUPS_DENY: policy = "deny"; break; case SETGROUPS_DEFAULT: default: /* Nothing to do. */ return; } if (write_file(policy, strlen(policy), "/proc/%d/setgroups", pid) < 0) { /* * If the kernel is too old to support /proc/pid/setgroups, * open(2) or write(2) will return ENOENT. This is fine. */ if (errno != ENOENT) bail("failed to write '%s' to /proc/%d/setgroups", policy, pid); } } static int try_mapping_tool(const char *app, int pid, char *map, size_t map_len) { int child; /* * If @app is NULL, execve will segfault. Just check it here and bail (if * we're in this path, the caller is already getting desperate and there * isn't a backup to this failing). This usually would be a configuration * or programming issue. */ if (!app) bail("mapping tool not present"); child = fork(); if (child < 0) bail("failed to fork"); if (!child) { #define MAX_ARGV 20 char *argv[MAX_ARGV]; char *envp[] = { NULL }; char pid_fmt[16]; int argc = 0; char *next; snprintf(pid_fmt, 16, "%d", pid); argv[argc++] = (char *)app; argv[argc++] = pid_fmt; /* * Convert the map string into a list of argument that * newuidmap/newgidmap can understand. */ while (argc < MAX_ARGV) { if (*map == '\0') { argv[argc++] = NULL; break; } argv[argc++] = map; next = strpbrk(map, "\n "); if (next == NULL) break; *next++ = '\0'; map = next + strspn(next, "\n "); } execve(app, argv, envp); bail("failed to execv"); } else { int status; while (true) { if (waitpid(child, &status, 0) < 0) { if (errno == EINTR) continue; bail("failed to waitpid"); } if (WIFEXITED(status) || WIFSIGNALED(status)) return WEXITSTATUS(status); } } return -1; } static void update_uidmap(const char *path, int pid, char *map, size_t map_len) { if (map == NULL || map_len <= 0) return; write_log(DEBUG, "update /proc/%d/uid_map to '%s'", pid, map); if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0) { if (errno != EPERM) bail("failed to update /proc/%d/uid_map", pid); write_log(DEBUG, "update /proc/%d/uid_map got -EPERM (trying %s)", pid, path); if (try_mapping_tool(path, pid, map, map_len)) bail("failed to use newuid map on %d", pid); } } static void update_gidmap(const char *path, int pid, char *map, size_t map_len) { if (map == NULL || map_len <= 0) return; write_log(DEBUG, "update /proc/%d/gid_map to '%s'", pid, map); if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0) { if (errno != EPERM) bail("failed to update /proc/%d/gid_map", pid); write_log(DEBUG, "update /proc/%d/gid_map got -EPERM (trying %s)", pid, path); if (try_mapping_tool(path, pid, map, map_len)) bail("failed to use newgid map on %d", pid); } } static void update_oom_score_adj(char *data, size_t len) { if (data == NULL || len <= 0) return; write_log(DEBUG, "update /proc/self/oom_score_adj to '%s'", data); if (write_file(data, len, "/proc/self/oom_score_adj") < 0) bail("failed to update /proc/self/oom_score_adj"); } /* A dummy function that just jumps to the given jumpval. */ static int child_func(void *arg) __attribute__((noinline)); static int child_func(void *arg) { struct clone_t *ca = (struct clone_t *)arg; longjmp(*ca->env, ca->jmpval); } static int clone_parent(jmp_buf *env, int jmpval) __attribute__((noinline)); static int clone_parent(jmp_buf *env, int jmpval) { struct clone_t ca = { .env = env, .jmpval = jmpval, }; return clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD, &ca); } /* * Gets the init pipe fd from the environment, which is used to read the * bootstrap data and tell the parent what the new pid is after we finish * setting up the environment. */ static int initpipe(void) { int pipenum; char *initpipe, *endptr; initpipe = getenv("_LIBCONTAINER_INITPIPE"); if (initpipe == NULL || *initpipe == '\0') return -1; pipenum = strtol(initpipe, &endptr, 10); if (*endptr != '\0') bail("unable to parse _LIBCONTAINER_INITPIPE"); return pipenum; } static void setup_logpipe(void) { char *logpipe, *endptr; logpipe = getenv("_LIBCONTAINER_LOGPIPE"); if (logpipe == NULL || *logpipe == '\0') { return; } logfd = strtol(logpipe, &endptr, 10); if (logpipe == endptr || *endptr != '\0') { fprintf(stderr, "unable to parse _LIBCONTAINER_LOGPIPE, value: %s\n", logpipe); /* It is too early to use bail */ exit(1); } } /* Returns the clone(2) flag for a namespace, given the name of a namespace. */ static int nsflag(char *name) { if (!strcmp(name, "cgroup")) return CLONE_NEWCGROUP; else if (!strcmp(name, "ipc")) return CLONE_NEWIPC; else if (!strcmp(name, "mnt")) return CLONE_NEWNS; else if (!strcmp(name, "net")) return CLONE_NEWNET; else if (!strcmp(name, "pid")) return CLONE_NEWPID; else if (!strcmp(name, "user")) return CLONE_NEWUSER; else if (!strcmp(name, "uts")) return CLONE_NEWUTS; /* If we don't recognise a name, fallback to 0. */ return 0; } static uint32_t readint32(char *buf) { return *(uint32_t *) buf; } static uint8_t readint8(char *buf) { return *(uint8_t *) buf; } static void nl_parse(int fd, struct nlconfig_t *config) { size_t len, size; struct nlmsghdr hdr; char *data, *current; /* Retrieve the netlink header. */ len = read(fd, &hdr, NLMSG_HDRLEN); if (len != NLMSG_HDRLEN) bail("invalid netlink header length %zu", len); if (hdr.nlmsg_type == NLMSG_ERROR) bail("failed to read netlink message"); if (hdr.nlmsg_type != INIT_MSG) bail("unexpected msg type %d", hdr.nlmsg_type); /* Retrieve data. */ size = NLMSG_PAYLOAD(&hdr, 0); current = data = malloc(size); if (!data) bail("failed to allocate %zu bytes of memory for nl_payload", size); len = read(fd, data, size); if (len != size) bail("failed to read netlink payload, %zu != %zu", len, size); /* Parse the netlink payload. */ config->data = data; while (current < data + size) { struct nlattr *nlattr = (struct nlattr *)current; size_t payload_len = nlattr->nla_len - NLA_HDRLEN; /* Advance to payload. */ current += NLA_HDRLEN; /* Handle payload. */ switch (nlattr->nla_type) { case CLONE_FLAGS_ATTR: config->cloneflags = readint32(current); break; case ROOTLESS_EUID_ATTR: config->is_rootless_euid = readint8(current); /* boolean */ break; case OOM_SCORE_ADJ_ATTR: config->oom_score_adj = current; config->oom_score_adj_len = payload_len; break; case NS_PATHS_ATTR: config->namespaces = current; config->namespaces_len = payload_len; break; case UIDMAP_ATTR: config->uidmap = current; config->uidmap_len = payload_len; break; case GIDMAP_ATTR: config->gidmap = current; config->gidmap_len = payload_len; break; case UIDMAPPATH_ATTR: config->uidmappath = current; config->uidmappath_len = payload_len; break; case GIDMAPPATH_ATTR: config->gidmappath = current; config->gidmappath_len = payload_len; break; case SETGROUP_ATTR: config->is_setgroup = readint8(current); break; default: bail("unknown netlink message type %d", nlattr->nla_type); } current += NLA_ALIGN(payload_len); } } void nl_free(struct nlconfig_t *config) { free(config->data); } void join_namespaces(char *nslist) { int num = 0, i; char *saveptr = NULL; char *namespace = strtok_r(nslist, ",", &saveptr); struct namespace_t { int fd; char type[PATH_MAX]; char path[PATH_MAX]; } *namespaces = NULL; if (!namespace || !strlen(namespace) || !strlen(nslist)) bail("ns paths are empty"); /* * We have to open the file descriptors first, since after * we join the mnt namespace we might no longer be able to * access the paths. */ do { int fd; char *path; struct namespace_t *ns; /* Resize the namespace array. */ namespaces = realloc(namespaces, ++num * sizeof(struct namespace_t)); if (!namespaces) bail("failed to reallocate namespace array"); ns = &namespaces[num - 1]; /* Split 'ns:path'. */ path = strstr(namespace, ":"); if (!path) bail("failed to parse %s", namespace); *path++ = '\0'; fd = open(path, O_RDONLY); if (fd < 0) bail("failed to open %s", path); ns->fd = fd; strncpy(ns->type, namespace, PATH_MAX - 1); strncpy(ns->path, path, PATH_MAX - 1); ns->path[PATH_MAX - 1] = '\0'; } while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL); /* * The ordering in which we join namespaces is important. We should * always join the user namespace *first*. This is all guaranteed * from the container_linux.go side of this, so we're just going to * follow the order given to us. */ for (i = 0; i < num; i++) { struct namespace_t *ns = &namespaces[i]; int flag = nsflag(ns->type); write_log(DEBUG, "setns(%#x) into %s namespace (with path %s)", flag, ns->type, ns->path); if (setns(ns->fd, flag) < 0) bail("failed to setns into %s namespace", ns->type); close(ns->fd); } free(namespaces); } /* Defined in cloned_binary.c. */ extern int ensure_cloned_binary(void); static inline int sane_kill(pid_t pid, int signum) { if (pid > 0) return kill(pid, signum); else return 0; } void nsexec(void) { int pipenum; jmp_buf env; int sync_child_pipe[2], sync_grandchild_pipe[2]; struct nlconfig_t config = { 0 }; /* * Setup a pipe to send logs to the parent. This should happen * first, because bail will use that pipe. */ setup_logpipe(); /* * If we don't have an init pipe, just return to the go routine. * We'll only get an init pipe for start or exec. */ pipenum = initpipe(); if (pipenum == -1) return; /* * We need to re-exec if we are not in a cloned binary. This is necessary * to ensure that containers won't be able to access the host binary * through /proc/self/exe. See CVE-2019-5736. */ if (ensure_cloned_binary() < 0) bail("could not ensure we are a cloned binary"); /* * Inform the parent we're past initial setup. * For the other side of this, see initWaiter. */ if (write(pipenum, "", 1) != 1) bail("could not inform the parent we are past initial setup"); write_log(DEBUG, "=> nsexec container setup"); /* Parse all of the netlink configuration. */ nl_parse(pipenum, &config); /* Set oom_score_adj. This has to be done before !dumpable because * /proc/self/oom_score_adj is not writeable unless you're an privileged * user (if !dumpable is set). All children inherit their parent's * oom_score_adj value on fork(2) so this will always be propagated * properly. */ update_oom_score_adj(config.oom_score_adj, config.oom_score_adj_len); /* * Make the process non-dumpable, to avoid various race conditions that * could cause processes in namespaces we're joining to access host * resources (or potentially execute code). * * However, if the number of namespaces we are joining is 0, we are not * going to be switching to a different security context. Thus setting * ourselves to be non-dumpable only breaks things (like rootless * containers), which is the recommendation from the kernel folks. */ if (config.namespaces) { write_log(DEBUG, "set process as non-dumpable"); if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0) bail("failed to set process as non-dumpable"); } /* Pipe so we can tell the child when we've finished setting up. */ if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_child_pipe) < 0) bail("failed to setup sync pipe between parent and child"); /* * We need a new socketpair to sync with grandchild so we don't have * race condition with child. */ if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_grandchild_pipe) < 0) bail("failed to setup sync pipe between parent and grandchild"); /* TODO: Currently we aren't dealing with child deaths properly. */ /* * Okay, so this is quite annoying. * * In order for this unsharing code to be more extensible we need to split * up unshare(CLONE_NEWUSER) and clone() in various ways. The ideal case * would be if we did clone(CLONE_NEWUSER) and the other namespaces * separately, but because of SELinux issues we cannot really do that. But * we cannot just dump the namespace flags into clone(...) because several * usecases (such as rootless containers) require more granularity around * the namespace setup. In addition, some older kernels had issues where * CLONE_NEWUSER wasn't handled before other namespaces (but we cannot * handle this while also dealing with SELinux so we choose SELinux support * over broken kernel support). * * However, if we unshare(2) the user namespace *before* we clone(2), then * all hell breaks loose. * * The parent no longer has permissions to do many things (unshare(2) drops * all capabilities in your old namespace), and the container cannot be set * up to have more than one {uid,gid} mapping. This is obviously less than * ideal. In order to fix this, we have to first clone(2) and then unshare. * * Unfortunately, it's not as simple as that. We have to fork to enter the * PID namespace (the PID namespace only applies to children). Since we'll * have to double-fork, this clone_parent() call won't be able to get the * PID of the _actual_ init process (without doing more synchronisation than * I can deal with at the moment). So we'll just get the parent to send it * for us, the only job of this process is to update * /proc/pid/{setgroups,uid_map,gid_map}. * * And as a result of the above, we also need to setns(2) in the first child * because if we join a PID namespace in the topmost parent then our child * will be in that namespace (and it will not be able to give us a PID value * that makes sense without resorting to sending things with cmsg). * * This also deals with an older issue caused by dumping cloneflags into * clone(2): On old kernels, CLONE_PARENT didn't work with CLONE_NEWPID, so * we have to unshare(2) before clone(2) in order to do this. This was fixed * in upstream commit 1f7f4dde5c945f41a7abc2285be43d918029ecc5, and was * introduced by 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e. As far as we're * aware, the last mainline kernel which had this bug was Linux 3.12. * However, we cannot comment on which kernels the broken patch was * backported to. * * -- Aleksa "what has my life come to?" Sarai */ current_stage = setjmp(env); switch (current_stage) { /* * Stage 0: We're in the parent. Our job is just to create a new child * (stage 1: STAGE_CHILD) process and write its uid_map and * gid_map. That process will go on to create a new process, then * it will send us its PID which we will send to the bootstrap * process. */ case STAGE_PARENT:{ int len; pid_t stage1_pid = -1, stage2_pid = -1; bool stage1_complete, stage2_complete; /* For debugging. */ prctl(PR_SET_NAME, (unsigned long)"runc:[0:PARENT]", 0, 0, 0); write_log(DEBUG, "~> nsexec stage-0"); /* Start the process of getting a container. */ write_log(DEBUG, "spawn stage-1"); stage1_pid = clone_parent(&env, STAGE_CHILD); if (stage1_pid < 0) bail("unable to spawn stage-1"); syncfd = sync_child_pipe[1]; close(sync_child_pipe[0]); /* * State machine for synchronisation with the children. We only * return once both the child and grandchild are ready. */ write_log(DEBUG, "-> stage-1 synchronisation loop"); stage1_complete = false; while (!stage1_complete) { enum sync_t s; if (read(syncfd, &s, sizeof(s)) != sizeof(s)) bail("failed to sync with stage-1: next state"); switch (s) { case SYNC_USERMAP_PLS: write_log(DEBUG, "stage-1 requested userns mappings"); /* * Enable setgroups(2) if we've been asked to. But we also * have to explicitly disable setgroups(2) if we're * creating a rootless container for single-entry mapping. * i.e. config.is_setgroup == false. * (this is required since Linux 3.19). * * For rootless multi-entry mapping, config.is_setgroup shall be true and * newuidmap/newgidmap shall be used. */ if (config.is_rootless_euid && !config.is_setgroup) update_setgroups(stage1_pid, SETGROUPS_DENY); /* Set up mappings. */ update_uidmap(config.uidmappath, stage1_pid, config.uidmap, config.uidmap_len); update_gidmap(config.gidmappath, stage1_pid, config.gidmap, config.gidmap_len); s = SYNC_USERMAP_ACK; if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { sane_kill(stage1_pid, SIGKILL); sane_kill(stage2_pid, SIGKILL); bail("failed to sync with stage-1: write(SYNC_USERMAP_ACK)"); } break; case SYNC_RECVPID_PLS: write_log(DEBUG, "stage-1 requested pid to be forwarded"); /* Get the stage-2 pid. */ if (read(syncfd, &stage2_pid, sizeof(stage2_pid)) != sizeof(stage2_pid)) { sane_kill(stage1_pid, SIGKILL); sane_kill(stage2_pid, SIGKILL); bail("failed to sync with stage-1: read(stage2_pid)"); } /* Send ACK. */ s = SYNC_RECVPID_ACK; if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { sane_kill(stage1_pid, SIGKILL); sane_kill(stage2_pid, SIGKILL); bail("failed to sync with stage-1: write(SYNC_RECVPID_ACK)"); } /* * Send both the stage-1 and stage-2 pids back to runc. * runc needs the stage-2 to continue process management, * but because stage-1 was spawned with CLONE_PARENT we * cannot reap it within stage-0 and thus we need to ask * runc to reap the zombie for us. */ write_log(DEBUG, "forward stage-1 (%d) and stage-2 (%d) pids to runc", stage1_pid, stage2_pid); len = dprintf(pipenum, "{\"stage1_pid\":%d,\"stage2_pid\":%d}\n", stage1_pid, stage2_pid); if (len < 0) { sane_kill(stage1_pid, SIGKILL); sane_kill(stage2_pid, SIGKILL); bail("failed to sync with runc: write(pid-JSON)"); } break; case SYNC_CHILD_FINISH: write_log(DEBUG, "stage-1 complete"); stage1_complete = true; break; default: bail("unexpected sync value: %u", s); } } write_log(DEBUG, "<- stage-1 synchronisation loop"); /* Now sync with grandchild. */ syncfd = sync_grandchild_pipe[1]; close(sync_grandchild_pipe[0]); write_log(DEBUG, "-> stage-2 synchronisation loop"); stage2_complete = false; while (!stage2_complete) { enum sync_t s; write_log(DEBUG, "signalling stage-2 to run"); s = SYNC_GRANDCHILD; if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { sane_kill(stage2_pid, SIGKILL); bail("failed to sync with child: write(SYNC_GRANDCHILD)"); } if (read(syncfd, &s, sizeof(s)) != sizeof(s)) bail("failed to sync with child: next state"); switch (s) { case SYNC_CHILD_FINISH: write_log(DEBUG, "stage-2 complete"); stage2_complete = true; break; default: bail("unexpected sync value: %u", s); } } write_log(DEBUG, "<- stage-2 synchronisation loop"); write_log(DEBUG, "<~ nsexec stage-0"); exit(0); } break; /* * Stage 1: We're in the first child process. Our job is to join any * provided namespaces in the netlink payload and unshare all of * the requested namespaces. If we've been asked to CLONE_NEWUSER, * we will ask our parent (stage 0) to set up our user mappings * for us. Then, we create a new child (stage 2: STAGE_INIT) for * PID namespace. We then send the child's PID to our parent * (stage 0). */ case STAGE_CHILD:{ pid_t stage2_pid = -1; enum sync_t s; /* We're in a child and thus need to tell the parent if we die. */ syncfd = sync_child_pipe[0]; close(sync_child_pipe[1]); /* For debugging. */ prctl(PR_SET_NAME, (unsigned long)"runc:[1:CHILD]", 0, 0, 0); write_log(DEBUG, "~> nsexec stage-1"); /* * We need to setns first. We cannot do this earlier (in stage 0) * because of the fact that we forked to get here (the PID of * [stage 2: STAGE_INIT]) would be meaningless). We could send it * using cmsg(3) but that's just annoying. */ if (config.namespaces) join_namespaces(config.namespaces); /* * Deal with user namespaces first. They are quite special, as they * affect our ability to unshare other namespaces and are used as * context for privilege checks. * * We don't unshare all namespaces in one go. The reason for this * is that, while the kernel documentation may claim otherwise, * there are certain cases where unsharing all namespaces at once * will result in namespace objects being owned incorrectly. * Ideally we should just fix these kernel bugs, but it's better to * be safe than sorry, and fix them separately. * * A specific case of this is that the SELinux label of the * internal kern-mount that mqueue uses will be incorrect if the * UTS namespace is cloned before the USER namespace is mapped. * I've also heard of similar problems with the network namespace * in some scenarios. This also mirrors how LXC deals with this * problem. */ if (config.cloneflags & CLONE_NEWUSER) { write_log(DEBUG, "unshare user namespace"); if (unshare(CLONE_NEWUSER) < 0) bail("failed to unshare user namespace"); config.cloneflags &= ~CLONE_NEWUSER; /* * We need to set ourselves as dumpable temporarily so that the * parent process can write to our procfs files. */ if (config.namespaces) { write_log(DEBUG, "temporarily set process as dumpable"); if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) < 0) bail("failed to temporarily set process as dumpable"); } /* * We don't have the privileges to do any mapping here (see the * clone_parent rant). So signal stage-0 to do the mapping for * us. */ write_log(DEBUG, "request stage-0 to map user namespace"); s = SYNC_USERMAP_PLS; if (write(syncfd, &s, sizeof(s)) != sizeof(s)) bail("failed to sync with parent: write(SYNC_USERMAP_PLS)"); /* ... wait for mapping ... */ write_log(DEBUG, "request stage-0 to map user namespace"); if (read(syncfd, &s, sizeof(s)) != sizeof(s)) bail("failed to sync with parent: read(SYNC_USERMAP_ACK)"); if (s != SYNC_USERMAP_ACK) bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s); /* Revert temporary re-dumpable setting. */ if (config.namespaces) { write_log(DEBUG, "re-set process as non-dumpable"); if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0) bail("failed to re-set process as non-dumpable"); } /* Become root in the namespace proper. */ if (setresuid(0, 0, 0) < 0) bail("failed to become root in user namespace"); } /* * Unshare all of the namespaces. Now, it should be noted that this * ordering might break in the future (especially with rootless * containers). But for now, it's not possible to split this into * CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues. * * Note that we don't merge this with clone() because there were * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID) * was broken, so we'll just do it the long way anyway. */ write_log(DEBUG, "unshare remaining namespace (except cgroupns)"); if (unshare(config.cloneflags & ~CLONE_NEWCGROUP) < 0) bail("failed to unshare remaining namespaces (except cgroupns)"); /* * TODO: What about non-namespace clone flags that we're dropping here? * * We fork again because of PID namespace, setns(2) or unshare(2) don't * change the PID namespace of the calling process, because doing so * would change the caller's idea of its own PID (as reported by getpid()), * which would break many applications and libraries, so we must fork * to actually enter the new PID namespace. */ write_log(DEBUG, "spawn stage-2"); stage2_pid = clone_parent(&env, STAGE_INIT); if (stage2_pid < 0) bail("unable to spawn stage-2"); /* Send the child to our parent, which knows what it's doing. */ write_log(DEBUG, "request stage-0 to forward stage-2 pid (%d)", stage2_pid); s = SYNC_RECVPID_PLS; if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { sane_kill(stage2_pid, SIGKILL); bail("failed to sync with parent: write(SYNC_RECVPID_PLS)"); } if (write(syncfd, &stage2_pid, sizeof(stage2_pid)) != sizeof(stage2_pid)) { sane_kill(stage2_pid, SIGKILL); bail("failed to sync with parent: write(stage2_pid)"); } /* ... wait for parent to get the pid ... */ if (read(syncfd, &s, sizeof(s)) != sizeof(s)) { sane_kill(stage2_pid, SIGKILL); bail("failed to sync with parent: read(SYNC_RECVPID_ACK)"); } if (s != SYNC_RECVPID_ACK) { sane_kill(stage2_pid, SIGKILL); bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s); } write_log(DEBUG, "signal completion to stage-0"); s = SYNC_CHILD_FINISH; if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { sane_kill(stage2_pid, SIGKILL); bail("failed to sync with parent: write(SYNC_CHILD_FINISH)"); } /* Our work is done. [Stage 2: STAGE_INIT] is doing the rest of the work. */ write_log(DEBUG, "<~ nsexec stage-1"); exit(0); } break; /* * Stage 2: We're the final child process, and the only process that will * actually return to the Go runtime. Our job is to just do the * final cleanup steps and then return to the Go runtime to allow * init_linux.go to run. */ case STAGE_INIT:{ /* * We're inside the child now, having jumped from the * start_child() code after forking in the parent. */ enum sync_t s; /* We're in a child and thus need to tell the parent if we die. */ syncfd = sync_grandchild_pipe[0]; close(sync_grandchild_pipe[1]); close(sync_child_pipe[0]); close(sync_child_pipe[1]); /* For debugging. */ prctl(PR_SET_NAME, (unsigned long)"runc:[2:INIT]", 0, 0, 0); write_log(DEBUG, "~> nsexec stage-2"); if (read(syncfd, &s, sizeof(s)) != sizeof(s)) bail("failed to sync with parent: read(SYNC_GRANDCHILD)"); if (s != SYNC_GRANDCHILD) bail("failed to sync with parent: SYNC_GRANDCHILD: got %u", s); if (setsid() < 0) bail("setsid failed"); if (setuid(0) < 0) bail("setuid failed"); if (setgid(0) < 0) bail("setgid failed"); if (!config.is_rootless_euid && config.is_setgroup) { if (setgroups(0, NULL) < 0) bail("setgroups failed"); } /* * Wait until our topmost parent has finished cgroup setup in * p.manager.Apply(). * * TODO(cyphar): Check if this code is actually needed because we * should be in the cgroup even from stage-0, so * waiting until now might not make sense. */ if (config.cloneflags & CLONE_NEWCGROUP) { uint8_t value; if (read(pipenum, &value, sizeof(value)) != sizeof(value)) bail("read synchronisation value failed"); if (value == CREATECGROUPNS) { write_log(DEBUG, "unshare cgroup namespace"); if (unshare(CLONE_NEWCGROUP) < 0) bail("failed to unshare cgroup namespace"); } else bail("received unknown synchronisation value"); } write_log(DEBUG, "signal completion to stage-0"); s = SYNC_CHILD_FINISH; if (write(syncfd, &s, sizeof(s)) != sizeof(s)) bail("failed to sync with patent: write(SYNC_CHILD_FINISH)"); /* Close sync pipes. */ close(sync_grandchild_pipe[0]); /* Free netlink data. */ nl_free(&config); /* Finish executing, let the Go runtime take over. */ write_log(DEBUG, "<= nsexec container setup"); write_log(DEBUG, "booting up go runtime ..."); return; } break; default: bail("unknown stage '%d' for jump value", current_stage); } /* Should never be reached. */ bail("should never be reached"); }