vendor: github.com/opencontainers/runc v1.0.0-rc95

Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
This commit is contained in:
Sebastiaan van Stijn 2021-06-21 12:54:15 +02:00
parent 579279ce09
commit 79a9fd61fd
No known key found for this signature in database
GPG Key ID: 76698F39D527CE8C
14 changed files with 662 additions and 402 deletions

View File

@ -55,7 +55,7 @@ github.com/modern-go/reflect2 94122c33edd36123c84d5368cfb2
github.com/morikuni/aec 39771216ff4c63d11f5e604076f9c45e8be1067b # v1.0.0 github.com/morikuni/aec 39771216ff4c63d11f5e604076f9c45e8be1067b # v1.0.0
github.com/opencontainers/go-digest ea51bea511f75cfa3ef6098cc253c5c3609b037a # v1.0.0 github.com/opencontainers/go-digest ea51bea511f75cfa3ef6098cc253c5c3609b037a # v1.0.0
github.com/opencontainers/image-spec d60099175f88c47cd379c4738d158884749ed235 # v1.0.1 github.com/opencontainers/image-spec d60099175f88c47cd379c4738d158884749ed235 # v1.0.1
github.com/opencontainers/runc ff819c7e9184c13b7c2607fe6c30ae19403a7aff # v1.0.0-rc92 github.com/opencontainers/runc b9ee9c6314599f1b4a7f497e1f1f856fe433d3b7 # v1.0.0-rc95
github.com/opentracing/opentracing-go d34af3eaa63c4d08ab54863a4bdd0daa45212e12 # v1.2.0 github.com/opentracing/opentracing-go d34af3eaa63c4d08ab54863a4bdd0daa45212e12 # v1.2.0
github.com/pkg/errors 614d223910a179a466c1767a985424175c39b465 # v0.9.1 github.com/pkg/errors 614d223910a179a466c1767a985424175c39b465 # v0.9.1
github.com/prometheus/client_golang 6edbbd9e560190e318cdc5b4d3e630b442858380 # v1.6.0 github.com/prometheus/client_golang 6edbbd9e560190e318cdc5b4d3e630b442858380 # v1.6.0

View File

@ -1,9 +1,10 @@
# runc # runc
[![Build Status](https://travis-ci.org/opencontainers/runc.svg?branch=master)](https://travis-ci.org/opencontainers/runc)
[![Go Report Card](https://goreportcard.com/badge/github.com/opencontainers/runc)](https://goreportcard.com/report/github.com/opencontainers/runc) [![Go Report Card](https://goreportcard.com/badge/github.com/opencontainers/runc)](https://goreportcard.com/report/github.com/opencontainers/runc)
[![GoDoc](https://godoc.org/github.com/opencontainers/runc?status.svg)](https://godoc.org/github.com/opencontainers/runc) [![GoDoc](https://godoc.org/github.com/opencontainers/runc?status.svg)](https://godoc.org/github.com/opencontainers/runc)
[![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/588/badge)](https://bestpractices.coreinfrastructure.org/projects/588) [![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/588/badge)](https://bestpractices.coreinfrastructure.org/projects/588)
[![gha/validate](https://github.com/opencontainers/runc/workflows/validate/badge.svg)](https://github.com/opencontainers/runc/actions?query=workflow%3Avalidate)
[![gha/ci](https://github.com/opencontainers/runc/workflows/ci/badge.svg)](https://github.com/opencontainers/runc/actions?query=workflow%3Aci)
## Introduction ## Introduction
@ -17,10 +18,6 @@ This means that `runc` 1.0.0 should implement the 1.0 version of the specificati
You can find official releases of `runc` on the [release](https://github.com/opencontainers/runc/releases) page. You can find official releases of `runc` on the [release](https://github.com/opencontainers/runc/releases) page.
Currently, the following features are not considered to be production-ready:
* [Support for cgroup v2](./docs/cgroup-v2.md)
## Security ## Security
The reporting process and disclosure communications are outlined [here](https://github.com/opencontainers/org/blob/master/SECURITY.md). The reporting process and disclosure communications are outlined [here](https://github.com/opencontainers/org/blob/master/SECURITY.md).
@ -64,19 +61,20 @@ sudo make install
with some of them enabled by default (see `BUILDTAGS` in top-level `Makefile`). with some of them enabled by default (see `BUILDTAGS` in top-level `Makefile`).
To change build tags from the default, set the `BUILDTAGS` variable for make, To change build tags from the default, set the `BUILDTAGS` variable for make,
e.g. e.g. to disable seccomp:
```bash ```bash
make BUILDTAGS='seccomp apparmor' make BUILDTAGS=""
``` ```
| Build Tag | Feature | Enabled by default | Dependency | | Build Tag | Feature | Enabled by default | Dependency |
|-----------|------------------------------------|--------------------|------------| |-----------|------------------------------------|--------------------|------------|
| seccomp | Syscall filtering | yes | libseccomp | | seccomp | Syscall filtering | yes | libseccomp |
| selinux | selinux process and mount labeling | yes | <none> |
| apparmor | apparmor profile support | yes | <none> |
| nokmem | disable kernel memory accounting | no | <none> |
The following build tags were used earlier, but are now obsoleted:
- **nokmem** (since runc v1.0.0-rc94 kernel memory settings are ignored)
- **apparmor** (since runc v1.0.0-rc93 the feature is always enabled)
- **selinux** (since runc v1.0.0-rc93 the feature is always enabled)
### Running the test suite ### Running the test suite
@ -128,6 +126,14 @@ make verify-dependencies
## Using runc ## Using runc
Please note that runc is a low level tool not designed with an end user
in mind. It is mostly employed by other higher level container software.
Therefore, unless there is some specific use case that prevents the use
of tools like Docker or Podman, it is not recommended to use runc directly.
If you still want to use runc, here's how.
### Creating an OCI Bundle ### Creating an OCI Bundle
In order to use runc you must have your container in the format of an OCI bundle. In order to use runc you must have your container in the format of an OCI bundle.
@ -169,7 +175,9 @@ If you used the unmodified `runc spec` template this should give you a `sh` sess
The second way to start a container is using the specs lifecycle operations. The second way to start a container is using the specs lifecycle operations.
This gives you more power over how the container is created and managed while it is running. This gives you more power over how the container is created and managed while it is running.
This will also launch the container in the background so you will have to edit the `config.json` to remove the `terminal` setting for the simple examples here. This will also launch the container in the background so you will have to edit
the `config.json` to remove the `terminal` setting for the simple examples
below (see more details about [runc terminal handling](docs/terminals.md)).
Your process field in the `config.json` should look like this below with `"terminal": false` and `"args": ["sleep", "5"]`. Your process field in the `config.json` should look like this below with `"terminal": false` and `"args": ["sleep", "5"]`.
@ -292,8 +300,12 @@ PIDFile=/run/mycontainerid.pid
WantedBy=multi-user.target WantedBy=multi-user.target
``` ```
#### cgroup v2 ## More documentation
See [`./docs/cgroup-v2.md`](./docs/cgroup-v2.md).
* [cgroup v2](./docs/cgroup-v2.md)
* [Checkpoint and restore](./docs/checkpoint-restore.md)
* [systemd cgroup driver](./docs/systemd.md)
* [Terminals and standard IO](./docs/terminals.md)
## License ## License

View File

@ -1,26 +1,28 @@
module github.com/opencontainers/runc module github.com/opencontainers/runc
go 1.14 go 1.13
require ( require (
github.com/checkpoint-restore/go-criu/v4 v4.1.0 github.com/checkpoint-restore/go-criu/v5 v5.0.0
github.com/cilium/ebpf v0.0.0-20200702112145-1c8d4c9ef775 github.com/cilium/ebpf v0.5.0
github.com/containerd/console v1.0.0 github.com/containerd/console v1.0.2
github.com/coreos/go-systemd/v22 v22.1.0 github.com/coreos/go-systemd/v22 v22.3.1
github.com/cyphar/filepath-securejoin v0.2.2 github.com/cyphar/filepath-securejoin v0.2.2
github.com/docker/go-units v0.4.0 github.com/docker/go-units v0.4.0
github.com/godbus/dbus/v5 v5.0.3 github.com/godbus/dbus/v5 v5.0.4
github.com/golang/protobuf v1.4.2 github.com/moby/sys/mountinfo v0.4.1
github.com/moby/sys/mountinfo v0.1.3 github.com/mrunalp/fileutils v0.5.0
github.com/mrunalp/fileutils v0.0.0-20200520151820-abd8a0e76976 github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417
github.com/opencontainers/runtime-spec v1.0.3-0.20200728170252-4d89ac9fbff6 github.com/opencontainers/selinux v1.8.0
github.com/opencontainers/selinux v1.6.0
github.com/pkg/errors v0.9.1 github.com/pkg/errors v0.9.1
github.com/seccomp/libseccomp-golang v0.9.1 github.com/seccomp/libseccomp-golang v0.9.1
github.com/sirupsen/logrus v1.6.0 github.com/sirupsen/logrus v1.7.0
github.com/syndtr/gocapability v0.0.0-20180916011248-d98352740cb2 github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635
// NOTE: urfave/cli must be <= v1.22.1 due to a regression: https://github.com/urfave/cli/issues/1092 // NOTE: urfave/cli must be <= v1.22.1 due to a regression: https://github.com/urfave/cli/issues/1092
github.com/urfave/cli v1.22.1 github.com/urfave/cli v1.22.1
github.com/vishvananda/netlink v1.1.0 github.com/vishvananda/netlink v1.1.0
golang.org/x/sys v0.0.0-20200728102440-3e129f6d46b1 github.com/willf/bitset v1.1.11
golang.org/x/net v0.0.0-20201224014010-6772e930b67b
golang.org/x/sys v0.0.0-20210426230700-d19ff857e887
google.golang.org/protobuf v1.25.0
) )

View File

@ -57,90 +57,94 @@ struct describing how the container is to be created. A sample would look simila
```go ```go
defaultMountFlags := unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV defaultMountFlags := unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV
var devices []*configs.DeviceRule
for _, device := range specconv.AllowedDevices {
devices = append(devices, &device.Rule)
}
config := &configs.Config{ config := &configs.Config{
Rootfs: "/your/path/to/rootfs", Rootfs: "/your/path/to/rootfs",
Capabilities: &configs.Capabilities{ Capabilities: &configs.Capabilities{
Bounding: []string{ Bounding: []string{
"CAP_CHOWN", "CAP_CHOWN",
"CAP_DAC_OVERRIDE", "CAP_DAC_OVERRIDE",
"CAP_FSETID", "CAP_FSETID",
"CAP_FOWNER", "CAP_FOWNER",
"CAP_MKNOD", "CAP_MKNOD",
"CAP_NET_RAW", "CAP_NET_RAW",
"CAP_SETGID", "CAP_SETGID",
"CAP_SETUID", "CAP_SETUID",
"CAP_SETFCAP", "CAP_SETFCAP",
"CAP_SETPCAP", "CAP_SETPCAP",
"CAP_NET_BIND_SERVICE", "CAP_NET_BIND_SERVICE",
"CAP_SYS_CHROOT", "CAP_SYS_CHROOT",
"CAP_KILL", "CAP_KILL",
"CAP_AUDIT_WRITE", "CAP_AUDIT_WRITE",
}, },
Effective: []string{ Effective: []string{
"CAP_CHOWN", "CAP_CHOWN",
"CAP_DAC_OVERRIDE", "CAP_DAC_OVERRIDE",
"CAP_FSETID", "CAP_FSETID",
"CAP_FOWNER", "CAP_FOWNER",
"CAP_MKNOD", "CAP_MKNOD",
"CAP_NET_RAW", "CAP_NET_RAW",
"CAP_SETGID", "CAP_SETGID",
"CAP_SETUID", "CAP_SETUID",
"CAP_SETFCAP", "CAP_SETFCAP",
"CAP_SETPCAP", "CAP_SETPCAP",
"CAP_NET_BIND_SERVICE", "CAP_NET_BIND_SERVICE",
"CAP_SYS_CHROOT", "CAP_SYS_CHROOT",
"CAP_KILL", "CAP_KILL",
"CAP_AUDIT_WRITE", "CAP_AUDIT_WRITE",
}, },
Inheritable: []string{ Inheritable: []string{
"CAP_CHOWN", "CAP_CHOWN",
"CAP_DAC_OVERRIDE", "CAP_DAC_OVERRIDE",
"CAP_FSETID", "CAP_FSETID",
"CAP_FOWNER", "CAP_FOWNER",
"CAP_MKNOD", "CAP_MKNOD",
"CAP_NET_RAW", "CAP_NET_RAW",
"CAP_SETGID", "CAP_SETGID",
"CAP_SETUID", "CAP_SETUID",
"CAP_SETFCAP", "CAP_SETFCAP",
"CAP_SETPCAP", "CAP_SETPCAP",
"CAP_NET_BIND_SERVICE", "CAP_NET_BIND_SERVICE",
"CAP_SYS_CHROOT", "CAP_SYS_CHROOT",
"CAP_KILL", "CAP_KILL",
"CAP_AUDIT_WRITE", "CAP_AUDIT_WRITE",
}, },
Permitted: []string{ Permitted: []string{
"CAP_CHOWN", "CAP_CHOWN",
"CAP_DAC_OVERRIDE", "CAP_DAC_OVERRIDE",
"CAP_FSETID", "CAP_FSETID",
"CAP_FOWNER", "CAP_FOWNER",
"CAP_MKNOD", "CAP_MKNOD",
"CAP_NET_RAW", "CAP_NET_RAW",
"CAP_SETGID", "CAP_SETGID",
"CAP_SETUID", "CAP_SETUID",
"CAP_SETFCAP", "CAP_SETFCAP",
"CAP_SETPCAP", "CAP_SETPCAP",
"CAP_NET_BIND_SERVICE", "CAP_NET_BIND_SERVICE",
"CAP_SYS_CHROOT", "CAP_SYS_CHROOT",
"CAP_KILL", "CAP_KILL",
"CAP_AUDIT_WRITE", "CAP_AUDIT_WRITE",
}, },
Ambient: []string{ Ambient: []string{
"CAP_CHOWN", "CAP_CHOWN",
"CAP_DAC_OVERRIDE", "CAP_DAC_OVERRIDE",
"CAP_FSETID", "CAP_FSETID",
"CAP_FOWNER", "CAP_FOWNER",
"CAP_MKNOD", "CAP_MKNOD",
"CAP_NET_RAW", "CAP_NET_RAW",
"CAP_SETGID", "CAP_SETGID",
"CAP_SETUID", "CAP_SETUID",
"CAP_SETFCAP", "CAP_SETFCAP",
"CAP_SETPCAP", "CAP_SETPCAP",
"CAP_NET_BIND_SERVICE", "CAP_NET_BIND_SERVICE",
"CAP_SYS_CHROOT", "CAP_SYS_CHROOT",
"CAP_KILL", "CAP_KILL",
"CAP_AUDIT_WRITE", "CAP_AUDIT_WRITE",
}, },
}, },
Namespaces: configs.Namespaces([]configs.Namespace{ Namespaces: configs.Namespaces([]configs.Namespace{
{Type: configs.NEWNS}, {Type: configs.NEWNS},
{Type: configs.NEWUTS}, {Type: configs.NEWUTS},
@ -155,7 +159,7 @@ config := &configs.Config{
Parent: "system", Parent: "system",
Resources: &configs.Resources{ Resources: &configs.Resources{
MemorySwappiness: nil, MemorySwappiness: nil,
Devices: specconv.AllowedDevices, Devices: devices,
}, },
}, },
MaskPaths: []string{ MaskPaths: []string{
@ -313,7 +317,7 @@ state, err := container.State()
#### Checkpoint & Restore #### Checkpoint & Restore
libcontainer now integrates [CRIU](http://criu.org/) for checkpointing and restoring containers. libcontainer now integrates [CRIU](http://criu.org/) for checkpointing and restoring containers.
This let's you save the state of a process running inside a container to disk, and then restore This lets you save the state of a process running inside a container to disk, and then restore
that state into a new process, on the same machine or on another machine. that state into a new process, on the same machine or on another machine.
`criu` version 1.5.2 or higher is required to use checkpoint and restore. `criu` version 1.5.2 or higher is required to use checkpoint and restore.

View File

@ -59,14 +59,38 @@
#include <sys/syscall.h> #include <sys/syscall.h>
/* Use our own wrapper for memfd_create. */ /* Use our own wrapper for memfd_create. */
#if !defined(SYS_memfd_create) && defined(__NR_memfd_create) #ifndef SYS_memfd_create
# define SYS_memfd_create __NR_memfd_create # ifdef __NR_memfd_create
# define SYS_memfd_create __NR_memfd_create
# else
/* These values come from <https://fedora.juszkiewicz.com.pl/syscalls.html>. */
# warning "libc is outdated -- using hard-coded SYS_memfd_create"
# if defined(__x86_64__)
# define SYS_memfd_create 319
# elif defined(__i386__)
# define SYS_memfd_create 356
# elif defined(__ia64__)
# define SYS_memfd_create 1340
# elif defined(__arm__)
# define SYS_memfd_create 385
# elif defined(__aarch64__)
# define SYS_memfd_create 279
# elif defined(__ppc__) || defined(__PPC64__) || defined(__powerpc64__)
# define SYS_memfd_create 360
# elif defined(__s390__) || defined(__s390x__)
# define SYS_memfd_create 350
# else
# warning "unknown architecture -- cannot hard-code SYS_memfd_create"
# endif
# endif
#endif #endif
/* memfd_create(2) flags -- copied from <linux/memfd.h>. */ /* memfd_create(2) flags -- copied from <linux/memfd.h>. */
#ifndef MFD_CLOEXEC #ifndef MFD_CLOEXEC
# define MFD_CLOEXEC 0x0001U # define MFD_CLOEXEC 0x0001U
# define MFD_ALLOW_SEALING 0x0002U # define MFD_ALLOW_SEALING 0x0002U
#endif #endif
int memfd_create(const char *name, unsigned int flags) int memfd_create(const char *name, unsigned int flags)
{ {
#ifdef SYS_memfd_create #ifdef SYS_memfd_create
@ -77,7 +101,6 @@ int memfd_create(const char *name, unsigned int flags)
#endif #endif
} }
/* This comes directly from <linux/fcntl.h>. */ /* This comes directly from <linux/fcntl.h>. */
#ifndef F_LINUX_SPECIFIC_BASE #ifndef F_LINUX_SPECIFIC_BASE
# define F_LINUX_SPECIFIC_BASE 1024 # define F_LINUX_SPECIFIC_BASE 1024
@ -103,7 +126,7 @@ static void *must_realloc(void *ptr, size_t size)
void *old = ptr; void *old = ptr;
do { do {
ptr = realloc(old, size); ptr = realloc(old, size);
} while(!ptr); } while (!ptr);
return ptr; return ptr;
} }
@ -115,10 +138,10 @@ static void *must_realloc(void *ptr, size_t size)
static int is_self_cloned(void) static int is_self_cloned(void)
{ {
int fd, ret, is_cloned = 0; int fd, ret, is_cloned = 0;
struct stat statbuf = {}; struct stat statbuf = { };
struct statfs fsbuf = {}; struct statfs fsbuf = { };
fd = open("/proc/self/exe", O_RDONLY|O_CLOEXEC); fd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC);
if (fd < 0) { if (fd < 0) {
fprintf(stderr, "you have no read access to runc binary file\n"); fprintf(stderr, "you have no read access to runc binary file\n");
return -ENOTRECOVERABLE; return -ENOTRECOVERABLE;
@ -274,7 +297,7 @@ enum {
static int make_execfd(int *fdtype) static int make_execfd(int *fdtype)
{ {
int fd = -1; int fd = -1;
char template[PATH_MAX] = {0}; char template[PATH_MAX] = { 0 };
char *prefix = getenv("_LIBCONTAINER_STATEDIR"); char *prefix = getenv("_LIBCONTAINER_STATEDIR");
if (!prefix || *prefix != '/') if (!prefix || *prefix != '/')
@ -303,7 +326,7 @@ static int make_execfd(int *fdtype)
*fdtype = EFD_FILE; *fdtype = EFD_FILE;
fd = open(prefix, O_TMPFILE | O_EXCL | O_RDWR | O_CLOEXEC, 0700); fd = open(prefix, O_TMPFILE | O_EXCL | O_RDWR | O_CLOEXEC, 0700);
if (fd >= 0) { if (fd >= 0) {
struct stat statbuf = {}; struct stat statbuf = { };
bool working_otmpfile = false; bool working_otmpfile = false;
/* /*
@ -348,27 +371,27 @@ static int seal_execfd(int *fd, int fdtype)
switch (fdtype) { switch (fdtype) {
case EFD_MEMFD: case EFD_MEMFD:
return fcntl(*fd, F_ADD_SEALS, RUNC_MEMFD_SEALS); return fcntl(*fd, F_ADD_SEALS, RUNC_MEMFD_SEALS);
case EFD_FILE: { case EFD_FILE:{
/* Need to re-open our pseudo-memfd as an O_PATH to avoid execve(2) giving -ETXTBSY. */ /* Need to re-open our pseudo-memfd as an O_PATH to avoid execve(2) giving -ETXTBSY. */
int newfd; int newfd;
char fdpath[PATH_MAX] = {0}; char fdpath[PATH_MAX] = { 0 };
if (fchmod(*fd, 0100) < 0) if (fchmod(*fd, 0100) < 0)
return -1; return -1;
if (snprintf(fdpath, sizeof(fdpath), "/proc/self/fd/%d", *fd) < 0) if (snprintf(fdpath, sizeof(fdpath), "/proc/self/fd/%d", *fd) < 0)
return -1; return -1;
newfd = open(fdpath, O_PATH | O_CLOEXEC); newfd = open(fdpath, O_PATH | O_CLOEXEC);
if (newfd < 0) if (newfd < 0)
return -1; return -1;
close(*fd); close(*fd);
*fd = newfd; *fd = newfd;
return 0; return 0;
} }
default: default:
break; break;
} }
return -1; return -1;
} }
@ -376,7 +399,7 @@ static int seal_execfd(int *fd, int fdtype)
static int try_bindfd(void) static int try_bindfd(void)
{ {
int fd, ret = -1; int fd, ret = -1;
char template[PATH_MAX] = {0}; char template[PATH_MAX] = { 0 };
char *prefix = getenv("_LIBCONTAINER_STATEDIR"); char *prefix = getenv("_LIBCONTAINER_STATEDIR");
if (!prefix || *prefix != '/') if (!prefix || *prefix != '/')
@ -404,7 +427,6 @@ static int try_bindfd(void)
if (mount("", template, "", MS_REMOUNT | MS_BIND | MS_RDONLY, "") < 0) if (mount("", template, "", MS_REMOUNT | MS_BIND | MS_RDONLY, "") < 0)
goto out_umount; goto out_umount;
/* Get read-only handle that we're sure can't be made read-write. */ /* Get read-only handle that we're sure can't be made read-write. */
ret = open(template, O_PATH | O_CLOEXEC); ret = open(template, O_PATH | O_CLOEXEC);
@ -448,7 +470,7 @@ static ssize_t fd_to_fd(int outfd, int infd)
if (n < 0) if (n < 0)
return -1; return -1;
nwritten += n; nwritten += n;
} while(nwritten < nread); } while (nwritten < nread);
total += nwritten; total += nwritten;
} }
@ -459,7 +481,7 @@ static ssize_t fd_to_fd(int outfd, int infd)
static int clone_binary(void) static int clone_binary(void)
{ {
int binfd, execfd; int binfd, execfd;
struct stat statbuf = {}; struct stat statbuf = { };
size_t sent = 0; size_t sent = 0;
int fdtype = EFD_NONE; int fdtype = EFD_NONE;

View File

@ -0,0 +1,142 @@
#include <stdlib.h>
#include <string.h>
#ifdef ESCAPE_TEST
# include <assert.h>
# define test_assert(arg) assert(arg)
#else
# define test_assert(arg)
#endif
#define DEL '\x7f'
/*
* Poor man version of itoa with base=16 and input number from 0 to 15,
* represented by a char. Converts it to a single hex digit ('0' to 'f').
*/
static char hex(char i)
{
test_assert(i >= 0 && i < 16);
if (i >= 0 && i < 10) {
return '0' + i;
}
if (i >= 10 && i < 16) {
return 'a' + i - 10;
}
return '?';
}
/*
* Given the character, tells how many _extra_ characters are needed
* to JSON-escape it. If 0 is returned, the character does not need to
* be escaped.
*/
static int need_escape(char c)
{
switch (c) {
case '\\':
case '"':
case '\b':
case '\n':
case '\r':
case '\t':
case '\f':
return 1;
case DEL: // -> \u007f
return 5;
default:
if (c > 0 && c < ' ') {
// ASCII decimal 01 to 31 -> \u00xx
return 5;
}
return 0;
}
}
/*
* Escape the string so it can be used as a JSON string (per RFC4627,
* section 2.5 minimal requirements, plus the DEL (0x7f) character).
*
* It is expected that the argument is a string allocated via malloc.
* In case no escaping is needed, the original string is returned as is;
* otherwise, the original string is free'd, and the newly allocated
* escaped string is returned. Thus, in any case, the value returned
* need to be free'd by the caller.
*/
char *escape_json_string(char *s)
{
int i, j, len;
char *c, *out;
/*
* First, check if escaping is at all needed -- if not, we can avoid
* malloc and return the argument as is. While at it, count how much
* extra space is required.
*
* XXX: the counting code must be in sync with the escaping code
* (checked by test_assert()s below).
*/
for (i = j = 0; s[i] != '\0'; i++) {
j += need_escape(s[i]);
}
if (j == 0) {
// nothing to escape
return s;
}
len = i + j + 1;
out = malloc(len);
if (!out) {
free(s);
// As malloc failed, strdup can fail, too, so in the worst case
// scenario NULL will be returned from here.
return strdup("escape_json_string: out of memory");
}
for (c = s, j = 0; *c != '\0'; c++) {
switch (*c) {
case '"':
case '\\':
test_assert(need_escape(*c) == 1);
out[j++] = '\\';
out[j++] = *c;
continue;
}
if ((*c < 0 || *c >= ' ') && (*c != DEL)) {
// no escape needed
test_assert(need_escape(*c) == 0);
out[j++] = *c;
continue;
}
out[j++] = '\\';
switch (*c) {
case '\b':
out[j++] = 'b';
break;
case '\n':
out[j++] = 'n';
break;
case '\r':
out[j++] = 'r';
break;
case '\t':
out[j++] = 't';
break;
case '\f':
out[j++] = 'f';
break;
default:
test_assert(need_escape(*c) == 5);
out[j++] = 'u';
out[j++] = '0';
out[j++] = '0';
out[j++] = hex(*c >> 4);
out[j++] = hex(*c & 0x0f);
}
}
test_assert(j + 1 == len);
out[j] = '\0';
free(s);
return out;
}

View File

@ -29,6 +29,8 @@
/* Get all of the CLONE_NEW* flags. */ /* Get all of the CLONE_NEW* flags. */
#include "namespace.h" #include "namespace.h"
extern char *escape_json_string(char *str);
/* Synchronisation values. */ /* Synchronisation values. */
enum sync_t { enum sync_t {
SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */ SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */
@ -36,7 +38,7 @@ enum sync_t {
SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */ SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */
SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */ SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */
SYNC_GRANDCHILD = 0x44, /* The grandchild is ready to run. */ SYNC_GRANDCHILD = 0x44, /* The grandchild is ready to run. */
SYNC_CHILD_READY = 0x45, /* The child or grandchild is ready to return. */ SYNC_CHILD_FINISH = 0x45, /* The child or grandchild has finished. */
}; };
/* /*
@ -45,10 +47,14 @@ enum sync_t {
*/ */
#define CREATECGROUPNS 0x80 #define CREATECGROUPNS 0x80
#define STAGE_SETUP -1
/* longjmp() arguments. */ /* longjmp() arguments. */
#define JUMP_PARENT 0x00 #define STAGE_PARENT 0
#define JUMP_CHILD 0xA0 #define STAGE_CHILD 1
#define JUMP_INIT 0xA1 #define STAGE_INIT 2
/* Stores the current stage of nsexec. */
int current_stage = STAGE_SETUP;
/* Assume the stack grows down, so arguments should be above it. */ /* Assume the stack grows down, so arguments should be above it. */
struct clone_t { struct clone_t {
@ -56,7 +62,7 @@ struct clone_t {
* Reserve some space for clone() to locate arguments * Reserve some space for clone() to locate arguments
* and retcode in this place * and retcode in this place
*/ */
char stack[4096] __attribute__ ((aligned(16))); char stack[4096] __attribute__((aligned(16)));
char stack_ptr[0]; char stack_ptr[0];
/* There's two children. This is used to execute the different code. */ /* There's two children. This is used to execute the different code. */
@ -102,31 +108,31 @@ static int logfd = -1;
* List of netlink message types sent to us as part of bootstrapping the init. * List of netlink message types sent to us as part of bootstrapping the init.
* These constants are defined in libcontainer/message_linux.go. * These constants are defined in libcontainer/message_linux.go.
*/ */
#define INIT_MSG 62000 #define INIT_MSG 62000
#define CLONE_FLAGS_ATTR 27281 #define CLONE_FLAGS_ATTR 27281
#define NS_PATHS_ATTR 27282 #define NS_PATHS_ATTR 27282
#define UIDMAP_ATTR 27283 #define UIDMAP_ATTR 27283
#define GIDMAP_ATTR 27284 #define GIDMAP_ATTR 27284
#define SETGROUP_ATTR 27285 #define SETGROUP_ATTR 27285
#define OOM_SCORE_ADJ_ATTR 27286 #define OOM_SCORE_ADJ_ATTR 27286
#define ROOTLESS_EUID_ATTR 27287 #define ROOTLESS_EUID_ATTR 27287
#define UIDMAPPATH_ATTR 27288 #define UIDMAPPATH_ATTR 27288
#define GIDMAPPATH_ATTR 27289 #define GIDMAPPATH_ATTR 27289
/* /*
* Use the raw syscall for versions of glibc which don't include a function for * Use the raw syscall for versions of glibc which don't include a function for
* it, namely (glibc 2.12). * it, namely (glibc 2.12).
*/ */
#if __GLIBC__ == 2 && __GLIBC_MINOR__ < 14 #if __GLIBC__ == 2 && __GLIBC_MINOR__ < 14
# define _GNU_SOURCE # define _GNU_SOURCE
# include "syscall.h" # include "syscall.h"
# if !defined(SYS_setns) && defined(__NR_setns) # if !defined(SYS_setns) && defined(__NR_setns)
# define SYS_setns __NR_setns # define SYS_setns __NR_setns
# endif # endif
#ifndef SYS_setns # ifndef SYS_setns
# error "setns(2) syscall not supported by glibc version" # error "setns(2) syscall not supported by glibc version"
#endif # endif
int setns(int fd, int nstype) int setns(int fd, int nstype)
{ {
@ -134,33 +140,43 @@ int setns(int fd, int nstype)
} }
#endif #endif
static void write_log_with_info(const char *level, const char *function, int line, const char *format, ...) static void write_log(const char *level, const char *format, ...)
{ {
char message[1024] = {}; char *message = NULL, *stage = NULL;
va_list args; va_list args;
int ret;
if (logfd < 0 || level == NULL) if (logfd < 0 || level == NULL)
return; goto out;
va_start(args, format); va_start(args, format);
if (vsnprintf(message, sizeof(message), format, args) < 0) ret = vasprintf(&message, format, args);
goto done;
dprintf(logfd, "{\"level\":\"%s\", \"msg\": \"%s:%d %s\"}\n", level, function, line, message);
done:
va_end(args); va_end(args);
} if (ret < 0)
goto out;
#define write_log(level, fmt, ...) \ message = escape_json_string(message);
write_log_with_info((level), __FUNCTION__, __LINE__, (fmt), ##__VA_ARGS__)
if (current_stage == STAGE_SETUP)
stage = strdup("nsexec");
else
ret = asprintf(&stage, "nsexec-%d", current_stage);
if (ret < 0)
goto out;
dprintf(logfd, "{\"level\":\"%s\", \"msg\": \"%s[%d]: %s\"}\n", level, stage, getpid(), message);
out:
free(message);
free(stage);
}
/* XXX: This is ugly. */ /* XXX: This is ugly. */
static int syncfd = -1; static int syncfd = -1;
#define bail(fmt, ...) \ #define bail(fmt, ...) \
do { \ do { \
write_log(FATAL, "nsenter: " fmt ": %m", ##__VA_ARGS__); \ write_log(FATAL, fmt ": %m", ##__VA_ARGS__); \
exit(1); \ exit(1); \
} while(0) } while(0)
@ -187,7 +203,7 @@ static int write_file(char *data, size_t data_len, char *pathfmt, ...)
goto out; goto out;
} }
out: out:
close(fd); close(fd);
return ret; return ret;
} }
@ -297,9 +313,11 @@ static void update_uidmap(const char *path, int pid, char *map, size_t map_len)
if (map == NULL || map_len <= 0) if (map == NULL || map_len <= 0)
return; return;
write_log(DEBUG, "update /proc/%d/uid_map to '%s'", pid, map);
if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0) { if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0) {
if (errno != EPERM) if (errno != EPERM)
bail("failed to update /proc/%d/uid_map", pid); bail("failed to update /proc/%d/uid_map", pid);
write_log(DEBUG, "update /proc/%d/uid_map got -EPERM (trying %s)", pid, path);
if (try_mapping_tool(path, pid, map, map_len)) if (try_mapping_tool(path, pid, map, map_len))
bail("failed to use newuid map on %d", pid); bail("failed to use newuid map on %d", pid);
} }
@ -310,9 +328,11 @@ static void update_gidmap(const char *path, int pid, char *map, size_t map_len)
if (map == NULL || map_len <= 0) if (map == NULL || map_len <= 0)
return; return;
write_log(DEBUG, "update /proc/%d/gid_map to '%s'", pid, map);
if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0) { if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0) {
if (errno != EPERM) if (errno != EPERM)
bail("failed to update /proc/%d/gid_map", pid); bail("failed to update /proc/%d/gid_map", pid);
write_log(DEBUG, "update /proc/%d/gid_map got -EPERM (trying %s)", pid, path);
if (try_mapping_tool(path, pid, map, map_len)) if (try_mapping_tool(path, pid, map, map_len))
bail("failed to use newgid map on %d", pid); bail("failed to use newgid map on %d", pid);
} }
@ -323,19 +343,20 @@ static void update_oom_score_adj(char *data, size_t len)
if (data == NULL || len <= 0) if (data == NULL || len <= 0)
return; return;
write_log(DEBUG, "update /proc/self/oom_score_adj to '%s'", data);
if (write_file(data, len, "/proc/self/oom_score_adj") < 0) if (write_file(data, len, "/proc/self/oom_score_adj") < 0)
bail("failed to update /proc/self/oom_score_adj"); bail("failed to update /proc/self/oom_score_adj");
} }
/* A dummy function that just jumps to the given jumpval. */ /* A dummy function that just jumps to the given jumpval. */
static int child_func(void *arg) __attribute__ ((noinline)); static int child_func(void *arg) __attribute__((noinline));
static int child_func(void *arg) static int child_func(void *arg)
{ {
struct clone_t *ca = (struct clone_t *)arg; struct clone_t *ca = (struct clone_t *)arg;
longjmp(*ca->env, ca->jmpval); longjmp(*ca->env, ca->jmpval);
} }
static int clone_parent(jmp_buf *env, int jmpval) __attribute__ ((noinline)); static int clone_parent(jmp_buf *env, int jmpval) __attribute__((noinline));
static int clone_parent(jmp_buf *env, int jmpval) static int clone_parent(jmp_buf *env, int jmpval)
{ {
struct clone_t ca = { struct clone_t ca = {
@ -507,7 +528,6 @@ void join_namespaces(char *nslist)
char *namespace = strtok_r(nslist, ",", &saveptr); char *namespace = strtok_r(nslist, ",", &saveptr);
struct namespace_t { struct namespace_t {
int fd; int fd;
int ns;
char type[PATH_MAX]; char type[PATH_MAX];
char path[PATH_MAX]; char path[PATH_MAX];
} *namespaces = NULL; } *namespaces = NULL;
@ -542,7 +562,7 @@ void join_namespaces(char *nslist)
bail("failed to open %s", path); bail("failed to open %s", path);
ns->fd = fd; ns->fd = fd;
ns->ns = nsflag(namespace); strncpy(ns->type, namespace, PATH_MAX - 1);
strncpy(ns->path, path, PATH_MAX - 1); strncpy(ns->path, path, PATH_MAX - 1);
ns->path[PATH_MAX - 1] = '\0'; ns->path[PATH_MAX - 1] = '\0';
} while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL); } while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL);
@ -555,12 +575,14 @@ void join_namespaces(char *nslist)
*/ */
for (i = 0; i < num; i++) { for (i = 0; i < num; i++) {
struct namespace_t ns = namespaces[i]; struct namespace_t *ns = &namespaces[i];
int flag = nsflag(ns->type);
if (setns(ns.fd, ns.ns) < 0) write_log(DEBUG, "setns(%#x) into %s namespace (with path %s)", flag, ns->type, ns->path);
bail("failed to setns to %s", ns.path); if (setns(ns->fd, flag) < 0)
bail("failed to setns into %s namespace", ns->type);
close(ns.fd); close(ns->fd);
} }
free(namespaces); free(namespaces);
@ -569,6 +591,14 @@ void join_namespaces(char *nslist)
/* Defined in cloned_binary.c. */ /* Defined in cloned_binary.c. */
extern int ensure_cloned_binary(void); extern int ensure_cloned_binary(void);
static inline int sane_kill(pid_t pid, int signum)
{
if (pid > 0)
return kill(pid, signum);
else
return 0;
}
void nsexec(void) void nsexec(void)
{ {
int pipenum; int pipenum;
@ -598,7 +628,14 @@ void nsexec(void)
if (ensure_cloned_binary() < 0) if (ensure_cloned_binary() < 0)
bail("could not ensure we are a cloned binary"); bail("could not ensure we are a cloned binary");
write_log(DEBUG, "nsexec started"); /*
* Inform the parent we're past initial setup.
* For the other side of this, see initWaiter.
*/
if (write(pipenum, "", 1) != 1)
bail("could not inform the parent we are past initial setup");
write_log(DEBUG, "=> nsexec container setup");
/* Parse all of the netlink configuration. */ /* Parse all of the netlink configuration. */
nl_parse(pipenum, &config); nl_parse(pipenum, &config);
@ -622,6 +659,7 @@ void nsexec(void)
* containers), which is the recommendation from the kernel folks. * containers), which is the recommendation from the kernel folks.
*/ */
if (config.namespaces) { if (config.namespaces) {
write_log(DEBUG, "set process as non-dumpable");
if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0) if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
bail("failed to set process as non-dumpable"); bail("failed to set process as non-dumpable");
} }
@ -686,45 +724,49 @@ void nsexec(void)
* -- Aleksa "what has my life come to?" Sarai * -- Aleksa "what has my life come to?" Sarai
*/ */
switch (setjmp(env)) { current_stage = setjmp(env);
switch (current_stage) {
/* /*
* Stage 0: We're in the parent. Our job is just to create a new child * Stage 0: We're in the parent. Our job is just to create a new child
* (stage 1: JUMP_CHILD) process and write its uid_map and * (stage 1: STAGE_CHILD) process and write its uid_map and
* gid_map. That process will go on to create a new process, then * gid_map. That process will go on to create a new process, then
* it will send us its PID which we will send to the bootstrap * it will send us its PID which we will send to the bootstrap
* process. * process.
*/ */
case JUMP_PARENT:{ case STAGE_PARENT:{
int len; int len;
pid_t child, first_child = -1; pid_t stage1_pid = -1, stage2_pid = -1;
bool ready = false; bool stage1_complete, stage2_complete;
/* For debugging. */ /* For debugging. */
prctl(PR_SET_NAME, (unsigned long)"runc:[0:PARENT]", 0, 0, 0); prctl(PR_SET_NAME, (unsigned long)"runc:[0:PARENT]", 0, 0, 0);
write_log(DEBUG, "~> nsexec stage-0");
/* Start the process of getting a container. */ /* Start the process of getting a container. */
child = clone_parent(&env, JUMP_CHILD); write_log(DEBUG, "spawn stage-1");
if (child < 0) stage1_pid = clone_parent(&env, STAGE_CHILD);
bail("unable to fork: child_func"); if (stage1_pid < 0)
bail("unable to spawn stage-1");
/*
* State machine for synchronisation with the children.
*
* Father only return when both child and grandchild are
* ready, so we can receive all possible error codes
* generated by children.
*/
syncfd = sync_child_pipe[1]; syncfd = sync_child_pipe[1];
close(sync_child_pipe[0]); close(sync_child_pipe[0]);
while (!ready) { /*
* State machine for synchronisation with the children. We only
* return once both the child and grandchild are ready.
*/
write_log(DEBUG, "-> stage-1 synchronisation loop");
stage1_complete = false;
while (!stage1_complete) {
enum sync_t s; enum sync_t s;
if (read(syncfd, &s, sizeof(s)) != sizeof(s)) if (read(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with child: next state"); bail("failed to sync with stage-1: next state");
switch (s) { switch (s) {
case SYNC_USERMAP_PLS: case SYNC_USERMAP_PLS:
write_log(DEBUG, "stage-1 requested userns mappings");
/* /*
* Enable setgroups(2) if we've been asked to. But we also * Enable setgroups(2) if we've been asked to. But we also
* have to explicitly disable setgroups(2) if we're * have to explicitly disable setgroups(2) if we're
@ -735,70 +777,78 @@ void nsexec(void)
* For rootless multi-entry mapping, config.is_setgroup shall be true and * For rootless multi-entry mapping, config.is_setgroup shall be true and
* newuidmap/newgidmap shall be used. * newuidmap/newgidmap shall be used.
*/ */
if (config.is_rootless_euid && !config.is_setgroup) if (config.is_rootless_euid && !config.is_setgroup)
update_setgroups(child, SETGROUPS_DENY); update_setgroups(stage1_pid, SETGROUPS_DENY);
/* Set up mappings. */ /* Set up mappings. */
update_uidmap(config.uidmappath, child, config.uidmap, config.uidmap_len); update_uidmap(config.uidmappath, stage1_pid, config.uidmap, config.uidmap_len);
update_gidmap(config.gidmappath, child, config.gidmap, config.gidmap_len); update_gidmap(config.gidmappath, stage1_pid, config.gidmap, config.gidmap_len);
s = SYNC_USERMAP_ACK; s = SYNC_USERMAP_ACK;
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
kill(child, SIGKILL); sane_kill(stage1_pid, SIGKILL);
bail("failed to sync with child: write(SYNC_USERMAP_ACK)"); sane_kill(stage2_pid, SIGKILL);
bail("failed to sync with stage-1: write(SYNC_USERMAP_ACK)");
} }
break; break;
case SYNC_RECVPID_PLS:{ case SYNC_RECVPID_PLS:
first_child = child; write_log(DEBUG, "stage-1 requested pid to be forwarded");
/* Get the init_func pid. */ /* Get the stage-2 pid. */
if (read(syncfd, &child, sizeof(child)) != sizeof(child)) { if (read(syncfd, &stage2_pid, sizeof(stage2_pid)) != sizeof(stage2_pid)) {
kill(first_child, SIGKILL); sane_kill(stage1_pid, SIGKILL);
bail("failed to sync with child: read(childpid)"); sane_kill(stage2_pid, SIGKILL);
} bail("failed to sync with stage-1: read(stage2_pid)");
}
/* Send ACK. */ /* Send ACK. */
s = SYNC_RECVPID_ACK; s = SYNC_RECVPID_ACK;
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
kill(first_child, SIGKILL); sane_kill(stage1_pid, SIGKILL);
kill(child, SIGKILL); sane_kill(stage2_pid, SIGKILL);
bail("failed to sync with child: write(SYNC_RECVPID_ACK)"); bail("failed to sync with stage-1: write(SYNC_RECVPID_ACK)");
} }
/* Send the init_func pid back to our parent. /*
* * Send both the stage-1 and stage-2 pids back to runc.
* Send the init_func pid and the pid of the first child back to our parent. * runc needs the stage-2 to continue process management,
* We need to send both back because we can't reap the first child we created (CLONE_PARENT). * but because stage-1 was spawned with CLONE_PARENT we
* It becomes the responsibility of our parent to reap the first child. * cannot reap it within stage-0 and thus we need to ask
*/ * runc to reap the zombie for us.
len = dprintf(pipenum, "{\"pid\": %d, \"pid_first\": %d}\n", child, first_child); */
if (len < 0) { write_log(DEBUG, "forward stage-1 (%d) and stage-2 (%d) pids to runc",
kill(child, SIGKILL); stage1_pid, stage2_pid);
bail("unable to generate JSON for child pid"); len =
} dprintf(pipenum, "{\"stage1_pid\":%d,\"stage2_pid\":%d}\n", stage1_pid,
stage2_pid);
if (len < 0) {
sane_kill(stage1_pid, SIGKILL);
sane_kill(stage2_pid, SIGKILL);
bail("failed to sync with runc: write(pid-JSON)");
} }
break; break;
case SYNC_CHILD_READY: case SYNC_CHILD_FINISH:
ready = true; write_log(DEBUG, "stage-1 complete");
stage1_complete = true;
break; break;
default: default:
bail("unexpected sync value: %u", s); bail("unexpected sync value: %u", s);
} }
} }
write_log(DEBUG, "<- stage-1 synchronisation loop");
/* Now sync with grandchild. */ /* Now sync with grandchild. */
syncfd = sync_grandchild_pipe[1]; syncfd = sync_grandchild_pipe[1];
close(sync_grandchild_pipe[0]); close(sync_grandchild_pipe[0]);
write_log(DEBUG, "-> stage-2 synchronisation loop");
ready = false; stage2_complete = false;
while (!ready) { while (!stage2_complete) {
enum sync_t s; enum sync_t s;
write_log(DEBUG, "signalling stage-2 to run");
s = SYNC_GRANDCHILD; s = SYNC_GRANDCHILD;
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
kill(child, SIGKILL); sane_kill(stage2_pid, SIGKILL);
bail("failed to sync with child: write(SYNC_GRANDCHILD)"); bail("failed to sync with child: write(SYNC_GRANDCHILD)");
} }
@ -806,27 +856,31 @@ void nsexec(void)
bail("failed to sync with child: next state"); bail("failed to sync with child: next state");
switch (s) { switch (s) {
case SYNC_CHILD_READY: case SYNC_CHILD_FINISH:
ready = true; write_log(DEBUG, "stage-2 complete");
stage2_complete = true;
break; break;
default: default:
bail("unexpected sync value: %u", s); bail("unexpected sync value: %u", s);
} }
} }
write_log(DEBUG, "<- stage-2 synchronisation loop");
write_log(DEBUG, "<~ nsexec stage-0");
exit(0); exit(0);
} }
break;
/* /*
* Stage 1: We're in the first child process. Our job is to join any * Stage 1: We're in the first child process. Our job is to join any
* provided namespaces in the netlink payload and unshare all * provided namespaces in the netlink payload and unshare all of
* of the requested namespaces. If we've been asked to * the requested namespaces. If we've been asked to CLONE_NEWUSER,
* CLONE_NEWUSER, we will ask our parent (stage 0) to set up * we will ask our parent (stage 0) to set up our user mappings
* our user mappings for us. Then, we create a new child * for us. Then, we create a new child (stage 2: STAGE_INIT) for
* (stage 2: JUMP_INIT) for PID namespace. We then send the * PID namespace. We then send the child's PID to our parent
* child's PID to our parent (stage 0). * (stage 0).
*/ */
case JUMP_CHILD:{ case STAGE_CHILD:{
pid_t child; pid_t stage2_pid = -1;
enum sync_t s; enum sync_t s;
/* We're in a child and thus need to tell the parent if we die. */ /* We're in a child and thus need to tell the parent if we die. */
@ -835,11 +889,12 @@ void nsexec(void)
/* For debugging. */ /* For debugging. */
prctl(PR_SET_NAME, (unsigned long)"runc:[1:CHILD]", 0, 0, 0); prctl(PR_SET_NAME, (unsigned long)"runc:[1:CHILD]", 0, 0, 0);
write_log(DEBUG, "~> nsexec stage-1");
/* /*
* We need to setns first. We cannot do this earlier (in stage 0) * We need to setns first. We cannot do this earlier (in stage 0)
* because of the fact that we forked to get here (the PID of * because of the fact that we forked to get here (the PID of
* [stage 2: JUMP_INIT]) would be meaningless). We could send it * [stage 2: STAGE_INIT]) would be meaningless). We could send it
* using cmsg(3) but that's just annoying. * using cmsg(3) but that's just annoying.
*/ */
if (config.namespaces) if (config.namespaces)
@ -865,40 +920,50 @@ void nsexec(void)
* problem. * problem.
*/ */
if (config.cloneflags & CLONE_NEWUSER) { if (config.cloneflags & CLONE_NEWUSER) {
write_log(DEBUG, "unshare user namespace");
if (unshare(CLONE_NEWUSER) < 0) if (unshare(CLONE_NEWUSER) < 0)
bail("failed to unshare user namespace"); bail("failed to unshare user namespace");
config.cloneflags &= ~CLONE_NEWUSER; config.cloneflags &= ~CLONE_NEWUSER;
/* /*
* We don't have the privileges to do any mapping here (see the * We need to set ourselves as dumpable temporarily so that the
* clone_parent rant). So signal our parent to hook us up. * parent process can write to our procfs files.
*/ */
/* Switching is only necessary if we joined namespaces. */
if (config.namespaces) { if (config.namespaces) {
write_log(DEBUG, "temporarily set process as dumpable");
if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) < 0) if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) < 0)
bail("failed to set process as dumpable"); bail("failed to temporarily set process as dumpable");
} }
/*
* We don't have the privileges to do any mapping here (see the
* clone_parent rant). So signal stage-0 to do the mapping for
* us.
*/
write_log(DEBUG, "request stage-0 to map user namespace");
s = SYNC_USERMAP_PLS; s = SYNC_USERMAP_PLS;
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) if (write(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with parent: write(SYNC_USERMAP_PLS)"); bail("failed to sync with parent: write(SYNC_USERMAP_PLS)");
/* ... wait for mapping ... */ /* ... wait for mapping ... */
write_log(DEBUG, "request stage-0 to map user namespace");
if (read(syncfd, &s, sizeof(s)) != sizeof(s)) if (read(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with parent: read(SYNC_USERMAP_ACK)"); bail("failed to sync with parent: read(SYNC_USERMAP_ACK)");
if (s != SYNC_USERMAP_ACK) if (s != SYNC_USERMAP_ACK)
bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s); bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s);
/* Switching is only necessary if we joined namespaces. */
/* Revert temporary re-dumpable setting. */
if (config.namespaces) { if (config.namespaces) {
write_log(DEBUG, "re-set process as non-dumpable");
if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0) if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
bail("failed to set process as dumpable"); bail("failed to re-set process as non-dumpable");
} }
/* Become root in the namespace proper. */ /* Become root in the namespace proper. */
if (setresuid(0, 0, 0) < 0) if (setresuid(0, 0, 0) < 0)
bail("failed to become root in user namespace"); bail("failed to become root in user namespace");
} }
/* /*
* Unshare all of the namespaces. Now, it should be noted that this * Unshare all of the namespaces. Now, it should be noted that this
* ordering might break in the future (especially with rootless * ordering might break in the future (especially with rootless
@ -909,8 +974,9 @@ void nsexec(void)
* some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID) * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID)
* was broken, so we'll just do it the long way anyway. * was broken, so we'll just do it the long way anyway.
*/ */
write_log(DEBUG, "unshare remaining namespace (except cgroupns)");
if (unshare(config.cloneflags & ~CLONE_NEWCGROUP) < 0) if (unshare(config.cloneflags & ~CLONE_NEWCGROUP) < 0)
bail("failed to unshare namespaces"); bail("failed to unshare remaining namespaces (except cgroupns)");
/* /*
* TODO: What about non-namespace clone flags that we're dropping here? * TODO: What about non-namespace clone flags that we're dropping here?
@ -921,41 +987,45 @@ void nsexec(void)
* which would break many applications and libraries, so we must fork * which would break many applications and libraries, so we must fork
* to actually enter the new PID namespace. * to actually enter the new PID namespace.
*/ */
child = clone_parent(&env, JUMP_INIT); write_log(DEBUG, "spawn stage-2");
if (child < 0) stage2_pid = clone_parent(&env, STAGE_INIT);
bail("unable to fork: init_func"); if (stage2_pid < 0)
bail("unable to spawn stage-2");
/* Send the child to our parent, which knows what it's doing. */ /* Send the child to our parent, which knows what it's doing. */
write_log(DEBUG, "request stage-0 to forward stage-2 pid (%d)", stage2_pid);
s = SYNC_RECVPID_PLS; s = SYNC_RECVPID_PLS;
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
kill(child, SIGKILL); sane_kill(stage2_pid, SIGKILL);
bail("failed to sync with parent: write(SYNC_RECVPID_PLS)"); bail("failed to sync with parent: write(SYNC_RECVPID_PLS)");
} }
if (write(syncfd, &child, sizeof(child)) != sizeof(child)) { if (write(syncfd, &stage2_pid, sizeof(stage2_pid)) != sizeof(stage2_pid)) {
kill(child, SIGKILL); sane_kill(stage2_pid, SIGKILL);
bail("failed to sync with parent: write(childpid)"); bail("failed to sync with parent: write(stage2_pid)");
} }
/* ... wait for parent to get the pid ... */ /* ... wait for parent to get the pid ... */
if (read(syncfd, &s, sizeof(s)) != sizeof(s)) { if (read(syncfd, &s, sizeof(s)) != sizeof(s)) {
kill(child, SIGKILL); sane_kill(stage2_pid, SIGKILL);
bail("failed to sync with parent: read(SYNC_RECVPID_ACK)"); bail("failed to sync with parent: read(SYNC_RECVPID_ACK)");
} }
if (s != SYNC_RECVPID_ACK) { if (s != SYNC_RECVPID_ACK) {
kill(child, SIGKILL); sane_kill(stage2_pid, SIGKILL);
bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s); bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s);
} }
s = SYNC_CHILD_READY; write_log(DEBUG, "signal completion to stage-0");
s = SYNC_CHILD_FINISH;
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
kill(child, SIGKILL); sane_kill(stage2_pid, SIGKILL);
bail("failed to sync with parent: write(SYNC_CHILD_READY)"); bail("failed to sync with parent: write(SYNC_CHILD_FINISH)");
} }
/* Our work is done. [Stage 2: JUMP_INIT] is doing the rest of the work. */ /* Our work is done. [Stage 2: STAGE_INIT] is doing the rest of the work. */
write_log(DEBUG, "<~ nsexec stage-1");
exit(0); exit(0);
} }
break;
/* /*
* Stage 2: We're the final child process, and the only process that will * Stage 2: We're the final child process, and the only process that will
@ -963,7 +1033,7 @@ void nsexec(void)
* final cleanup steps and then return to the Go runtime to allow * final cleanup steps and then return to the Go runtime to allow
* init_linux.go to run. * init_linux.go to run.
*/ */
case JUMP_INIT:{ case STAGE_INIT:{
/* /*
* We're inside the child now, having jumped from the * We're inside the child now, having jumped from the
* start_child() code after forking in the parent. * start_child() code after forking in the parent.
@ -978,6 +1048,7 @@ void nsexec(void)
/* For debugging. */ /* For debugging. */
prctl(PR_SET_NAME, (unsigned long)"runc:[2:INIT]", 0, 0, 0); prctl(PR_SET_NAME, (unsigned long)"runc:[2:INIT]", 0, 0, 0);
write_log(DEBUG, "~> nsexec stage-2");
if (read(syncfd, &s, sizeof(s)) != sizeof(s)) if (read(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with parent: read(SYNC_GRANDCHILD)"); bail("failed to sync with parent: read(SYNC_GRANDCHILD)");
@ -998,21 +1069,30 @@ void nsexec(void)
bail("setgroups failed"); bail("setgroups failed");
} }
/* ... wait until our topmost parent has finished cgroup setup in p.manager.Apply() ... */ /*
* Wait until our topmost parent has finished cgroup setup in
* p.manager.Apply().
*
* TODO(cyphar): Check if this code is actually needed because we
* should be in the cgroup even from stage-0, so
* waiting until now might not make sense.
*/
if (config.cloneflags & CLONE_NEWCGROUP) { if (config.cloneflags & CLONE_NEWCGROUP) {
uint8_t value; uint8_t value;
if (read(pipenum, &value, sizeof(value)) != sizeof(value)) if (read(pipenum, &value, sizeof(value)) != sizeof(value))
bail("read synchronisation value failed"); bail("read synchronisation value failed");
if (value == CREATECGROUPNS) { if (value == CREATECGROUPNS) {
write_log(DEBUG, "unshare cgroup namespace");
if (unshare(CLONE_NEWCGROUP) < 0) if (unshare(CLONE_NEWCGROUP) < 0)
bail("failed to unshare cgroup namespace"); bail("failed to unshare cgroup namespace");
} else } else
bail("received unknown synchronisation value"); bail("received unknown synchronisation value");
} }
s = SYNC_CHILD_READY; write_log(DEBUG, "signal completion to stage-0");
s = SYNC_CHILD_FINISH;
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) if (write(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with patent: write(SYNC_CHILD_READY)"); bail("failed to sync with patent: write(SYNC_CHILD_FINISH)");
/* Close sync pipes. */ /* Close sync pipes. */
close(sync_grandchild_pipe[0]); close(sync_grandchild_pipe[0]);
@ -1021,10 +1101,13 @@ void nsexec(void)
nl_free(&config); nl_free(&config);
/* Finish executing, let the Go runtime take over. */ /* Finish executing, let the Go runtime take over. */
write_log(DEBUG, "<= nsexec container setup");
write_log(DEBUG, "booting up go runtime ...");
return; return;
} }
break;
default: default:
bail("unexpected jump value"); bail("unknown stage '%d' for jump value", current_stage);
} }
/* Should never be reached. */ /* Should never be reached. */

View File

@ -0,0 +1 @@
../escape.c

View File

@ -0,0 +1,53 @@
package escapetest
// This file is part of escape_json_string unit test.
// It is in a separate package so cgo can be used together
// with go test.
// #include <stdlib.h>
// extern char *escape_json_string(char *str);
// #cgo CFLAGS: -DESCAPE_TEST=1
import "C"
import (
"testing"
"unsafe"
)
func testEscapeJsonString(t *testing.T, input, want string) {
in := C.CString(input)
out := C.escape_json_string(in)
got := C.GoString(out)
C.free(unsafe.Pointer(out))
t.Logf("input: %q, output: %q", input, got)
if got != want {
t.Errorf("Failed on input: %q, want %q, got %q", input, want, got)
}
}
func testEscapeJson(t *testing.T) {
testCases := []struct {
input, output string
}{
{"", ""},
{"abcdef", "abcdef"},
{`\\\\\\`, `\\\\\\\\\\\\`},
{`with"quote`, `with\"quote`},
{"\n\r\b\t\f\\", `\n\r\b\t\f\\`},
{"\007", "\\u0007"},
{"\017 \020 \037", "\\u000f \\u0010 \\u001f"},
{"\033", "\\u001b"},
{`<->`, `<->`},
{"\176\177\200", "~\\u007f\200"},
{"\000", ""},
{"a\x7fxc", "a\\u007fxc"},
{"a\033xc", "a\\u001bxc"},
{"a\nxc", "a\\nxc"},
{"a\\xc", "a\\\\xc"},
{"Barney B\303\244r", "Barney B\303\244r"},
}
for _, tc := range testCases {
testEscapeJsonString(t, tc.input, tc.output)
}
}

View File

@ -1,41 +0,0 @@
package user
import (
"errors"
)
var (
// The current operating system does not provide the required data for user lookups.
ErrUnsupported = errors.New("user lookup: operating system does not provide passwd-formatted data")
// No matching entries found in file.
ErrNoPasswdEntries = errors.New("no matching entries in passwd file")
ErrNoGroupEntries = errors.New("no matching entries in group file")
)
// LookupUser looks up a user by their username in /etc/passwd. If the user
// cannot be found (or there is no /etc/passwd file on the filesystem), then
// LookupUser returns an error.
func LookupUser(username string) (User, error) {
return lookupUser(username)
}
// LookupUid looks up a user by their user id in /etc/passwd. If the user cannot
// be found (or there is no /etc/passwd file on the filesystem), then LookupId
// returns an error.
func LookupUid(uid int) (User, error) {
return lookupUid(uid)
}
// LookupGroup looks up a group by its name in /etc/group. If the group cannot
// be found (or there is no /etc/group file on the filesystem), then LookupGroup
// returns an error.
func LookupGroup(groupname string) (Group, error) {
return lookupGroup(groupname)
}
// LookupGid looks up a group by its group id in /etc/group. If the group cannot
// be found (or there is no /etc/group file on the filesystem), then LookupGid
// returns an error.
func LookupGid(gid int) (Group, error) {
return lookupGid(gid)
}

View File

@ -16,13 +16,19 @@ const (
unixGroupPath = "/etc/group" unixGroupPath = "/etc/group"
) )
func lookupUser(username string) (User, error) { // LookupUser looks up a user by their username in /etc/passwd. If the user
// cannot be found (or there is no /etc/passwd file on the filesystem), then
// LookupUser returns an error.
func LookupUser(username string) (User, error) {
return lookupUserFunc(func(u User) bool { return lookupUserFunc(func(u User) bool {
return u.Name == username return u.Name == username
}) })
} }
func lookupUid(uid int) (User, error) { // LookupUid looks up a user by their user id in /etc/passwd. If the user cannot
// be found (or there is no /etc/passwd file on the filesystem), then LookupId
// returns an error.
func LookupUid(uid int) (User, error) {
return lookupUserFunc(func(u User) bool { return lookupUserFunc(func(u User) bool {
return u.Uid == uid return u.Uid == uid
}) })
@ -51,13 +57,19 @@ func lookupUserFunc(filter func(u User) bool) (User, error) {
return users[0], nil return users[0], nil
} }
func lookupGroup(groupname string) (Group, error) { // LookupGroup looks up a group by its name in /etc/group. If the group cannot
// be found (or there is no /etc/group file on the filesystem), then LookupGroup
// returns an error.
func LookupGroup(groupname string) (Group, error) {
return lookupGroupFunc(func(g Group) bool { return lookupGroupFunc(func(g Group) bool {
return g.Name == groupname return g.Name == groupname
}) })
} }
func lookupGid(gid int) (Group, error) { // LookupGid looks up a group by its group id in /etc/group. If the group cannot
// be found (or there is no /etc/group file on the filesystem), then LookupGid
// returns an error.
func LookupGid(gid int) (Group, error) {
return lookupGroupFunc(func(g Group) bool { return lookupGroupFunc(func(g Group) bool {
return g.Gid == gid return g.Gid == gid
}) })

View File

@ -1,40 +0,0 @@
// +build windows
package user
import (
"fmt"
"os/user"
)
func lookupUser(username string) (User, error) {
u, err := user.Lookup(username)
if err != nil {
return User{}, err
}
return userFromOS(u)
}
func lookupUid(uid int) (User, error) {
u, err := user.LookupId(fmt.Sprintf("%d", uid))
if err != nil {
return User{}, err
}
return userFromOS(u)
}
func lookupGroup(groupname string) (Group, error) {
g, err := user.LookupGroup(groupname)
if err != nil {
return Group{}, err
}
return groupFromOS(g)
}
func lookupGid(gid int) (Group, error) {
g, err := user.LookupGroupId(fmt.Sprintf("%d", gid))
if err != nil {
return Group{}, err
}
return groupFromOS(g)
}

View File

@ -2,10 +2,10 @@ package user
import ( import (
"bufio" "bufio"
"errors"
"fmt" "fmt"
"io" "io"
"os" "os"
"os/user"
"strconv" "strconv"
"strings" "strings"
) )
@ -16,6 +16,13 @@ const (
) )
var ( var (
// The current operating system does not provide the required data for user lookups.
ErrUnsupported = errors.New("user lookup: operating system does not provide passwd-formatted data")
// No matching entries found in file.
ErrNoPasswdEntries = errors.New("no matching entries in passwd file")
ErrNoGroupEntries = errors.New("no matching entries in group file")
ErrRange = fmt.Errorf("uids and gids must be in range %d-%d", minId, maxId) ErrRange = fmt.Errorf("uids and gids must be in range %d-%d", minId, maxId)
) )
@ -29,28 +36,6 @@ type User struct {
Shell string Shell string
} }
// userFromOS converts an os/user.(*User) to local User
//
// (This does not include Pass, Shell or Gecos)
func userFromOS(u *user.User) (User, error) {
newUser := User{
Name: u.Username,
Home: u.HomeDir,
}
id, err := strconv.Atoi(u.Uid)
if err != nil {
return newUser, err
}
newUser.Uid = id
id, err = strconv.Atoi(u.Gid)
if err != nil {
return newUser, err
}
newUser.Gid = id
return newUser, nil
}
type Group struct { type Group struct {
Name string Name string
Pass string Pass string
@ -58,23 +43,6 @@ type Group struct {
List []string List []string
} }
// groupFromOS converts an os/user.(*Group) to local Group
//
// (This does not include Pass or List)
func groupFromOS(g *user.Group) (Group, error) {
newGroup := Group{
Name: g.Name,
}
id, err := strconv.Atoi(g.Gid)
if err != nil {
return newGroup, err
}
newGroup.Gid = id
return newGroup, nil
}
// SubID represents an entry in /etc/sub{u,g}id // SubID represents an entry in /etc/sub{u,g}id
type SubID struct { type SubID struct {
Name string Name string
@ -466,7 +434,7 @@ func GetAdditionalGroups(additionalGroups []string, group io.Reader) ([]int, err
// we asked for a group but didn't find it. let's check to see // we asked for a group but didn't find it. let's check to see
// if we wanted a numeric group // if we wanted a numeric group
if !found { if !found {
gid, err := strconv.Atoi(ag) gid, err := strconv.ParseInt(ag, 10, 64)
if err != nil { if err != nil {
return nil, fmt.Errorf("Unable to find group %s", ag) return nil, fmt.Errorf("Unable to find group %s", ag)
} }
@ -474,7 +442,7 @@ func GetAdditionalGroups(additionalGroups []string, group io.Reader) ([]int, err
if gid < minId || gid > maxId { if gid < minId || gid > maxId {
return nil, ErrRange return nil, ErrRange
} }
gidMap[gid] = struct{}{} gidMap[int(gid)] = struct{}{}
} }
} }
gids := []int{} gids := []int{}

View File

@ -0,0 +1,42 @@
// +build gofuzz
package user
import (
"io"
"strings"
)
func IsDivisbleBy(n int, divisibleby int) bool {
return (n % divisibleby) == 0
}
func FuzzUser(data []byte) int {
if len(data) == 0 {
return -1
}
if !IsDivisbleBy(len(data), 5) {
return -1
}
var divided [][]byte
chunkSize := len(data) / 5
for i := 0; i < len(data); i += chunkSize {
end := i + chunkSize
divided = append(divided, data[i:end])
}
_, _ = ParsePasswdFilter(strings.NewReader(string(divided[0])), nil)
var passwd, group io.Reader
group = strings.NewReader(string(divided[1]))
_, _ = GetAdditionalGroups([]string{string(divided[2])}, group)
passwd = strings.NewReader(string(divided[3]))
_, _ = GetExecUser(string(divided[4]), nil, passwd, group)
return 1
}