Skip to content

Commit

Permalink
kubelet: Implement support for userns
Browse files Browse the repository at this point in the history
This commit implements support for user ns in the kubelet. The kubelet uses the
GetRuntimeInfoConfig function of the runtime to query for the uid/gid configured
mapping.

Kubelet tries to use POD mode for the user namespace when possible, NODE is used
when:
- Feature is not supported nor enabled in the runtime
- The value of the "alpha.kinvolk.io/userns" annotation is "node"
- The pod specification is imcompatible with it
-- Any host namespace is used (IPC, PID, NET)
-- There is any host-path volume
-- There is any non namespaced capability (MKNOD, SYS_TIME, SYS_MODULE)
-- There is any privileged container
-- The pod has PVC mounts

Files under the pod volumes dir (/var/lib/kubelet/pods/xxxx/volumes) are chowned
to the mapped user in the host if the user namespace is used.
  • Loading branch information
mauriciovasquezbernal committed Jul 3, 2020
1 parent d54cddc commit dc7798d
Show file tree
Hide file tree
Showing 16 changed files with 536 additions and 142 deletions.
9 changes: 0 additions & 9 deletions pkg/features/kube_features.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,14 +40,6 @@ const (
// beta: v1.11
DynamicKubeletConfig featuregate.Feature = "DynamicKubeletConfig"

// owner: @pweil-
// alpha: v1.5
//
// Default userns=host for containers that are using other host namespaces, host mounts, the pod
// contains a privileged container, or specific non-namespaced capabilities (MKNOD, SYS_MODULE,
// SYS_TIME). This should only be enabled if user namespace remapping is enabled in the docker daemon.
ExperimentalHostUserNamespaceDefaultingGate featuregate.Feature = "ExperimentalHostUserNamespaceDefaulting"

// owner: @jiayingz
// beta: v1.10
//
Expand Down Expand Up @@ -559,7 +551,6 @@ func init() {
var defaultKubernetesFeatureGates = map[featuregate.Feature]featuregate.FeatureSpec{
AppArmor: {Default: true, PreRelease: featuregate.Beta},
DynamicKubeletConfig: {Default: true, PreRelease: featuregate.Beta},
ExperimentalHostUserNamespaceDefaultingGate: {Default: false, PreRelease: featuregate.Beta},
DevicePlugins: {Default: true, PreRelease: featuregate.Beta},
TaintBasedEvictions: {Default: true, PreRelease: featuregate.Beta},
RotateKubeletServerCertificate: {Default: true, PreRelease: featuregate.Beta},
Expand Down
2 changes: 2 additions & 0 deletions pkg/kubelet/container/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ type RuntimeHelper interface {
// supplemental groups for the Pod. These extra supplemental groups come
// from annotations on persistent volumes that the pod depends on.
GetExtraSupplementalGroupsForPod(pod *v1.Pod) []int64
// UserNamespaceForPod returns the mode for the user namespace of the pod passed as argument.
UserNamespaceForPod(pod *v1.Pod) (runtimeapi.NamespaceMode, error)
}

// ShouldContainerBeRestarted checks whether a container needs to be restarted.
Expand Down
6 changes: 0 additions & 6 deletions pkg/kubelet/container/runtime.go
Original file line number Diff line number Diff line change
Expand Up @@ -424,12 +424,6 @@ type RunContainerOptions struct {
ReadOnly bool
// hostname for pod containers
Hostname string
// EnableHostUserNamespace sets userns=host when users request host namespaces (pid, ipc, net),
// are using non-namespaced capabilities (mknod, sys_time, sys_module), the pod contains a privileged container,
// or using host path volumes.
// This should only be enabled when the container runtime is performing user remapping AND if the
// experimental behavior is desired.
EnableHostUserNamespace bool
}

// VolumeInfo contains information about the volume.
Expand Down
4 changes: 4 additions & 0 deletions pkg/kubelet/container/testing/fake_runtime_helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,10 @@ func (f *FakeRuntimeHelper) GetPodVolumesDir(podUID kubetypes.UID) string {
return f.GetPodDir(podUID) + "/volumes/"
}

func (f *FakeRuntimeHelper) UserNamespaceForPod(pod *v1.Pod) (runtimeapi.NamespaceMode, error) {
return runtimeapi.NamespaceMode_NODE, nil
}

func (f *FakeRuntimeHelper) GetExtraSupplementalGroupsForPod(pod *v1.Pod) []int64 {
return nil
}
12 changes: 12 additions & 0 deletions pkg/kubelet/dockershim/docker_sandbox.go
Original file line number Diff line number Diff line change
Expand Up @@ -464,6 +464,7 @@ func (ds *dockerService) PodSandboxStatus(ctx context.Context, req *runtimeapi.P
Network: networkNamespaceMode(r),
Pid: pidNamespaceMode(r),
Ipc: ipcNamespaceMode(r),
User: userNamespaceMode(r),
},
},
},
Expand Down Expand Up @@ -690,6 +691,17 @@ func ipcNamespaceMode(container *dockertypes.ContainerJSON) runtimeapi.Namespace
return runtimeapi.NamespaceMode_POD
}

// userNamespaceMode returns the user runtimeapi.NamespaceMode for this container.
// Supports: POD, NODE
func userNamespaceMode(container *dockertypes.ContainerJSON) runtimeapi.NamespaceMode {
if container != nil && container.HostConfig != nil {
if string(container.HostConfig.UsernsMode) == namespaceModeHost {
return runtimeapi.NamespaceMode_NODE
}
}
return runtimeapi.NamespaceMode_POD
}

func constructPodSandboxCheckpoint(config *runtimeapi.PodSandboxConfig) checkpointmanager.Checkpoint {
data := CheckpointData{}
for _, pm := range config.GetPortMappings() {
Expand Down
133 changes: 81 additions & 52 deletions pkg/kubelet/dockershim/docker_sandbox_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,65 +93,94 @@ func TestListSandboxes(t *testing.T) {
// TestSandboxStatus tests the basic lifecycle operations and verify that
// the status returned reflects the operations performed.
func TestSandboxStatus(t *testing.T) {
ds, fDocker, fClock := newTestDockerService()
labels := map[string]string{"label": "foobar1"}
annotations := map[string]string{"annotation": "abc"}
config := makeSandboxConfigWithLabelsAndAnnotations("foo", "bar", "1", 0, labels, annotations)
r := rand.New(rand.NewSource(0)).Uint32()
podIP := fmt.Sprintf("10.%d.%d.%d", byte(r>>16), byte(r>>8), byte(r))

state := runtimeapi.PodSandboxState_SANDBOX_READY
ct := int64(0)
expected := &runtimeapi.PodSandboxStatus{
State: state,
CreatedAt: ct,
Metadata: config.Metadata,
Network: &runtimeapi.PodSandboxNetworkStatus{Ip: podIP, AdditionalIps: []*runtimeapi.PodIP{}},
Linux: &runtimeapi.LinuxPodSandboxStatus{
Namespaces: &runtimeapi.Namespace{
Options: &runtimeapi.NamespaceOption{
Pid: runtimeapi.NamespaceMode_CONTAINER,
},
},
testCases := []struct {
description string
labels map[string]string
annotations map[string]string
linux *runtimeapi.LinuxPodSandboxStatus
}{
{
description: "Tests that default value of User namespace option is NamespaceMode_POD",
labels: map[string]string{"label": "foobar1"},
annotations: map[string]string{"annotation": "abc"},
linux: &runtimeapi.LinuxPodSandboxStatus{
Namespaces: &runtimeapi.Namespace{
Options: &runtimeapi.NamespaceOption{
Pid: runtimeapi.NamespaceMode_CONTAINER,
User: runtimeapi.NamespaceMode_POD,
},
}},
},
{
description: "Tests that if User namespace option is NamespaceMode_NODE, same is received back in status",
labels: map[string]string{"label": "foobar1"},
annotations: map[string]string{"annotation": "abc"},
linux: &runtimeapi.LinuxPodSandboxStatus{
Namespaces: &runtimeapi.Namespace{
Options: &runtimeapi.NamespaceOption{
Pid: runtimeapi.NamespaceMode_CONTAINER,
User: runtimeapi.NamespaceMode_NODE,
},
}},
},
Labels: labels,
Annotations: annotations,
}
for _, test := range testCases {
ds, fDocker, fClock := newTestDockerService()
config := makeSandboxConfigWithLabelsAndAnnotations("foo", "bar", "1", 0, test.labels, test.annotations)
config.Linux = &runtimeapi.LinuxPodSandboxConfig{
SecurityContext: &runtimeapi.LinuxSandboxSecurityContext{
NamespaceOptions: test.linux.Namespaces.Options,
},
}
r := rand.New(rand.NewSource(0)).Uint32()
podIP := fmt.Sprintf("10.%d.%d.%d", byte(r>>16), byte(r>>8), byte(r))

// Create the sandbox.
fClock.SetTime(time.Now())
expected.CreatedAt = fClock.Now().UnixNano()
runResp, err := ds.RunPodSandbox(getTestCTX(), &runtimeapi.RunPodSandboxRequest{Config: config})
require.NoError(t, err)
id := runResp.PodSandboxId
state := runtimeapi.PodSandboxState_SANDBOX_READY
ct := int64(0)
expected := &runtimeapi.PodSandboxStatus{
State: state,
CreatedAt: ct,
Metadata: config.Metadata,
Network: &runtimeapi.PodSandboxNetworkStatus{Ip: podIP, AdditionalIps: []*runtimeapi.PodIP{}},
Linux: test.linux,
Labels: test.labels,
Annotations: test.annotations,
}

// Check internal labels
c, err := fDocker.InspectContainer(id)
assert.NoError(t, err)
assert.Equal(t, c.Config.Labels[containerTypeLabelKey], containerTypeLabelSandbox)
assert.Equal(t, c.Config.Labels[types.KubernetesContainerNameLabel], sandboxContainerName)
// Create the sandbox.
fClock.SetTime(time.Now())
expected.CreatedAt = fClock.Now().UnixNano()
runResp, err := ds.RunPodSandbox(getTestCTX(), &runtimeapi.RunPodSandboxRequest{Config: config})
require.NoError(t, err)
id := runResp.PodSandboxId

expected.Id = id // ID is only known after the creation.
statusResp, err := ds.PodSandboxStatus(getTestCTX(), &runtimeapi.PodSandboxStatusRequest{PodSandboxId: id})
require.NoError(t, err)
assert.Equal(t, expected, statusResp.Status)
// Check internal labels
c, err := fDocker.InspectContainer(id)
assert.NoError(t, err)
assert.Equal(t, c.Config.Labels[containerTypeLabelKey], containerTypeLabelSandbox)
assert.Equal(t, c.Config.Labels[types.KubernetesContainerNameLabel], sandboxContainerName)

// Stop the sandbox.
expected.State = runtimeapi.PodSandboxState_SANDBOX_NOTREADY
_, err = ds.StopPodSandbox(getTestCTX(), &runtimeapi.StopPodSandboxRequest{PodSandboxId: id})
require.NoError(t, err)
// IP not valid after sandbox stop
expected.Network.Ip = ""
expected.Network.AdditionalIps = []*runtimeapi.PodIP{}
statusResp, err = ds.PodSandboxStatus(getTestCTX(), &runtimeapi.PodSandboxStatusRequest{PodSandboxId: id})
require.NoError(t, err)
assert.Equal(t, expected, statusResp.Status)
expected.Id = id // ID is only known after the creation.
statusResp, err := ds.PodSandboxStatus(getTestCTX(), &runtimeapi.PodSandboxStatusRequest{PodSandboxId: id})
require.NoError(t, err)
assert.Equal(t, expected, statusResp.Status)

// Remove the container.
_, err = ds.RemovePodSandbox(getTestCTX(), &runtimeapi.RemovePodSandboxRequest{PodSandboxId: id})
require.NoError(t, err)
statusResp, err = ds.PodSandboxStatus(getTestCTX(), &runtimeapi.PodSandboxStatusRequest{PodSandboxId: id})
assert.Error(t, err, fmt.Sprintf("status of sandbox: %+v", statusResp))
// Stop the sandbox.
expected.State = runtimeapi.PodSandboxState_SANDBOX_NOTREADY
_, err = ds.StopPodSandbox(getTestCTX(), &runtimeapi.StopPodSandboxRequest{PodSandboxId: id})
require.NoError(t, err)
// IP not valid after sandbox stop
expected.Network.Ip = ""
statusResp, err = ds.PodSandboxStatus(getTestCTX(), &runtimeapi.PodSandboxStatusRequest{PodSandboxId: id})
require.NoError(t, err)
assert.Equal(t, expected, statusResp.Status)

// Remove the container.
_, err = ds.RemovePodSandbox(getTestCTX(), &runtimeapi.RemovePodSandboxRequest{PodSandboxId: id})
require.NoError(t, err)
statusResp, err = ds.PodSandboxStatus(getTestCTX(), &runtimeapi.PodSandboxStatusRequest{PodSandboxId: id})
assert.Error(t, err, fmt.Sprintf("status of sandbox: %+v", statusResp))
}
}

// TestSandboxStatusAfterRestart tests that retrieving sandbox status returns
Expand Down
6 changes: 6 additions & 0 deletions pkg/kubelet/dockershim/security_context.go
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,9 @@ func modifyCommonNamespaceOptions(nsOpts *runtimeapi.NamespaceOption, hostConfig

// modifyHostOptionsForSandbox applies NetworkMode/UTSMode to sandbox's dockercontainer.HostConfig.
func modifyHostOptionsForSandbox(nsOpts *runtimeapi.NamespaceOption, network *knetwork.PluginManager, hc *dockercontainer.HostConfig) {
if nsOpts.GetUser() == runtimeapi.NamespaceMode_NODE {
hc.UsernsMode = namespaceModeHost
}
if nsOpts.GetIpc() == runtimeapi.NamespaceMode_NODE {
hc.IpcMode = namespaceModeHost
}
Expand Down Expand Up @@ -199,6 +202,9 @@ func modifyHostOptionsForContainer(nsOpts *runtimeapi.NamespaceOption, podSandbo
hc.NetworkMode = dockercontainer.NetworkMode(sandboxNSMode)
hc.IpcMode = dockercontainer.IpcMode(sandboxNSMode)
hc.UTSMode = ""
if nsOpts.GetUser() == runtimeapi.NamespaceMode_NODE {
hc.UsernsMode = namespaceModeHost
}

if nsOpts.GetNetwork() == runtimeapi.NamespaceMode_NODE {
hc.UTSMode = namespaceModeHost
Expand Down
22 changes: 22 additions & 0 deletions pkg/kubelet/dockershim/security_context_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,16 @@ func TestModifySandboxNamespaceOptions(t *testing.T) {
NetworkMode: "default",
},
},
{
name: "Host User NamespaceOption",
nsOpt: &runtimeapi.NamespaceOption{
User: runtimeapi.NamespaceMode_NODE,
},
expected: &dockercontainer.HostConfig{
NetworkMode: "default",
UsernsMode: namespaceModeHost,
},
},
}
for _, tc := range cases {
dockerCfg := &dockercontainer.HostConfig{}
Expand Down Expand Up @@ -396,6 +406,18 @@ func TestModifyContainerNamespaceOptions(t *testing.T) {
PidMode: namespaceModeHost,
},
},
{
name: "Host User NamespaceOption",
nsOpt: &runtimeapi.NamespaceOption{
User: runtimeapi.NamespaceMode_NODE,
},
expected: &dockercontainer.HostConfig{
NetworkMode: dockercontainer.NetworkMode(sandboxNSMode),
IpcMode: dockercontainer.IpcMode(sandboxNSMode),
PidMode: dockercontainer.PidMode(sandboxNSMode),
UsernsMode: namespaceModeHost,
},
},
}
for _, tc := range cases {
dockerCfg := &dockercontainer.HostConfig{}
Expand Down
Loading

0 comments on commit dc7798d

Please sign in to comment.