diff --git a/Gopkg.lock b/Gopkg.lock index b16f86df47..663907c013 100644 --- a/Gopkg.lock +++ b/Gopkg.lock @@ -1,6 +1,14 @@ # This file is autogenerated, do not edit; changes may be undone by the next 'dep ensure'. +[[projects]] + digest = "1:b54ea408c3a0a82d6d644fbb4468d00b50b1368482abc337aec72b60a3ea7c12" + name = "github.com/checkpoint-restore/go-criu" + packages = ["rpc"] + pruneopts = "NUT" + revision = "17b0214f6c48980c45dc47ecb0cfd6d9e02df723" + version = "v3.11" + [[projects]] branch = "master" digest = "1:8ecb89af7dfe3ac401bdb0c9390b134ef96a97e85f732d2b0604fb7b3977839f" @@ -133,7 +141,7 @@ revision = "7d4729fb36185a7c1719923406c9d40e54fb93c7" [[projects]] - digest = "1:e9efda6418044c653a9d6051719196214dbdbe4fa50ebcc7975dde8932b6d2f5" + digest = "1:b2eb2cc22a636b636ed6f57847c37ca065bddbe456303930193083edd36f1203" name = "github.com/opencontainers/runc" packages = [ "libcontainer", @@ -143,7 +151,6 @@ "libcontainer/cgroups/systemd", "libcontainer/configs", "libcontainer/configs/validate", - "libcontainer/criurpc", "libcontainer/intelrdt", "libcontainer/keys", "libcontainer/mount", @@ -156,7 +163,7 @@ "libcontainer/utils", ] pruneopts = "NUT" - revision = "cc4307ab6643668ce5abc6b524e1764a54c32550" + revision = "f56b4cbeadc407e715d9b2ba49e62185bd81cef4" [[projects]] digest = "1:0d447d4961f4f9270457fbc20d0261bba8d3056f395efd2e2480e2dfa4487a60" diff --git a/Gopkg.toml b/Gopkg.toml index cf910debab..84167b528c 100644 --- a/Gopkg.toml +++ b/Gopkg.toml @@ -12,7 +12,7 @@ [[constraint]] name = "github.com/opencontainers/runc" - revision = "cc4307ab6643668ce5abc6b524e1764a54c32550" + revision = "f56b4cbeadc407e715d9b2ba49e62185bd81cef4" [[constraint]] name = "github.com/opencontainers/runtime-spec" diff --git a/vendor/github.com/checkpoint-restore/go-criu/LICENSE b/vendor/github.com/checkpoint-restore/go-criu/LICENSE new file mode 100644 index 0000000000..8dada3edaf --- /dev/null +++ b/vendor/github.com/checkpoint-restore/go-criu/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "{}" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright {yyyy} {name of copyright owner} + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/vendor/github.com/opencontainers/runc/libcontainer/criurpc/criurpc.pb.go b/vendor/github.com/checkpoint-restore/go-criu/rpc/rpc.pb.go similarity index 70% rename from vendor/github.com/opencontainers/runc/libcontainer/criurpc/criurpc.pb.go rename to vendor/github.com/checkpoint-restore/go-criu/rpc/rpc.pb.go index 21af9db971..230faace55 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/criurpc/criurpc.pb.go +++ b/vendor/github.com/checkpoint-restore/go-criu/rpc/rpc.pb.go @@ -1,12 +1,11 @@ -// Code generated by protoc-gen-go. -// source: criurpc.proto -// DO NOT EDIT! +// Code generated by protoc-gen-go. DO NOT EDIT. +// source: rpc/rpc.proto /* -Package criurpc is a generated protocol buffer package. +Package rpc is a generated protocol buffer package. It is generated from these files: - criurpc.proto + rpc/rpc.proto It has these top-level messages: CriuPageServerInfo @@ -25,7 +24,7 @@ It has these top-level messages: CriuResp CriuVersion */ -package criurpc +package rpc import proto "github.com/golang/protobuf/proto" import fmt "fmt" @@ -94,17 +93,19 @@ func (CriuCgMode) EnumDescriptor() ([]byte, []int) { return fileDescriptor0, []i type CriuReqType int32 const ( - CriuReqType_EMPTY CriuReqType = 0 - CriuReqType_DUMP CriuReqType = 1 - CriuReqType_RESTORE CriuReqType = 2 - CriuReqType_CHECK CriuReqType = 3 - CriuReqType_PRE_DUMP CriuReqType = 4 - CriuReqType_PAGE_SERVER CriuReqType = 5 - CriuReqType_NOTIFY CriuReqType = 6 - CriuReqType_CPUINFO_DUMP CriuReqType = 7 - CriuReqType_CPUINFO_CHECK CriuReqType = 8 - CriuReqType_FEATURE_CHECK CriuReqType = 9 - CriuReqType_VERSION CriuReqType = 10 + CriuReqType_EMPTY CriuReqType = 0 + CriuReqType_DUMP CriuReqType = 1 + CriuReqType_RESTORE CriuReqType = 2 + CriuReqType_CHECK CriuReqType = 3 + CriuReqType_PRE_DUMP CriuReqType = 4 + CriuReqType_PAGE_SERVER CriuReqType = 5 + CriuReqType_NOTIFY CriuReqType = 6 + CriuReqType_CPUINFO_DUMP CriuReqType = 7 + CriuReqType_CPUINFO_CHECK CriuReqType = 8 + CriuReqType_FEATURE_CHECK CriuReqType = 9 + CriuReqType_VERSION CriuReqType = 10 + CriuReqType_WAIT_PID CriuReqType = 11 + CriuReqType_PAGE_SERVER_CHLD CriuReqType = 12 ) var CriuReqType_name = map[int32]string{ @@ -119,19 +120,23 @@ var CriuReqType_name = map[int32]string{ 8: "CPUINFO_CHECK", 9: "FEATURE_CHECK", 10: "VERSION", + 11: "WAIT_PID", + 12: "PAGE_SERVER_CHLD", } var CriuReqType_value = map[string]int32{ - "EMPTY": 0, - "DUMP": 1, - "RESTORE": 2, - "CHECK": 3, - "PRE_DUMP": 4, - "PAGE_SERVER": 5, - "NOTIFY": 6, - "CPUINFO_DUMP": 7, - "CPUINFO_CHECK": 8, - "FEATURE_CHECK": 9, - "VERSION": 10, + "EMPTY": 0, + "DUMP": 1, + "RESTORE": 2, + "CHECK": 3, + "PRE_DUMP": 4, + "PAGE_SERVER": 5, + "NOTIFY": 6, + "CPUINFO_DUMP": 7, + "CPUINFO_CHECK": 8, + "FEATURE_CHECK": 9, + "VERSION": 10, + "WAIT_PID": 11, + "PAGE_SERVER_CHLD": 12, } func (x CriuReqType) Enum() *CriuReqType { @@ -393,6 +398,7 @@ type CriuOpts struct { LazyPages *bool `protobuf:"varint,48,opt,name=lazy_pages,json=lazyPages" json:"lazy_pages,omitempty"` StatusFd *int32 `protobuf:"varint,49,opt,name=status_fd,json=statusFd" json:"status_fd,omitempty"` OrphanPtsMaster *bool `protobuf:"varint,50,opt,name=orphan_pts_master,json=orphanPtsMaster" json:"orphan_pts_master,omitempty"` + ConfigFile *string `protobuf:"bytes,51,opt,name=config_file,json=configFile" json:"config_file,omitempty"` XXX_unrecognized []byte `json:"-"` } @@ -748,6 +754,13 @@ func (m *CriuOpts) GetOrphanPtsMaster() bool { return false } +func (m *CriuOpts) GetConfigFile() string { + if m != nil && m.ConfigFile != nil { + return *m.ConfigFile + } + return "" +} + type CriuDumpResp struct { Restored *bool `protobuf:"varint,1,opt,name=restored" json:"restored,omitempty"` XXX_unrecognized []byte `json:"-"` @@ -848,8 +861,10 @@ type CriuReq struct { // 'features' can be used to query which features // are supported by the installed criu/kernel // via RPC. - Features *CriuFeatures `protobuf:"bytes,5,opt,name=features" json:"features,omitempty"` - XXX_unrecognized []byte `json:"-"` + Features *CriuFeatures `protobuf:"bytes,5,opt,name=features" json:"features,omitempty"` + // 'pid' is used for WAIT_PID + Pid *uint32 `protobuf:"varint,6,opt,name=pid" json:"pid,omitempty"` + XXX_unrecognized []byte `json:"-"` } func (m *CriuReq) Reset() { *m = CriuReq{} } @@ -892,6 +907,13 @@ func (m *CriuReq) GetFeatures() *CriuFeatures { return nil } +func (m *CriuReq) GetPid() uint32 { + if m != nil && m.Pid != nil { + return *m.Pid + } + return 0 +} + type CriuResp struct { Type *CriuReqType `protobuf:"varint,1,req,name=type,enum=CriuReqType" json:"type,omitempty"` Success *bool `protobuf:"varint,2,req,name=success" json:"success,omitempty"` @@ -903,6 +925,7 @@ type CriuResp struct { Features *CriuFeatures `protobuf:"bytes,8,opt,name=features" json:"features,omitempty"` CrErrmsg *string `protobuf:"bytes,9,opt,name=cr_errmsg,json=crErrmsg" json:"cr_errmsg,omitempty"` Version *CriuVersion `protobuf:"bytes,10,opt,name=version" json:"version,omitempty"` + Status *int32 `protobuf:"varint,11,opt,name=status" json:"status,omitempty"` XXX_unrecognized []byte `json:"-"` } @@ -981,6 +1004,13 @@ func (m *CriuResp) GetVersion() *CriuVersion { return nil } +func (m *CriuResp) GetStatus() int32 { + if m != nil && m.Status != nil { + return *m.Status + } + return 0 +} + // Answer for criu_req_type.VERSION requests type CriuVersion struct { Major *int32 `protobuf:"varint,1,req,name=major" json:"major,omitempty"` @@ -1059,120 +1089,123 @@ func init() { proto.RegisterEnum("CriuReqType", CriuReqType_name, CriuReqType_value) } -func init() { proto.RegisterFile("criurpc.proto", fileDescriptor0) } +func init() { proto.RegisterFile("rpc/rpc.proto", fileDescriptor0) } var fileDescriptor0 = []byte{ - // 1781 bytes of a gzipped FileDescriptorProto - 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0x8c, 0x56, 0xdd, 0x72, 0x5b, 0xb7, - 0x11, 0x0e, 0x29, 0xfe, 0x1c, 0x82, 0x3f, 0xa6, 0x10, 0xdb, 0x81, 0x93, 0xda, 0x62, 0xe8, 0x28, - 0x51, 0x15, 0x97, 0x4d, 0x58, 0x3b, 0xae, 0x33, 0xed, 0x85, 0x47, 0x22, 0x5d, 0x36, 0x92, 0xc8, - 0x01, 0x25, 0xcf, 0xe4, 0x0a, 0x73, 0x74, 0x0e, 0x48, 0xc1, 0x3c, 0x7f, 0x05, 0x40, 0x45, 0xf2, - 0x83, 0xf4, 0x29, 0xfa, 0x0c, 0x7d, 0x84, 0xbe, 0x4e, 0x6f, 0x3b, 0xbb, 0x00, 0x65, 0x29, 0xc9, - 0xb4, 0xbd, 0xc3, 0x7e, 0x58, 0x00, 0xbb, 0xfb, 0xed, 0x0f, 0x48, 0x3b, 0xd2, 0x6a, 0xad, 0x8b, - 0x68, 0x50, 0xe8, 0xdc, 0xe6, 0xfd, 0x25, 0x79, 0x00, 0x80, 0x28, 0xc2, 0xa5, 0x14, 0x46, 0xea, - 0x4b, 0xa9, 0x85, 0xca, 0x16, 0x39, 0x65, 0xa4, 0x1e, 0xc6, 0xb1, 0x96, 0xc6, 0xb0, 0x52, 0xaf, - 0xb4, 0xd7, 0xe0, 0x1b, 0x91, 0x52, 0x52, 0x29, 0x72, 0x6d, 0x59, 0xb9, 0x57, 0xda, 0xab, 0x72, - 0x5c, 0xd3, 0x2e, 0xd9, 0x2a, 0x54, 0xcc, 0xb6, 0x10, 0x82, 0x25, 0xed, 0x90, 0xf2, 0x22, 0x66, - 0x15, 0x04, 0xca, 0x8b, 0xb8, 0xff, 0x27, 0xd2, 0xc1, 0x87, 0x2e, 0xa5, 0xbd, 0x10, 0x45, 0xa8, - 0x34, 0xfd, 0x98, 0x54, 0xd5, 0x42, 0xa8, 0x8c, 0x95, 0x7a, 0xe5, 0xbd, 0x06, 0xaf, 0xa8, 0xc5, - 0x24, 0xa3, 0x0f, 0x48, 0x4d, 0x2d, 0x44, 0xbe, 0x86, 0xeb, 0x01, 0xad, 0xaa, 0xc5, 0x74, 0x6d, - 0xfb, 0x7f, 0x20, 0x6d, 0x79, 0x65, 0x45, 0x9a, 0xaf, 0x33, 0x2b, 0xd2, 0xb0, 0x80, 0x07, 0x57, - 0xf2, 0xda, 0x1f, 0x85, 0x25, 0x20, 0x97, 0x61, 0xe2, 0x8f, 0xc1, 0xb2, 0xff, 0x96, 0x74, 0xde, - 0xe5, 0x2a, 0x13, 0x59, 0x98, 0x4a, 0x53, 0x84, 0x91, 0x04, 0xa3, 0x32, 0xe3, 0x0f, 0x95, 0x33, - 0x43, 0x3f, 0x21, 0xf5, 0xcc, 0x88, 0x85, 0x4a, 0xa4, 0x3f, 0x57, 0xcb, 0xcc, 0x58, 0x25, 0x92, - 0x7e, 0x46, 0x1a, 0xf2, 0xca, 0xea, 0x50, 0xe4, 0x85, 0x45, 0xaf, 0x1a, 0x3c, 0x40, 0x60, 0x5a, - 0xd8, 0xfe, 0x80, 0x10, 0x95, 0x5d, 0x48, 0xad, 0xac, 0x58, 0xc4, 0xbf, 0x62, 0x89, 0x73, 0x1d, - 0x2e, 0x74, 0xae, 0xbf, 0x20, 0xcd, 0x68, 0xa9, 0xf3, 0x75, 0x21, 0x74, 0x9e, 0x5b, 0x88, 0x5f, - 0x64, 0x75, 0xe2, 0xc3, 0x8a, 0x6b, 0x8c, 0x69, 0x68, 0x2f, 0xbc, 0x15, 0xb8, 0xee, 0xef, 0x90, - 0xfa, 0x3a, 0x53, 0x57, 0xc2, 0xac, 0xe8, 0x7d, 0x52, 0x55, 0x59, 0x1e, 0x4b, 0x7c, 0xa5, 0xcd, - 0x9d, 0xd0, 0xff, 0x57, 0x9b, 0x34, 0x30, 0xa6, 0x79, 0x61, 0x0d, 0xed, 0x93, 0xb6, 0x4a, 0xc3, - 0xa5, 0x34, 0x22, 0x56, 0x5a, 0x2c, 0x62, 0xd4, 0xad, 0xf2, 0xa6, 0x03, 0x0f, 0x95, 0x1e, 0xc7, - 0x1b, 0x9a, 0xca, 0x1f, 0x68, 0x7a, 0x4a, 0xda, 0x89, 0x0c, 0x2f, 0xa5, 0xd0, 0xeb, 0x2c, 0x53, - 0xd9, 0x12, 0x9d, 0x0d, 0x78, 0x0b, 0x41, 0xee, 0x30, 0xfa, 0x84, 0x34, 0x21, 0xfa, 0xde, 0x1a, - 0x24, 0x35, 0xe0, 0x10, 0xa0, 0xb3, 0x4c, 0x5d, 0xcd, 0x57, 0xf4, 0x2b, 0x72, 0xcf, 0x46, 0x85, - 0x90, 0xc6, 0x86, 0xe7, 0x89, 0x32, 0x17, 0x32, 0x66, 0x55, 0xd4, 0xe9, 0xd8, 0xa8, 0x18, 0x7d, - 0x40, 0x41, 0x51, 0x5e, 0x86, 0x46, 0x5d, 0x4a, 0x11, 0xcb, 0x4b, 0x15, 0x49, 0xc3, 0x6a, 0x4e, - 0xd1, 0xc3, 0x87, 0x0e, 0x85, 0xf8, 0x9b, 0x0b, 0x99, 0x24, 0xe2, 0x5d, 0x7e, 0xce, 0xea, 0xa8, - 0x12, 0x20, 0xf0, 0xd7, 0xfc, 0x9c, 0x3e, 0x26, 0x04, 0x28, 0x13, 0x49, 0x1e, 0xad, 0x0c, 0x0b, - 0x9c, 0x35, 0x80, 0x1c, 0x01, 0x40, 0x9f, 0x90, 0x46, 0x92, 0x2f, 0x45, 0x22, 0x2f, 0x65, 0xc2, - 0x1a, 0xe0, 0xea, 0xf7, 0xa5, 0x21, 0x0f, 0x92, 0x7c, 0x79, 0x04, 0x10, 0x7d, 0x44, 0x60, 0xed, - 0x58, 0x27, 0x2e, 0xb5, 0x93, 0x7c, 0x89, 0xb4, 0x7f, 0x49, 0xca, 0x85, 0x61, 0xcd, 0x5e, 0x69, - 0xaf, 0x39, 0x7c, 0x38, 0xf8, 0xd5, 0xc2, 0xe0, 0xe5, 0xc2, 0xd0, 0x5d, 0xd2, 0xc9, 0x72, 0xab, - 0x16, 0xd7, 0xc2, 0x44, 0x5a, 0x15, 0xd6, 0xb0, 0x16, 0x5a, 0xd1, 0x76, 0xe8, 0xdc, 0x81, 0xc0, - 0x2a, 0x30, 0xce, 0xda, 0x8e, 0x69, 0x64, 0xff, 0x31, 0x21, 0x45, 0xa8, 0x65, 0x66, 0x85, 0x4a, - 0x97, 0xac, 0x83, 0x3b, 0x0d, 0x87, 0x4c, 0xd2, 0x25, 0x38, 0x6e, 0x75, 0x18, 0xad, 0x44, 0x2a, - 0x53, 0x76, 0xcf, 0x39, 0x8e, 0xc0, 0xb1, 0x4c, 0xe1, 0x6c, 0xb8, 0xb6, 0xb9, 0x88, 0x65, 0xbc, - 0x2e, 0x58, 0xd7, 0x39, 0x0e, 0xc8, 0x21, 0x00, 0x40, 0xd3, 0x4f, 0xb9, 0x5e, 0x6d, 0xf8, 0xdf, - 0x46, 0x96, 0x1b, 0x00, 0x39, 0xf6, 0x1f, 0x13, 0x92, 0xa8, 0x6c, 0x25, 0xb4, 0x4c, 0xc3, 0x82, - 0x51, 0x77, 0x1c, 0x10, 0x0e, 0x00, 0xdd, 0x25, 0x55, 0x28, 0x4e, 0xc3, 0x3e, 0xee, 0x6d, 0xed, - 0x35, 0x87, 0xf7, 0x06, 0x77, 0xeb, 0x95, 0xbb, 0x5d, 0xfa, 0x94, 0xd4, 0xa3, 0x62, 0x2d, 0xa2, - 0xb0, 0x60, 0xf7, 0x7b, 0xa5, 0xbd, 0xf6, 0xf7, 0xe4, 0xf9, 0xf0, 0xd5, 0xf3, 0x57, 0xdf, 0xbd, - 0x1c, 0xbe, 0x7a, 0xc1, 0x6b, 0x51, 0xb1, 0x3e, 0x08, 0x0b, 0xba, 0x43, 0x9a, 0x8b, 0x5c, 0x47, - 0x52, 0x28, 0x0d, 0x6f, 0x3d, 0xc0, 0xb7, 0x08, 0x42, 0x13, 0x40, 0x80, 0x04, 0x79, 0x25, 0x23, - 0x11, 0xa5, 0x31, 0x7b, 0xd8, 0xdb, 0x02, 0x12, 0x40, 0x3e, 0x48, 0x21, 0x49, 0xea, 0x58, 0xeb, - 0x99, 0x65, 0x9f, 0xa0, 0x25, 0x9d, 0xc1, 0x9d, 0xda, 0xe7, 0x35, 0x79, 0x65, 0x8f, 0x33, 0x0b, - 0x2c, 0xa4, 0x61, 0x06, 0xfc, 0xb8, 0xf2, 0x32, 0x8c, 0x39, 0x16, 0x1c, 0x7a, 0xe0, 0x40, 0xba, - 0x4b, 0xea, 0xd1, 0x12, 0x4b, 0x8f, 0x3d, 0xc2, 0xfb, 0x5a, 0x83, 0x5b, 0xe5, 0xc8, 0x6b, 0xd1, - 0x92, 0x03, 0x31, 0x3b, 0xa4, 0xa9, 0x8d, 0x15, 0x46, 0x9d, 0x27, 0x50, 0x07, 0x9f, 0x3a, 0x93, - 0xb5, 0xb1, 0x73, 0x87, 0xd0, 0xfd, 0xdb, 0x65, 0xcf, 0x3e, 0xc3, 0xab, 0x9a, 0x83, 0x0f, 0x10, - 0x6f, 0xf8, 0xf5, 0x38, 0xa6, 0x3d, 0xd2, 0x42, 0xa6, 0x36, 0x8e, 0xfc, 0xc6, 0xdd, 0x06, 0xd8, - 0xc8, 0x19, 0xbf, 0xe3, 0x6a, 0xca, 0x5c, 0x84, 0x1a, 0x9e, 0x7b, 0xec, 0x14, 0xe4, 0x95, 0x9d, - 0x3b, 0x64, 0xa3, 0x90, 0x86, 0xc6, 0x4a, 0x6d, 0xd8, 0x93, 0x1b, 0x85, 0x63, 0x87, 0x40, 0x08, - 0xcd, 0x4a, 0x15, 0x78, 0xff, 0x8e, 0x0b, 0x21, 0xc8, 0x70, 0x39, 0xb4, 0xaf, 0x2c, 0x3c, 0x4f, - 0xa4, 0x58, 0x18, 0xd6, 0xc3, 0xbd, 0xc0, 0x01, 0x63, 0x43, 0xf7, 0x48, 0xd3, 0x57, 0xb2, 0x50, - 0x59, 0xce, 0x3e, 0x47, 0x47, 0x82, 0x81, 0xc7, 0x78, 0x63, 0x8d, 0x45, 0x3d, 0xc9, 0x72, 0xfa, - 0x67, 0xf2, 0xf1, 0xdd, 0x00, 0x8b, 0x14, 0x9a, 0x50, 0xbf, 0x57, 0xda, 0xeb, 0x0c, 0xdb, 0x2e, - 0x3f, 0xa2, 0x25, 0x82, 0x7c, 0xfb, 0x4e, 0xd0, 0x8f, 0xf3, 0x58, 0xc2, 0x43, 0xcb, 0x8b, 0xdc, - 0x58, 0x91, 0xa8, 0x54, 0x59, 0xf6, 0x14, 0xb3, 0xa5, 0xfe, 0xed, 0x37, 0xcf, 0xff, 0xf8, 0xe2, - 0xe5, 0x77, 0x9c, 0xe0, 0xde, 0x11, 0x6c, 0xd1, 0x3d, 0xd2, 0xc5, 0x44, 0x11, 0x26, 0x0a, 0x33, - 0x01, 0xdd, 0xcf, 0xb0, 0x2f, 0xd0, 0xec, 0x0e, 0xe2, 0xf3, 0x28, 0xcc, 0x66, 0x80, 0xd2, 0x4f, - 0x21, 0x6f, 0xac, 0xd4, 0x59, 0x98, 0xb0, 0x5d, 0xef, 0x98, 0x97, 0x31, 0xa7, 0xd2, 0xc2, 0x5e, - 0x8b, 0xcc, 0xb0, 0x2f, 0xe1, 0x31, 0x5e, 0x47, 0xf9, 0x04, 0x7c, 0xae, 0xbb, 0x51, 0x60, 0xd8, - 0x57, 0x3e, 0xbb, 0xef, 0x8e, 0x06, 0x5e, 0x03, 0xf9, 0xc4, 0xd0, 0xcf, 0x49, 0xcb, 0x67, 0x47, - 0xa1, 0xf3, 0xc2, 0xb0, 0xdf, 0x62, 0x85, 0xfa, 0x06, 0x3e, 0x03, 0x88, 0xee, 0x93, 0xed, 0xdb, - 0x2a, 0xae, 0x93, 0xec, 0xa3, 0xde, 0xbd, 0x5b, 0x7a, 0xd8, 0x51, 0x9e, 0x93, 0x87, 0x5e, 0x37, - 0x5e, 0xa7, 0x85, 0x88, 0xf2, 0xcc, 0xea, 0x3c, 0x49, 0xa4, 0x66, 0x5f, 0xa3, 0xf5, 0xf7, 0xdd, - 0xee, 0xe1, 0x3a, 0x2d, 0x0e, 0x6e, 0xf6, 0xa0, 0x2b, 0x2f, 0xb4, 0x94, 0xef, 0x37, 0x81, 0x67, - 0xcf, 0xf0, 0xf6, 0x96, 0x03, 0x5d, 0x8c, 0x61, 0x42, 0x5b, 0x95, 0x4a, 0x98, 0x95, 0xbf, 0x73, - 0xde, 0x7a, 0x91, 0x7e, 0x4d, 0x28, 0xf4, 0x63, 0xcc, 0x0e, 0x95, 0x89, 0x45, 0xa2, 0x96, 0x17, - 0x96, 0x0d, 0x30, 0x83, 0xa0, 0x53, 0xcf, 0x57, 0xaa, 0x98, 0x64, 0x63, 0x84, 0xc1, 0xe1, 0x9f, - 0x64, 0xb8, 0x12, 0xe6, 0xda, 0x44, 0x36, 0x31, 0xec, 0xf7, 0xa8, 0xd6, 0x04, 0x6c, 0xee, 0x20, - 0x6c, 0x1c, 0xe1, 0xfb, 0x6b, 0xec, 0x85, 0x86, 0x7d, 0xe3, 0x1b, 0x47, 0xf8, 0xfe, 0x7a, 0x06, - 0x00, 0x36, 0x6b, 0x1b, 0xda, 0xb5, 0x81, 0xba, 0xf8, 0x16, 0xbb, 0x4e, 0xe0, 0x80, 0x71, 0x0c, - 0xc1, 0xca, 0x75, 0x71, 0x01, 0xb4, 0x5a, 0xe3, 0xb3, 0x99, 0x0d, 0x9d, 0x29, 0x6e, 0x63, 0x66, - 0x8d, 0x4b, 0xe9, 0xfe, 0x33, 0xff, 0x47, 0xc0, 0x50, 0x69, 0x69, 0x0a, 0xa0, 0x5b, 0x4b, 0x63, - 0x73, 0x2d, 0x63, 0x9c, 0x97, 0x01, 0xbf, 0x91, 0xfb, 0xbb, 0x64, 0x1b, 0xb5, 0x3d, 0xe0, 0x0e, - 0xf8, 0x09, 0xe7, 0x66, 0x1f, 0x2c, 0xfb, 0x2f, 0x49, 0x13, 0xd5, 0x5c, 0x6b, 0xa6, 0x0f, 0x49, - 0xcd, 0xf5, 0x6c, 0x3f, 0x7f, 0xbd, 0xf4, 0xcb, 0xd1, 0xd8, 0xff, 0xc1, 0xfd, 0x95, 0xc4, 0x42, - 0x86, 0x76, 0xad, 0x9d, 0x9f, 0xa9, 0x4c, 0x05, 0xb6, 0xe3, 0x8d, 0x35, 0xa9, 0x4c, 0x4f, 0x41, - 0xfe, 0x59, 0x8c, 0xca, 0x3f, 0x8b, 0x51, 0xff, 0x9f, 0x25, 0x12, 0x78, 0x6b, 0xff, 0x46, 0xfb, - 0xa4, 0x62, 0xaf, 0x0b, 0x37, 0xcd, 0x3b, 0xc3, 0xce, 0x60, 0xb3, 0x21, 0x00, 0xe5, 0xb8, 0x47, - 0x9f, 0x90, 0x0a, 0x8c, 0x75, 0xbc, 0xa9, 0x39, 0x24, 0x83, 0x9b, 0x41, 0xcf, 0x11, 0xbf, 0x3d, - 0x82, 0xd6, 0x51, 0x04, 0xdf, 0xb4, 0xad, 0x3b, 0x23, 0xc8, 0x81, 0x60, 0xf3, 0x4a, 0xca, 0x42, - 0xe4, 0x85, 0xcc, 0xfc, 0xe0, 0x0e, 0x00, 0x98, 0x16, 0x32, 0xa3, 0xfb, 0x24, 0xd8, 0x38, 0x87, - 0x03, 0xbb, 0xb9, 0xb1, 0x65, 0x83, 0xf2, 0x9b, 0xfd, 0xfe, 0xbf, 0xcb, 0xfe, 0xb3, 0x81, 0x61, - 0xfe, 0x7f, 0x3c, 0x60, 0xa4, 0xbe, 0x31, 0x0d, 0xbe, 0x35, 0x01, 0xdf, 0x88, 0xf4, 0x29, 0xa9, - 0x00, 0xc5, 0x68, 0xf1, 0xcd, 0xa0, 0xb9, 0x21, 0x9d, 0xe3, 0x26, 0x7d, 0x46, 0xea, 0x9e, 0x59, - 0xb4, 0xbb, 0x39, 0xa4, 0x83, 0x5f, 0xd0, 0xcd, 0x37, 0x2a, 0xf4, 0x0b, 0x52, 0x73, 0x8e, 0x7b, - 0x47, 0x5a, 0x83, 0x5b, 0xa4, 0x73, 0xbf, 0xe7, 0xe7, 0x7b, 0xed, 0x7f, 0xce, 0xf7, 0x47, 0x40, - 0x96, 0x90, 0x5a, 0x67, 0x39, 0xfe, 0x3e, 0xaa, 0xbc, 0x1e, 0xe9, 0x11, 0x88, 0x77, 0x62, 0x16, - 0xfc, 0xf7, 0x98, 0x41, 0xf0, 0xdd, 0x35, 0xa9, 0x59, 0xe2, 0x4f, 0xa4, 0xc1, 0x03, 0xbc, 0x27, - 0x35, 0x4b, 0x18, 0x73, 0x97, 0x52, 0x1b, 0x95, 0x67, 0xf8, 0x0b, 0x69, 0x6e, 0x1a, 0xaa, 0x07, - 0xf9, 0x66, 0xb7, 0xff, 0xf7, 0x12, 0x69, 0xdd, 0xde, 0x81, 0xdf, 0x60, 0x1a, 0xbe, 0xcb, 0xb5, - 0xcf, 0x72, 0x27, 0x20, 0xaa, 0xb2, 0x5c, 0xfb, 0x8f, 0xa7, 0x13, 0x00, 0x5d, 0x2a, 0xeb, 0xbf, - 0xe6, 0x0d, 0xee, 0x04, 0x28, 0x2b, 0xb3, 0x3e, 0x77, 0x3f, 0xa4, 0x8a, 0x2f, 0x58, 0x2f, 0xc3, - 0x09, 0xfc, 0xe9, 0x62, 0x20, 0xab, 0xdc, 0x09, 0xf0, 0x95, 0x81, 0x5e, 0x89, 0xb1, 0x6b, 0x70, - 0x5c, 0xef, 0x0b, 0x6f, 0x97, 0x1f, 0x01, 0x94, 0x90, 0xda, 0xe4, 0xcd, 0xc9, 0x94, 0x8f, 0xba, - 0x1f, 0xd1, 0x26, 0xa9, 0x1f, 0xbc, 0x11, 0x27, 0xd3, 0x93, 0x51, 0xb7, 0x44, 0x1b, 0xa4, 0x3a, - 0xe3, 0xd3, 0xd9, 0xbc, 0x5b, 0xa6, 0x01, 0xa9, 0xcc, 0xa7, 0xe3, 0xd3, 0xee, 0x16, 0xac, 0xc6, - 0x67, 0x47, 0x47, 0xdd, 0x0a, 0x9c, 0x9b, 0x9f, 0xf2, 0xc9, 0xc1, 0x69, 0xb7, 0x0a, 0xe7, 0x0e, - 0x47, 0xe3, 0xd7, 0x67, 0x47, 0xa7, 0xdd, 0xda, 0xfe, 0x3f, 0x4a, 0xbe, 0x04, 0x37, 0x99, 0x05, - 0x37, 0x8d, 0x8e, 0x67, 0xa7, 0x3f, 0x76, 0x3f, 0x82, 0xf3, 0x87, 0x67, 0xc7, 0xb3, 0x6e, 0x09, - 0xce, 0xf0, 0xd1, 0xfc, 0x14, 0x1e, 0x2e, 0x83, 0xc6, 0xc1, 0x5f, 0x46, 0x07, 0x3f, 0x74, 0xb7, - 0x68, 0x8b, 0x04, 0x33, 0x3e, 0x12, 0xa8, 0x55, 0xa1, 0xf7, 0x48, 0x73, 0xf6, 0xfa, 0xcd, 0x48, - 0xcc, 0x47, 0xfc, 0xed, 0x88, 0x77, 0xab, 0xf0, 0xec, 0xc9, 0xf4, 0x74, 0x32, 0xfe, 0xb1, 0x5b, - 0xa3, 0x5d, 0xd2, 0x3a, 0x98, 0x9d, 0x4d, 0x4e, 0xc6, 0x53, 0xa7, 0x5e, 0xa7, 0xdb, 0xa4, 0xbd, - 0x41, 0xdc, 0x7d, 0x01, 0x40, 0xe3, 0xd1, 0xeb, 0xd3, 0x33, 0x3e, 0xf2, 0x50, 0x03, 0x9e, 0x7e, - 0x3b, 0xe2, 0xf3, 0xc9, 0xf4, 0xa4, 0x4b, 0xfe, 0x13, 0x00, 0x00, 0xff, 0xff, 0x5f, 0x2a, 0xaf, - 0x49, 0x5b, 0x0d, 0x00, 0x00, + // 1835 bytes of a gzipped FileDescriptorProto + 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0x8c, 0x56, 0xeb, 0x72, 0x5b, 0xb7, + 0x11, 0x0e, 0x29, 0xf1, 0x06, 0x5e, 0x7c, 0x0c, 0x5f, 0x02, 0xc7, 0xb5, 0xad, 0xd0, 0x51, 0xa2, + 0x2a, 0x2e, 0x93, 0x30, 0x76, 0x5c, 0x67, 0xda, 0x1f, 0x1e, 0x8a, 0x74, 0xd8, 0x48, 0x22, 0x07, + 0xa4, 0xdc, 0xc9, 0x2f, 0xcc, 0xd1, 0x39, 0x20, 0x05, 0xf3, 0xdc, 0x0a, 0x80, 0x8a, 0xe4, 0x97, + 0xe8, 0xbf, 0x3e, 0x57, 0xde, 0xa4, 0xaf, 0xd0, 0xd9, 0x05, 0x28, 0x4b, 0x49, 0x66, 0xd2, 0x7f, + 0xd8, 0x0f, 0xbb, 0xc0, 0xde, 0x77, 0x49, 0x5b, 0x17, 0xd1, 0x57, 0xba, 0x88, 0x7a, 0x85, 0xce, + 0x6d, 0xde, 0x5d, 0x92, 0x7b, 0x91, 0x56, 0x6b, 0x51, 0x84, 0x4b, 0x29, 0x8c, 0xd4, 0xe7, 0x52, + 0x0b, 0x95, 0x2d, 0x72, 0xca, 0x48, 0x2d, 0x8c, 0x63, 0x2d, 0x8d, 0x61, 0xa5, 0x9d, 0xd2, 0x5e, + 0x83, 0x6f, 0x48, 0x4a, 0xc9, 0x76, 0x91, 0x6b, 0xcb, 0xca, 0x3b, 0xa5, 0xbd, 0x0a, 0xc7, 0x33, + 0x0d, 0xc8, 0x56, 0xa1, 0x62, 0xb6, 0x85, 0x10, 0x1c, 0x69, 0x87, 0x94, 0x17, 0x31, 0xdb, 0x46, + 0xa0, 0xbc, 0x88, 0xbb, 0x7f, 0x23, 0x1d, 0xfc, 0xe8, 0x5c, 0xda, 0x33, 0x51, 0x84, 0x4a, 0xd3, + 0x3b, 0xa4, 0xa2, 0x16, 0x42, 0x65, 0xac, 0xb4, 0x53, 0xde, 0x6b, 0xf0, 0x6d, 0xb5, 0x18, 0x67, + 0xf4, 0x1e, 0xa9, 0xaa, 0x85, 0xc8, 0xd7, 0xf0, 0x3c, 0xa0, 0x15, 0xb5, 0x98, 0xac, 0x6d, 0xf7, + 0x5b, 0xd2, 0x96, 0x17, 0x56, 0xa4, 0xf9, 0x3a, 0xb3, 0x22, 0x0d, 0x0b, 0xf8, 0x70, 0x25, 0x2f, + 0xbd, 0x28, 0x1c, 0x01, 0x39, 0x0f, 0x13, 0x2f, 0x06, 0xc7, 0xee, 0x5b, 0xd2, 0x79, 0x97, 0xab, + 0x4c, 0x64, 0x61, 0x2a, 0x4d, 0x11, 0x46, 0x12, 0x94, 0xca, 0x8c, 0x17, 0x2a, 0x67, 0x86, 0x7e, + 0x4c, 0x6a, 0x99, 0x11, 0x0b, 0x95, 0x48, 0x2f, 0x57, 0xcd, 0xcc, 0x48, 0x25, 0x92, 0x3e, 0x24, + 0x0d, 0x79, 0x61, 0x75, 0x28, 0xf2, 0xc2, 0xa2, 0x55, 0x0d, 0x5e, 0x47, 0x60, 0x52, 0xd8, 0x6e, + 0x8f, 0x10, 0x95, 0x9d, 0x49, 0xad, 0xac, 0x58, 0xc4, 0xbf, 0xa3, 0x89, 0x33, 0x1d, 0x1e, 0x74, + 0xa6, 0xbf, 0x20, 0xcd, 0x68, 0xa9, 0xf3, 0x75, 0x21, 0x74, 0x9e, 0x5b, 0xf0, 0x5f, 0x64, 0x75, + 0xe2, 0xdd, 0x8a, 0x67, 0xf4, 0x69, 0x68, 0xcf, 0xbc, 0x16, 0x78, 0xee, 0x3e, 0x21, 0xb5, 0x75, + 0xa6, 0x2e, 0x84, 0x59, 0xd1, 0xbb, 0xa4, 0xa2, 0xb2, 0x3c, 0x96, 0xf8, 0x4b, 0x9b, 0x3b, 0xa2, + 0xfb, 0xdf, 0x36, 0x69, 0xa0, 0x4f, 0xf3, 0xc2, 0x1a, 0xda, 0x25, 0x6d, 0x95, 0x86, 0x4b, 0x69, + 0x44, 0xac, 0xb4, 0x58, 0xc4, 0xc8, 0x5b, 0xe1, 0x4d, 0x07, 0x1e, 0x28, 0x3d, 0x8a, 0x37, 0x61, + 0x2a, 0x7f, 0x08, 0xd3, 0x53, 0xd2, 0x4e, 0x64, 0x78, 0x2e, 0x85, 0x5e, 0x67, 0x99, 0xca, 0x96, + 0x68, 0x6c, 0x9d, 0xb7, 0x10, 0xe4, 0x0e, 0xa3, 0x8f, 0x49, 0x13, 0xbc, 0xef, 0xb5, 0xc1, 0xa0, + 0xd6, 0x39, 0x38, 0xe8, 0x24, 0x53, 0x17, 0xb3, 0x15, 0xfd, 0x82, 0xdc, 0xb2, 0x51, 0x21, 0xa4, + 0xb1, 0xe1, 0x69, 0xa2, 0xcc, 0x99, 0x8c, 0x59, 0x05, 0x79, 0x3a, 0x36, 0x2a, 0x86, 0x1f, 0x50, + 0x60, 0x94, 0xe7, 0xa1, 0x51, 0xe7, 0x52, 0xc4, 0xf2, 0x5c, 0x45, 0xd2, 0xb0, 0xaa, 0x63, 0xf4, + 0xf0, 0x81, 0x43, 0xc1, 0xff, 0xe6, 0x4c, 0x26, 0x89, 0x78, 0x97, 0x9f, 0xb2, 0x1a, 0xb2, 0xd4, + 0x11, 0xf8, 0x47, 0x7e, 0x4a, 0x1f, 0x11, 0x02, 0x21, 0x13, 0x49, 0x1e, 0xad, 0x0c, 0xab, 0x3b, + 0x6d, 0x00, 0x39, 0x04, 0x80, 0x3e, 0x26, 0x8d, 0x24, 0x5f, 0x8a, 0x44, 0x9e, 0xcb, 0x84, 0x35, + 0xc0, 0xd4, 0xef, 0x4b, 0x7d, 0x5e, 0x4f, 0xf2, 0xe5, 0x21, 0x40, 0xf4, 0x01, 0x81, 0xb3, 0x8b, + 0x3a, 0x71, 0xa9, 0x9d, 0xe4, 0x4b, 0x0c, 0xfb, 0xe7, 0xa4, 0x5c, 0x18, 0xd6, 0xdc, 0x29, 0xed, + 0x35, 0xfb, 0xf7, 0x7b, 0xbf, 0x5b, 0x18, 0xbc, 0x5c, 0x18, 0xba, 0x4b, 0x3a, 0x59, 0x6e, 0xd5, + 0xe2, 0x52, 0x98, 0x48, 0xab, 0xc2, 0x1a, 0xd6, 0x42, 0x2d, 0xda, 0x0e, 0x9d, 0x39, 0x10, 0xa2, + 0x0a, 0x11, 0x67, 0x6d, 0x17, 0x69, 0x8c, 0xfe, 0x23, 0x42, 0x8a, 0x50, 0xcb, 0xcc, 0x0a, 0x95, + 0x2e, 0x59, 0x07, 0x6f, 0x1a, 0x0e, 0x19, 0xa7, 0x4b, 0x30, 0xdc, 0xea, 0x30, 0x5a, 0x89, 0x54, + 0xa6, 0xec, 0x96, 0x33, 0x1c, 0x81, 0x23, 0x99, 0x82, 0x6c, 0xb8, 0xb6, 0xb9, 0x88, 0x65, 0xbc, + 0x2e, 0x58, 0xe0, 0x0c, 0x07, 0xe4, 0x00, 0x00, 0x08, 0xd3, 0xcf, 0xb9, 0x5e, 0x6d, 0xe2, 0x7f, + 0x1b, 0xa3, 0xdc, 0x00, 0xc8, 0x45, 0xff, 0x11, 0x21, 0x89, 0xca, 0x56, 0x42, 0xcb, 0x34, 0x2c, + 0x18, 0x75, 0xe2, 0x80, 0x70, 0x00, 0xe8, 0x2e, 0xa9, 0x40, 0x71, 0x1a, 0x76, 0x67, 0x67, 0x6b, + 0xaf, 0xd9, 0xbf, 0xd5, 0xbb, 0x59, 0xaf, 0xdc, 0xdd, 0xd2, 0xa7, 0xa4, 0x16, 0x15, 0x6b, 0x11, + 0x85, 0x05, 0xbb, 0xbb, 0x53, 0xda, 0x6b, 0x7f, 0x4f, 0x9e, 0xf7, 0x5f, 0x3d, 0x7f, 0xf5, 0xdd, + 0xcb, 0xfe, 0xab, 0x17, 0xbc, 0x1a, 0x15, 0xeb, 0x41, 0x58, 0xd0, 0x27, 0xa4, 0xb9, 0xc8, 0x75, + 0x24, 0x85, 0xd2, 0xf0, 0xd7, 0x3d, 0xfc, 0x8b, 0x20, 0x34, 0x06, 0x04, 0x82, 0x20, 0x2f, 0x64, + 0x24, 0xa2, 0x34, 0x66, 0xf7, 0x77, 0xb6, 0x20, 0x08, 0x40, 0x0f, 0x52, 0x48, 0x92, 0x1a, 0xd6, + 0x7a, 0x66, 0xd9, 0xc7, 0xa8, 0x49, 0xa7, 0x77, 0xa3, 0xf6, 0x79, 0x55, 0x5e, 0xd8, 0xa3, 0xcc, + 0x42, 0x14, 0xd2, 0x30, 0x83, 0xf8, 0xb8, 0xf2, 0x32, 0x8c, 0xb9, 0x28, 0x38, 0x74, 0xe0, 0x40, + 0xba, 0x4b, 0x6a, 0xd1, 0x12, 0x4b, 0x8f, 0x3d, 0xc0, 0xf7, 0x5a, 0xbd, 0x6b, 0xe5, 0xc8, 0xab, + 0xd1, 0x92, 0x43, 0x60, 0x9e, 0x90, 0xa6, 0x36, 0x56, 0x18, 0x75, 0x9a, 0x40, 0x1d, 0x7c, 0xe2, + 0x54, 0xd6, 0xc6, 0xce, 0x1c, 0x42, 0xf7, 0xaf, 0x97, 0x3d, 0x7b, 0x88, 0x4f, 0x35, 0x7b, 0x1f, + 0x20, 0xde, 0xf0, 0xe7, 0x51, 0x4c, 0x77, 0x48, 0x0b, 0x23, 0xb5, 0x31, 0xe4, 0x4f, 0xee, 0x35, + 0xc0, 0x86, 0x4e, 0xf9, 0x27, 0xae, 0xa6, 0xcc, 0x59, 0xa8, 0xe1, 0xbb, 0x47, 0x8e, 0x41, 0x5e, + 0xd8, 0x99, 0x43, 0x36, 0x0c, 0x69, 0x68, 0xac, 0xd4, 0x86, 0x3d, 0xbe, 0x62, 0x38, 0x72, 0x08, + 0xb8, 0xd0, 0xac, 0x54, 0x81, 0xef, 0x3f, 0x71, 0x2e, 0x04, 0x1a, 0x1e, 0x87, 0xf6, 0x95, 0x85, + 0xa7, 0x89, 0x14, 0x0b, 0xc3, 0x76, 0xf0, 0xae, 0xee, 0x80, 0x91, 0xa1, 0x7b, 0xa4, 0xe9, 0x2b, + 0x59, 0xa8, 0x2c, 0x67, 0x9f, 0xa2, 0x21, 0xf5, 0x9e, 0xc7, 0x78, 0x63, 0x8d, 0x45, 0x3d, 0xce, + 0x72, 0xfa, 0x77, 0x72, 0xe7, 0xa6, 0x83, 0x45, 0x0a, 0x4d, 0xa8, 0xbb, 0x53, 0xda, 0xeb, 0xf4, + 0xdb, 0x2e, 0x3f, 0xa2, 0x25, 0x82, 0xfc, 0xf6, 0x0d, 0xa7, 0x1f, 0xe5, 0xb1, 0x84, 0x8f, 0x96, + 0x67, 0xb9, 0xb1, 0x22, 0x51, 0xa9, 0xb2, 0xec, 0x29, 0x66, 0x4b, 0xed, 0x9b, 0xaf, 0x9f, 0xff, + 0xf5, 0xc5, 0xcb, 0xef, 0x38, 0xc1, 0xbb, 0x43, 0xb8, 0xa2, 0x7b, 0x24, 0xc0, 0x44, 0x11, 0x26, + 0x0a, 0x33, 0x01, 0xdd, 0xcf, 0xb0, 0xcf, 0x50, 0xed, 0x0e, 0xe2, 0xb3, 0x28, 0xcc, 0xa6, 0x80, + 0xd2, 0x4f, 0x20, 0x6f, 0xac, 0xd4, 0x59, 0x98, 0xb0, 0x5d, 0x6f, 0x98, 0xa7, 0x31, 0xa7, 0xd2, + 0xc2, 0x5e, 0x8a, 0xcc, 0xb0, 0xcf, 0xe1, 0x33, 0x5e, 0x43, 0xfa, 0x18, 0x6c, 0xae, 0xb9, 0x51, + 0x60, 0xd8, 0x17, 0x3e, 0xbb, 0x6f, 0x8e, 0x06, 0x5e, 0x05, 0xfa, 0xd8, 0xd0, 0x4f, 0x49, 0xcb, + 0x67, 0x47, 0xa1, 0xf3, 0xc2, 0xb0, 0x3f, 0x63, 0x85, 0xfa, 0x06, 0x3e, 0x05, 0x88, 0xee, 0x93, + 0xdb, 0xd7, 0x59, 0x5c, 0x27, 0xd9, 0x47, 0xbe, 0x5b, 0xd7, 0xf8, 0xb0, 0xa3, 0x3c, 0x27, 0xf7, + 0x3d, 0x6f, 0xbc, 0x4e, 0x0b, 0x11, 0xe5, 0x99, 0xd5, 0x79, 0x92, 0x48, 0xcd, 0xbe, 0x44, 0xed, + 0xef, 0xba, 0xdb, 0x83, 0x75, 0x5a, 0x0c, 0xae, 0xee, 0xa0, 0x2b, 0x2f, 0xb4, 0x94, 0xef, 0x37, + 0x8e, 0x67, 0xcf, 0xf0, 0xf5, 0x96, 0x03, 0x9d, 0x8f, 0x61, 0x42, 0x5b, 0x95, 0x4a, 0x98, 0x95, + 0x7f, 0x71, 0xd6, 0x7a, 0x92, 0x7e, 0x49, 0x28, 0xf4, 0x63, 0xcc, 0x0e, 0x95, 0x89, 0x45, 0xa2, + 0x96, 0x67, 0x96, 0xf5, 0x30, 0x83, 0xa0, 0x53, 0xcf, 0x56, 0xaa, 0x18, 0x67, 0x23, 0x84, 0xc1, + 0xe0, 0x9f, 0x65, 0xb8, 0x12, 0xe6, 0xd2, 0x44, 0x36, 0x31, 0xec, 0x2b, 0x64, 0x6b, 0x02, 0x36, + 0x73, 0x10, 0x36, 0x8e, 0xf0, 0xfd, 0x25, 0xf6, 0x42, 0xc3, 0xbe, 0xf6, 0x8d, 0x23, 0x7c, 0x7f, + 0x39, 0x05, 0x00, 0x9b, 0xb5, 0x0d, 0xed, 0xda, 0x40, 0x5d, 0x7c, 0x83, 0x5d, 0xa7, 0xee, 0x80, + 0x51, 0x0c, 0xce, 0xca, 0x75, 0x71, 0x06, 0x61, 0xb5, 0xc6, 0x67, 0x33, 0xeb, 0x3b, 0x55, 0xdc, + 0xc5, 0xd4, 0x1a, 0x97, 0xd2, 0x90, 0xf2, 0x51, 0x9e, 0x2d, 0x94, 0x6f, 0xce, 0xdf, 0xa2, 0xd1, + 0xc4, 0x41, 0xe0, 0xcd, 0xee, 0x33, 0xbf, 0x44, 0xa0, 0x2f, 0xb5, 0x34, 0x05, 0xe4, 0x83, 0x96, + 0xc6, 0xe6, 0x5a, 0xc6, 0x38, 0x50, 0xeb, 0xfc, 0x8a, 0xee, 0xee, 0x92, 0xdb, 0xc8, 0xed, 0x01, + 0x27, 0xe0, 0x47, 0xa0, 0x1b, 0x8e, 0x70, 0xec, 0xbe, 0x24, 0x4d, 0x64, 0x73, 0xbd, 0x9b, 0xde, + 0x27, 0x55, 0xd7, 0xd4, 0xfd, 0x80, 0xf6, 0xd4, 0x6f, 0x67, 0x67, 0xf7, 0x47, 0xd2, 0x46, 0xc1, + 0x85, 0x0c, 0xed, 0x5a, 0x3b, 0x47, 0xa4, 0x32, 0x15, 0xd8, 0xaf, 0x37, 0xda, 0xa4, 0x32, 0x9d, + 0x03, 0xfd, 0x2b, 0x27, 0x96, 0x7f, 0xe5, 0xc4, 0xee, 0x2f, 0x25, 0x52, 0xf7, 0xda, 0xfe, 0x8b, + 0x76, 0xc9, 0xb6, 0xbd, 0x2c, 0xdc, 0xb8, 0xef, 0xf4, 0x3b, 0xbd, 0xcd, 0x85, 0x00, 0x94, 0xe3, + 0x1d, 0x7d, 0x4c, 0xb6, 0x61, 0xee, 0xe3, 0x4b, 0xcd, 0x3e, 0xe9, 0x5d, 0x6d, 0x02, 0x1c, 0xf1, + 0xeb, 0x33, 0x6a, 0x1d, 0x45, 0xb0, 0xc7, 0x6d, 0xdd, 0x98, 0x51, 0x0e, 0x04, 0x9d, 0x57, 0x52, + 0x16, 0x22, 0x2f, 0x64, 0xe6, 0x27, 0x7b, 0x1d, 0x80, 0x49, 0x21, 0x33, 0xba, 0x4f, 0xea, 0x1b, + 0xe3, 0x70, 0xa2, 0x37, 0x37, 0xba, 0x6c, 0x50, 0x7e, 0x75, 0xbf, 0xf1, 0x4f, 0x15, 0x53, 0x11, + 0xfd, 0xf3, 0xef, 0x2d, 0xbf, 0x9f, 0xa0, 0xe3, 0xff, 0x1f, 0x9b, 0x18, 0xa9, 0x6d, 0x94, 0x85, + 0x4d, 0xa8, 0xce, 0x37, 0x24, 0x7d, 0x4a, 0xb6, 0x21, 0xe8, 0x68, 0xc3, 0xd5, 0x6c, 0xba, 0x4a, + 0x03, 0x8e, 0x97, 0xf4, 0x19, 0xa9, 0xf9, 0x58, 0xa3, 0x25, 0xcd, 0x3e, 0xed, 0xfd, 0x26, 0x01, + 0xf8, 0x86, 0x85, 0x7e, 0x46, 0xaa, 0xce, 0x15, 0xde, 0xb4, 0x56, 0xef, 0x5a, 0x1a, 0x70, 0x7f, + 0xe7, 0x57, 0x82, 0xea, 0x1f, 0xae, 0x04, 0x0f, 0x20, 0x7c, 0x42, 0x6a, 0x9d, 0xe5, 0xb8, 0xb0, + 0x54, 0x78, 0x2d, 0xd2, 0x43, 0x20, 0x6f, 0x78, 0xb1, 0xfe, 0x07, 0x5e, 0x7c, 0x08, 0x2e, 0x83, + 0x67, 0x52, 0xb3, 0xc4, 0xe5, 0xa5, 0xc1, 0xeb, 0xf8, 0x4e, 0x6a, 0x96, 0x30, 0x19, 0xcf, 0xa5, + 0x36, 0x2a, 0xcf, 0x70, 0x71, 0x69, 0x6e, 0x7a, 0xb0, 0x07, 0xf9, 0xe6, 0x16, 0x73, 0x18, 0x0b, + 0x10, 0x77, 0x99, 0x0a, 0xf7, 0x54, 0xf7, 0x3f, 0x25, 0xd2, 0xba, 0x2e, 0x01, 0x8b, 0x65, 0x1a, + 0xbe, 0xcb, 0xb5, 0xaf, 0x07, 0x47, 0x20, 0xaa, 0xb2, 0x5c, 0xfb, 0x1d, 0xd6, 0x11, 0x80, 0x2e, + 0x95, 0xf5, 0x5b, 0x7e, 0x83, 0x3b, 0x02, 0x0a, 0xd0, 0xac, 0x4f, 0xdd, 0xb2, 0xb5, 0xed, 0x6b, + 0xdf, 0xd3, 0x20, 0x81, 0x4b, 0x33, 0x3a, 0xb8, 0xc2, 0x1d, 0x01, 0x5b, 0x11, 0xb4, 0x5d, 0xf4, + 0x69, 0x83, 0xe3, 0x79, 0x5f, 0x78, 0xbd, 0xfc, 0x34, 0xa1, 0x84, 0x54, 0xc7, 0x6f, 0x8e, 0x27, + 0x7c, 0x18, 0x7c, 0x44, 0x9b, 0xa4, 0x36, 0x78, 0x23, 0x8e, 0x27, 0xc7, 0xc3, 0xa0, 0x44, 0x1b, + 0xa4, 0x32, 0xe5, 0x93, 0xe9, 0x2c, 0x28, 0xd3, 0x3a, 0xd9, 0x9e, 0x4d, 0x46, 0xf3, 0x60, 0x0b, + 0x4e, 0xa3, 0x93, 0xc3, 0xc3, 0x60, 0x1b, 0xe4, 0x66, 0x73, 0x3e, 0x1e, 0xcc, 0x83, 0x0a, 0xc8, + 0x1d, 0x0c, 0x47, 0xaf, 0x4f, 0x0e, 0xe7, 0x41, 0x75, 0xff, 0x97, 0x92, 0x2f, 0xd6, 0x4d, 0xc6, + 0xc1, 0x4b, 0xc3, 0xa3, 0xe9, 0xfc, 0xa7, 0xe0, 0x23, 0x90, 0x3f, 0x38, 0x39, 0x9a, 0x06, 0x25, + 0x90, 0xe1, 0xc3, 0xd9, 0x1c, 0x3e, 0x2e, 0x03, 0xc7, 0xe0, 0x87, 0xe1, 0xe0, 0xc7, 0x60, 0x8b, + 0xb6, 0x48, 0x7d, 0xca, 0x87, 0x02, 0xb9, 0xb6, 0xe9, 0x2d, 0xd2, 0x9c, 0xbe, 0x7e, 0x33, 0x14, + 0xb3, 0x21, 0x7f, 0x3b, 0xe4, 0x41, 0x05, 0xbe, 0x3d, 0x9e, 0xcc, 0xc7, 0xa3, 0x9f, 0x82, 0x2a, + 0x0d, 0x48, 0x6b, 0x30, 0x3d, 0x19, 0x1f, 0x8f, 0x26, 0x8e, 0xbd, 0x46, 0x6f, 0x93, 0xf6, 0x06, + 0x71, 0xef, 0xd5, 0x01, 0x1a, 0x0d, 0x5f, 0xcf, 0x4f, 0xf8, 0xd0, 0x43, 0x0d, 0xf8, 0xfa, 0xed, + 0x90, 0xcf, 0xc6, 0x93, 0xe3, 0x80, 0xc0, 0x7f, 0xff, 0x7c, 0x3d, 0x9e, 0x8b, 0xe9, 0xf8, 0x20, + 0x68, 0xd2, 0xbb, 0x24, 0xb8, 0xf6, 0x9f, 0x18, 0xfc, 0x70, 0x78, 0x10, 0xb4, 0xfe, 0x17, 0x00, + 0x00, 0xff, 0xff, 0xf8, 0x9f, 0x0e, 0x7d, 0xca, 0x0d, 0x00, 0x00, } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go index 6d9123dc26..f672ba2737 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go @@ -3,7 +3,6 @@ package fs import ( - "errors" "fmt" "io" "io/ioutil" @@ -14,6 +13,8 @@ import ( "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils" + "github.com/pkg/errors" + "golang.org/x/sys/unix" ) var ( @@ -35,7 +36,7 @@ var ( HugePageSizes, _ = cgroups.GetHugePageSize() ) -var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist") +var errSubsystemDoesNotExist = fmt.Errorf("cgroup: subsystem does not exist") type subsystemSet []subsystem @@ -62,9 +63,10 @@ type subsystem interface { } type Manager struct { - mu sync.Mutex - Cgroups *configs.Cgroup - Paths map[string]string + mu sync.Mutex + Cgroups *configs.Cgroup + Rootless bool // ignore permission-related errors + Paths map[string]string } // The absolute path to the root of the cgroup hierarchies. @@ -100,6 +102,33 @@ type cgroupData struct { pid int } +// isIgnorableError returns whether err is a permission error (in the loose +// sense of the word). This includes EROFS (which for an unprivileged user is +// basically a permission error) and EACCES (for similar reasons) as well as +// the normal EPERM. +func isIgnorableError(rootless bool, err error) bool { + // We do not ignore errors if we are root. + if !rootless { + return false + } + // Is it an ordinary EPERM? + if os.IsPermission(errors.Cause(err)) { + return true + } + + // Try to handle other errnos. + var errno error + switch err := errors.Cause(err).(type) { + case *os.PathError: + errno = err.Err + case *os.LinkError: + errno = err.Err + case *os.SyscallError: + errno = err.Err + } + return errno == unix.EROFS || errno == unix.EPERM || errno == unix.EACCES +} + func (m *Manager) Apply(pid int) (err error) { if m.Cgroups == nil { return nil @@ -145,11 +174,11 @@ func (m *Manager) Apply(pid int) (err error) { m.Paths[sys.Name()] = p if err := sys.Apply(d); err != nil { - if os.IsPermission(err) && m.Cgroups.Path == "" { - // If we didn't set a cgroup path, then let's defer the error here - // until we know whether we have set limits or not. - // If we hadn't set limits, then it's ok that we couldn't join this cgroup, because - // it will have the same limits as its parent. + // In the case of rootless (including euid=0 in userns), where an explicit cgroup path hasn't + // been set, we don't bail on error in case of permission problems. + // Cases where limits have been set (and we couldn't create our own + // cgroup) are handled by Set. + if isIgnorableError(m.Rootless, err) && m.Cgroups.Path == "" { delete(m.Paths, sys.Name()) continue } @@ -207,9 +236,16 @@ func (m *Manager) Set(container *configs.Config) error { for _, sys := range subsystems { path := paths[sys.Name()] if err := sys.Set(path, container.Cgroups); err != nil { + if m.Rootless && sys.Name() == "devices" { + continue + } + // When m.Rootless is true, errors from the device subsystem are ignored because it is really not expected to work. + // However, errors from other subsystems are not ignored. + // see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error" if path == "" { - // cgroup never applied - return fmt.Errorf("cannot set limits on the %s cgroup, as the container has not joined it", sys.Name()) + // We never created a path for this cgroup, so we cannot set + // limits for it (though we have already tried at this point). + return fmt.Errorf("cannot set %s limit: container could not join or create cgroup", sys.Name()) } return err } @@ -281,7 +317,7 @@ func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) { } func (raw *cgroupData) path(subsystem string) (string, error) { - mnt, err := cgroups.FindCgroupMountpoint(subsystem) + mnt, err := cgroups.FindCgroupMountpoint(raw.root, subsystem) // If we didn't mount the subsystem, there is no point we make the path. if err != nil { return "", err diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go index b712bd0b1e..e240a8313a 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go @@ -46,11 +46,7 @@ func (s *CpuGroup) ApplyDir(path string, cgroup *configs.Cgroup, pid int) error } // because we are not using d.join we need to place the pid into the procs file // unlike the other subsystems - if err := cgroups.WriteCgroupProc(path, pid); err != nil { - return err - } - - return nil + return cgroups.WriteCgroupProc(path, pid) } func (s *CpuGroup) SetRtSched(path string, cgroup *configs.Cgroup) error { @@ -83,11 +79,7 @@ func (s *CpuGroup) Set(path string, cgroup *configs.Cgroup) error { return err } } - if err := s.SetRtSched(path, cgroup); err != nil { - return err - } - - return nil + return s.SetRtSched(path, cgroup) } func (s *CpuGroup) Remove(d *cgroupData) error { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go index 20c9eafac2..5a1d152ea1 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go @@ -77,18 +77,14 @@ func (s *CpusetGroup) ApplyDir(dir string, cgroup *configs.Cgroup, pid int) erro // The logic is, if user specified cpuset configs, use these // specified configs, otherwise, inherit from parent. This makes // cpuset configs work correctly with 'cpuset.cpu_exclusive', and - // keep backward compatbility. + // keep backward compatibility. if err := s.ensureCpusAndMems(dir, cgroup); err != nil { return err } // because we are not using d.join we need to place the pid into the procs file // unlike the other subsystems - if err := cgroups.WriteCgroupProc(dir, pid); err != nil { - return err - } - - return nil + return cgroups.WriteCgroupProc(dir, pid) } func (s *CpusetGroup) getSubsystemSettings(parent string) (cpus []byte, mems []byte, err error) { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/kmem.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/kmem.go new file mode 100644 index 0000000000..69b5a1946c --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/kmem.go @@ -0,0 +1,62 @@ +// +build linux,!nokmem + +package fs + +import ( + "errors" + "fmt" + "io/ioutil" + "os" + "path/filepath" + "strconv" + "syscall" // for Errno type only + + "github.com/opencontainers/runc/libcontainer/cgroups" + "golang.org/x/sys/unix" +) + +const cgroupKernelMemoryLimit = "memory.kmem.limit_in_bytes" + +func EnableKernelMemoryAccounting(path string) error { + // Ensure that kernel memory is available in this kernel build. If it + // isn't, we just ignore it because EnableKernelMemoryAccounting is + // automatically called for all memory limits. + if !cgroups.PathExists(filepath.Join(path, cgroupKernelMemoryLimit)) { + return nil + } + // We have to limit the kernel memory here as it won't be accounted at all + // until a limit is set on the cgroup and limit cannot be set once the + // cgroup has children, or if there are already tasks in the cgroup. + for _, i := range []int64{1, -1} { + if err := setKernelMemory(path, i); err != nil { + return err + } + } + return nil +} + +func setKernelMemory(path string, kernelMemoryLimit int64) error { + if path == "" { + return fmt.Errorf("no such directory for %s", cgroupKernelMemoryLimit) + } + if !cgroups.PathExists(filepath.Join(path, cgroupKernelMemoryLimit)) { + // We have specifically been asked to set a kmem limit. If the kernel + // doesn't support it we *must* error out. + return errors.New("kernel memory accounting not supported by this kernel") + } + if err := ioutil.WriteFile(filepath.Join(path, cgroupKernelMemoryLimit), []byte(strconv.FormatInt(kernelMemoryLimit, 10)), 0700); err != nil { + // Check if the error number returned by the syscall is "EBUSY" + // The EBUSY signal is returned on attempts to write to the + // memory.kmem.limit_in_bytes file if the cgroup has children or + // once tasks have been attached to the cgroup + if pathErr, ok := err.(*os.PathError); ok { + if errNo, ok := pathErr.Err.(syscall.Errno); ok { + if errNo == unix.EBUSY { + return fmt.Errorf("failed to set %s, because either tasks have already joined this cgroup or it has children", cgroupKernelMemoryLimit) + } + } + } + return fmt.Errorf("failed to write %v to %v: %v", kernelMemoryLimit, cgroupKernelMemoryLimit, err) + } + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/kmem_disabled.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/kmem_disabled.go new file mode 100644 index 0000000000..ac290fd7a0 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/kmem_disabled.go @@ -0,0 +1,15 @@ +// +build linux,nokmem + +package fs + +import ( + "errors" +) + +func EnableKernelMemoryAccounting(path string) error { + return nil +} + +func setKernelMemory(path string, kernelMemoryLimit int64) error { + return errors.New("kernel memory accounting disabled in this runc build") +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go index ad395a5d62..d5310d569f 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go @@ -5,23 +5,18 @@ package fs import ( "bufio" "fmt" - "io/ioutil" "os" "path/filepath" "strconv" "strings" - "syscall" // only for Errno "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" - - "golang.org/x/sys/unix" ) const ( - cgroupKernelMemoryLimit = "memory.kmem.limit_in_bytes" - cgroupMemorySwapLimit = "memory.memsw.limit_in_bytes" - cgroupMemoryLimit = "memory.limit_in_bytes" + cgroupMemorySwapLimit = "memory.memsw.limit_in_bytes" + cgroupMemoryLimit = "memory.limit_in_bytes" ) type MemoryGroup struct { @@ -67,44 +62,6 @@ func (s *MemoryGroup) Apply(d *cgroupData) (err error) { return nil } -func EnableKernelMemoryAccounting(path string) error { - // Check if kernel memory is enabled - // We have to limit the kernel memory here as it won't be accounted at all - // until a limit is set on the cgroup and limit cannot be set once the - // cgroup has children, or if there are already tasks in the cgroup. - for _, i := range []int64{1, -1} { - if err := setKernelMemory(path, i); err != nil { - return err - } - } - return nil -} - -func setKernelMemory(path string, kernelMemoryLimit int64) error { - if path == "" { - return fmt.Errorf("no such directory for %s", cgroupKernelMemoryLimit) - } - if !cgroups.PathExists(filepath.Join(path, cgroupKernelMemoryLimit)) { - // kernel memory is not enabled on the system so we should do nothing - return nil - } - if err := ioutil.WriteFile(filepath.Join(path, cgroupKernelMemoryLimit), []byte(strconv.FormatInt(kernelMemoryLimit, 10)), 0700); err != nil { - // Check if the error number returned by the syscall is "EBUSY" - // The EBUSY signal is returned on attempts to write to the - // memory.kmem.limit_in_bytes file if the cgroup has children or - // once tasks have been attached to the cgroup - if pathErr, ok := err.(*os.PathError); ok { - if errNo, ok := pathErr.Err.(syscall.Errno); ok { - if errNo == unix.EBUSY { - return fmt.Errorf("failed to set %s, because either tasks have already joined this cgroup or it has children", cgroupKernelMemoryLimit) - } - } - } - return fmt.Errorf("failed to write %v to %v: %v", kernelMemoryLimit, cgroupKernelMemoryLimit, err) - } - return nil -} - func setMemoryAndSwap(path string, cgroup *configs.Cgroup) error { // If the memory update is set to -1 we should also // set swap to -1, it means unlimited memory. diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go index efadc2a3ca..a10e3f6a89 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go @@ -5,6 +5,8 @@ package systemd import ( "errors" "fmt" + "io/ioutil" + "math" "os" "path/filepath" "strings" @@ -70,12 +72,11 @@ const ( ) var ( - connLock sync.Mutex - theConn *systemdDbus.Conn - hasStartTransientUnit bool - hasStartTransientSliceUnit bool - hasTransientDefaultDependencies bool - hasDelegate bool + connLock sync.Mutex + theConn *systemdDbus.Conn + hasStartTransientUnit bool + hasStartTransientSliceUnit bool + hasDelegateSlice bool ) func newProp(name string, units interface{}) systemdDbus.Property { @@ -113,53 +114,6 @@ func UseSystemd() bool { } } - // Ensure the scope name we use doesn't exist. Use the Pid to - // avoid collisions between multiple libcontainer users on a - // single host. - scope := fmt.Sprintf("libcontainer-%d-systemd-test-default-dependencies.scope", os.Getpid()) - testScopeExists := true - for i := 0; i <= testScopeWait; i++ { - if _, err := theConn.StopUnit(scope, "replace", nil); err != nil { - if dbusError, ok := err.(dbus.Error); ok { - if strings.Contains(dbusError.Name, "org.freedesktop.systemd1.NoSuchUnit") { - testScopeExists = false - break - } - } - } - time.Sleep(time.Millisecond) - } - - // Bail out if we can't kill this scope without testing for DefaultDependencies - if testScopeExists { - return hasStartTransientUnit - } - - // Assume StartTransientUnit on a scope allows DefaultDependencies - hasTransientDefaultDependencies = true - ddf := newProp("DefaultDependencies", false) - if _, err := theConn.StartTransientUnit(scope, "replace", []systemdDbus.Property{ddf}, nil); err != nil { - if dbusError, ok := err.(dbus.Error); ok { - if strings.Contains(dbusError.Name, "org.freedesktop.DBus.Error.PropertyReadOnly") { - hasTransientDefaultDependencies = false - } - } - } - - // Not critical because of the stop unit logic above. - theConn.StopUnit(scope, "replace", nil) - - // Assume StartTransientUnit on a scope allows Delegate - hasDelegate = true - dl := newProp("Delegate", true) - if _, err := theConn.StartTransientUnit(scope, "replace", []systemdDbus.Property{dl}, nil); err != nil { - if dbusError, ok := err.(dbus.Error); ok { - if strings.Contains(dbusError.Name, "org.freedesktop.DBus.Error.PropertyReadOnly") { - hasDelegate = false - } - } - } - // Assume we have the ability to start a transient unit as a slice // This was broken until systemd v229, but has been back-ported on RHEL environments >= 219 // For details, see: https://bugzilla.redhat.com/show_bug.cgi?id=1370299 @@ -188,7 +142,22 @@ func UseSystemd() bool { } // Not critical because of the stop unit logic above. - theConn.StopUnit(scope, "replace", nil) + theConn.StopUnit(slice, "replace", nil) + + // Assume StartTransientUnit on a slice allows Delegate + hasDelegateSlice = true + dlSlice := newProp("Delegate", true) + if _, err := theConn.StartTransientUnit(slice, "replace", []systemdDbus.Property{dlSlice}, nil); err != nil { + if dbusError, ok := err.(dbus.Error); ok { + // Starting with systemd v237, Delegate is not even a property of slices anymore, + // so the D-Bus call fails with "InvalidArgs" error. + if strings.Contains(dbusError.Name, "org.freedesktop.DBus.Error.PropertyReadOnly") || strings.Contains(dbusError.Name, "org.freedesktop.DBus.Error.InvalidArgs") { + hasDelegateSlice = false + } + } + } + + // Not critical because of the stop unit logic above. theConn.StopUnit(slice, "replace", nil) } return hasStartTransientUnit @@ -242,8 +211,14 @@ func (m *Manager) Apply(pid int) error { properties = append(properties, newProp("PIDs", []uint32{uint32(pid)})) } - if hasDelegate { - // This is only supported on systemd versions 218 and above. + // Check if we can delegate. This is only supported on systemd versions 218 and above. + if strings.HasSuffix(unitName, ".slice") { + if hasDelegateSlice { + // systemd 237 and above no longer allows delegation on a slice + properties = append(properties, newProp("Delegate", true)) + } + } else { + // Assume scopes always support delegation. properties = append(properties, newProp("Delegate", true)) } @@ -254,10 +229,9 @@ func (m *Manager) Apply(pid int) error { newProp("CPUAccounting", true), newProp("BlockIOAccounting", true)) - if hasTransientDefaultDependencies { - properties = append(properties, - newProp("DefaultDependencies", false)) - } + // Assume DefaultDependencies= will always work (the check for it was previously broken.) + properties = append(properties, + newProp("DefaultDependencies", false)) if c.Resources.Memory != 0 { properties = append(properties, @@ -271,13 +245,19 @@ func (m *Manager) Apply(pid int) error { // cpu.cfs_quota_us and cpu.cfs_period_us are controlled by systemd. if c.Resources.CpuQuota != 0 && c.Resources.CpuPeriod != 0 { - cpuQuotaPerSecUSec := uint64(c.Resources.CpuQuota*1000000) / c.Resources.CpuPeriod - // systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota - // (integer percentage of CPU) internally. This means that if a fractional percent of - // CPU is indicated by Resources.CpuQuota, we need to round up to the nearest - // 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect. - if cpuQuotaPerSecUSec%10000 != 0 { - cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000 + // corresponds to USEC_INFINITY in systemd + // if USEC_INFINITY is provided, CPUQuota is left unbound by systemd + // always setting a property value ensures we can apply a quota and remove it later + cpuQuotaPerSecUSec := uint64(math.MaxUint64) + if c.Resources.CpuQuota > 0 { + // systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota + // (integer percentage of CPU) internally. This means that if a fractional percent of + // CPU is indicated by Resources.CpuQuota, we need to round up to the nearest + // 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect. + cpuQuotaPerSecUSec = uint64(c.Resources.CpuQuota*1000000) / c.Resources.CpuPeriod + if cpuQuotaPerSecUSec%10000 != 0 { + cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000 + } } properties = append(properties, newProp("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec)) @@ -288,6 +268,12 @@ func (m *Manager) Apply(pid int) error { newProp("BlockIOWeight", uint64(c.Resources.BlkioWeight))) } + if c.Resources.PidsLimit > 0 { + properties = append(properties, + newProp("TasksAccounting", true), + newProp("TasksMax", uint64(c.Resources.PidsLimit))) + } + // We have to set kernel memory here, as we can't change it once // processes have been attached to the cgroup. if c.Resources.KernelMemory != 0 { @@ -296,17 +282,17 @@ func (m *Manager) Apply(pid int) error { } } - statusChan := make(chan string) - if _, err := theConn.StartTransientUnit(unitName, "replace", properties, statusChan); err != nil && !isUnitExists(err) { + statusChan := make(chan string, 1) + if _, err := theConn.StartTransientUnit(unitName, "replace", properties, statusChan); err == nil { + select { + case <-statusChan: + case <-time.After(time.Second): + logrus.Warnf("Timed out while waiting for StartTransientUnit(%s) completion signal from dbus. Continuing...", unitName) + } + } else if !isUnitExists(err) { return err } - select { - case <-statusChan: - case <-time.After(time.Second): - logrus.Warnf("Timed out while waiting for StartTransientUnit completion signal from dbus. Continuing...") - } - if err := joinCgroups(c, pid); err != nil { return err } @@ -433,7 +419,7 @@ func ExpandSlice(slice string) (string, error) { } func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) { - mountpoint, err := cgroups.FindCgroupMountpoint(subsystem) + mountpoint, err := cgroups.FindCgroupMountpoint(c.Path, subsystem) if err != nil { return "", err } @@ -553,6 +539,15 @@ func setKernelMemory(c *configs.Cgroup) error { if err := os.MkdirAll(path, 0755); err != nil { return err } + // do not try to enable the kernel memory if we already have + // tasks in the cgroup. + content, err := ioutil.ReadFile(filepath.Join(path, "tasks")) + if err != nil { + return err + } + if len(content) > 0 { + return nil + } return fs.EnableKernelMemoryAccounting(path) } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go index 7c995efee5..9717acc729 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go @@ -13,40 +13,51 @@ import ( "strings" "time" - "github.com/docker/go-units" + units "github.com/docker/go-units" + "golang.org/x/sys/unix" ) const ( - cgroupNamePrefix = "name=" + CgroupNamePrefix = "name=" CgroupProcesses = "cgroup.procs" ) // https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt -func FindCgroupMountpoint(subsystem string) (string, error) { - mnt, _, err := FindCgroupMountpointAndRoot(subsystem) +func FindCgroupMountpoint(cgroupPath, subsystem string) (string, error) { + mnt, _, err := FindCgroupMountpointAndRoot(cgroupPath, subsystem) return mnt, err } -func FindCgroupMountpointAndRoot(subsystem string) (string, string, error) { +func FindCgroupMountpointAndRoot(cgroupPath, subsystem string) (string, string, error) { // We are not using mount.GetMounts() because it's super-inefficient, // parsing it directly sped up x10 times because of not using Sscanf. // It was one of two major performance drawbacks in container start. if !isSubsystemAvailable(subsystem) { return "", "", NewNotFoundError(subsystem) } + f, err := os.Open("/proc/self/mountinfo") if err != nil { return "", "", err } defer f.Close() - scanner := bufio.NewScanner(f) + return findCgroupMountpointAndRootFromReader(f, cgroupPath, subsystem) +} + +func findCgroupMountpointAndRootFromReader(reader io.Reader, cgroupPath, subsystem string) (string, string, error) { + scanner := bufio.NewScanner(reader) for scanner.Scan() { txt := scanner.Text() - fields := strings.Split(txt, " ") - for _, opt := range strings.Split(fields[len(fields)-1], ",") { - if opt == subsystem { - return fields[4], fields[3], nil + fields := strings.Fields(txt) + if len(fields) < 5 { + continue + } + if strings.HasPrefix(fields[4], cgroupPath) { + for _, opt := range strings.Split(fields[len(fields)-1], ",") { + if opt == subsystem { + return fields[4], fields[3], nil + } } } } @@ -103,7 +114,7 @@ func FindCgroupMountpointDir() (string, error) { } if postSeparatorFields[0] == "cgroup" { - // Check that the mount is properly formated. + // Check that the mount is properly formatted. if numPostFields < 3 { return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text) } @@ -151,19 +162,20 @@ func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount, Root: fields[3], } for _, opt := range strings.Split(fields[len(fields)-1], ",") { - if !ss[opt] { + seen, known := ss[opt] + if !known || (!all && seen) { continue } - if strings.HasPrefix(opt, cgroupNamePrefix) { - m.Subsystems = append(m.Subsystems, opt[len(cgroupNamePrefix):]) - } else { - m.Subsystems = append(m.Subsystems, opt) - } - if !all { - numFound++ + ss[opt] = true + if strings.HasPrefix(opt, CgroupNamePrefix) { + opt = opt[len(CgroupNamePrefix):] } + m.Subsystems = append(m.Subsystems, opt) + numFound++ + } + if len(m.Subsystems) > 0 || all { + res = append(res, m) } - res = append(res, m) } if err := scanner.Err(); err != nil { return nil, err @@ -187,7 +199,7 @@ func GetCgroupMounts(all bool) ([]Mount, error) { allMap := make(map[string]bool) for s := range allSubsystems { - allMap[s] = true + allMap[s] = false } return getCgroupMountsHelper(allMap, f, all) } @@ -256,13 +268,13 @@ func GetInitCgroupPath(subsystem string) (string, error) { } func getCgroupPathHelper(subsystem, cgroup string) (string, error) { - mnt, root, err := FindCgroupMountpointAndRoot(subsystem) + mnt, root, err := FindCgroupMountpointAndRoot("", subsystem) if err != nil { return "", err } // This is needed for nested containers, because in /proc/self/cgroup we - // see pathes from host, which don't exist in container. + // see paths from host, which don't exist in container. relCgroup, err := filepath.Rel(root, cgroup) if err != nil { return "", err @@ -342,7 +354,7 @@ func getControllerPath(subsystem string, cgroups map[string]string) (string, err return p, nil } - if p, ok := cgroups[cgroupNamePrefix+subsystem]; ok { + if p, ok := cgroups[CgroupNamePrefix+subsystem]; ok { return p, nil } @@ -453,10 +465,39 @@ func WriteCgroupProc(dir string, pid int) error { } // Dont attach any pid to the cgroup if -1 is specified as a pid - if pid != -1 { - if err := ioutil.WriteFile(filepath.Join(dir, CgroupProcesses), []byte(strconv.Itoa(pid)), 0700); err != nil { - return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err) + if pid == -1 { + return nil + } + + cgroupProcessesFile, err := os.OpenFile(filepath.Join(dir, CgroupProcesses), os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0700) + if err != nil { + return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err) + } + defer cgroupProcessesFile.Close() + + for i := 0; i < 5; i++ { + _, err = cgroupProcessesFile.WriteString(strconv.Itoa(pid)) + if err == nil { + return nil + } + + // EINVAL might mean that the task being added to cgroup.procs is in state + // TASK_NEW. We should attempt to do so again. + if isEINVAL(err) { + time.Sleep(30 * time.Millisecond) + continue } + + return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err) + } + return err +} + +func isEINVAL(err error) bool { + switch err := err.(type) { + case *os.PathError: + return err.Err == unix.EINVAL + default: + return false } - return nil } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go index 3cae4fd8d9..7728522fef 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go @@ -141,9 +141,10 @@ type Config struct { // OomScoreAdj specifies the adjustment to be made by the kernel when calculating oom scores // for a process. Valid values are between the range [-1000, '1000'], where processes with - // higher scores are preferred for being killed. + // higher scores are preferred for being killed. If it is unset then we don't touch the current + // value. // More information about kernel oom score calculation here: https://lwn.net/Articles/317814/ - OomScoreAdj int `json:"oom_score_adj"` + OomScoreAdj *int `json:"oom_score_adj,omitempty"` // UidMappings is an array of User ID mappings for User Namespaces UidMappings []IDMap `json:"uid_mappings"` @@ -185,12 +186,19 @@ type Config struct { // callers keyring in this case. NoNewKeyring bool `json:"no_new_keyring"` - // Rootless specifies whether the container is a rootless container. - Rootless bool `json:"rootless"` - - // IntelRdt specifies settings for Intel RDT/CAT group that the container is placed into - // to limit the resources (e.g., L3 cache) the container has available + // IntelRdt specifies settings for Intel RDT group that the container is placed into + // to limit the resources (e.g., L3 cache, memory bandwidth) the container has available IntelRdt *IntelRdt `json:"intel_rdt,omitempty"` + + // RootlessEUID is set when the runc was launched with non-zero EUID. + // Note that RootlessEUID is set to false when launched with EUID=0 in userns. + // When RootlessEUID is set, runc creates a new userns for the container. + // (config.json needs to contain userns settings) + RootlessEUID bool `json:"rootless_euid,omitempty"` + + // RootlessCgroups is set when unlikely to have the full access to cgroups. + // When RootlessCgroups is set, cgroups errors are ignored. + RootlessCgroups bool `json:"rootless_cgroups,omitempty"` } type Hooks struct { @@ -264,26 +272,23 @@ func (hooks Hooks) MarshalJSON() ([]byte, error) { }) } -// HookState is the payload provided to a hook on execution. -type HookState specs.State - type Hook interface { // Run executes the hook with the provided state. - Run(HookState) error + Run(*specs.State) error } // NewFunctionHook will call the provided function when the hook is run. -func NewFunctionHook(f func(HookState) error) FuncHook { +func NewFunctionHook(f func(*specs.State) error) FuncHook { return FuncHook{ run: f, } } type FuncHook struct { - run func(HookState) error + run func(*specs.State) error } -func (f FuncHook) Run(s HookState) error { +func (f FuncHook) Run(s *specs.State) error { return f.run(s) } @@ -306,7 +311,7 @@ type CommandHook struct { Command } -func (c Command) Run(s HookState) error { +func (c Command) Run(s *specs.State) error { b, err := json.Marshal(s) if err != nil { return err diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/intelrdt.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/intelrdt.go index 36bd5f96a1..57e9f037d9 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/intelrdt.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/intelrdt.go @@ -4,4 +4,10 @@ type IntelRdt struct { // The schema for L3 cache id and capacity bitmask (CBM) // Format: "L3:=;=;..." L3CacheSchema string `json:"l3_cache_schema,omitempty"` + + // The schema of memory bandwidth per L3 cache id + // Format: "MB:=bandwidth0;=bandwidth1;..." + // The unit of memory bandwidth is specified in "percentages" by + // default, and in "MBps" if MBA Software Controller is enabled. + MemBwSchema string `json:"memBwSchema,omitempty"` } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go index 5fc171a57b..1bbaef9bd9 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go @@ -7,12 +7,13 @@ import ( ) const ( - NEWNET NamespaceType = "NEWNET" - NEWPID NamespaceType = "NEWPID" - NEWNS NamespaceType = "NEWNS" - NEWUTS NamespaceType = "NEWUTS" - NEWIPC NamespaceType = "NEWIPC" - NEWUSER NamespaceType = "NEWUSER" + NEWNET NamespaceType = "NEWNET" + NEWPID NamespaceType = "NEWPID" + NEWNS NamespaceType = "NEWNS" + NEWUTS NamespaceType = "NEWUTS" + NEWIPC NamespaceType = "NEWIPC" + NEWUSER NamespaceType = "NEWUSER" + NEWCGROUP NamespaceType = "NEWCGROUP" ) var ( @@ -35,6 +36,8 @@ func NsName(ns NamespaceType) string { return "user" case NEWUTS: return "uts" + case NEWCGROUP: + return "cgroup" } return "" } @@ -68,6 +71,7 @@ func NamespaceTypes() []NamespaceType { NEWNET, NEWPID, NEWNS, + NEWCGROUP, } } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall.go index 4ce6813d23..2dc7adfc96 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall.go @@ -9,12 +9,13 @@ func (n *Namespace) Syscall() int { } var namespaceInfo = map[NamespaceType]int{ - NEWNET: unix.CLONE_NEWNET, - NEWNS: unix.CLONE_NEWNS, - NEWUSER: unix.CLONE_NEWUSER, - NEWIPC: unix.CLONE_NEWIPC, - NEWUTS: unix.CLONE_NEWUTS, - NEWPID: unix.CLONE_NEWPID, + NEWNET: unix.CLONE_NEWNET, + NEWNS: unix.CLONE_NEWNS, + NEWUSER: unix.CLONE_NEWUSER, + NEWIPC: unix.CLONE_NEWIPC, + NEWUTS: unix.CLONE_NEWUTS, + NEWPID: unix.CLONE_NEWPID, + NEWCGROUP: unix.CLONE_NEWCGROUP, } // CloneFlags parses the container's Namespaces options to set the correct diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/rootless.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/rootless.go index e532ac8fe2..393d9e81ee 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/rootless.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/rootless.go @@ -2,23 +2,18 @@ package validate import ( "fmt" - "os" - "reflect" "strings" "github.com/opencontainers/runc/libcontainer/configs" ) -var ( - geteuid = os.Geteuid - getegid = os.Getegid -) - -func (v *ConfigValidator) rootless(config *configs.Config) error { - if err := rootlessMappings(config); err != nil { +// rootlessEUID makes sure that the config can be applied when runc +// is being executed as a non-root user (euid != 0) in the current user namespace. +func (v *ConfigValidator) rootlessEUID(config *configs.Config) error { + if err := rootlessEUIDMappings(config); err != nil { return err } - if err := rootlessMount(config); err != nil { + if err := rootlessEUIDMount(config); err != nil { return err } @@ -38,11 +33,9 @@ func hasIDMapping(id int, mappings []configs.IDMap) bool { return false } -func rootlessMappings(config *configs.Config) error { - if euid := geteuid(); euid != 0 { - if !config.Namespaces.Contains(configs.NEWUSER) { - return fmt.Errorf("rootless containers require user namespaces") - } +func rootlessEUIDMappings(config *configs.Config) error { + if !config.Namespaces.Contains(configs.NEWUSER) { + return fmt.Errorf("rootless container requires user namespaces") } if len(config.UidMappings) == 0 { @@ -51,34 +44,13 @@ func rootlessMappings(config *configs.Config) error { if len(config.GidMappings) == 0 { return fmt.Errorf("rootless containers requires at least one GID mapping") } - - return nil -} - -// cgroup verifies that the user isn't trying to set any cgroup limits or paths. -func rootlessCgroup(config *configs.Config) error { - // Nothing set at all. - if config.Cgroups == nil || config.Cgroups.Resources == nil { - return nil - } - - // Used for comparing to the zero value. - left := reflect.ValueOf(*config.Cgroups.Resources) - right := reflect.Zero(left.Type()) - - // This is all we need to do, since specconv won't add cgroup options in - // rootless mode. - if !reflect.DeepEqual(left.Interface(), right.Interface()) { - return fmt.Errorf("cannot specify resource limits in rootless container") - } - return nil } // mount verifies that the user isn't trying to set up any mounts they don't have // the rights to do. In addition, it makes sure that no mount has a `uid=` or // `gid=` option that doesn't resolve to root. -func rootlessMount(config *configs.Config) error { +func rootlessEUIDMount(config *configs.Config) error { // XXX: We could whitelist allowed devices at this point, but I'm not // convinced that's a good idea. The kernel is the best arbiter of // access control. diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go index cbbba9a03a..3b42f30107 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go @@ -38,14 +38,17 @@ func (v *ConfigValidator) Validate(config *configs.Config) error { if err := v.usernamespace(config); err != nil { return err } + if err := v.cgroupnamespace(config); err != nil { + return err + } if err := v.sysctl(config); err != nil { return err } if err := v.intelrdt(config); err != nil { return err } - if config.Rootless { - if err := v.rootless(config); err != nil { + if config.RootlessEUID { + if err := v.rootlessEUID(config); err != nil { return err } } @@ -116,6 +119,15 @@ func (v *ConfigValidator) usernamespace(config *configs.Config) error { return nil } +func (v *ConfigValidator) cgroupnamespace(config *configs.Config) error { + if config.Namespaces.Contains(configs.NEWCGROUP) { + if _, err := os.Stat("/proc/self/ns/cgroup"); os.IsNotExist(err) { + return fmt.Errorf("cgroup namespaces aren't enabled in the kernel") + } + } + return nil +} + // sysctl validates that the specified sysctl keys are valid or not. // /proc/sys isn't completely namespaced and depending on which namespaces // are specified, a subset of sysctls are permitted. @@ -151,6 +163,16 @@ func (v *ConfigValidator) sysctl(config *configs.Config) error { return fmt.Errorf("sysctl %q is not allowed in the hosts network namespace", s) } } + if config.Namespaces.Contains(configs.NEWUTS) { + switch s { + case "kernel.domainname": + // This is namespaced and there's no explicit OCI field for it. + continue + case "kernel.hostname": + // This is namespaced but there's a conflicting (dedicated) OCI field for it. + return fmt.Errorf("sysctl %q is not allowed as it conflicts with the OCI %q field", s, "hostname") + } + } return fmt.Errorf("sysctl %q is not in a separate kernel namespace", s) } @@ -159,11 +181,22 @@ func (v *ConfigValidator) sysctl(config *configs.Config) error { func (v *ConfigValidator) intelrdt(config *configs.Config) error { if config.IntelRdt != nil { - if !intelrdt.IsEnabled() { - return fmt.Errorf("intelRdt is specified in config, but Intel RDT feature is not supported or enabled") + if !intelrdt.IsCatEnabled() && !intelrdt.IsMbaEnabled() { + return fmt.Errorf("intelRdt is specified in config, but Intel RDT is not supported or enabled") + } + + if !intelrdt.IsCatEnabled() && config.IntelRdt.L3CacheSchema != "" { + return fmt.Errorf("intelRdt.l3CacheSchema is specified in config, but Intel RDT/CAT is not enabled") + } + if !intelrdt.IsMbaEnabled() && config.IntelRdt.MemBwSchema != "" { + return fmt.Errorf("intelRdt.memBwSchema is specified in config, but Intel RDT/MBA is not enabled") + } + + if intelrdt.IsCatEnabled() && config.IntelRdt.L3CacheSchema == "" { + return fmt.Errorf("Intel RDT/CAT is enabled and intelRdt is specified in config, but intelRdt.l3CacheSchema is empty") } - if config.IntelRdt.L3CacheSchema == "" { - return fmt.Errorf("intelRdt is specified in config, but intelRdt.l3CacheSchema is empty") + if intelrdt.IsMbaEnabled() && config.IntelRdt.MemBwSchema == "" { + return fmt.Errorf("Intel RDT/MBA is enabled and intelRdt is specified in config, but intelRdt.memBwSchema is empty") } } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/container.go b/vendor/github.com/opencontainers/runc/libcontainer/container.go index 2e31b4d4fc..ba7541c5fd 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/container.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/container.go @@ -9,6 +9,7 @@ import ( "time" "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runtime-spec/specs-go" ) // Status is the status of a container. @@ -85,6 +86,12 @@ type BaseContainer interface { // SystemError - System error. State() (*State, error) + // OCIState returns the current container's state information. + // + // errors: + // SystemError - System error. + OCIState() (*specs.State, error) + // Returns the current config of the container. Config() configs.Config diff --git a/vendor/github.com/opencontainers/runc/libcontainer/container_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/container_linux.go index b3e157bdf8..7e58e5e008 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/container_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/container_linux.go @@ -19,16 +19,17 @@ import ( "syscall" // only for SysProcAttr and Signal "time" + "github.com/cyphar/filepath-securejoin" "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" - "github.com/opencontainers/runc/libcontainer/criurpc" "github.com/opencontainers/runc/libcontainer/intelrdt" "github.com/opencontainers/runc/libcontainer/system" "github.com/opencontainers/runc/libcontainer/utils" + "github.com/opencontainers/runtime-spec/specs-go" + criurpc "github.com/checkpoint-restore/go-criu/rpc" "github.com/golang/protobuf/proto" "github.com/sirupsen/logrus" - "github.com/syndtr/gocapability/capability" "github.com/vishvananda/netlink/nl" "golang.org/x/sys/unix" ) @@ -60,7 +61,8 @@ type State struct { // Platform specific fields below here - // Specifies if the container was started under the rootless mode. + // Specified if the container was started under the rootless mode. + // Set to true if BaseState.Config.RootlessEUID && BaseState.Config.RootlessCgroups Rootless bool `json:"rootless"` // Path to all the cgroups setup for a container. Key is cgroup subsystem name @@ -156,6 +158,12 @@ func (c *linuxContainer) State() (*State, error) { return c.currentState() } +func (c *linuxContainer) OCIState() (*specs.State, error) { + c.m.Lock() + defer c.m.Unlock() + return c.currentOCIState() +} + func (c *linuxContainer) Processes() ([]int, error) { pids, err := c.cgroupManager.GetAllPids() if err != nil { @@ -225,17 +233,13 @@ func (c *linuxContainer) Set(config configs.Config) error { func (c *linuxContainer) Start(process *Process) error { c.m.Lock() defer c.m.Unlock() - status, err := c.currentStatus() - if err != nil { - return err - } - if status == Stopped { + if process.Init { if err := c.createExecFifo(); err != nil { return err } } - if err := c.start(process, status == Stopped); err != nil { - if status == Stopped { + if err := c.start(process); err != nil { + if process.Init { c.deleteExecFifo() } return err @@ -244,17 +248,10 @@ func (c *linuxContainer) Start(process *Process) error { } func (c *linuxContainer) Run(process *Process) error { - c.m.Lock() - status, err := c.currentStatus() - if err != nil { - c.m.Unlock() - return err - } - c.m.Unlock() if err := c.Start(process); err != nil { return err } - if status == Stopped { + if process.Init { return c.exec() } return nil @@ -335,8 +332,8 @@ type openResult struct { err error } -func (c *linuxContainer) start(process *Process, isInit bool) error { - parent, err := c.newParentProcess(process, isInit) +func (c *linuxContainer) start(process *Process) error { + parent, err := c.newParentProcess(process) if err != nil { return newSystemErrorWithCause(err, "creating new parent process") } @@ -349,7 +346,7 @@ func (c *linuxContainer) start(process *Process, isInit bool) error { } // generate a timestamp indicating when the container was started c.created = time.Now().UTC() - if isInit { + if process.Init { c.state = &createdState{ c: c, } @@ -360,13 +357,9 @@ func (c *linuxContainer) start(process *Process, isInit bool) error { c.initProcessStartTime = state.InitProcessStartTime if c.config.Hooks != nil { - bundle, annotations := utils.Annotations(c.config.Labels) - s := configs.HookState{ - Version: c.config.Version, - ID: c.id, - Pid: parent.pid(), - Bundle: bundle, - Annotations: annotations, + s, err := c.currentOCIState() + if err != nil { + return err } for i, hook := range c.config.Hooks.Poststart { if err := hook.Run(s); err != nil { @@ -385,10 +378,18 @@ func (c *linuxContainer) Signal(s os.Signal, all bool) error { if all { return signalAllProcesses(c.cgroupManager, s) } - if err := c.initProcess.signal(s); err != nil { - return newSystemErrorWithCause(err, "signaling init process") + status, err := c.currentStatus() + if err != nil { + return err } - return nil + // to avoid a PID reuse attack + if status == Running || status == Created || status == Paused { + if err := c.initProcess.signal(s); err != nil { + return newSystemErrorWithCause(err, "signaling init process") + } + return nil + } + return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning) } func (c *linuxContainer) createExecFifo() error { @@ -411,10 +412,7 @@ func (c *linuxContainer) createExecFifo() error { return err } unix.Umask(oldMask) - if err := os.Chown(fifoName, rootuid, rootgid); err != nil { - return err - } - return nil + return os.Chown(fifoName, rootuid, rootgid) } func (c *linuxContainer) deleteExecFifo() { @@ -439,7 +437,7 @@ func (c *linuxContainer) includeExecFifo(cmd *exec.Cmd) error { return nil } -func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProcess, error) { +func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) { parentPipe, childPipe, err := utils.NewSockPair("init") if err != nil { return nil, newSystemErrorWithCause(err, "creating new init pipe") @@ -448,7 +446,7 @@ func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProces if err != nil { return nil, newSystemErrorWithCause(err, "creating new command template") } - if !doInit { + if !p.Init { return c.newSetnsProcess(p, cmd, parentPipe, childPipe) } @@ -473,6 +471,7 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec. if cmd.SysProcAttr == nil { cmd.SysProcAttr = &syscall.SysProcAttr{} } + cmd.Env = append(cmd.Env, fmt.Sprintf("GOMAXPROCS=%s", os.Getenv("GOMAXPROCS"))) cmd.ExtraFiles = append(cmd.ExtraFiles, p.ExtraFiles...) if p.ConsoleSocket != nil { cmd.ExtraFiles = append(cmd.ExtraFiles, p.ConsoleSocket) @@ -483,6 +482,7 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec. cmd.ExtraFiles = append(cmd.ExtraFiles, childPipe) cmd.Env = append(cmd.Env, fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1), + fmt.Sprintf("_LIBCONTAINER_STATEDIR=%s", c.root), ) // NOTE: when running a container with no PID namespace and the parent process spawning the container is // PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason @@ -506,7 +506,7 @@ func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, c if err != nil { return nil, err } - return &initProcess{ + init := &initProcess{ cmd: cmd, childPipe: childPipe, parentPipe: parentPipe, @@ -517,7 +517,9 @@ func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, c process: p, bootstrapData: data, sharePidns: sharePidns, - }, nil + } + c.initProcess = init + return init, nil } func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*setnsProcess, error) { @@ -533,14 +535,15 @@ func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, return nil, err } return &setnsProcess{ - cmd: cmd, - cgroupPaths: c.cgroupManager.GetPaths(), - intelRdtPath: state.IntelRdtPath, - childPipe: childPipe, - parentPipe: parentPipe, - config: c.newInitConfig(p), - process: p, - bootstrapData: data, + cmd: cmd, + cgroupPaths: c.cgroupManager.GetPaths(), + rootlessCgroups: c.config.RootlessCgroups, + intelRdtPath: state.IntelRdtPath, + childPipe: childPipe, + parentPipe: parentPipe, + config: c.newInitConfig(p), + process: p, + bootstrapData: data, }, nil } @@ -556,7 +559,8 @@ func (c *linuxContainer) newInitConfig(process *Process) *initConfig { PassedFilesCount: len(process.ExtraFiles), ContainerId: c.ID(), NoNewPrivileges: c.config.NoNewPrivileges, - Rootless: c.config.Rootless, + RootlessEUID: c.config.RootlessEUID, + RootlessCgroups: c.config.RootlessCgroups, AppArmorProfile: c.config.AppArmorProfile, ProcessLabel: c.config.ProcessLabel, Rlimits: c.config.Rlimits, @@ -624,16 +628,16 @@ func (c *linuxContainer) Resume() error { func (c *linuxContainer) NotifyOOM() (<-chan struct{}, error) { // XXX(cyphar): This requires cgroups. - if c.config.Rootless { - return nil, fmt.Errorf("cannot get OOM notifications from rootless container") + if c.config.RootlessCgroups { + logrus.Warn("getting OOM notifications may fail if you don't have the full access to cgroups") } return notifyOnOOM(c.cgroupManager.GetPaths()) } func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) { // XXX(cyphar): This requires cgroups. - if c.config.Rootless { - return nil, fmt.Errorf("cannot get memory pressure notifications from rootless container") + if c.config.RootlessCgroups { + logrus.Warn("getting memory pressure notifications may fail if you don't have the full access to cgroups") } return notifyMemoryPressure(c.cgroupManager.GetPaths(), level) } @@ -668,7 +672,7 @@ func (c *linuxContainer) checkCriuFeatures(criuOpts *CriuOpts, rpcOpts *criurpc. Features: criuFeat, } - err := c.criuSwrk(nil, req, criuOpts, false) + err := c.criuSwrk(nil, req, criuOpts, false, nil) if err != nil { logrus.Debugf("%s", err) return fmt.Errorf("CRIU feature check failed") @@ -781,7 +785,7 @@ func (c *linuxContainer) checkCriuVersion(minVersion int) error { Type: &t, } - err := c.criuSwrk(nil, req, nil, false) + err := c.criuSwrk(nil, req, nil, false, nil) if err != nil { return fmt.Errorf("CRIU version check failed: %s", err) } @@ -873,16 +877,41 @@ func waitForCriuLazyServer(r *os.File, status string) error { return nil } +func (c *linuxContainer) handleCriuConfigurationFile(rpcOpts *criurpc.CriuOpts) { + // CRIU will evaluate a configuration starting with release 3.11. + // Settings in the configuration file will overwrite RPC settings. + // Look for annotations. The annotation 'org.criu.config' + // specifies if CRIU should use a different, container specific + // configuration file. + _, annotations := utils.Annotations(c.config.Labels) + configFile, exists := annotations["org.criu.config"] + if exists { + // If the annotation 'org.criu.config' exists and is set + // to a non-empty string, tell CRIU to use that as a + // configuration file. If the file does not exist, CRIU + // will just ignore it. + if configFile != "" { + rpcOpts.ConfigFile = proto.String(configFile) + } + // If 'org.criu.config' exists and is set to an empty + // string, a runc specific CRIU configuration file will + // be not set at all. + } else { + // If the mentioned annotation has not been found, specify + // a default CRIU configuration file. + rpcOpts.ConfigFile = proto.String("/etc/criu/runc.conf") + } +} + func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error { c.m.Lock() defer c.m.Unlock() + // Checkpoint is unlikely to work if os.Geteuid() != 0 || system.RunningInUserNS(). + // (CLI prints a warning) // TODO(avagin): Figure out how to make this work nicely. CRIU 2.0 has // support for doing unprivileged dumps, but the setup of // rootless containers might make this complicated. - if c.config.Rootless { - return fmt.Errorf("cannot checkpoint a rootless container") - } // criu 1.5.2 => 10502 if err := c.checkCriuVersion(10502); err != nil { @@ -939,6 +968,35 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error { LazyPages: proto.Bool(criuOpts.LazyPages), } + c.handleCriuConfigurationFile(&rpcOpts) + + // If the container is running in a network namespace and has + // a path to the network namespace configured, we will dump + // that network namespace as an external namespace and we + // will expect that the namespace exists during restore. + // This basically means that CRIU will ignore the namespace + // and expect to be setup correctly. + nsPath := c.config.Namespaces.PathOf(configs.NEWNET) + if nsPath != "" { + // For this to work we need at least criu 3.11.0 => 31100. + // As there was already a successful version check we will + // not error out if it fails. runc will just behave as it used + // to do and ignore external network namespaces. + err := c.checkCriuVersion(31100) + if err == nil { + // CRIU expects the information about an external namespace + // like this: --external net[]: + // This is always 'extRootNetNS'. + var netns syscall.Stat_t + err = syscall.Stat(nsPath, &netns) + if err != nil { + return err + } + criuExternal := fmt.Sprintf("net[%d]:extRootNetNS", netns.Ino) + rpcOpts.External = append(rpcOpts.External, criuExternal) + } + } + fcg := c.cgroupManager.GetPaths()["freezer"] if fcg != "" { rpcOpts.FreezeCgroup = proto.String(fcg) @@ -1043,7 +1101,7 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error { } } - err = c.criuSwrk(nil, req, criuOpts, false) + err = c.criuSwrk(nil, req, criuOpts, false, nil) if err != nil { return err } @@ -1083,15 +1141,85 @@ func (c *linuxContainer) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts } } +// makeCriuRestoreMountpoints makes the actual mountpoints for the +// restore using CRIU. This function is inspired from the code in +// rootfs_linux.go +func (c *linuxContainer) makeCriuRestoreMountpoints(m *configs.Mount) error { + switch m.Device { + case "cgroup": + // Do nothing for cgroup, CRIU should handle it + case "bind": + // The prepareBindMount() function checks if source + // exists. So it cannot be used for other filesystem types. + if err := prepareBindMount(m, c.config.Rootfs); err != nil { + return err + } + default: + // for all other file-systems just create the mountpoints + dest, err := securejoin.SecureJoin(c.config.Rootfs, m.Destination) + if err != nil { + return err + } + if err := checkMountDestination(c.config.Rootfs, dest); err != nil { + return err + } + m.Destination = dest + if err := os.MkdirAll(dest, 0755); err != nil { + return err + } + } + return nil +} + +// isPathInPrefixList is a small function for CRIU restore to make sure +// mountpoints, which are on a tmpfs, are not created in the roofs +func isPathInPrefixList(path string, prefix []string) bool { + for _, p := range prefix { + if strings.HasPrefix(path, p+"/") { + return false + } + } + return true +} + +// prepareCriuRestoreMounts tries to set up the rootfs of the +// container to be restored in the same way runc does it for +// initial container creation. Even for a read-only rootfs container +// runc modifies the rootfs to add mountpoints which do not exist. +// This function also creates missing mountpoints as long as they +// are not on top of a tmpfs, as CRIU will restore tmpfs content anyway. +func (c *linuxContainer) prepareCriuRestoreMounts(mounts []*configs.Mount) error { + // First get a list of a all tmpfs mounts + tmpfs := []string{} + for _, m := range mounts { + switch m.Device { + case "tmpfs": + tmpfs = append(tmpfs, m.Destination) + } + } + // Now go through all mounts and create the mountpoints + // if the mountpoints are not on a tmpfs, as CRIU will + // restore the complete tmpfs content from its checkpoint. + for _, m := range mounts { + if isPathInPrefixList(m.Destination, tmpfs) { + if err := c.makeCriuRestoreMountpoints(m); err != nil { + return err + } + } + } + return nil +} + func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { c.m.Lock() defer c.m.Unlock() + var extraFiles []*os.File + + // Restore is unlikely to work if os.Geteuid() != 0 || system.RunningInUserNS(). + // (CLI prints a warning) // TODO(avagin): Figure out how to make this work nicely. CRIU doesn't have // support for unprivileged restore at the moment. - if c.config.Rootless { - return fmt.Errorf("cannot restore a rootless container") - } // criu 1.5.2 => 10502 if err := c.checkCriuVersion(10502); err != nil { @@ -1161,6 +1289,46 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { }, } + c.handleCriuConfigurationFile(req.Opts) + + // Same as during checkpointing. If the container has a specific network namespace + // assigned to it, this now expects that the checkpoint will be restored in a + // already created network namespace. + nsPath := c.config.Namespaces.PathOf(configs.NEWNET) + if nsPath != "" { + // For this to work we need at least criu 3.11.0 => 31100. + // As there was already a successful version check we will + // not error out if it fails. runc will just behave as it used + // to do and ignore external network namespaces. + err := c.checkCriuVersion(31100) + if err == nil { + // CRIU wants the information about an existing network namespace + // like this: --inherit-fd fd[]: + // The needs to be the same as during checkpointing. + // We are always using 'extRootNetNS' as the key in this. + netns, err := os.Open(nsPath) + defer netns.Close() + if err != nil { + logrus.Errorf("If a specific network namespace is defined it must exist: %s", err) + return fmt.Errorf("Requested network namespace %v does not exist", nsPath) + } + inheritFd := new(criurpc.InheritFd) + inheritFd.Key = proto.String("extRootNetNS") + // The offset of four is necessary because 0, 1, 2 and 3 is already + // used by stdin, stdout, stderr, 'criu swrk' socket. + inheritFd.Fd = proto.Int32(int32(4 + len(extraFiles))) + req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd) + // All open FDs need to be transferred to CRIU via extraFiles + extraFiles = append(extraFiles, netns) + } + } + + // This will modify the rootfs of the container in the same way runc + // modifies the container during initial creation. + if err := c.prepareCriuRestoreMounts(c.config.Mounts); err != nil { + return err + } + for _, m := range c.config.Mounts { switch m.Device { case "bind": @@ -1219,7 +1387,7 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd) } } - return c.criuSwrk(process, req, criuOpts, true) + return c.criuSwrk(process, req, criuOpts, true, extraFiles) } func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error { @@ -1249,7 +1417,7 @@ func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error { return nil } -func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *CriuOpts, applyCgroups bool) error { +func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *CriuOpts, applyCgroups bool, extraFiles []*os.File) error { fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_SEQPACKET|unix.SOCK_CLOEXEC, 0) if err != nil { return err @@ -1290,6 +1458,9 @@ func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts * cmd.Stderr = process.Stderr } cmd.ExtraFiles = append(cmd.ExtraFiles, criuServer) + if extraFiles != nil { + cmd.ExtraFiles = append(cmd.ExtraFiles, extraFiles...) + } if err := cmd.Start(); err != nil { return err @@ -1486,14 +1657,11 @@ func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Proc } case notify.GetScript() == "setup-namespaces": if c.config.Hooks != nil { - bundle, annotations := utils.Annotations(c.config.Labels) - s := configs.HookState{ - Version: c.config.Version, - ID: c.id, - Pid: int(notify.GetPid()), - Bundle: bundle, - Annotations: annotations, + s, err := c.currentOCIState() + if err != nil { + return nil } + s.Pid = int(notify.GetPid()) for i, hook := range c.config.Hooks.Prestart { if err := hook.Run(s); err != nil { return newSystemErrorWithCausef(err, "running prestart hook %d", i) @@ -1664,7 +1832,7 @@ func (c *linuxContainer) currentState() (*State, error) { InitProcessStartTime: startTime, Created: c.created, }, - Rootless: c.config.Rootless, + Rootless: c.config.RootlessEUID && c.config.RootlessCgroups, CgroupPaths: c.cgroupManager.GetPaths(), IntelRdtPath: intelRdtPath, NamespacePaths: make(map[configs.NamespaceType]string), @@ -1687,11 +1855,31 @@ func (c *linuxContainer) currentState() (*State, error) { return state, nil } +func (c *linuxContainer) currentOCIState() (*specs.State, error) { + bundle, annotations := utils.Annotations(c.config.Labels) + state := &specs.State{ + Version: specs.Version, + ID: c.ID(), + Bundle: bundle, + Annotations: annotations, + } + status, err := c.currentStatus() + if err != nil { + return nil, err + } + state.Status = status.String() + if status != Stopped { + if c.initProcess != nil { + state.Pid = c.initProcess.pid() + } + } + return state, nil +} + // orderNamespacePaths sorts namespace paths into a list of paths that we // can setns in order. func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) { paths := []string{} - for _, ns := range configs.NamespaceTypes() { // Remove namespaces that we don't need to join. @@ -1765,7 +1953,7 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Na if !joinExistingUser { // write uid mappings if len(c.config.UidMappings) > 0 { - if c.config.Rootless && c.newuidmapPath != "" { + if c.config.RootlessEUID && c.newuidmapPath != "" { r.AddData(&Bytemsg{ Type: UidmapPathAttr, Value: []byte(c.newuidmapPath), @@ -1791,39 +1979,33 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Na Type: GidmapAttr, Value: b, }) - if c.config.Rootless && c.newgidmapPath != "" { + if c.config.RootlessEUID && c.newgidmapPath != "" { r.AddData(&Bytemsg{ Type: GidmapPathAttr, Value: []byte(c.newgidmapPath), }) } - // The following only applies if we are root. - if !c.config.Rootless { - // check if we have CAP_SETGID to setgroup properly - pid, err := capability.NewPid(0) - if err != nil { - return nil, err - } - if !pid.Get(capability.EFFECTIVE, capability.CAP_SETGID) { - r.AddData(&Boolmsg{ - Type: SetgroupAttr, - Value: true, - }) - } + if requiresRootOrMappingTool(c.config) { + r.AddData(&Boolmsg{ + Type: SetgroupAttr, + Value: true, + }) } } } - // write oom_score_adj - r.AddData(&Bytemsg{ - Type: OomScoreAdjAttr, - Value: []byte(fmt.Sprintf("%d", c.config.OomScoreAdj)), - }) + if c.config.OomScoreAdj != nil { + // write oom_score_adj + r.AddData(&Bytemsg{ + Type: OomScoreAdjAttr, + Value: []byte(fmt.Sprintf("%d", *c.config.OomScoreAdj)), + }) + } // write rootless r.AddData(&Boolmsg{ - Type: RootlessAttr, - Value: c.config.Rootless, + Type: RootlessEUIDAttr, + Value: c.config.RootlessEUID, }) return bytes.NewReader(r.Serialize()), nil @@ -1843,3 +2025,10 @@ func ignoreTerminateErrors(err error) error { } return err } + +func requiresRootOrMappingTool(c *configs.Config) bool { + gidMap := []configs.IDMap{ + {ContainerID: 0, HostID: os.Getegid(), Size: 1}, + } + return !reflect.DeepEqual(c.GidMappings, gidMap) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/factory_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/factory_linux.go index 7d53d5e04d..e35957c314 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/factory_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/factory_linux.go @@ -11,6 +11,7 @@ import ( "runtime/debug" "strconv" + "github.com/cyphar/filepath-securejoin" "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups/fs" "github.com/opencontainers/runc/libcontainer/cgroups/systemd" @@ -59,9 +60,9 @@ func SystemdCgroups(l *LinuxFactory) error { return nil } -// Cgroupfs is an options func to configure a LinuxFactory to return -// containers that use the native cgroups filesystem implementation to -// create and manage cgroups. +// Cgroupfs is an options func to configure a LinuxFactory to return containers +// that use the native cgroups filesystem implementation to create and manage +// cgroups. func Cgroupfs(l *LinuxFactory) error { l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager { return &fs.Manager{ @@ -72,9 +73,26 @@ func Cgroupfs(l *LinuxFactory) error { return nil } +// RootlessCgroupfs is an options func to configure a LinuxFactory to return +// containers that use the native cgroups filesystem implementation to create +// and manage cgroups. The difference between RootlessCgroupfs and Cgroupfs is +// that RootlessCgroupfs can transparently handle permission errors that occur +// during rootless container (including euid=0 in userns) setup (while still allowing cgroup usage if +// they've been set up properly). +func RootlessCgroupfs(l *LinuxFactory) error { + l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager { + return &fs.Manager{ + Cgroups: config, + Rootless: true, + Paths: paths, + } + } + return nil +} + // IntelRdtfs is an options func to configure a LinuxFactory to return // containers that use the Intel RDT "resource control" filesystem to -// create and manage Intel Xeon platform shared resources (e.g., L3 cache). +// create and manage Intel RDT resources (e.g., L3 cache, memory bandwidth). func IntelRdtFs(l *LinuxFactory) error { l.NewIntelRdtManager = func(config *configs.Config, id string, path string) intelrdt.Manager { return &intelrdt.IntelRdtManager{ @@ -178,7 +196,10 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err if err := l.Validator.Validate(config); err != nil { return nil, newGenericError(err, ConfigInvalid) } - containerRoot := filepath.Join(l.Root, id) + containerRoot, err := securejoin.SecureJoin(l.Root, id) + if err != nil { + return nil, err + } if _, err := os.Stat(containerRoot); err == nil { return nil, newGenericError(fmt.Errorf("container with id exists: %v", id), IdInUse) } else if !os.IsNotExist(err) { @@ -201,7 +222,7 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err newgidmapPath: l.NewgidmapPath, cgroupManager: l.NewCgroupsManager(config.Cgroups, nil), } - if intelrdt.IsEnabled() { + if intelrdt.IsCatEnabled() || intelrdt.IsMbaEnabled() { c.intelRdtManager = l.NewIntelRdtManager(config, id, "") } c.state = &stoppedState{c: c} @@ -212,7 +233,14 @@ func (l *LinuxFactory) Load(id string) (Container, error) { if l.Root == "" { return nil, newGenericError(fmt.Errorf("invalid root"), ConfigInvalid) } - containerRoot := filepath.Join(l.Root, id) + //when load, we need to check id is valid or not. + if err := l.validateID(id); err != nil { + return nil, err + } + containerRoot, err := securejoin.SecureJoin(l.Root, id) + if err != nil { + return nil, err + } state, err := l.loadState(containerRoot, id) if err != nil { return nil, err @@ -240,7 +268,7 @@ func (l *LinuxFactory) Load(id string) (Container, error) { if err := c.refreshState(); err != nil { return nil, err } - if intelrdt.IsEnabled() { + if intelrdt.IsCatEnabled() || intelrdt.IsMbaEnabled() { c.intelRdtManager = l.NewIntelRdtManager(&state.Config, id, state.IntelRdtPath) } return c, nil @@ -322,7 +350,11 @@ func (l *LinuxFactory) StartInitialization() (err error) { } func (l *LinuxFactory) loadState(root, id string) (*State, error) { - f, err := os.Open(filepath.Join(root, stateFilename)) + stateFilePath, err := securejoin.SecureJoin(root, stateFilename) + if err != nil { + return nil, err + } + f, err := os.Open(stateFilePath) if err != nil { if os.IsNotExist(err) { return nil, newGenericError(fmt.Errorf("container %q does not exist", id), ContainerNotExists) @@ -338,7 +370,7 @@ func (l *LinuxFactory) loadState(root, id string) (*State, error) { } func (l *LinuxFactory) validateID(id string) error { - if !idRegex.MatchString(id) { + if !idRegex.MatchString(id) || string(os.PathSeparator)+id != utils.CleanPath(string(os.PathSeparator)+id) { return newGenericError(fmt.Errorf("invalid id format: %v", id), InvalidIdFormat) } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/init_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/init_linux.go index 2770be3071..cd7ff67a70 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/init_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/init_linux.go @@ -6,6 +6,7 @@ import ( "encoding/json" "fmt" "io" + "io/ioutil" "net" "os" "strings" @@ -20,6 +21,7 @@ import ( "github.com/opencontainers/runc/libcontainer/system" "github.com/opencontainers/runc/libcontainer/user" "github.com/opencontainers/runc/libcontainer/utils" + "github.com/pkg/errors" "github.com/sirupsen/logrus" "github.com/vishvananda/netlink" ) @@ -64,7 +66,8 @@ type initConfig struct { CreateConsole bool `json:"create_console"` ConsoleWidth uint16 `json:"console_width"` ConsoleHeight uint16 `json:"console_height"` - Rootless bool `json:"rootless"` + RootlessEUID bool `json:"rootless_euid,omitempty"` + RootlessCgroups bool `json:"rootless_cgroups,omitempty"` } type initer interface { @@ -121,7 +124,7 @@ func finalizeNamespace(config *initConfig) error { // inherited are marked close-on-exec so they stay out of the // container if err := utils.CloseExecFrom(config.PassedFilesCount + 3); err != nil { - return err + return errors.Wrap(err, "close exec fds") } capabilities := &configs.Capabilities{} @@ -136,20 +139,20 @@ func finalizeNamespace(config *initConfig) error { } // drop capabilities in bounding set before changing user if err := w.ApplyBoundingSet(); err != nil { - return err + return errors.Wrap(err, "apply bounding set") } // preserve existing capabilities while we change users if err := system.SetKeepCaps(); err != nil { - return err + return errors.Wrap(err, "set keep caps") } if err := setupUser(config); err != nil { - return err + return errors.Wrap(err, "setup user") } if err := system.ClearKeepCaps(); err != nil { - return err + return errors.Wrap(err, "clear keep caps") } if err := w.ApplyCaps(); err != nil { - return err + return errors.Wrap(err, "apply caps") } if config.Cwd != "" { if err := unix.Chdir(config.Cwd); err != nil { @@ -217,11 +220,7 @@ func syncParentReady(pipe io.ReadWriter) error { } // Wait for parent to give the all-clear. - if err := readSync(pipe, procRun); err != nil { - return err - } - - return nil + return readSync(pipe, procRun) } // syncParentHooks sends to the given pipe a JSON payload which indicates that @@ -234,11 +233,7 @@ func syncParentHooks(pipe io.ReadWriter) error { } // Wait for parent to give the all-clear. - if err := readSync(pipe, procResume); err != nil { - return err - } - - return nil + return readSync(pipe, procResume) } // setupUser changes the groups, gid, and uid for the user inside the container @@ -282,7 +277,7 @@ func setupUser(config *initConfig) error { return fmt.Errorf("cannot set gid to unmapped user in user namespace") } - if config.Rootless { + if config.RootlessEUID { // We cannot set any additional groups in a rootless container and thus // we bail if the user asked us to do so. TODO: We currently can't do // this check earlier, but if libcontainer.Process.User was typesafe @@ -298,11 +293,18 @@ func setupUser(config *initConfig) error { return err } + setgroups, err := ioutil.ReadFile("/proc/self/setgroups") + if err != nil && !os.IsNotExist(err) { + return err + } + // This isn't allowed in an unprivileged user namespace since Linux 3.19. // There's nothing we can do about /etc/group entries, so we silently // ignore setting groups here (since the user didn't explicitly ask us to // set the group). - if !config.Rootless { + allowSupGroups := !config.RootlessEUID && strings.TrimSpace(string(setgroups)) != "deny" + + if allowSupGroups { suppGroups := append(execUser.Sgids, addGroups...) if err := unix.Setgroups(suppGroups); err != nil { return err diff --git a/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/intelrdt.go b/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/intelrdt.go index 487c630af6..0071ce7557 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/intelrdt.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/intelrdt.go @@ -16,20 +16,26 @@ import ( ) /* - * About Intel RDT/CAT feature: + * About Intel RDT features: * Intel platforms with new Xeon CPU support Resource Director Technology (RDT). - * Intel Cache Allocation Technology (CAT) is a sub-feature of RDT. Currently L3 - * Cache is the only resource that is supported in RDT. + * Cache Allocation Technology (CAT) and Memory Bandwidth Allocation (MBA) are + * two sub-features of RDT. * - * This feature provides a way for the software to restrict cache allocation to a - * defined 'subset' of L3 cache which may be overlapping with other 'subsets'. - * The different subsets are identified by class of service (CLOS) and each CLOS - * has a capacity bitmask (CBM). + * Cache Allocation Technology (CAT) provides a way for the software to restrict + * cache allocation to a defined 'subset' of L3 cache which may be overlapping + * with other 'subsets'. The different subsets are identified by class of + * service (CLOS) and each CLOS has a capacity bitmask (CBM). * - * For more information about Intel RDT/CAT can be found in the section 17.17 - * of Intel Software Developer Manual. + * Memory Bandwidth Allocation (MBA) provides indirect and approximate throttle + * over memory bandwidth for the software. A user controls the resource by + * indicating the percentage of maximum memory bandwidth or memory bandwidth + * limit in MBps unit if MBA Software Controller is enabled. * - * About Intel RDT/CAT kernel interface: + * More details about Intel RDT CAT and MBA can be found in the section 17.18 + * of Intel Software Developer Manual: + * https://software.intel.com/en-us/articles/intel-sdm + * + * About Intel RDT kernel interface: * In Linux 4.10 kernel or newer, the interface is defined and exposed via * "resource control" filesystem, which is a "cgroup-like" interface. * @@ -37,59 +43,98 @@ import ( * interfaces in a container. But unlike cgroups' hierarchy, it has single level * filesystem layout. * + * CAT and MBA features are introduced in Linux 4.10 and 4.12 kernel via + * "resource control" filesystem. + * * Intel RDT "resource control" filesystem hierarchy: * mount -t resctrl resctrl /sys/fs/resctrl * tree /sys/fs/resctrl * /sys/fs/resctrl/ * |-- info * | |-- L3 - * | |-- cbm_mask - * | |-- min_cbm_bits + * | | |-- cbm_mask + * | | |-- min_cbm_bits + * | | |-- num_closids + * | |-- MB + * | |-- bandwidth_gran + * | |-- delay_linear + * | |-- min_bandwidth * | |-- num_closids - * |-- cpus + * |-- ... * |-- schemata * |-- tasks * |-- - * |-- cpus + * |-- ... * |-- schemata * |-- tasks * - * For runc, we can make use of `tasks` and `schemata` configuration for L3 cache - * resource constraints. + * For runc, we can make use of `tasks` and `schemata` configuration for L3 + * cache and memory bandwidth resources constraints. * - * The file `tasks` has a list of tasks that belongs to this group (e.g., + * The file `tasks` has a list of tasks that belongs to this group (e.g., * " group). Tasks can be added to a group by writing the task ID - * to the "tasks" file (which will automatically remove them from the previous + * to the "tasks" file (which will automatically remove them from the previous * group to which they belonged). New tasks created by fork(2) and clone(2) are - * added to the same group as their parent. If a pid is not in any sub group, it is - * in root group. + * added to the same group as their parent. * - * The file `schemata` has allocation bitmasks/values for L3 cache on each socket, - * which contains L3 cache id and capacity bitmask (CBM). + * The file `schemata` has a list of all the resources available to this group. + * Each resource (L3 cache, memory bandwidth) has its own line and format. + * + * L3 cache schema: + * It has allocation bitmasks/values for L3 cache on each socket, which + * contains L3 cache id and capacity bitmask (CBM). * Format: "L3:=;=;..." - * For example, on a two-socket machine, L3's schema line could be `L3:0=ff;1=c0` + * For example, on a two-socket machine, the schema line could be "L3:0=ff;1=c0" * which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0. * * The valid L3 cache CBM is a *contiguous bits set* and number of bits that can * be set is less than the max bit. The max bits in the CBM is varied among - * supported Intel Xeon platforms. In Intel RDT "resource control" filesystem - * layout, the CBM in a group should be a subset of the CBM in root. Kernel will - * check if it is valid when writing. e.g., 0xfffff in root indicates the max bits - * of CBM is 20 bits, which mapping to entire L3 cache capacity. Some valid CBM - * values to set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc. + * supported Intel CPU models. Kernel will check if it is valid when writing. + * e.g., default value 0xfffff in root indicates the max bits of CBM is 20 + * bits, which mapping to entire L3 cache capacity. Some valid CBM values to + * set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc. + * + * Memory bandwidth schema: + * It has allocation values for memory bandwidth on each socket, which contains + * L3 cache id and memory bandwidth. + * Format: "MB:=bandwidth0;=bandwidth1;..." + * For example, on a two-socket machine, the schema line could be "MB:0=20;1=70" + * + * The minimum bandwidth percentage value for each CPU model is predefined and + * can be looked up through "info/MB/min_bandwidth". The bandwidth granularity + * that is allocated is also dependent on the CPU model and can be looked up at + * "info/MB/bandwidth_gran". The available bandwidth control steps are: + * min_bw + N * bw_gran. Intermediate values are rounded to the next control + * step available on the hardware. + * + * If MBA Software Controller is enabled through mount option "-o mba_MBps": + * mount -t resctrl resctrl -o mba_MBps /sys/fs/resctrl + * We could specify memory bandwidth in "MBps" (Mega Bytes per second) unit + * instead of "percentages". The kernel underneath would use a software feedback + * mechanism or a "Software Controller" which reads the actual bandwidth using + * MBM counters and adjust the memory bandwidth percentages to ensure: + * "actual memory bandwidth < user specified memory bandwidth". + * + * For example, on a two-socket machine, the schema line could be + * "MB:0=5000;1=7000" which means 5000 MBps memory bandwidth limit on socket 0 + * and 7000 MBps memory bandwidth limit on socket 1. * - * For more information about Intel RDT/CAT kernel interface: + * For more information about Intel RDT kernel interface: * https://www.kernel.org/doc/Documentation/x86/intel_rdt_ui.txt * * An example for runc: * Consider a two-socket machine with two L3 caches where the default CBM is - * 0xfffff and the max CBM length is 20 bits. With this configuration, tasks - * inside the container only have access to the "upper" 80% of L3 cache id 0 and - * the "lower" 50% L3 cache id 1: + * 0x7ff and the max CBM length is 11 bits, and minimum memory bandwidth of 10% + * with a memory bandwidth granularity of 10%. + * + * Tasks inside the container only have access to the "upper" 7/11 of L3 cache + * on socket 0 and the "lower" 5/11 L3 cache on socket 1, and may use a + * maximum memory bandwidth of 20% on socket 0 and 70% on socket 1. * * "linux": { - * "intelRdt": { - * "l3CacheSchema": "L3:0=ffff0;1=3ff" + * "intelRdt": { + * "l3CacheSchema": "L3:0=7f0;1=1f", + * "memBwSchema": "MB:0=20;1=70" * } * } */ @@ -129,8 +174,12 @@ var ( intelRdtRoot string intelRdtRootLock sync.Mutex - // The flag to indicate if Intel RDT is supported - isEnabled bool + // The flag to indicate if Intel RDT/CAT is enabled + isCatEnabled bool + // The flag to indicate if Intel RDT/MBA is enabled + isMbaEnabled bool + // The flag to indicate if Intel RDT/MBA Software Controller is enabled + isMbaScEnabled bool ) type intelRdtData struct { @@ -139,19 +188,40 @@ type intelRdtData struct { pid int } -// Check if Intel RDT is enabled in init() +// Check if Intel RDT sub-features are enabled in init() func init() { - // 1. Check if hardware and kernel support Intel RDT/CAT feature - // "cat_l3" flag is set if supported - isFlagSet, err := parseCpuInfoFile("/proc/cpuinfo") - if !isFlagSet || err != nil { - isEnabled = false + // 1. Check if hardware and kernel support Intel RDT sub-features + // "cat_l3" flag for CAT and "mba" flag for MBA + isCatFlagSet, isMbaFlagSet, err := parseCpuInfoFile("/proc/cpuinfo") + if err != nil { return } // 2. Check if Intel RDT "resource control" filesystem is mounted // The user guarantees to mount the filesystem - isEnabled = isIntelRdtMounted() + if !isIntelRdtMounted() { + return + } + + // 3. Double check if Intel RDT sub-features are available in + // "resource control" filesystem. Intel RDT sub-features can be + // selectively disabled or enabled by kernel command line + // (e.g., rdt=!l3cat,mba) in 4.14 and newer kernel + if isCatFlagSet { + if _, err := os.Stat(filepath.Join(intelRdtRoot, "info", "L3")); err == nil { + isCatEnabled = true + } + } + if isMbaScEnabled { + // We confirm MBA Software Controller is enabled in step 2, + // MBA should be enabled because MBA Software Controller + // depends on MBA + isMbaEnabled = true + } else if isMbaFlagSet { + if _, err := os.Stat(filepath.Join(intelRdtRoot, "info", "MB")); err == nil { + isMbaEnabled = true + } + } } // Return the mount point path of Intel RDT "resource control" filesysem @@ -177,11 +247,16 @@ func findIntelRdtMountpointDir() (string, error) { } if postSeparatorFields[0] == "resctrl" { - // Check that the mount is properly formated. + // Check that the mount is properly formatted. if numPostFields < 3 { return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text) } + // Check if MBA Software Controller is enabled through mount option "-o mba_MBps" + if strings.Contains(postSeparatorFields[2], "mba_MBps") { + isMbaScEnabled = true + } + return fields[4], nil } } @@ -223,30 +298,40 @@ func isIntelRdtMounted() bool { return true } -func parseCpuInfoFile(path string) (bool, error) { +func parseCpuInfoFile(path string) (bool, bool, error) { + isCatFlagSet := false + isMbaFlagSet := false + f, err := os.Open(path) if err != nil { - return false, err + return false, false, err } defer f.Close() s := bufio.NewScanner(f) for s.Scan() { if err := s.Err(); err != nil { - return false, err + return false, false, err } - text := s.Text() - flags := strings.Split(text, " ") - - // "cat_l3" flag is set if Intel RDT/CAT is supported - for _, flag := range flags { - if flag == "cat_l3" { - return true, nil + line := s.Text() + + // Search "cat_l3" and "mba" flags in first "flags" line + if strings.Contains(line, "flags") { + flags := strings.Split(line, " ") + // "cat_l3" flag for CAT and "mba" flag for MBA + for _, flag := range flags { + switch flag { + case "cat_l3": + isCatFlagSet = true + case "mba": + isMbaFlagSet = true + } } + return isCatFlagSet, isMbaFlagSet, nil } } - return false, nil + return isCatFlagSet, isMbaFlagSet, nil } func parseUint(s string, base, bitSize int) (uint64, error) { @@ -292,30 +377,6 @@ func getIntelRdtParamString(path, file string) (string, error) { return strings.TrimSpace(string(contents)), nil } -func readTasksFile(dir string) ([]int, error) { - f, err := os.Open(filepath.Join(dir, IntelRdtTasks)) - if err != nil { - return nil, err - } - defer f.Close() - - var ( - s = bufio.NewScanner(f) - out = []int{} - ) - - for s.Scan() { - if t := s.Text(); t != "" { - pid, err := strconv.Atoi(t) - if err != nil { - return nil, err - } - out = append(out, pid) - } - } - return out, nil -} - func writeFile(dir, file, data string) error { if dir == "" { return fmt.Errorf("no such directory for %s", file) @@ -368,13 +429,64 @@ func getL3CacheInfo() (*L3CacheInfo, error) { return l3CacheInfo, nil } +// Get the read-only memory bandwidth information +func getMemBwInfo() (*MemBwInfo, error) { + memBwInfo := &MemBwInfo{} + + rootPath, err := getIntelRdtRoot() + if err != nil { + return memBwInfo, err + } + + path := filepath.Join(rootPath, "info", "MB") + bandwidthGran, err := getIntelRdtParamUint(path, "bandwidth_gran") + if err != nil { + return memBwInfo, err + } + delayLinear, err := getIntelRdtParamUint(path, "delay_linear") + if err != nil { + return memBwInfo, err + } + minBandwidth, err := getIntelRdtParamUint(path, "min_bandwidth") + if err != nil { + return memBwInfo, err + } + numClosids, err := getIntelRdtParamUint(path, "num_closids") + if err != nil { + return memBwInfo, err + } + + memBwInfo.BandwidthGran = bandwidthGran + memBwInfo.DelayLinear = delayLinear + memBwInfo.MinBandwidth = minBandwidth + memBwInfo.NumClosids = numClosids + + return memBwInfo, nil +} + +// Get diagnostics for last filesystem operation error from file info/last_cmd_status +func getLastCmdStatus() (string, error) { + rootPath, err := getIntelRdtRoot() + if err != nil { + return "", err + } + + path := filepath.Join(rootPath, "info") + lastCmdStatus, err := getIntelRdtParamString(path, "last_cmd_status") + if err != nil { + return "", err + } + + return lastCmdStatus, nil +} + // WriteIntelRdtTasks writes the specified pid into the "tasks" file func WriteIntelRdtTasks(dir string, pid int) error { if dir == "" { return fmt.Errorf("no such directory for %s", IntelRdtTasks) } - // Dont attach any pid if -1 is specified as a pid + // Don't attach any pid if -1 is specified as a pid if pid != -1 { if err := ioutil.WriteFile(filepath.Join(dir, IntelRdtTasks), []byte(strconv.Itoa(pid)), 0700); err != nil { return fmt.Errorf("failed to write %v to %v: %v", pid, IntelRdtTasks, err) @@ -383,9 +495,19 @@ func WriteIntelRdtTasks(dir string, pid int) error { return nil } -// Check if Intel RDT is enabled -func IsEnabled() bool { - return isEnabled +// Check if Intel RDT/CAT is enabled +func IsCatEnabled() bool { + return isCatEnabled +} + +// Check if Intel RDT/MBA is enabled +func IsMbaEnabled() bool { + return isMbaEnabled +} + +// Check if Intel RDT/MBA Software Controller is enabled +func IsMbaScEnabled() bool { + return isMbaScEnabled } // Get the 'container_id' path in Intel RDT "resource control" filesystem @@ -425,7 +547,7 @@ func (m *IntelRdtManager) Apply(pid int) (err error) { func (m *IntelRdtManager) Destroy() error { m.mu.Lock() defer m.mu.Unlock() - if err := os.RemoveAll(m.Path); err != nil { + if err := os.RemoveAll(m.GetPath()); err != nil { return err } m.Path = "" @@ -452,65 +574,143 @@ func (m *IntelRdtManager) GetStats() (*Stats, error) { defer m.mu.Unlock() stats := NewStats() - // The read-only L3 cache information - l3CacheInfo, err := getL3CacheInfo() - if err != nil { - return nil, err - } - stats.L3CacheInfo = l3CacheInfo - - // The read-only L3 cache schema in root rootPath, err := getIntelRdtRoot() if err != nil { return nil, err } + // The read-only L3 cache and memory bandwidth schemata in root tmpRootStrings, err := getIntelRdtParamString(rootPath, "schemata") if err != nil { return nil, err } - // L3 cache schema is in the first line schemaRootStrings := strings.Split(tmpRootStrings, "\n") - stats.L3CacheSchemaRoot = schemaRootStrings[0] - // The L3 cache schema in 'container_id' group + // The L3 cache and memory bandwidth schemata in 'container_id' group tmpStrings, err := getIntelRdtParamString(m.GetPath(), "schemata") if err != nil { return nil, err } - // L3 cache schema is in the first line schemaStrings := strings.Split(tmpStrings, "\n") - stats.L3CacheSchema = schemaStrings[0] + + if IsCatEnabled() { + // The read-only L3 cache information + l3CacheInfo, err := getL3CacheInfo() + if err != nil { + return nil, err + } + stats.L3CacheInfo = l3CacheInfo + + // The read-only L3 cache schema in root + for _, schemaRoot := range schemaRootStrings { + if strings.Contains(schemaRoot, "L3") { + stats.L3CacheSchemaRoot = strings.TrimSpace(schemaRoot) + } + } + + // The L3 cache schema in 'container_id' group + for _, schema := range schemaStrings { + if strings.Contains(schema, "L3") { + stats.L3CacheSchema = strings.TrimSpace(schema) + } + } + } + + if IsMbaEnabled() { + // The read-only memory bandwidth information + memBwInfo, err := getMemBwInfo() + if err != nil { + return nil, err + } + stats.MemBwInfo = memBwInfo + + // The read-only memory bandwidth information + for _, schemaRoot := range schemaRootStrings { + if strings.Contains(schemaRoot, "MB") { + stats.MemBwSchemaRoot = strings.TrimSpace(schemaRoot) + } + } + + // The memory bandwidth schema in 'container_id' group + for _, schema := range schemaStrings { + if strings.Contains(schema, "MB") { + stats.MemBwSchema = strings.TrimSpace(schema) + } + } + } return stats, nil } // Set Intel RDT "resource control" filesystem as configured. func (m *IntelRdtManager) Set(container *configs.Config) error { - path := m.GetPath() - - // About L3 cache schema file: - // The schema has allocation masks/values for L3 cache on each socket, + // About L3 cache schema: + // It has allocation bitmasks/values for L3 cache on each socket, // which contains L3 cache id and capacity bitmask (CBM). - // Format: "L3:=;=;..." - // For example, on a two-socket machine, L3's schema line could be: - // L3:0=ff;1=c0 - // Which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0. + // Format: "L3:=;=;..." + // For example, on a two-socket machine, the schema line could be: + // L3:0=ff;1=c0 + // which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM + // is 0xc0. // - // About L3 cache CBM validity: // The valid L3 cache CBM is a *contiguous bits set* and number of // bits that can be set is less than the max bit. The max bits in the - // CBM is varied among supported Intel Xeon platforms. In Intel RDT - // "resource control" filesystem layout, the CBM in a group should - // be a subset of the CBM in root. Kernel will check if it is valid - // when writing. - // e.g., 0xfffff in root indicates the max bits of CBM is 20 bits, - // which mapping to entire L3 cache capacity. Some valid CBM values - // to set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc. + // CBM is varied among supported Intel CPU models. Kernel will check + // if it is valid when writing. e.g., default value 0xfffff in root + // indicates the max bits of CBM is 20 bits, which mapping to entire + // L3 cache capacity. Some valid CBM values to set in a group: + // 0xf, 0xf0, 0x3ff, 0x1f00 and etc. + // + // + // About memory bandwidth schema: + // It has allocation values for memory bandwidth on each socket, which + // contains L3 cache id and memory bandwidth. + // Format: "MB:=bandwidth0;=bandwidth1;..." + // For example, on a two-socket machine, the schema line could be: + // "MB:0=20;1=70" + // + // The minimum bandwidth percentage value for each CPU model is + // predefined and can be looked up through "info/MB/min_bandwidth". + // The bandwidth granularity that is allocated is also dependent on + // the CPU model and can be looked up at "info/MB/bandwidth_gran". + // The available bandwidth control steps are: min_bw + N * bw_gran. + // Intermediate values are rounded to the next control step available + // on the hardware. + // + // If MBA Software Controller is enabled through mount option + // "-o mba_MBps": mount -t resctrl resctrl -o mba_MBps /sys/fs/resctrl + // We could specify memory bandwidth in "MBps" (Mega Bytes per second) + // unit instead of "percentages". The kernel underneath would use a + // software feedback mechanism or a "Software Controller" which reads + // the actual bandwidth using MBM counters and adjust the memory + // bandwidth percentages to ensure: + // "actual memory bandwidth < user specified memory bandwidth". + // + // For example, on a two-socket machine, the schema line could be + // "MB:0=5000;1=7000" which means 5000 MBps memory bandwidth limit on + // socket 0 and 7000 MBps memory bandwidth limit on socket 1. if container.IntelRdt != nil { + path := m.GetPath() l3CacheSchema := container.IntelRdt.L3CacheSchema - if l3CacheSchema != "" { + memBwSchema := container.IntelRdt.MemBwSchema + + // Write a single joint schema string to schemata file + if l3CacheSchema != "" && memBwSchema != "" { + if err := writeFile(path, "schemata", l3CacheSchema+"\n"+memBwSchema); err != nil { + return NewLastCmdError(err) + } + } + + // Write only L3 cache schema string to schemata file + if l3CacheSchema != "" && memBwSchema == "" { if err := writeFile(path, "schemata", l3CacheSchema); err != nil { - return err + return NewLastCmdError(err) + } + } + + // Write only memory bandwidth schema string to schemata file + if l3CacheSchema == "" && memBwSchema != "" { + if err := writeFile(path, "schemata", memBwSchema); err != nil { + return NewLastCmdError(err) } } } @@ -521,11 +721,11 @@ func (m *IntelRdtManager) Set(container *configs.Config) error { func (raw *intelRdtData) join(id string) (string, error) { path := filepath.Join(raw.root, id) if err := os.MkdirAll(path, 0755); err != nil { - return "", err + return "", NewLastCmdError(err) } if err := WriteIntelRdtTasks(path, raw.pid); err != nil { - return "", err + return "", NewLastCmdError(err) } return path, nil } @@ -551,3 +751,23 @@ func IsNotFound(err error) bool { _, ok := err.(*NotFoundError) return ok } + +type LastCmdError struct { + LastCmdStatus string + Err error +} + +func (e *LastCmdError) Error() string { + return fmt.Sprintf(e.Err.Error() + ", last_cmd_status: " + e.LastCmdStatus) +} + +func NewLastCmdError(err error) error { + lastCmdStatus, err1 := getLastCmdStatus() + if err1 == nil { + return &LastCmdError{ + LastCmdStatus: lastCmdStatus, + Err: err, + } + } + return err +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/stats.go b/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/stats.go index 095c0a380c..df5686f3b8 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/stats.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/stats.go @@ -8,6 +8,13 @@ type L3CacheInfo struct { NumClosids uint64 `json:"num_closids,omitempty"` } +type MemBwInfo struct { + BandwidthGran uint64 `json:"bandwidth_gran,omitempty"` + DelayLinear uint64 `json:"delay_linear,omitempty"` + MinBandwidth uint64 `json:"min_bandwidth,omitempty"` + NumClosids uint64 `json:"num_closids,omitempty"` +} + type Stats struct { // The read-only L3 cache information L3CacheInfo *L3CacheInfo `json:"l3_cache_info,omitempty"` @@ -17,6 +24,15 @@ type Stats struct { // The L3 cache schema in 'container_id' group L3CacheSchema string `json:"l3_cache_schema,omitempty"` + + // The read-only memory bandwidth information + MemBwInfo *MemBwInfo `json:"mem_bw_info,omitempty"` + + // The read-only memory bandwidth schema in root + MemBwSchemaRoot string `json:"mem_bw_schema_root,omitempty"` + + // The memory bandwidth schema in 'container_id' group + MemBwSchema string `json:"mem_bw_schema,omitempty"` } func NewStats() *Stats { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/keys/keyctl.go b/vendor/github.com/opencontainers/runc/libcontainer/keys/keyctl.go index ce8b4e6b04..74dedd56ca 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/keys/keyctl.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/keys/keyctl.go @@ -7,6 +7,8 @@ import ( "strconv" "strings" + "github.com/pkg/errors" + "golang.org/x/sys/unix" ) @@ -15,7 +17,7 @@ type KeySerial uint32 func JoinSessionKeyring(name string) (KeySerial, error) { sessKeyId, err := unix.KeyctlJoinSessionKeyring(name) if err != nil { - return 0, fmt.Errorf("could not create session key: %v", err) + return 0, errors.Wrap(err, "create session key") } return KeySerial(sessKeyId), nil } @@ -42,9 +44,5 @@ func ModKeyringPerm(ringId KeySerial, mask, setbits uint32) error { perm := (uint32(perm64) & mask) | setbits - if err := unix.KeyctlSetperm(int(ringId), perm); err != nil { - return err - } - - return nil + return unix.KeyctlSetperm(int(ringId), perm) } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/message_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/message_linux.go index ab453cde91..1d4f5033aa 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/message_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/message_linux.go @@ -10,16 +10,16 @@ import ( // list of known message types we want to send to bootstrap program // The number is randomly chosen to not conflict with known netlink types const ( - InitMsg uint16 = 62000 - CloneFlagsAttr uint16 = 27281 - NsPathsAttr uint16 = 27282 - UidmapAttr uint16 = 27283 - GidmapAttr uint16 = 27284 - SetgroupAttr uint16 = 27285 - OomScoreAdjAttr uint16 = 27286 - RootlessAttr uint16 = 27287 - UidmapPathAttr uint16 = 27288 - GidmapPathAttr uint16 = 27289 + InitMsg uint16 = 62000 + CloneFlagsAttr uint16 = 27281 + NsPathsAttr uint16 = 27282 + UidmapAttr uint16 = 27283 + GidmapAttr uint16 = 27284 + SetgroupAttr uint16 = 27285 + OomScoreAdjAttr uint16 = 27286 + RootlessEUIDAttr uint16 = 27287 + UidmapPathAttr uint16 = 27288 + GidmapPathAttr uint16 = 27289 ) type Int32msg struct { @@ -77,13 +77,13 @@ func (msg *Boolmsg) Serialize() []byte { native.PutUint16(buf[0:2], uint16(msg.Len())) native.PutUint16(buf[2:4], msg.Type) if msg.Value { - buf[4] = 1 + native.PutUint32(buf[4:8], uint32(1)) } else { - buf[4] = 0 + native.PutUint32(buf[4:8], uint32(0)) } return buf } func (msg *Boolmsg) Len() int { - return unix.NLA_HDRLEN + 1 + return unix.NLA_HDRLEN + 4 // alignment } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/network_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/network_linux.go index 5075bee4db..569c53f6e8 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/network_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/network_linux.go @@ -5,18 +5,15 @@ package libcontainer import ( "fmt" "io/ioutil" - "net" "path/filepath" "strconv" "strings" "github.com/opencontainers/runc/libcontainer/configs" - "github.com/opencontainers/runc/libcontainer/utils" "github.com/vishvananda/netlink" ) var strategies = map[string]networkStrategy{ - "veth": &veth{}, "loopback": &loopback{}, } @@ -103,157 +100,3 @@ func (l *loopback) attach(n *configs.Network) (err error) { func (l *loopback) detach(n *configs.Network) (err error) { return nil } - -// veth is a network strategy that uses a bridge and creates -// a veth pair, one that is attached to the bridge on the host and the other -// is placed inside the container's namespace -type veth struct { -} - -func (v *veth) detach(n *configs.Network) (err error) { - return netlink.LinkSetMaster(&netlink.Device{LinkAttrs: netlink.LinkAttrs{Name: n.HostInterfaceName}}, nil) -} - -// attach a container network interface to an external network -func (v *veth) attach(n *configs.Network) (err error) { - brl, err := netlink.LinkByName(n.Bridge) - if err != nil { - return err - } - br, ok := brl.(*netlink.Bridge) - if !ok { - return fmt.Errorf("Wrong device type %T", brl) - } - host, err := netlink.LinkByName(n.HostInterfaceName) - if err != nil { - return err - } - - if err := netlink.LinkSetMaster(host, br); err != nil { - return err - } - if err := netlink.LinkSetMTU(host, n.Mtu); err != nil { - return err - } - if n.HairpinMode { - if err := netlink.LinkSetHairpin(host, true); err != nil { - return err - } - } - if err := netlink.LinkSetUp(host); err != nil { - return err - } - - return nil -} - -func (v *veth) create(n *network, nspid int) (err error) { - tmpName, err := v.generateTempPeerName() - if err != nil { - return err - } - n.TempVethPeerName = tmpName - if n.Bridge == "" { - return fmt.Errorf("bridge is not specified") - } - veth := &netlink.Veth{ - LinkAttrs: netlink.LinkAttrs{ - Name: n.HostInterfaceName, - TxQLen: n.TxQueueLen, - }, - PeerName: n.TempVethPeerName, - } - if err := netlink.LinkAdd(veth); err != nil { - return err - } - defer func() { - if err != nil { - netlink.LinkDel(veth) - } - }() - if err := v.attach(&n.Network); err != nil { - return err - } - child, err := netlink.LinkByName(n.TempVethPeerName) - if err != nil { - return err - } - return netlink.LinkSetNsPid(child, nspid) -} - -func (v *veth) generateTempPeerName() (string, error) { - return utils.GenerateRandomName("veth", 7) -} - -func (v *veth) initialize(config *network) error { - peer := config.TempVethPeerName - if peer == "" { - return fmt.Errorf("peer is not specified") - } - child, err := netlink.LinkByName(peer) - if err != nil { - return err - } - if err := netlink.LinkSetDown(child); err != nil { - return err - } - if err := netlink.LinkSetName(child, config.Name); err != nil { - return err - } - // get the interface again after we changed the name as the index also changes. - if child, err = netlink.LinkByName(config.Name); err != nil { - return err - } - if config.MacAddress != "" { - mac, err := net.ParseMAC(config.MacAddress) - if err != nil { - return err - } - if err := netlink.LinkSetHardwareAddr(child, mac); err != nil { - return err - } - } - ip, err := netlink.ParseAddr(config.Address) - if err != nil { - return err - } - if err := netlink.AddrAdd(child, ip); err != nil { - return err - } - if config.IPv6Address != "" { - ip6, err := netlink.ParseAddr(config.IPv6Address) - if err != nil { - return err - } - if err := netlink.AddrAdd(child, ip6); err != nil { - return err - } - } - if err := netlink.LinkSetMTU(child, config.Mtu); err != nil { - return err - } - if err := netlink.LinkSetUp(child); err != nil { - return err - } - if config.Gateway != "" { - gw := net.ParseIP(config.Gateway) - if err := netlink.RouteAdd(&netlink.Route{ - Scope: netlink.SCOPE_UNIVERSE, - LinkIndex: child.Attrs().Index, - Gw: gw, - }); err != nil { - return err - } - } - if config.IPv6Gateway != "" { - gw := net.ParseIP(config.IPv6Gateway) - if err := netlink.RouteAdd(&netlink.Route{ - Scope: netlink.SCOPE_UNIVERSE, - LinkIndex: child.Attrs().Index, - Gw: gw, - }); err != nil { - return err - } - } - return nil -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/cloned_binary.c b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/cloned_binary.c new file mode 100644 index 0000000000..ad10f14067 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/cloned_binary.c @@ -0,0 +1,516 @@ +/* + * Copyright (C) 2019 Aleksa Sarai + * Copyright (C) 2019 SUSE LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +/* Use our own wrapper for memfd_create. */ +#if !defined(SYS_memfd_create) && defined(__NR_memfd_create) +# define SYS_memfd_create __NR_memfd_create +#endif +/* memfd_create(2) flags -- copied from . */ +#ifndef MFD_CLOEXEC +# define MFD_CLOEXEC 0x0001U +# define MFD_ALLOW_SEALING 0x0002U +#endif +int memfd_create(const char *name, unsigned int flags) +{ +#ifdef SYS_memfd_create + return syscall(SYS_memfd_create, name, flags); +#else + errno = ENOSYS; + return -1; +#endif +} + + +/* This comes directly from . */ +#ifndef F_LINUX_SPECIFIC_BASE +# define F_LINUX_SPECIFIC_BASE 1024 +#endif +#ifndef F_ADD_SEALS +# define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9) +# define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) +#endif +#ifndef F_SEAL_SEAL +# define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */ +# define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ +# define F_SEAL_GROW 0x0004 /* prevent file from growing */ +# define F_SEAL_WRITE 0x0008 /* prevent writes */ +#endif + +#define CLONED_BINARY_ENV "_LIBCONTAINER_CLONED_BINARY" +#define RUNC_MEMFD_COMMENT "runc_cloned:/proc/self/exe" +#define RUNC_MEMFD_SEALS \ + (F_SEAL_SEAL | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE) + +static void *must_realloc(void *ptr, size_t size) +{ + void *old = ptr; + do { + ptr = realloc(old, size); + } while(!ptr); + return ptr; +} + +/* + * Verify whether we are currently in a self-cloned program (namely, is + * /proc/self/exe a memfd). F_GET_SEALS will only succeed for memfds (or rather + * for shmem files), and we want to be sure it's actually sealed. + */ +static int is_self_cloned(void) +{ + int fd, ret, is_cloned = 0; + struct stat statbuf = {}; + struct statfs fsbuf = {}; + + fd = open("/proc/self/exe", O_RDONLY|O_CLOEXEC); + if (fd < 0) + return -ENOTRECOVERABLE; + + /* + * Is the binary a fully-sealed memfd? We don't need CLONED_BINARY_ENV for + * this, because you cannot write to a sealed memfd no matter what (so + * sharing it isn't a bad thing -- and an admin could bind-mount a sealed + * memfd to /usr/bin/runc to allow re-use). + */ + ret = fcntl(fd, F_GET_SEALS); + if (ret >= 0) { + is_cloned = (ret == RUNC_MEMFD_SEALS); + goto out; + } + + /* + * All other forms require CLONED_BINARY_ENV, since they are potentially + * writeable (or we can't tell if they're fully safe) and thus we must + * check the environment as an extra layer of defence. + */ + if (!getenv(CLONED_BINARY_ENV)) { + is_cloned = false; + goto out; + } + + /* + * Is the binary on a read-only filesystem? We can't detect bind-mounts in + * particular (in-kernel they are identical to regular mounts) but we can + * at least be sure that it's read-only. In addition, to make sure that + * it's *our* bind-mount we check CLONED_BINARY_ENV. + */ + if (fstatfs(fd, &fsbuf) >= 0) + is_cloned |= (fsbuf.f_flags & MS_RDONLY); + + /* + * Okay, we're a tmpfile -- or we're currently running on RHEL <=7.6 + * which appears to have a borked backport of F_GET_SEALS. Either way, + * having a file which has no hardlinks indicates that we aren't using + * a host-side "runc" binary and this is something that a container + * cannot fake (because unlinking requires being able to resolve the + * path that you want to unlink). + */ + if (fstat(fd, &statbuf) >= 0) + is_cloned |= (statbuf.st_nlink == 0); + +out: + close(fd); + return is_cloned; +} + +/* Read a given file into a new buffer, and providing the length. */ +static char *read_file(char *path, size_t *length) +{ + int fd; + char buf[4096], *copy = NULL; + + if (!length) + return NULL; + + fd = open(path, O_RDONLY | O_CLOEXEC); + if (fd < 0) + return NULL; + + *length = 0; + for (;;) { + ssize_t n; + + n = read(fd, buf, sizeof(buf)); + if (n < 0) + goto error; + if (!n) + break; + + copy = must_realloc(copy, (*length + n) * sizeof(*copy)); + memcpy(copy + *length, buf, n); + *length += n; + } + close(fd); + return copy; + +error: + close(fd); + free(copy); + return NULL; +} + +/* + * A poor-man's version of "xargs -0". Basically parses a given block of + * NUL-delimited data, within the given length and adds a pointer to each entry + * to the array of pointers. + */ +static int parse_xargs(char *data, int data_length, char ***output) +{ + int num = 0; + char *cur = data; + + if (!data || *output != NULL) + return -1; + + while (cur < data + data_length) { + num++; + *output = must_realloc(*output, (num + 1) * sizeof(**output)); + (*output)[num - 1] = cur; + cur += strlen(cur) + 1; + } + (*output)[num] = NULL; + return num; +} + +/* + * "Parse" out argv from /proc/self/cmdline. + * This is necessary because we are running in a context where we don't have a + * main() that we can just get the arguments from. + */ +static int fetchve(char ***argv) +{ + char *cmdline = NULL; + size_t cmdline_size; + + cmdline = read_file("/proc/self/cmdline", &cmdline_size); + if (!cmdline) + goto error; + + if (parse_xargs(cmdline, cmdline_size, argv) <= 0) + goto error; + + return 0; + +error: + free(cmdline); + return -EINVAL; +} + +enum { + EFD_NONE = 0, + EFD_MEMFD, + EFD_FILE, +}; + +/* + * This comes from . We can't hard-code __O_TMPFILE because it + * changes depending on the architecture. If we don't have O_TMPFILE we always + * have the mkostemp(3) fallback. + */ +#ifndef O_TMPFILE +# if defined(__O_TMPFILE) && defined(O_DIRECTORY) +# define O_TMPFILE (__O_TMPFILE | O_DIRECTORY) +# endif +#endif + +static int make_execfd(int *fdtype) +{ + int fd = -1; + char template[PATH_MAX] = {0}; + char *prefix = getenv("_LIBCONTAINER_STATEDIR"); + + if (!prefix || *prefix != '/') + prefix = "/tmp"; + if (snprintf(template, sizeof(template), "%s/runc.XXXXXX", prefix) < 0) + return -1; + + /* + * Now try memfd, it's much nicer than actually creating a file in STATEDIR + * since it's easily detected thanks to sealing and also doesn't require + * assumptions about STATEDIR. + */ + *fdtype = EFD_MEMFD; + fd = memfd_create(RUNC_MEMFD_COMMENT, MFD_CLOEXEC | MFD_ALLOW_SEALING); + if (fd >= 0) + return fd; + if (errno != ENOSYS && errno != EINVAL) + goto error; + +#ifdef O_TMPFILE + /* + * Try O_TMPFILE to avoid races where someone might snatch our file. Note + * that O_EXCL isn't actually a security measure here (since you can just + * fd re-open it and clear O_EXCL). + */ + *fdtype = EFD_FILE; + fd = open(prefix, O_TMPFILE | O_EXCL | O_RDWR | O_CLOEXEC, 0700); + if (fd >= 0) { + struct stat statbuf = {}; + bool working_otmpfile = false; + + /* + * open(2) ignores unknown O_* flags -- yeah, I was surprised when I + * found this out too. As a result we can't check for EINVAL. However, + * if we get nlink != 0 (or EISDIR) then we know that this kernel + * doesn't support O_TMPFILE. + */ + if (fstat(fd, &statbuf) >= 0) + working_otmpfile = (statbuf.st_nlink == 0); + + if (working_otmpfile) + return fd; + + /* Pretend that we got EISDIR since O_TMPFILE failed. */ + close(fd); + errno = EISDIR; + } + if (errno != EISDIR) + goto error; +#endif /* defined(O_TMPFILE) */ + + /* + * Our final option is to create a temporary file the old-school way, and + * then unlink it so that nothing else sees it by accident. + */ + *fdtype = EFD_FILE; + fd = mkostemp(template, O_CLOEXEC); + if (fd >= 0) { + if (unlink(template) >= 0) + return fd; + close(fd); + } + +error: + *fdtype = EFD_NONE; + return -1; +} + +static int seal_execfd(int *fd, int fdtype) +{ + switch (fdtype) { + case EFD_MEMFD: + return fcntl(*fd, F_ADD_SEALS, RUNC_MEMFD_SEALS); + case EFD_FILE: { + /* Need to re-open our pseudo-memfd as an O_PATH to avoid execve(2) giving -ETXTBSY. */ + int newfd; + char fdpath[PATH_MAX] = {0}; + + if (fchmod(*fd, 0100) < 0) + return -1; + + if (snprintf(fdpath, sizeof(fdpath), "/proc/self/fd/%d", *fd) < 0) + return -1; + + newfd = open(fdpath, O_PATH | O_CLOEXEC); + if (newfd < 0) + return -1; + + close(*fd); + *fd = newfd; + return 0; + } + default: + break; + } + return -1; +} + +static int try_bindfd(void) +{ + int fd, ret = -1; + char template[PATH_MAX] = {0}; + char *prefix = getenv("_LIBCONTAINER_STATEDIR"); + + if (!prefix || *prefix != '/') + prefix = "/tmp"; + if (snprintf(template, sizeof(template), "%s/runc.XXXXXX", prefix) < 0) + return ret; + + /* + * We need somewhere to mount it, mounting anything over /proc/self is a + * BAD idea on the host -- even if we do it temporarily. + */ + fd = mkstemp(template); + if (fd < 0) + return ret; + close(fd); + + /* + * For obvious reasons this won't work in rootless mode because we haven't + * created a userns+mntns -- but getting that to work will be a bit + * complicated and it's only worth doing if someone actually needs it. + */ + ret = -EPERM; + if (mount("/proc/self/exe", template, "", MS_BIND, "") < 0) + goto out; + if (mount("", template, "", MS_REMOUNT | MS_BIND | MS_RDONLY, "") < 0) + goto out_umount; + + + /* Get read-only handle that we're sure can't be made read-write. */ + ret = open(template, O_PATH | O_CLOEXEC); + +out_umount: + /* + * Make sure the MNT_DETACH works, otherwise we could get remounted + * read-write and that would be quite bad (the fd would be made read-write + * too, invalidating the protection). + */ + if (umount2(template, MNT_DETACH) < 0) { + if (ret >= 0) + close(ret); + ret = -ENOTRECOVERABLE; + } + +out: + /* + * We don't care about unlink errors, the worst that happens is that + * there's an empty file left around in STATEDIR. + */ + unlink(template); + return ret; +} + +static ssize_t fd_to_fd(int outfd, int infd) +{ + ssize_t total = 0; + char buffer[4096]; + + for (;;) { + ssize_t nread, nwritten = 0; + + nread = read(infd, buffer, sizeof(buffer)); + if (nread < 0) + return -1; + if (!nread) + break; + + do { + ssize_t n = write(outfd, buffer + nwritten, nread - nwritten); + if (n < 0) + return -1; + nwritten += n; + } while(nwritten < nread); + + total += nwritten; + } + + return total; +} + +static int clone_binary(void) +{ + int binfd, execfd; + struct stat statbuf = {}; + size_t sent = 0; + int fdtype = EFD_NONE; + + /* + * Before we resort to copying, let's try creating an ro-binfd in one shot + * by getting a handle for a read-only bind-mount of the execfd. + */ + execfd = try_bindfd(); + if (execfd >= 0) + return execfd; + + /* + * Dammit, that didn't work -- time to copy the binary to a safe place we + * can seal the contents. + */ + execfd = make_execfd(&fdtype); + if (execfd < 0 || fdtype == EFD_NONE) + return -ENOTRECOVERABLE; + + binfd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC); + if (binfd < 0) + goto error; + + if (fstat(binfd, &statbuf) < 0) + goto error_binfd; + + while (sent < statbuf.st_size) { + int n = sendfile(execfd, binfd, NULL, statbuf.st_size - sent); + if (n < 0) { + /* sendfile can fail so we fallback to a dumb user-space copy. */ + n = fd_to_fd(execfd, binfd); + if (n < 0) + goto error_binfd; + } + sent += n; + } + close(binfd); + if (sent != statbuf.st_size) + goto error; + + if (seal_execfd(&execfd, fdtype) < 0) + goto error; + + return execfd; + +error_binfd: + close(binfd); +error: + close(execfd); + return -EIO; +} + +/* Get cheap access to the environment. */ +extern char **environ; + +int ensure_cloned_binary(void) +{ + int execfd; + char **argv = NULL; + + /* Check that we're not self-cloned, and if we are then bail. */ + int cloned = is_self_cloned(); + if (cloned > 0 || cloned == -ENOTRECOVERABLE) + return cloned; + + if (fetchve(&argv) < 0) + return -EINVAL; + + execfd = clone_binary(); + if (execfd < 0) + return -EIO; + + if (putenv(CLONED_BINARY_ENV "=1")) + goto error; + + fexecve(execfd, argv, environ); +error: + close(execfd); + return -ENOEXEC; +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c index 2c69cee5d6..7750af35ea 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c +++ b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c @@ -42,6 +42,12 @@ enum sync_t { SYNC_ERR = 0xFF, /* Fatal error, no turning back. The error code follows. */ }; +/* + * Synchronisation value for cgroup namespace setup. + * The same constant is defined in process_linux.go as "createCgroupns". + */ +#define CREATECGROUPNS 0x80 + /* longjmp() arguments. */ #define JUMP_PARENT 0x00 #define JUMP_CHILD 0xA0 @@ -82,7 +88,7 @@ struct nlconfig_t { uint8_t is_setgroup; /* Rootless container settings. */ - uint8_t is_rootless; + uint8_t is_rootless_euid; /* boolean */ char *uidmappath; size_t uidmappath_len; char *gidmappath; @@ -100,7 +106,7 @@ struct nlconfig_t { #define GIDMAP_ATTR 27284 #define SETGROUP_ATTR 27285 #define OOM_SCORE_ADJ_ATTR 27286 -#define ROOTLESS_ATTR 27287 +#define ROOTLESS_EUID_ATTR 27287 #define UIDMAPPATH_ATTR 27288 #define GIDMAPPATH_ATTR 27289 @@ -211,7 +217,7 @@ static int try_mapping_tool(const char *app, int pid, char *map, size_t map_len) /* * If @app is NULL, execve will segfault. Just check it here and bail (if - * we're in this path, the caller is already getting desparate and there + * we're in this path, the caller is already getting desperate and there * isn't a backup to this failing). This usually would be a configuration * or programming issue. */ @@ -419,8 +425,8 @@ static void nl_parse(int fd, struct nlconfig_t *config) case CLONE_FLAGS_ATTR: config->cloneflags = readint32(current); break; - case ROOTLESS_ATTR: - config->is_rootless = readint8(current); + case ROOTLESS_EUID_ATTR: + config->is_rootless_euid = readint8(current); /* boolean */ break; case OOM_SCORE_ADJ_ATTR: config->oom_score_adj = current; @@ -505,7 +511,8 @@ void join_namespaces(char *nslist) ns->fd = fd; ns->ns = nsflag(namespace); - strncpy(ns->path, path, PATH_MAX); + strncpy(ns->path, path, PATH_MAX - 1); + ns->path[PATH_MAX - 1] = '\0'; } while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL); /* @@ -527,6 +534,9 @@ void join_namespaces(char *nslist) free(namespaces); } +/* Defined in cloned_binary.c. */ +extern int ensure_cloned_binary(void); + void nsexec(void) { int pipenum; @@ -542,6 +552,14 @@ void nsexec(void) if (pipenum == -1) return; + /* + * We need to re-exec if we are not in a cloned binary. This is necessary + * to ensure that containers won't be able to access the host binary + * through /proc/self/exe. See CVE-2019-5736. + */ + if (ensure_cloned_binary() < 0) + bail("could not ensure we are a cloned binary"); + /* Parse all of the netlink configuration. */ nl_parse(pipenum, &config); @@ -639,7 +657,6 @@ void nsexec(void) case JUMP_PARENT:{ int len; pid_t child, first_child = -1; - char buf[JSON_MAX]; bool ready = false; /* For debugging. */ @@ -678,17 +695,15 @@ void nsexec(void) /* * Enable setgroups(2) if we've been asked to. But we also * have to explicitly disable setgroups(2) if we're - * creating a rootless container (this is required since - * Linux 3.19). + * creating a rootless container for single-entry mapping. + * i.e. config.is_setgroup == false. + * (this is required since Linux 3.19). + * + * For rootless multi-entry mapping, config.is_setgroup shall be true and + * newuidmap/newgidmap shall be used. */ - if (config.is_rootless && config.is_setgroup) { - kill(child, SIGKILL); - bail("cannot allow setgroup in an unprivileged user namespace setup"); - } - if (config.is_setgroup) - update_setgroups(child, SETGROUPS_ALLOW); - if (config.is_rootless) + if (config.is_rootless_euid && !config.is_setgroup) update_setgroups(child, SETGROUPS_DENY); /* Set up mappings. */ @@ -717,6 +732,18 @@ void nsexec(void) kill(child, SIGKILL); bail("failed to sync with child: write(SYNC_RECVPID_ACK)"); } + + /* Send the init_func pid back to our parent. + * + * Send the init_func pid and the pid of the first child back to our parent. + * We need to send both back because we can't reap the first child we created (CLONE_PARENT). + * It becomes the responsibility of our parent to reap the first child. + */ + len = dprintf(pipenum, "{\"pid\": %d, \"pid_first\": %d}\n", child, first_child); + if (len < 0) { + kill(child, SIGKILL); + bail("unable to generate JSON for child pid"); + } } break; case SYNC_CHILD_READY: @@ -760,23 +787,6 @@ void nsexec(void) bail("unexpected sync value: %u", s); } } - - /* - * Send the init_func pid and the pid of the first child back to our parent. - * - * We need to send both back because we can't reap the first child we created (CLONE_PARENT). - * It becomes the responsibility of our parent to reap the first child. - */ - len = snprintf(buf, JSON_MAX, "{\"pid\": %d, \"pid_first\": %d}\n", child, first_child); - if (len < 0) { - kill(child, SIGKILL); - bail("unable to generate JSON for child pid"); - } - if (write(pipenum, buf, len) != len) { - kill(child, SIGKILL); - bail("unable to send child pid to bootstrapper"); - } - exit(0); } @@ -809,25 +819,30 @@ void nsexec(void) if (config.namespaces) join_namespaces(config.namespaces); - /* - * Unshare all of the namespaces. Now, it should be noted that this - * ordering might break in the future (especially with rootless - * containers). But for now, it's not possible to split this into - * CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues. - * - * Note that we don't merge this with clone() because there were - * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID) - * was broken, so we'll just do it the long way anyway. - */ - if (unshare(config.cloneflags) < 0) - bail("failed to unshare namespaces"); - /* * Deal with user namespaces first. They are quite special, as they * affect our ability to unshare other namespaces and are used as * context for privilege checks. + * + * We don't unshare all namespaces in one go. The reason for this + * is that, while the kernel documentation may claim otherwise, + * there are certain cases where unsharing all namespaces at once + * will result in namespace objects being owned incorrectly. + * Ideally we should just fix these kernel bugs, but it's better to + * be safe than sorry, and fix them separately. + * + * A specific case of this is that the SELinux label of the + * internal kern-mount that mqueue uses will be incorrect if the + * UTS namespace is cloned before the USER namespace is mapped. + * I've also heard of similar problems with the network namespace + * in some scenarios. This also mirrors how LXC deals with this + * problem. */ if (config.cloneflags & CLONE_NEWUSER) { + if (unshare(CLONE_NEWUSER) < 0) + bail("failed to unshare user namespace"); + config.cloneflags &= ~CLONE_NEWUSER; + /* * We don't have the privileges to do any mapping here (see the * clone_parent rant). So signal our parent to hook us up. @@ -853,7 +868,23 @@ void nsexec(void) if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0) bail("failed to set process as dumpable"); } + + /* Become root in the namespace proper. */ + if (setresuid(0, 0, 0) < 0) + bail("failed to become root in user namespace"); } + /* + * Unshare all of the namespaces. Now, it should be noted that this + * ordering might break in the future (especially with rootless + * containers). But for now, it's not possible to split this into + * CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues. + * + * Note that we don't merge this with clone() because there were + * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID) + * was broken, so we'll just do it the long way anyway. + */ + if (unshare(config.cloneflags & ~CLONE_NEWCGROUP) < 0) + bail("failed to unshare namespaces"); /* * TODO: What about non-namespace clone flags that we're dropping here? @@ -936,11 +967,23 @@ void nsexec(void) if (setgid(0) < 0) bail("setgid failed"); - if (!config.is_rootless && config.is_setgroup) { + if (!config.is_rootless_euid && config.is_setgroup) { if (setgroups(0, NULL) < 0) bail("setgroups failed"); } + /* ... wait until our topmost parent has finished cgroup setup in p.manager.Apply() ... */ + if (config.cloneflags & CLONE_NEWCGROUP) { + uint8_t value; + if (read(pipenum, &value, sizeof(value)) != sizeof(value)) + bail("read synchronisation value failed"); + if (value == CREATECGROUPNS) { + if (unshare(CLONE_NEWCGROUP) < 0) + bail("failed to unshare cgroup namespace"); + } else + bail("received unknown synchronisation value"); + } + s = SYNC_CHILD_READY; if (write(syncfd, &s, sizeof(s)) != sizeof(s)) bail("failed to sync with patent: write(SYNC_CHILD_READY)"); diff --git a/vendor/github.com/opencontainers/runc/libcontainer/process.go b/vendor/github.com/opencontainers/runc/libcontainer/process.go index 86bf7387f8..9a7c601412 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/process.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/process.go @@ -72,6 +72,9 @@ type Process struct { // ConsoleSocket provides the masterfd console. ConsoleSocket *os.File + // Init specifies whether the process is the first process in the container. + Init bool + ops processOperations } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/process_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/process_linux.go index 58980b0594..e8ffac9fa5 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/process_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/process_linux.go @@ -22,6 +22,10 @@ import ( "golang.org/x/sys/unix" ) +// Synchronisation value for cgroup namespace setup. +// The same constant is defined in nsexec.c as "CREATECGROUPNS". +const createCgroupns = 0x80 + type parentProcess interface { // pid returns the pid for the running process. pid() int @@ -46,15 +50,16 @@ type parentProcess interface { } type setnsProcess struct { - cmd *exec.Cmd - parentPipe *os.File - childPipe *os.File - cgroupPaths map[string]string - intelRdtPath string - config *initConfig - fds []string - process *Process - bootstrapData io.Reader + cmd *exec.Cmd + parentPipe *os.File + childPipe *os.File + cgroupPaths map[string]string + rootlessCgroups bool + intelRdtPath string + config *initConfig + fds []string + process *Process + bootstrapData io.Reader } func (p *setnsProcess) startTime() (uint64, error) { @@ -86,7 +91,7 @@ func (p *setnsProcess) start() (err error) { return newSystemErrorWithCause(err, "executing setns process") } if len(p.cgroupPaths) > 0 { - if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil { + if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil && !p.rootlessCgroups { return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid()) } } @@ -224,12 +229,17 @@ func (p *initProcess) externalDescriptors() []string { return p.fds } -// execSetns runs the process that executes C code to perform the setns calls -// because setns support requires the C process to fork off a child and perform the setns -// before the go runtime boots, we wait on the process to die and receive the child's pid -// over the provided pipe. -// This is called by initProcess.start function -func (p *initProcess) execSetns() error { +// getChildPid receives the final child's pid over the provided pipe. +func (p *initProcess) getChildPid() (int, error) { + var pid pid + if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil { + p.cmd.Wait() + return -1, err + } + return pid.Pid, nil +} + +func (p *initProcess) waitForChildExit(childPid int) error { status, err := p.cmd.Process.Wait() if err != nil { p.cmd.Wait() @@ -239,22 +249,8 @@ func (p *initProcess) execSetns() error { p.cmd.Wait() return &exec.ExitError{ProcessState: status} } - var pid *pid - if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil { - p.cmd.Wait() - return err - } - // Clean up the zombie parent process - firstChildProcess, err := os.FindProcess(pid.PidFirstChild) - if err != nil { - return err - } - - // Ignore the error in case the child has already been reaped for any reason - _, _ = firstChildProcess.Wait() - - process, err := os.FindProcess(pid.Pid) + process, err := os.FindProcess(childPid) if err != nil { return err } @@ -296,19 +292,47 @@ func (p *initProcess) start() error { if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil { return newSystemErrorWithCause(err, "copying bootstrap data to pipe") } - - if err := p.execSetns(); err != nil { - return newSystemErrorWithCause(err, "running exec setns process for init") + childPid, err := p.getChildPid() + if err != nil { + return newSystemErrorWithCause(err, "getting the final child's pid from pipe") } // Save the standard descriptor names before the container process // can potentially move them (e.g., via dup2()). If we don't do this now, // we won't know at checkpoint time which file descriptor to look up. - fds, err := getPipeFds(p.pid()) + fds, err := getPipeFds(childPid) if err != nil { - return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", p.pid()) + return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", childPid) } p.setExternalDescriptors(fds) + // Do this before syncing with child so that no children + // can escape the cgroup + if err := p.manager.Apply(childPid); err != nil { + return newSystemErrorWithCause(err, "applying cgroup configuration for process") + } + if p.intelRdtManager != nil { + if err := p.intelRdtManager.Apply(childPid); err != nil { + return newSystemErrorWithCause(err, "applying Intel RDT configuration for process") + } + } + // Now it's time to setup cgroup namesapce + if p.config.Config.Namespaces.Contains(configs.NEWCGROUP) && p.config.Config.Namespaces.PathOf(configs.NEWCGROUP) == "" { + if _, err := p.parentPipe.Write([]byte{createCgroupns}); err != nil { + return newSystemErrorWithCause(err, "sending synchronization value to init process") + } + } + + // Wait for our first child to exit + if err := p.waitForChildExit(childPid); err != nil { + return newSystemErrorWithCause(err, "waiting for our first child to exit") + } + + defer func() { + if err != nil { + // TODO: should not be the responsibility to call here + p.manager.Destroy() + } + }() if err := p.createNetworkInterfaces(); err != nil { return newSystemErrorWithCause(err, "creating network interfaces") } @@ -341,14 +365,13 @@ func (p *initProcess) start() error { } if p.config.Config.Hooks != nil { - bundle, annotations := utils.Annotations(p.container.config.Labels) - s := configs.HookState{ - Version: p.container.config.Version, - ID: p.container.id, - Pid: p.pid(), - Bundle: bundle, - Annotations: annotations, + s, err := p.container.currentOCIState() + if err != nil { + return err } + // initProcessStartTime hasn't been set yet. + s.Pid = p.cmd.Process.Pid + s.Status = "creating" for i, hook := range p.config.Config.Hooks.Prestart { if err := hook.Run(s); err != nil { return newSystemErrorWithCausef(err, "running prestart hook %d", i) @@ -372,14 +395,13 @@ func (p *initProcess) start() error { } } if p.config.Config.Hooks != nil { - bundle, annotations := utils.Annotations(p.container.config.Labels) - s := configs.HookState{ - Version: p.container.config.Version, - ID: p.container.id, - Pid: p.pid(), - Bundle: bundle, - Annotations: annotations, + s, err := p.container.currentOCIState() + if err != nil { + return err } + // initProcessStartTime hasn't been set yet. + s.Pid = p.cmd.Process.Pid + s.Status = "creating" for i, hook := range p.config.Config.Hooks.Prestart { if err := hook.Run(s); err != nil { return newSystemErrorWithCausef(err, "running prestart hook %d", i) @@ -537,7 +559,7 @@ func (p *Process) InitializeIO(rootuid, rootgid int) (i *IO, err error) { } fds = append(fds, r.Fd(), w.Fd()) p.Stderr, i.Stderr = w, r - // change ownership of the pipes incase we are in a user namespace + // change ownership of the pipes in case we are in a user namespace for _, fd := range fds { if err := unix.Fchown(int(fd), rootuid, rootgid); err != nil { return nil, err diff --git a/vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go index cf715d6649..f13b226e44 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go @@ -46,6 +46,7 @@ func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig) (err error) { return newSystemErrorWithCause(err, "preparing rootfs") } + hasCgroupns := config.Namespaces.Contains(configs.NEWCGROUP) setupDev := needsSetupDev(config) for _, m := range config.Mounts { for _, precmd := range m.PremountCmds { @@ -53,8 +54,7 @@ func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig) (err error) { return newSystemErrorWithCause(err, "running premount command") } } - - if err := mountToRootfs(m, config.Rootfs, config.MountLabel); err != nil { + if err := mountToRootfs(m, config.Rootfs, config.MountLabel, hasCgroupns); err != nil { return newSystemErrorWithCausef(err, "mounting %q to rootfs %q at %q", m.Source, config.Rootfs, m.Destination) } @@ -152,6 +152,26 @@ func finalizeRootfs(config *configs.Config) (err error) { return nil } +// /tmp has to be mounted as private to allow MS_MOVE to work in all situations +func prepareTmp(topTmpDir string) (string, error) { + tmpdir, err := ioutil.TempDir(topTmpDir, "runctop") + if err != nil { + return "", err + } + if err := unix.Mount(tmpdir, tmpdir, "bind", unix.MS_BIND, ""); err != nil { + return "", err + } + if err := unix.Mount("", tmpdir, "", uintptr(unix.MS_PRIVATE), ""); err != nil { + return "", err + } + return tmpdir, nil +} + +func cleanupTmp(tmpdir string) error { + unix.Unmount(tmpdir, 0) + return os.RemoveAll(tmpdir) +} + func mountCmd(cmd configs.Command) error { command := exec.Command(cmd.Path, cmd.Args[:]...) command.Env = cmd.Env @@ -162,7 +182,34 @@ func mountCmd(cmd configs.Command) error { return nil } -func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error { +func prepareBindMount(m *configs.Mount, rootfs string) error { + stat, err := os.Stat(m.Source) + if err != nil { + // error out if the source of a bind mount does not exist as we will be + // unable to bind anything to it. + return err + } + // ensure that the destination of the bind mount is resolved of symlinks at mount time because + // any previous mounts can invalidate the next mount's destination. + // this can happen when a user specifies mounts within other mounts to cause breakouts or other + // evil stuff to try to escape the container's rootfs. + var dest string + if dest, err = securejoin.SecureJoin(rootfs, m.Destination); err != nil { + return err + } + if err := checkMountDestination(rootfs, dest); err != nil { + return err + } + // update the mount with the correct dest after symlinks are resolved. + m.Destination = dest + if err := createIfNotExists(dest, stat.IsDir()); err != nil { + return err + } + + return nil +} + +func mountToRootfs(m *configs.Mount, rootfs, mountLabel string, enableCgroupns bool) error { var ( dest = m.Destination ) @@ -199,7 +246,12 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error { } } if copyUp { - tmpDir, err = ioutil.TempDir("/tmp", "runctmpdir") + tmpdir, err := prepareTmp("/tmp") + if err != nil { + return newSystemErrorWithCause(err, "tmpcopyup: failed to setup tmpdir") + } + defer cleanupTmp(tmpdir) + tmpDir, err = ioutil.TempDir(tmpdir, "runctmpdir") if err != nil { return newSystemErrorWithCause(err, "tmpcopyup: failed to create tmpdir") } @@ -232,25 +284,7 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error { } return nil case "bind": - stat, err := os.Stat(m.Source) - if err != nil { - // error out if the source of a bind mount does not exist as we will be - // unable to bind anything to it. - return err - } - // ensure that the destination of the bind mount is resolved of symlinks at mount time because - // any previous mounts can invalidate the next mount's destination. - // this can happen when a user specifies mounts within other mounts to cause breakouts or other - // evil stuff to try to escape the container's rootfs. - if dest, err = securejoin.SecureJoin(rootfs, m.Destination); err != nil { - return err - } - if err := checkMountDestination(rootfs, dest); err != nil { - return err - } - // update the mount with the correct dest after symlinks are resolved. - m.Destination = dest - if err := createIfNotExists(dest, stat.IsDir()); err != nil { + if err := prepareBindMount(m, rootfs); err != nil { return err } if err := mountPropagate(m, rootfs, mountLabel); err != nil { @@ -294,12 +328,33 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error { Data: "mode=755", PropagationFlags: m.PropagationFlags, } - if err := mountToRootfs(tmpfs, rootfs, mountLabel); err != nil { + if err := mountToRootfs(tmpfs, rootfs, mountLabel, enableCgroupns); err != nil { return err } for _, b := range binds { - if err := mountToRootfs(b, rootfs, mountLabel); err != nil { - return err + if enableCgroupns { + subsystemPath := filepath.Join(rootfs, b.Destination) + if err := os.MkdirAll(subsystemPath, 0755); err != nil { + return err + } + flags := defaultMountFlags + if m.Flags&unix.MS_RDONLY != 0 { + flags = flags | unix.MS_RDONLY + } + cgroupmount := &configs.Mount{ + Source: "cgroup", + Device: "cgroup", + Destination: subsystemPath, + Flags: flags, + Data: filepath.Base(subsystemPath), + } + if err := mountNewCgroup(cgroupmount); err != nil { + return err + } + } else { + if err := mountToRootfs(b, rootfs, mountLabel, enableCgroupns); err != nil { + return err + } } } for _, mc := range merged { @@ -396,6 +451,7 @@ func checkMountDestination(rootfs, dest string) error { "/proc/stat", "/proc/swaps", "/proc/uptime", + "/proc/loadavg", "/proc/net/dev", } for _, valid := range validDestinations { @@ -412,7 +468,7 @@ func checkMountDestination(rootfs, dest string) error { if err != nil { return err } - if path == "." || !strings.HasPrefix(path, "..") { + if path != "." && !strings.HasPrefix(path, "..") { return fmt.Errorf("%q cannot be mounted because it is located inside %q", dest, invalid) } } @@ -701,6 +757,41 @@ func pivotRoot(rootfs string) error { } func msMoveRoot(rootfs string) error { + mountinfos, err := mount.GetMounts() + if err != nil { + return err + } + + absRootfs, err := filepath.Abs(rootfs) + if err != nil { + return err + } + + for _, info := range mountinfos { + p, err := filepath.Abs(info.Mountpoint) + if err != nil { + return err + } + // Umount every syfs and proc file systems, except those under the container rootfs + if (info.Fstype != "proc" && info.Fstype != "sysfs") || filepath.HasPrefix(p, absRootfs) { + continue + } + // Be sure umount events are not propagated to the host. + if err := unix.Mount("", p, "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil { + return err + } + if err := unix.Unmount(p, unix.MNT_DETACH); err != nil { + if err != unix.EINVAL && err != unix.EPERM { + return err + } else { + // If we have not privileges for umounting (e.g. rootless), then + // cover the path. + if err := unix.Mount("tmpfs", p, "tmpfs", 0, ""); err != nil { + return err + } + } + } + } if err := unix.Mount(rootfs, "/", "", unix.MS_MOVE, ""); err != nil { return err } @@ -802,10 +893,7 @@ func remount(m *configs.Mount, rootfs string) error { if !strings.HasPrefix(dest, rootfs) { dest = filepath.Join(rootfs, dest) } - if err := unix.Mount(m.Source, dest, m.Device, uintptr(m.Flags|unix.MS_REMOUNT), ""); err != nil { - return err - } - return nil + return unix.Mount(m.Source, dest, m.Device, uintptr(m.Flags|unix.MS_REMOUNT), "") } // Do the mount operation followed by additional mounts required to take care @@ -836,3 +924,18 @@ func mountPropagate(m *configs.Mount, rootfs string, mountLabel string) error { } return nil } + +func mountNewCgroup(m *configs.Mount) error { + var ( + data = m.Data + source = m.Source + ) + if data == "systemd" { + data = cgroups.CgroupNamePrefix + data + source = "systemd" + } + if err := unix.Mount(source, m.Destination, m.Device, uintptr(m.Flags), data); err != nil { + return err + } + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/setns_init_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/setns_init_linux.go index 096c601e76..6613bb65cb 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/setns_init_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/setns_init_linux.go @@ -5,12 +5,14 @@ package libcontainer import ( "fmt" "os" + "runtime" "github.com/opencontainers/runc/libcontainer/apparmor" "github.com/opencontainers/runc/libcontainer/keys" "github.com/opencontainers/runc/libcontainer/seccomp" "github.com/opencontainers/runc/libcontainer/system" "github.com/opencontainers/selinux/go-selinux/label" + "github.com/pkg/errors" "golang.org/x/sys/unix" ) @@ -28,10 +30,19 @@ func (l *linuxSetnsInit) getSessionRingName() string { } func (l *linuxSetnsInit) Init() error { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + if !l.config.Config.NoNewKeyring { - // do not inherit the parent's session keyring + // Do not inherit the parent's session keyring. if _, err := keys.JoinSessionKeyring(l.getSessionRingName()); err != nil { - return err + // Same justification as in standart_init_linux.go as to why we + // don't bail on ENOSYS. + // + // TODO(cyphar): And we should have logging here too. + if errors.Cause(err) != unix.ENOSYS { + return errors.Wrap(err, "join session keyring") + } } } if l.config.CreateConsole { @@ -47,6 +58,10 @@ func (l *linuxSetnsInit) Init() error { return err } } + if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil { + return err + } + defer label.SetProcessLabel("") // Without NoNewPrivileges seccomp is a privileged operation, so we need to // do this before dropping capabilities; otherwise do it as late as possible // just before execve so as few syscalls take place after it as possible. @@ -61,9 +76,6 @@ func (l *linuxSetnsInit) Init() error { if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil { return err } - if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil { - return err - } // Set seccomp as close to execve as possible, so as few syscalls take // place afterward (reducing the amount of syscalls that users need to // enable in their seccomp profiles). diff --git a/vendor/github.com/opencontainers/runc/libcontainer/specconv/example.go b/vendor/github.com/opencontainers/runc/libcontainer/specconv/example.go index c113b337f3..827ca9e781 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/specconv/example.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/specconv/example.go @@ -156,7 +156,7 @@ func Example() *specs.Spec { } // ToRootless converts the given spec file into one that should work with -// rootless containers, by removing incompatible options and adding others that +// rootless containers (euid != 0), by removing incompatible options and adding others that // are needed. func ToRootless(spec *specs.Spec) { var namespaces []specs.LinuxNamespace diff --git a/vendor/github.com/opencontainers/runc/libcontainer/specconv/spec_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/specconv/spec_linux.go index 98fd2e6321..f68cac011a 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/specconv/spec_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/specconv/spec_linux.go @@ -28,6 +28,7 @@ var namespaceMapping = map[specs.LinuxNamespaceType]configs.NamespaceType{ specs.UserNamespace: configs.NEWUSER, specs.IPCNamespace: configs.NEWIPC, specs.UTSNamespace: configs.NEWUTS, + specs.CgroupNamespace: configs.NEWCGROUP, } var mountPropagationMapping = map[string]int{ @@ -148,7 +149,8 @@ type CreateOpts struct { NoPivotRoot bool NoNewKeyring bool Spec *specs.Spec - Rootless bool + RootlessEUID bool + RootlessCgroups bool } // CreateLibcontainerConfig creates a new libcontainer configuration from a @@ -176,13 +178,14 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) { labels = append(labels, fmt.Sprintf("%s=%s", k, v)) } config := &configs.Config{ - Rootfs: rootfsPath, - NoPivotRoot: opts.NoPivotRoot, - Readonlyfs: spec.Root.Readonly, - Hostname: spec.Hostname, - Labels: append(labels, fmt.Sprintf("bundle=%s", cwd)), - NoNewKeyring: opts.NoNewKeyring, - Rootless: opts.Rootless, + Rootfs: rootfsPath, + NoPivotRoot: opts.NoPivotRoot, + Readonlyfs: spec.Root.Readonly, + Hostname: spec.Hostname, + Labels: append(labels, fmt.Sprintf("bundle=%s", cwd)), + NoNewKeyring: opts.NoNewKeyring, + RootlessEUID: opts.RootlessEUID, + RootlessCgroups: opts.RootlessCgroups, } exists := false @@ -216,7 +219,7 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) { } config.Namespaces.Add(t, ns.Path) } - if config.Namespaces.Contains(configs.NEWNET) { + if config.Namespaces.Contains(configs.NEWNET) && config.Namespaces.PathOf(configs.NEWNET) == "" { config.Networks = []*configs.Network{ { Type: "loopback", @@ -233,49 +236,56 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) { config.MountLabel = spec.Linux.MountLabel config.Sysctl = spec.Linux.Sysctl if spec.Linux.Seccomp != nil { - seccomp, err := setupSeccomp(spec.Linux.Seccomp) + seccomp, err := SetupSeccomp(spec.Linux.Seccomp) if err != nil { return nil, err } config.Seccomp = seccomp } + if spec.Linux.IntelRdt != nil { + config.IntelRdt = &configs.IntelRdt{} + if spec.Linux.IntelRdt.L3CacheSchema != "" { + config.IntelRdt.L3CacheSchema = spec.Linux.IntelRdt.L3CacheSchema + } + if spec.Linux.IntelRdt.MemBwSchema != "" { + config.IntelRdt.MemBwSchema = spec.Linux.IntelRdt.MemBwSchema + } + } } - if spec.Process.SelinuxLabel != "" { - config.ProcessLabel = spec.Process.SelinuxLabel - } - if spec.Process != nil && spec.Process.OOMScoreAdj != nil { - config.OomScoreAdj = *spec.Process.OOMScoreAdj - } - if spec.Process.Capabilities != nil { - config.Capabilities = &configs.Capabilities{ - Bounding: spec.Process.Capabilities.Bounding, - Effective: spec.Process.Capabilities.Effective, - Permitted: spec.Process.Capabilities.Permitted, - Inheritable: spec.Process.Capabilities.Inheritable, - Ambient: spec.Process.Capabilities.Ambient, + if spec.Process != nil { + config.OomScoreAdj = spec.Process.OOMScoreAdj + if spec.Process.SelinuxLabel != "" { + config.ProcessLabel = spec.Process.SelinuxLabel + } + if spec.Process.Capabilities != nil { + config.Capabilities = &configs.Capabilities{ + Bounding: spec.Process.Capabilities.Bounding, + Effective: spec.Process.Capabilities.Effective, + Permitted: spec.Process.Capabilities.Permitted, + Inheritable: spec.Process.Capabilities.Inheritable, + Ambient: spec.Process.Capabilities.Ambient, + } } } createHooks(spec, config) config.Version = specs.Version - if spec.Linux.IntelRdt != nil { - config.IntelRdt = &configs.IntelRdt{} - if spec.Linux.IntelRdt.L3CacheSchema != "" { - config.IntelRdt.L3CacheSchema = spec.Linux.IntelRdt.L3CacheSchema - } - } return config, nil } func createLibcontainerMount(cwd string, m specs.Mount) *configs.Mount { flags, pgflags, data, ext := parseMountOptions(m.Options) source := m.Source - if m.Type == "bind" { + device := m.Type + if flags&unix.MS_BIND != 0 { + if device == "" { + device = "bind" + } if !filepath.IsAbs(source) { source = filepath.Join(cwd, m.Source) } } return &configs.Mount{ - Device: m.Type, + Device: device, Source: source, Destination: m.Destination, Data: data, @@ -315,7 +325,7 @@ func createCgroupConfig(opts *CreateOpts) (*configs.Cgroup, error) { // for e.g. "system.slice:docker:1234" parts := strings.Split(myCgroupPath, ":") if len(parts) != 3 { - return nil, fmt.Errorf("expected cgroupsPath to be of format \"slice:prefix:name\" for systemd cgroups") + return nil, fmt.Errorf("expected cgroupsPath to be of format \"slice:prefix:name\" for systemd cgroups, got %q instead", myCgroupPath) } c.Parent = parts[0] c.ScopePrefix = parts[1] @@ -328,12 +338,9 @@ func createCgroupConfig(opts *CreateOpts) (*configs.Cgroup, error) { c.Path = myCgroupPath } - // In rootless containers, any attempt to make cgroup changes will fail. - // libcontainer will validate this and we shouldn't add any cgroup options - // the user didn't specify. - if !opts.Rootless { - c.Resources.AllowedDevices = allowedDevices - } + // In rootless containers, any attempt to make cgroup changes is likely to fail. + // libcontainer will validate this but ignores the error. + c.Resources.AllowedDevices = allowedDevices if spec.Linux != nil { r := spec.Linux.Resources if r == nil { @@ -486,10 +493,8 @@ func createCgroupConfig(opts *CreateOpts) (*configs.Cgroup, error) { } } } - if !opts.Rootless { - // append the default allowed devices to the end of the list - c.Resources.Devices = append(c.Resources.Devices, allowedDevices...) - } + // append the default allowed devices to the end of the list + c.Resources.Devices = append(c.Resources.Devices, allowedDevices...) return c, nil } @@ -732,7 +737,7 @@ func parseMountOptions(options []string) (int, []int, string, int) { return flag, pgflag, strings.Join(data, ","), extFlags } -func setupSeccomp(config *specs.LinuxSeccomp) (*configs.Seccomp, error) { +func SetupSeccomp(config *specs.LinuxSeccomp) (*configs.Seccomp, error) { if config == nil { return nil, nil } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/standard_init_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/standard_init_linux.go index e74d800256..ad7ee8d8c8 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/standard_init_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/standard_init_linux.go @@ -6,6 +6,7 @@ import ( "fmt" "os" "os/exec" + "runtime" "syscall" //only for Exec "github.com/opencontainers/runc/libcontainer/apparmor" @@ -14,6 +15,7 @@ import ( "github.com/opencontainers/runc/libcontainer/seccomp" "github.com/opencontainers/runc/libcontainer/system" "github.com/opencontainers/selinux/go-selinux/label" + "github.com/pkg/errors" "golang.org/x/sys/unix" ) @@ -43,17 +45,31 @@ func (l *linuxStandardInit) getSessionRingParams() (string, uint32, uint32) { } func (l *linuxStandardInit) Init() error { + runtime.LockOSThread() + defer runtime.UnlockOSThread() if !l.config.Config.NoNewKeyring { ringname, keepperms, newperms := l.getSessionRingParams() // Do not inherit the parent's session keyring. - sessKeyId, err := keys.JoinSessionKeyring(ringname) - if err != nil { - return err - } - // Make session keyring searcheable. - if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil { - return err + if sessKeyId, err := keys.JoinSessionKeyring(ringname); err != nil { + // If keyrings aren't supported then it is likely we are on an + // older kernel (or inside an LXC container). While we could bail, + // the security feature we are using here is best-effort (it only + // really provides marginal protection since VFS credentials are + // the only significant protection of keyrings). + // + // TODO(cyphar): Log this so people know what's going on, once we + // have proper logging in 'runc init'. + if errors.Cause(err) != unix.ENOSYS { + return errors.Wrap(err, "join session keyring") + } + } else { + // Make session keyring searcheable. If we've gotten this far we + // bail on any error -- we don't want to have a keyring with bad + // permissions. + if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil { + return errors.Wrap(err, "mod keyring permissions") + } } } @@ -76,7 +92,7 @@ func (l *linuxStandardInit) Init() error { return err } if err := system.Setctty(); err != nil { - return err + return errors.Wrap(err, "setctty") } } @@ -89,46 +105,47 @@ func (l *linuxStandardInit) Init() error { if hostname := l.config.Config.Hostname; hostname != "" { if err := unix.Sethostname([]byte(hostname)); err != nil { - return err + return errors.Wrap(err, "sethostname") } } if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil { - return err - } - if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil { - return err + return errors.Wrap(err, "apply apparmor profile") } for key, value := range l.config.Config.Sysctl { if err := writeSystemProperty(key, value); err != nil { - return err + return errors.Wrapf(err, "write sysctl key %s", key) } } for _, path := range l.config.Config.ReadonlyPaths { if err := readonlyPath(path); err != nil { - return err + return errors.Wrapf(err, "readonly path %s", path) } } for _, path := range l.config.Config.MaskPaths { if err := maskPath(path, l.config.Config.MountLabel); err != nil { - return err + return errors.Wrapf(err, "mask path %s", path) } } pdeath, err := system.GetParentDeathSignal() if err != nil { - return err + return errors.Wrap(err, "get pdeath signal") } if l.config.NoNewPrivileges { if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil { - return err + return errors.Wrap(err, "set nonewprivileges") } } // Tell our parent that we're ready to Execv. This must be done before the // Seccomp rules have been applied, because we need to be able to read and // write to a socket. if err := syncParentReady(l.pipe); err != nil { - return err + return errors.Wrap(err, "sync ready") + } + if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil { + return errors.Wrap(err, "set process label") } + defer label.SetProcessLabel("") // Without NoNewPrivileges seccomp is a privileged operation, so we need to // do this before dropping capabilities; otherwise do it as late as possible // just before execve so as few syscalls take place after it as possible. @@ -143,7 +160,7 @@ func (l *linuxStandardInit) Init() error { // finalizeNamespace can change user/group which clears the parent death // signal, so we restore it here. if err := pdeath.Restore(); err != nil { - return err + return errors.Wrap(err, "restore pdeath signal") } // Compare the parent from the initial start of the init process and make // sure that it did not change. if the parent changes that means it died diff --git a/vendor/github.com/opencontainers/runc/libcontainer/state_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/state_linux.go index b45ce23e4a..5c16a423f7 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/state_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/state_linux.go @@ -8,7 +8,6 @@ import ( "path/filepath" "github.com/opencontainers/runc/libcontainer/configs" - "github.com/opencontainers/runc/libcontainer/utils" "github.com/sirupsen/logrus" "golang.org/x/sys/unix" @@ -63,12 +62,9 @@ func destroy(c *linuxContainer) error { func runPoststopHooks(c *linuxContainer) error { if c.config.Hooks != nil { - bundle, annotations := utils.Annotations(c.config.Labels) - s := configs.HookState{ - Version: c.config.Version, - ID: c.id, - Bundle: bundle, - Annotations: annotations, + s, err := c.currentOCIState() + if err != nil { + return err } for _, hook := range c.config.Hooks.Poststop { if err := hook.Run(s); err != nil { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/sync.go b/vendor/github.com/opencontainers/runc/libcontainer/sync.go index cf7b45bc32..a8704a2679 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/sync.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/sync.go @@ -41,10 +41,7 @@ type syncT struct { // writeSync is used to write to a synchronisation pipe. An error is returned // if there was a problem writing the payload. func writeSync(pipe io.Writer, sync syncType) error { - if err := utils.WriteJSON(pipe, syncT{sync}); err != nil { - return err - } - return nil + return utils.WriteJSON(pipe, syncT{sync}) } // readSync is used to read from a synchronisation pipe. An error is returned diff --git a/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go b/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go index 5f124cd8bb..a4ae8901ac 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go @@ -3,13 +3,12 @@ package system import ( - "bufio" - "fmt" "os" "os/exec" "syscall" // only for exec "unsafe" + "github.com/opencontainers/runc/libcontainer/user" "golang.org/x/sys/unix" ) @@ -102,34 +101,43 @@ func Setctty() error { } // RunningInUserNS detects whether we are currently running in a user namespace. -// Copied from github.com/lxc/lxd/shared/util.go +// Originally copied from github.com/lxc/lxd/shared/util.go func RunningInUserNS() bool { - file, err := os.Open("/proc/self/uid_map") + uidmap, err := user.CurrentProcessUIDMap() if err != nil { // This kernel-provided file only exists if user namespaces are supported return false } - defer file.Close() - - buf := bufio.NewReader(file) - l, _, err := buf.ReadLine() - if err != nil { - return false - } + return UIDMapInUserNS(uidmap) +} - line := string(l) - var a, b, c int64 - fmt.Sscanf(line, "%d %d %d", &a, &b, &c) +func UIDMapInUserNS(uidmap []user.IDMap) bool { /* * We assume we are in the initial user namespace if we have a full * range - 4294967295 uids starting at uid 0. */ - if a == 0 && b == 0 && c == 4294967295 { + if len(uidmap) == 1 && uidmap[0].ID == 0 && uidmap[0].ParentID == 0 && uidmap[0].Count == 4294967295 { return false } return true } +// GetParentNSeuid returns the euid within the parent user namespace +func GetParentNSeuid() int64 { + euid := int64(os.Geteuid()) + uidmap, err := user.CurrentProcessUIDMap() + if err != nil { + // This kernel-provided file only exists if user namespaces are supported + return euid + } + for _, um := range uidmap { + if um.ID <= euid && euid <= um.ID+um.Count-1 { + return um.ParentID + euid - um.ID + } + } + return euid +} + // SetSubreaper sets the value i as the subreaper setting for the calling process func SetSubreaper(i int) error { return unix.Prctl(PR_SET_CHILD_SUBREAPER, uintptr(i), 0, 0, 0) diff --git a/vendor/github.com/opencontainers/runc/libcontainer/system/unsupported.go b/vendor/github.com/opencontainers/runc/libcontainer/system/unsupported.go index e7cfd62b29..b94be74a66 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/system/unsupported.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/system/unsupported.go @@ -2,8 +2,26 @@ package system +import ( + "os" + + "github.com/opencontainers/runc/libcontainer/user" +) + // RunningInUserNS is a stub for non-Linux systems // Always returns false func RunningInUserNS() bool { return false } + +// UIDMapInUserNS is a stub for non-Linux systems +// Always returns false +func UIDMapInUserNS(uidmap []user.IDMap) bool { + return false +} + +// GetParentNSeuid returns the euid within the parent user namespace +// Always returns os.Geteuid on non-linux +func GetParentNSeuid() int { + return os.Geteuid() +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unix.go b/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unix.go index c45e300411..92b5ae8de0 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unix.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unix.go @@ -5,6 +5,7 @@ package user import ( "io" "os" + "strconv" "golang.org/x/sys/unix" ) @@ -114,3 +115,30 @@ func CurrentUser() (User, error) { func CurrentGroup() (Group, error) { return LookupGid(unix.Getgid()) } + +func currentUserSubIDs(fileName string) ([]SubID, error) { + u, err := CurrentUser() + if err != nil { + return nil, err + } + filter := func(entry SubID) bool { + return entry.Name == u.Name || entry.Name == strconv.Itoa(u.Uid) + } + return ParseSubIDFileFilter(fileName, filter) +} + +func CurrentUserSubUIDs() ([]SubID, error) { + return currentUserSubIDs("/etc/subuid") +} + +func CurrentUserSubGIDs() ([]SubID, error) { + return currentUserSubIDs("/etc/subgid") +} + +func CurrentProcessUIDMap() ([]IDMap, error) { + return ParseIDMapFile("/proc/self/uid_map") +} + +func CurrentProcessGIDMap() ([]IDMap, error) { + return ParseIDMapFile("/proc/self/gid_map") +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/user/user.go b/vendor/github.com/opencontainers/runc/libcontainer/user/user.go index 93414516ca..7b912bbf8b 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/user/user.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/user/user.go @@ -75,12 +75,29 @@ func groupFromOS(g *user.Group) (Group, error) { return newGroup, nil } +// SubID represents an entry in /etc/sub{u,g}id +type SubID struct { + Name string + SubID int64 + Count int64 +} + +// IDMap represents an entry in /proc/PID/{u,g}id_map +type IDMap struct { + ID int64 + ParentID int64 + Count int64 +} + func parseLine(line string, v ...interface{}) { - if line == "" { + parseParts(strings.Split(line, ":"), v...) +} + +func parseParts(parts []string, v ...interface{}) { + if len(parts) == 0 { return } - parts := strings.Split(line, ":") for i, p := range parts { // Ignore cases where we don't have enough fields to populate the arguments. // Some configuration files like to misbehave. @@ -96,6 +113,8 @@ func parseLine(line string, v ...interface{}) { case *int: // "numbers", with conversion errors ignored because of some misbehaving configuration files. *e, _ = strconv.Atoi(p) + case *int64: + *e, _ = strconv.ParseInt(p, 10, 64) case *[]string: // Comma-separated lists. if p != "" { @@ -105,7 +124,7 @@ func parseLine(line string, v ...interface{}) { } default: // Someone goof'd when writing code using this function. Scream so they can hear us. - panic(fmt.Sprintf("parseLine only accepts {*string, *int, *[]string} as arguments! %#v is not a pointer!", e)) + panic(fmt.Sprintf("parseLine only accepts {*string, *int, *int64, *[]string} as arguments! %#v is not a pointer!", e)) } } } @@ -479,3 +498,111 @@ func GetAdditionalGroupsPath(additionalGroups []string, groupPath string) ([]int } return GetAdditionalGroups(additionalGroups, group) } + +func ParseSubIDFile(path string) ([]SubID, error) { + subid, err := os.Open(path) + if err != nil { + return nil, err + } + defer subid.Close() + return ParseSubID(subid) +} + +func ParseSubID(subid io.Reader) ([]SubID, error) { + return ParseSubIDFilter(subid, nil) +} + +func ParseSubIDFileFilter(path string, filter func(SubID) bool) ([]SubID, error) { + subid, err := os.Open(path) + if err != nil { + return nil, err + } + defer subid.Close() + return ParseSubIDFilter(subid, filter) +} + +func ParseSubIDFilter(r io.Reader, filter func(SubID) bool) ([]SubID, error) { + if r == nil { + return nil, fmt.Errorf("nil source for subid-formatted data") + } + + var ( + s = bufio.NewScanner(r) + out = []SubID{} + ) + + for s.Scan() { + if err := s.Err(); err != nil { + return nil, err + } + + line := strings.TrimSpace(s.Text()) + if line == "" { + continue + } + + // see: man 5 subuid + p := SubID{} + parseLine(line, &p.Name, &p.SubID, &p.Count) + + if filter == nil || filter(p) { + out = append(out, p) + } + } + + return out, nil +} + +func ParseIDMapFile(path string) ([]IDMap, error) { + r, err := os.Open(path) + if err != nil { + return nil, err + } + defer r.Close() + return ParseIDMap(r) +} + +func ParseIDMap(r io.Reader) ([]IDMap, error) { + return ParseIDMapFilter(r, nil) +} + +func ParseIDMapFileFilter(path string, filter func(IDMap) bool) ([]IDMap, error) { + r, err := os.Open(path) + if err != nil { + return nil, err + } + defer r.Close() + return ParseIDMapFilter(r, filter) +} + +func ParseIDMapFilter(r io.Reader, filter func(IDMap) bool) ([]IDMap, error) { + if r == nil { + return nil, fmt.Errorf("nil source for idmap-formatted data") + } + + var ( + s = bufio.NewScanner(r) + out = []IDMap{} + ) + + for s.Scan() { + if err := s.Err(); err != nil { + return nil, err + } + + line := strings.TrimSpace(s.Text()) + if line == "" { + continue + } + + // see: man 7 user_namespaces + p := IDMap{} + parseParts(strings.Fields(line), &p.ID, &p.ParentID, &p.Count) + + if filter == nil || filter(p) { + out = append(out, p) + } + } + + return out, nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go index baa54c9ba2..40ccfaa1a0 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go @@ -1,8 +1,6 @@ package utils import ( - "crypto/rand" - "encoding/hex" "encoding/json" "io" "os" @@ -17,19 +15,6 @@ const ( exitSignalOffset = 128 ) -// GenerateRandomName returns a new name joined with a prefix. This size -// specified is used to truncate the randomly generated value -func GenerateRandomName(prefix string, size int) (string, error) { - id := make([]byte, 32) - if _, err := io.ReadFull(rand.Reader, id); err != nil { - return "", err - } - if size > 64 { - size = 64 - } - return prefix + hex.EncodeToString(id)[:size], nil -} - // ResolveRootfs ensures that the current working directory is // not a symlink and returns the absolute path to the rootfs func ResolveRootfs(uncleanRootfs string) (string, error) {