// Config defines configuration options for executing a process inside a contained environment. type Config struct { ... // Namespaces specifies the container's namespaces that it should setup when cloning the init process // If a namespace is not provided that namespace is shared from the container's parent process Namespaces Namespaces `json:"namespaces"` // UidMappings is an array of User ID mappings for User Namespaces UidMappings []IDMap `json:"uid_mappings"` // GidMappings is an array of Group ID mappings for User Namespaces GidMappings []IDMap `json:"gid_mappings"` ... }
runC中namespace的源码主要在: runc/libcontainer/configs/namespaces_unix.go runC支持的namespce type包括($nsName) "net"、"mnt"、"pid"、"ipc"、"user"、"uts":
const ( NEWNET NamespaceType = "NEWNET" NEWPID NamespaceType = "NEWPID" NEWNS NamespaceType = "NEWNS" NEWUTS NamespaceType = "NEWUTS" NEWIPC NamespaceType = "NEWIPC" NEWUSER NamespaceType = "NEWUSER" )
除了验证 Namespce Type是否在以上常量中,还要去验证 /proc/self/ns/$nsName是否存在并且可以read,都通过时,才认为该Namespace是在当前系统中是被支持的。
root@cloud:~/iso# ls /proc/self/ns/ -al total 0 dr-x--x--x 2 root root 0 Dec 4 14:51 . dr-xr-xr-x 9 root root 0 Dec 4 14:51 .. lrwxrwxrwx 1 root root 0 Dec 4 14:51 cgroup -> 'cgroup:[4026531835]' lrwxrwxrwx 1 root root 0 Dec 4 14:51 ipc -> 'ipc:[4026531839]' lrwxrwxrwx 1 root root 0 Dec 4 14:51 mnt -> 'mnt:[4026531840]' lrwxrwxrwx 1 root root 0 Dec 4 14:51 net -> 'net:[4026531896]' lrwxrwxrwx 1 root root 0 Dec 4 14:51 pid -> 'pid:[4026531836]' lrwxrwxrwx 1 root root 0 Dec 4 14:51 pid_for_children -> 'pid:[4026531836]' lrwxrwxrwx 1 root root 0 Dec 4 14:51 user -> 'user:[4026531837]' lrwxrwxrwx 1 root root 0 Dec 4 14:51 uts -> 'uts:[4026531838]' root@cloud:~/iso#
root@cloud:~/iso# unshare -m -u --propagation unchanged /bin/bash root@cloud:~/iso# ls /proc/self/ns/ -al total 0 dr-x--x--x 2 root root 0 Dec 4 14:52 . dr-xr-xr-x 9 root root 0 Dec 4 14:52 .. lrwxrwxrwx 1 root root 0 Dec 4 14:52 cgroup -> 'cgroup:[4026531835]' lrwxrwxrwx 1 root root 0 Dec 4 14:52 ipc -> 'ipc:[4026531839]' lrwxrwxrwx 1 root root 0 Dec 4 14:52 mnt -> 'mnt:[4026533784]' lrwxrwxrwx 1 root root 0 Dec 4 14:52 net -> 'net:[4026531896]' lrwxrwxrwx 1 root root 0 Dec 4 14:52 pid -> 'pid:[4026531836]' lrwxrwxrwx 1 root root 0 Dec 4 14:52 pid_for_children -> 'pid:[4026531836]' lrwxrwxrwx 1 root root 0 Dec 4 14:52 user -> 'user:[4026531837]' lrwxrwxrwx 1 root root 0 Dec 4 14:52 uts -> 'uts:[4026533786]' root@cloud:~/iso#
如下是NameSpace的完整定义,很简单,只包括NamespaceType 和对应的Path。
// Namespace defines configuration for each namespace. It specifies an // alternate path that is able to be joined via setns. type Namespace struct { Type NamespaceType `json:"type"` Path string `json:"path"` }
从Namespace的GetPath方法中可见,一个pid对应的namespace path为 /proc/$pid/ns/$nsName。
func (n *Namespace) GetPath(pid int) string { if n.Path != "" { return n.Path } return fmt.Sprintf("/proc/%d/ns/%s", pid, NsName(n.Type)) }
root@cloud:~/iso# ls /proc/$$/ns/ -al total 0 dr-x--x--x 2 root root 0 Dec 4 14:55 . dr-xr-xr-x 9 root root 0 Dec 4 14:55 .. lrwxrwxrwx 1 root root 0 Dec 4 14:55 cgroup -> 'cgroup:[4026531835]' lrwxrwxrwx 1 root root 0 Dec 4 14:55 ipc -> 'ipc:[4026531839]' lrwxrwxrwx 1 root root 0 Dec 4 14:55 mnt -> 'mnt:[4026533784]' lrwxrwxrwx 1 root root 0 Dec 4 14:55 net -> 'net:[4026531896]' lrwxrwxrwx 1 root root 0 Dec 4 14:55 pid -> 'pid:[4026531836]' lrwxrwxrwx 1 root root 0 Dec 4 14:55 pid_for_children -> 'pid:[4026531836]' lrwxrwxrwx 1 root root 0 Dec 4 14:55 user -> 'user:[4026531837]' lrwxrwxrwx 1 root root 0 Dec 4 14:55 uts -> 'uts:[4026533786]' root@cloud:~/iso#
func (c *linuxContainer) start(process *Process) error { //if false == cPathExists("/run/sockets/qemu_pipe") { // return newSystemErrorWithCausef(nil, "mount bind /run/sockets failed %s , /run/sockets/qemu_pipe not exist", c.config.Rootfs) //} //input_dir := filepath.Join(c.config.Rootfs, "/vmi/sockets") //if err := os.MkdirAll(input_dir, 0777); err != nil { // return newSystemErrorWithCause(err, "mkdir rootfs/sockets/") //} //if err := unix.Mount("/run/sockets/qemu_pipe", input_dir, "", unix.MS_REC|unix.MS_BIND, ""); err != nil { // return newSystemErrorWithCausef(err, "mount bind /run/sockets failed %s", c.config.Rootfs) //} parent, err := c.newParentProcess(process) if err != nil { return newSystemErrorWithCause(err, "creating new parent process") } parent.forwardChildLogs() if err := parent.start(); err != nil { // terminate the process to ensure that it properly is reaped. if err := ignoreTerminateErrors(parent.terminate()); err != nil { logrus.Warn(err) } return newSystemErrorWithCause(err, "starting container process") } // generate a timestamp indicating when the container was started c.created = time.Now().UTC() if process.Init { c.state = &createdState{ c: c, } state, err := c.updateState(parent) if err != nil { return err } c.initProcessStartTime = state.InitProcessStartTime if c.config.Hooks != nil { s, err := c.currentOCIState() if err != nil { return err } for i, hook := range c.config.Hooks.Poststart { if err := hook.Run(s); err != nil { if err := ignoreTerminateErrors(parent.terminate()); err != nil { logrus.Warn(err) } return newSystemErrorWithCausef(err, "running poststart hook %d", i) } } } } return nil }
func (c *linuxContainer) updateState(process parentProcess) (*State, error) { if process != nil { c.initProcess = process } state, err := c.currentState() if err != nil { return nil, err } err = c.saveState(state) if err != nil { return nil, err } return state, nil }
func (c *linuxContainer) currentState() (*State, error) { var ( startTime uint64 externalDescriptors []string pid = -1 ) if c.initProcess != nil { pid = c.initProcess.pid() startTime, _ = c.initProcess.startTime() externalDescriptors = c.initProcess.externalDescriptors() } intelRdtPath, err := intelrdt.GetIntelRdtPath(c.ID()) if err != nil { intelRdtPath = "" } state := &State{ BaseState: BaseState{ ID: c.ID(), Config: *c.config, InitProcessPid: pid, InitProcessStartTime: startTime, Created: c.created, }, Rootless: c.config.RootlessEUID && c.config.RootlessCgroups, CgroupPaths: c.cgroupManager.GetPaths(), IntelRdtPath: intelRdtPath, NamespacePaths: make(map[configs.NamespaceType]string), ExternalDescriptors: externalDescriptors, } if pid > 0 { for _, ns := range c.config.Namespaces { state.NamespacePaths[ns.Type] = ns.GetPath(pid) } for _, nsType := range configs.NamespaceTypes() { if !configs.IsNamespaceSupported(nsType) { continue } if _, ok := state.NamespacePaths[nsType]; !ok { ns := configs.Namespace{Type: nsType} state.NamespacePaths[ns.Type] = ns.GetPath(pid) } } } return state, nil }
除此之外,还定义了以下常用方法:
func (n *Namespaces) Remove(t NamespaceType) bool {...} func (n *Namespaces) Add(t NamespaceType, path string) {...} func (n *Namespaces) index(t NamespaceType) int {...} func (n *Namespaces) Contains(t NamespaceType) bool {...} func (n *Namespaces) PathOf(t NamespaceType) string {...}
在runc/libcontainer/configs/namespaces_syscall.go中,定义了linux clone时这些namespace对应的clone flags。
var namespaceInfo = map[NamespaceType]int{ NEWNET: syscall.CLONE_NEWNET, NEWNS: syscall.CLONE_NEWNS, NEWUSER: syscall.CLONE_NEWUSER, NEWIPC: syscall.CLONE_NEWIPC, NEWUTS: syscall.CLONE_NEWUTS, NEWPID: syscall.CLONE_NEWPID, } // CloneFlags parses the container's Namespaces options to set the correct // flags on clone, unshare. This function returns flags only for new namespaces. func (n *Namespaces) CloneFlags() uintptr { var flag int for _, v := range *n { if v.Path != "" { continue } flag |= namespaceInfo[v.Type] } return uintptr(flag) }
上面的CloneFlags()方法是用来解析linuxContainer的config中的namespace相关的参数,生成clone flags,提供给linuxContainer.bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string) 来封装。
// bootstrapData encodes the necessary data in netlink binary format
// as a io.Reader.
// Consumer can write the data to a bootstrap program
// such as one that uses nsenter package to bootstrap the container's
// init process correctly, i.e. with correct namespaces, uid/gid
// mapping etc.
func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string) (io.Reader, error) {
// create the netlink message
r := nl.NewNetlinkRequest(int(InitMsg), 0)
// write cloneFlags
r.AddData(&Int32msg{
Type: CloneFlagsAttr,
Value: uint32(cloneFlags),
})
// write custom namespace paths
if len(nsMaps) > 0 {
nsPaths, err := c.orderNamespacePaths(nsMaps)
if err != nil {
return nil, err
}
r.AddData(&Bytemsg{
Type: NsPathsAttr,
Value: []byte(strings.Join(nsPaths, ",")),
})
}
// write namespace paths only when we are not joining an existing user ns
_, joinExistingUser := nsMaps[configs.NEWUSER]
if !joinExistingUser {
// write uid mappings
if len(c.config.UidMappings) > 0 {
b, err := encodeIDMapping(c.config.UidMappings)
if err != nil {
return nil, err
}
r.AddData(&Bytemsg{
Type: UidmapAttr,
Value: b,
})
}
// write gid mappings
if len(c.config.GidMappings) > 0 {
b, err := encodeIDMapping(c.config.GidMappings)
if err != nil {
return nil, err
}
r.AddData(&Bytemsg{
Type: GidmapAttr,
Value: b,
})
// check if we have CAP_SETGID to setgroup properly
pid, err := capability.NewPid(os.Getpid())
if err != nil {
return nil, err
}
if !pid.Get(capability.EFFECTIVE, capability.CAP_SETGID) {
r.AddData(&Boolmsg{
Type: SetgroupAttr,
Value: true,
})
}
}
}
return bytes.NewReader(r.Serialize()), nil
}
linuxContainer.newInitProcess(...)最终会使用linuxContainer.bootstrapData封装的clone flags数据,完成initProcess的构建。
func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*initProcess, error) { cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard)) nsMaps := make(map[configs.NamespaceType]string) for _, ns := range c.config.Namespaces { if ns.Path != "" { nsMaps[ns.Type] = ns.Path } } _, sharePidns := nsMaps[configs.NEWPID] data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps) if err != nil { return nil, err } p.consoleChan = make(chan *os.File, 1) return &initProcess{ cmd: cmd, childPipe: childPipe, parentPipe: parentPipe, manager: c.cgroupManager, config: c.newInitConfig(p), container: c, process: p, bootstrapData: data, sharePidns: sharePidns, rootDir: rootDir, }, nil }
func (p *initProcess) start() error { defer p.messageSockPair.parent.Close() err := p.cmd.Start() p.process.ops = p // close the write-side of the pipes (controlled by child) p.messageSockPair.child.Close() p.logFilePair.child.Close() if err != nil { p.process.ops = nil return newSystemErrorWithCause(err, "starting init process command") } // Do this before syncing with child so that no children can escape the // cgroup. We don't need to worry about not doing this and not being root // because we'd be using the rootless cgroup manager in that case. if err := p.manager.Apply(p.pid()); err != nil { return newSystemErrorWithCause(err, "applying cgroup configuration for process") } if p.intelRdtManager != nil { if err := p.intelRdtManager.Apply(p.pid()); err != nil { return newSystemErrorWithCause(err, "applying Intel RDT configuration for process") } } defer func() { if err != nil { // TODO: should not be the responsibility to call here p.manager.Destroy() if p.intelRdtManager != nil { p.intelRdtManager.Destroy() } } }() if _, err := io.Copy(p.messageSockPair.parent, p.bootstrapData); err != nil { return newSystemErrorWithCause(err, "copying bootstrap data to pipe") } childPid, err := p.getChildPid() if err != nil { return newSystemErrorWithCause(err, "getting the final child's pid from pipe") }
io.Copy
将 p.bootstrapData 中的数据通过 p.parentPipe 发送给子进程
newInitProcess(...)在整个container create的流程中的位置,请参考:runC源码分析之Create/Run Container —— 王涛 如此,namespace在整个container create/run中的源码分析就完整了。
补充:runC中container的Spec是从bundle/config.json中解析得到的,见runC的create.go中的setupSpec(context)的调用。
Action: func(context *cli.Context) error {
if context.NArg() != 1 {
fmt.Printf("Incorrect Usage.
")
cli.ShowCommandHelp(context, "create")
return fmt.Errorf("runc: "create" requires exactly one argument")
}
if err := revisePidFile(context); err != nil {
return err
}
spec, err := setupSpec(context)
if err != nil {
return err
}
status, err := startContainer(context, spec, true)
if err != nil {
return err
}
setupSepc(context)会去loadSpec("config.json"):
// setupSpec performs initial setup based on the cli.Context for the container
func setupSpec(context *cli.Context) (*specs.Spec, error) {
bundle := context.String("bundle")
if bundle != "" {
if err := os.Chdir(bundle); err != nil {
return nil, err
}
}
spec, err := loadSpec(specConfig)
if err != nil {
return nil, err
}
notifySocket := os.Getenv("NOTIFY_SOCKET")
if notifySocket != "" {
setupSdNotify(spec, notifySocket)
}
if os.Geteuid() != 0 {
return nil, fmt.Errorf("runc should be run as root")
}
return spec, nil
}
config.json样例如下,namespace部分见 “.linux.namespaces”。
{
"ociVersion": "0.4.0",
"platform": {
"os": "linux",
"arch": "amd64"
},
"process": {
"terminal": true,
"user": {},
"args": [
"redis-server",
"--bind",
"0.0.0.0"
],
"env": [
"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
"TERM=xterm"
],
"cwd": "/",
"capabilities": [
"CAP_AUDIT_WRITE",
"CAP_KILL",
"CAP_NET_BIND_SERVICE"
],
"rlimits": [
{
"type": "RLIMIT_NOFILE",
"hard": 1024,
"soft": 1024
}
],
"noNewPrivileges": true
},
"root": {
"path": "rootfs",
"readonly": true
},
"hostname": "runc",
"mounts": [
{
"destination": "/proc",
"type": "proc",
"source": "proc"
},
{
"destination": "/dev",
"type": "tmpfs",
"source": "tmpfs",
"options": [
"nosuid",
"strictatime",
"mode=755",
"size=65536k"
]
},
{
"destination": "/dev/pts",
"type": "devpts",
"source": "devpts",
"options": [
"nosuid",
"noexec",
"newinstance",
"ptmxmode=0666",
"mode=0620",
"gid=5"
]
},
{
"destination": "/dev/shm",
"type": "tmpfs",
"source": "shm",
"options": [
"nosuid",
"noexec",
"nodev",
"mode=1777",
"size=65536k"
]
},
{
"destination": "/dev/mqueue",
"type": "mqueue",
"source": "mqueue",
"options": [
"nosuid",
"noexec",
"nodev"
]
},
{
"destination": "/sys",
"type": "sysfs",
"source": "sysfs",
"options": [
"nosuid",
"noexec",