in tensorpipe/channel/cma/context_impl.cc [131:302]
std::shared_ptr<ContextImpl> ContextImpl::create() {
int rv;
std::ostringstream oss;
oss << kDomainDescriptorPrefix;
// This transport only works across processes on the same machine, and we
// detect that by computing the boot ID.
optional<std::string> bootID = getBootID();
TP_THROW_ASSERT_IF(!bootID.has_value()) << "Unable to read boot_id";
oss << bootID.value();
// An endpoint can see the other through its PID if the latter is in a child
// PID namespace of the former. Since the channel is bidirectional this must
// be symmetric and thus the PID namespaces must be the same.
optional<std::string> pidNsID = getLinuxNamespaceId(LinuxNamespace::kPid);
if (!pidNsID.has_value()) {
TP_VLOG(5) << "Unable to read pid namespace ID";
return nullptr;
}
oss << '_' << pidNsID.value();
// The ability to call process_vm_readv on a target is controlled by the
// PTRACE_MODE_ATTACH_REALCREDS check (see process_vm_readv(2)). We'll go
// through its checklist, step by step (which is found in ptrace(2)). We will
// ignore the CAP_SYS_PTRACE conditions (i.e., we'll assume we don't have that
// capability) because they are hard to check, and typically not needed.
// We'll skip the check on whether the endpoints are two threads of the same
// process (in which case ptrace is always allowed) because it's hard to fit
// it in the descriptor and because we have some other more specialized
// channels for that case.
// The next step involves comparing user and group IDs. If the processes are
// in user namespaces the kernel first maps these IDs back to the top-level
// ("initial") ones and compares those. We can't do such mapping, thus we
// compare the IDs as integers as we see them and thus for this to work
// properly we require that the two endpoints are in the same user namespace.
// This does not in fact constitute an extra restriction since the later
// commoncap/capability LSM check will need to enforce this too.
optional<std::string> userNsID = getLinuxNamespaceId(LinuxNamespace::kUser);
if (!userNsID.has_value()) {
TP_VLOG(5) << "Unable to read user namespace ID";
return nullptr;
}
oss << '_' << userNsID.value();
// It is required that our *real* user ID matches the real, effective and
// saved-set user IDs of the target. And the same must hold for group IDs.
// As the channel is bidirectional, the reverse must also hold, which means
// our real, effective and saved-set IDs must all be equal and must match the
// other endpoint's ones.
uid_t realUserId, effectiveUserId, savedSetUserId;
gid_t realGroupId, effectiveGroupId, savedSetGroupId;
rv = ::getresuid(&realUserId, &effectiveUserId, &savedSetUserId);
TP_THROW_SYSTEM_IF(rv < 0, errno);
rv = ::getresgid(&realGroupId, &effectiveGroupId, &savedSetGroupId);
TP_THROW_SYSTEM_IF(rv < 0, errno);
if (realUserId != effectiveUserId || realUserId != savedSetUserId ||
realGroupId != effectiveGroupId || realGroupId != savedSetGroupId) {
TP_VLOG(5) << "User IDs or group IDs aren't all equal. User IDs are "
<< realUserId << " (real), " << effectiveUserId
<< " (effective) and " << savedSetUserId
<< " (saved-set). Group IDs are " << realGroupId << " (real), "
<< effectiveGroupId << " (effective) and " << savedSetGroupId
<< " (saved-set).";
return nullptr;
}
oss << '_' << realUserId << '_' << realGroupId;
// The target must be dumpable. Which, due to symmetry, means we must be
// dumpable too.
rv = ::prctl(PR_GET_DUMPABLE, 0, 0, 0, 0);
TP_THROW_SYSTEM_IF(rv < 0, errno);
// SUID_DUMP_USER has a value of 1.
if (rv != 1) {
TP_VLOG(5) << "Process isn't dumpable";
return nullptr;
}
// Next the Linux Security Modules (LSMs) kick in. Since users could register
// third-party LSMs we'll need to draw a line in what we support. We have two
// options with unsupported LSMs: play it safe and assume the LSM will reject
// the check, or "trust" the user and make them responsible to deal with the
// LSMs they added. We're leaning for the latter, as often some LSMs like
// AppArmor or SELinux are enabled without actually restricting anything. For
// now we'll support the LSMs that are found by default on common distros,
// but we can include support for more of them if that becomes necessary.
optional<std::vector<std::string>> lsms = getLinuxSecurityModules();
bool yamaOptional = false;
if (!lsms.has_value()) {
// This could happen if /sys/kernel/security/lsm cannot be opened. Although
// that file looks like it resides on sysfs, it's actually on the securityfs
// VFS, which is sometimes not bind-mounted inside containers. In such cases
// rather than failing hard we'll check a couple of reasonable LSMs.
TP_VLOG(5) << "Couldn't detect the active Linux Security Modules";
lsms.emplace();
*lsms = {"capability", "yama"};
// We don't know whether YAMA is really there, hence we'll remember to
// tolerate any failures later on.
yamaOptional = true;
} else {
TP_VLOG(5) << "Detected these Linux Security Modules: " << joinStrs(*lsms);
}
// FIXME Can we assume that the two endpoints will see the same list of LSMs,
// or should we incorporate that into the domain descriptor?
for (const std::string& lsm : lsms.value()) {
if (lsm == "capability") {
// We already checked that the endpoints are in the same user namespace.
// We must check they have the same permitted capabilities in it.
optional<std::string> caps = getPermittedCapabilitiesID();
TP_THROW_ASSERT_IF(!caps.has_value())
<< "Unable to obtain permitted capabilities";
oss << '_' << caps.value();
} else if (lsm == "yama") {
optional<YamaPtraceScope> yamaScope = getYamaPtraceScope();
if (!yamaScope.has_value()) {
TP_THROW_ASSERT_IF(!yamaOptional)
<< "Unable to retrieve YAMA ptrace scope";
continue;
}
switch (yamaScope.value()) {
case YamaPtraceScope::kClassicPtracePermissions:
TP_VLOG(5) << "YAMA ptrace scope set to classic ptrace permissions";
break;
case YamaPtraceScope::kRestrictedPtrace:
TP_VLOG(5) << "YAMA ptrace scope set to restricted ptrace";
// FIXME It's not really great to change a global property of the
// process, especially a security-related one. An "excuse" for doing
// so is that UCT does the same:
// https://github.com/openucx/ucx/blob/4d9976b6b8f8faae609c078c72aad8e5b842c43f/src/uct/sm/scopy/cma/cma_md.c#L61
#ifndef PR_SET_PTRACER
// https://github.com/torvalds/linux/blob/master/include/uapi/linux/prctl.h
#define PR_SET_PTRACER 0x59616d61
#endif
#ifndef PR_SET_PTRACER_ANY
// https://github.com/torvalds/linux/blob/master/include/uapi/linux/prctl.h
#define PR_SET_PTRACER_ANY ((unsigned long)-1)
#endif
rv = ::prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY, 0, 0, 0);
TP_THROW_SYSTEM_IF(rv < 0, errno);
break;
case YamaPtraceScope::kAdminOnlyAttach:
TP_VLOG(5) << "YAMA ptrace scope set to admin-only attach";
return nullptr;
case YamaPtraceScope::kNoAttach:
TP_VLOG(5) << "YAMA ptrace scope set to no attach";
return nullptr;
default:
TP_THROW_ASSERT() << "Unknown YAMA ptrace scope";
}
}
}
// In addition to the ptrace check, in some cases (I'm looking at you Docker)
// the process_vm_readv syscall is outright blocked by seccomp-bpf. Or just
// unsupported by the kernel.
Error error = attemptProcessVmReadvSyscallOnSelf();
if (error) {
TP_VLOG(5)
<< "The process_vm_readv syscall appears to be unavailable or blocked: "
<< error.what();
return nullptr;
}
std::string domainDescriptor = oss.str();
TP_VLOG(5) << "The domain descriptor for CMA is " << domainDescriptor;
std::unordered_map<Device, std::string> deviceDescriptors = {
{Device{kCpuDeviceType, 0}, std::move(domainDescriptor)}};
return std::make_shared<ContextImpl>(std::move(deviceDescriptors));
}