std::shared_ptr ContextImpl::create()

in tensorpipe/channel/cma/context_impl.cc [131:302]
104 lines of code
18 McCabe index (conditional complexity)

std::shared_ptr<ContextImpl> ContextImpl::create() {
  int rv;
  std::ostringstream oss;
  oss << kDomainDescriptorPrefix;

  // This transport only works across processes on the same machine, and we
  // detect that by computing the boot ID.
  optional<std::string> bootID = getBootID();
  TP_THROW_ASSERT_IF(!bootID.has_value()) << "Unable to read boot_id";
  oss << bootID.value();

  // An endpoint can see the other through its PID if the latter is in a child
  // PID namespace of the former. Since the channel is bidirectional this must
  // be symmetric and thus the PID namespaces must be the same.
  optional<std::string> pidNsID = getLinuxNamespaceId(LinuxNamespace::kPid);
  if (!pidNsID.has_value()) {
    TP_VLOG(5) << "Unable to read pid namespace ID";
    return nullptr;
  }
  oss << '_' << pidNsID.value();

  // The ability to call process_vm_readv on a target is controlled by the
  // PTRACE_MODE_ATTACH_REALCREDS check (see process_vm_readv(2)). We'll go
  // through its checklist, step by step (which is found in ptrace(2)). We will
  // ignore the CAP_SYS_PTRACE conditions (i.e., we'll assume we don't have that
  // capability) because they are hard to check, and typically not needed.

  // We'll skip the check on whether the endpoints are two threads of the same
  // process (in which case ptrace is always allowed) because it's hard to fit
  // it in the descriptor and because we have some other more specialized
  // channels for that case.

  // The next step involves comparing user and group IDs. If the processes are
  // in user namespaces the kernel first maps these IDs back to the top-level
  // ("initial") ones and compares those. We can't do such mapping, thus we
  // compare the IDs as integers as we see them and thus for this to work
  // properly we require that the two endpoints are in the same user namespace.
  // This does not in fact constitute an extra restriction since the later
  // commoncap/capability LSM check will need to enforce this too.
  optional<std::string> userNsID = getLinuxNamespaceId(LinuxNamespace::kUser);
  if (!userNsID.has_value()) {
    TP_VLOG(5) << "Unable to read user namespace ID";
    return nullptr;
  }
  oss << '_' << userNsID.value();

  // It is required that our *real* user ID matches the real, effective and
  // saved-set user IDs of the target. And the same must hold for group IDs.
  // As the channel is bidirectional, the reverse must also hold, which means
  // our real, effective and saved-set IDs must all be equal and must match the
  // other endpoint's ones.
  uid_t realUserId, effectiveUserId, savedSetUserId;
  gid_t realGroupId, effectiveGroupId, savedSetGroupId;
  rv = ::getresuid(&realUserId, &effectiveUserId, &savedSetUserId);
  TP_THROW_SYSTEM_IF(rv < 0, errno);
  rv = ::getresgid(&realGroupId, &effectiveGroupId, &savedSetGroupId);
  TP_THROW_SYSTEM_IF(rv < 0, errno);
  if (realUserId != effectiveUserId || realUserId != savedSetUserId ||
      realGroupId != effectiveGroupId || realGroupId != savedSetGroupId) {
    TP_VLOG(5) << "User IDs or group IDs aren't all equal. User IDs are "
               << realUserId << " (real), " << effectiveUserId
               << " (effective) and " << savedSetUserId
               << " (saved-set). Group IDs are " << realGroupId << " (real), "
               << effectiveGroupId << " (effective) and " << savedSetGroupId
               << " (saved-set).";
    return nullptr;
  }
  oss << '_' << realUserId << '_' << realGroupId;

  // The target must be dumpable. Which, due to symmetry, means we must be
  // dumpable too.
  rv = ::prctl(PR_GET_DUMPABLE, 0, 0, 0, 0);
  TP_THROW_SYSTEM_IF(rv < 0, errno);
  // SUID_DUMP_USER has a value of 1.
  if (rv != 1) {
    TP_VLOG(5) << "Process isn't dumpable";
    return nullptr;
  }

  // Next the Linux Security Modules (LSMs) kick in. Since users could register
  // third-party LSMs we'll need to draw a line in what we support. We have two
  // options with unsupported LSMs: play it safe and assume the LSM will reject
  // the check, or "trust" the user and make them responsible to deal with the
  // LSMs they added. We're leaning for the latter, as often some LSMs like
  // AppArmor or SELinux are enabled without actually restricting anything. For
  // now we'll support the LSMs that are found by default on common distros,
  // but we can include support for more of them if that becomes necessary.
  optional<std::vector<std::string>> lsms = getLinuxSecurityModules();
  bool yamaOptional = false;
  if (!lsms.has_value()) {
    // This could happen if /sys/kernel/security/lsm cannot be opened. Although
    // that file looks like it resides on sysfs, it's actually on the securityfs
    // VFS, which is sometimes not bind-mounted inside containers. In such cases
    // rather than failing hard we'll check a couple of reasonable LSMs.
    TP_VLOG(5) << "Couldn't detect the active Linux Security Modules";
    lsms.emplace();
    *lsms = {"capability", "yama"};
    // We don't know whether YAMA is really there, hence we'll remember to
    // tolerate any failures later on.
    yamaOptional = true;
  } else {
    TP_VLOG(5) << "Detected these Linux Security Modules: " << joinStrs(*lsms);
  }
  // FIXME Can we assume that the two endpoints will see the same list of LSMs,
  // or should we incorporate that into the domain descriptor?
  for (const std::string& lsm : lsms.value()) {
    if (lsm == "capability") {
      // We already checked that the endpoints are in the same user namespace.
      // We must check they have the same permitted capabilities in it.
      optional<std::string> caps = getPermittedCapabilitiesID();
      TP_THROW_ASSERT_IF(!caps.has_value())
          << "Unable to obtain permitted capabilities";
      oss << '_' << caps.value();
    } else if (lsm == "yama") {
      optional<YamaPtraceScope> yamaScope = getYamaPtraceScope();
      if (!yamaScope.has_value()) {
        TP_THROW_ASSERT_IF(!yamaOptional)
            << "Unable to retrieve YAMA ptrace scope";
        continue;
      }
      switch (yamaScope.value()) {
        case YamaPtraceScope::kClassicPtracePermissions:
          TP_VLOG(5) << "YAMA ptrace scope set to classic ptrace permissions";
          break;
        case YamaPtraceScope::kRestrictedPtrace:
          TP_VLOG(5) << "YAMA ptrace scope set to restricted ptrace";
          // FIXME It's not really great to change a global property of the
          // process, especially a security-related one. An "excuse" for doing
          // so is that UCT does the same:
          // https://github.com/openucx/ucx/blob/4d9976b6b8f8faae609c078c72aad8e5b842c43f/src/uct/sm/scopy/cma/cma_md.c#L61
#ifndef PR_SET_PTRACER
// https://github.com/torvalds/linux/blob/master/include/uapi/linux/prctl.h
#define PR_SET_PTRACER 0x59616d61
#endif
#ifndef PR_SET_PTRACER_ANY
// https://github.com/torvalds/linux/blob/master/include/uapi/linux/prctl.h
#define PR_SET_PTRACER_ANY ((unsigned long)-1)
#endif
          rv = ::prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY, 0, 0, 0);
          TP_THROW_SYSTEM_IF(rv < 0, errno);
          break;
        case YamaPtraceScope::kAdminOnlyAttach:
          TP_VLOG(5) << "YAMA ptrace scope set to admin-only attach";
          return nullptr;
        case YamaPtraceScope::kNoAttach:
          TP_VLOG(5) << "YAMA ptrace scope set to no attach";
          return nullptr;
        default:
          TP_THROW_ASSERT() << "Unknown YAMA ptrace scope";
      }
    }
  }

  // In addition to the ptrace check, in some cases (I'm looking at you Docker)
  // the process_vm_readv syscall is outright blocked by seccomp-bpf. Or just
  // unsupported by the kernel.
  Error error = attemptProcessVmReadvSyscallOnSelf();
  if (error) {
    TP_VLOG(5)
        << "The process_vm_readv syscall appears to be unavailable or blocked: "
        << error.what();
    return nullptr;
  }

  std::string domainDescriptor = oss.str();
  TP_VLOG(5) << "The domain descriptor for CMA is " << domainDescriptor;

  std::unordered_map<Device, std::string> deviceDescriptors = {
      {Device{kCpuDeviceType, 0}, std::move(domainDescriptor)}};

  return std::make_shared<ContextImpl>(std::move(deviceDescriptors));
}