Bug 1401062 - Create Linux child processes with clone() for namespace/chroot sandboxing. r=gcp draft
authorJed Davis <jld@mozilla.com>
Fri, 06 Oct 2017 17:16:41 -0600
changeset 721337 06108bcdc900367848361694050635fc881da2d8
parent 721336 e0f34fcf3d14feeb2454077dc85e825117f87883
child 721338 f63271a8a9c52701bfe938be6ecdd4d5a9d0d0c3
push id95806
push userbmo:jld@mozilla.com
push dateWed, 17 Jan 2018 04:30:48 +0000
reviewersgcp
bugs1401062
milestone59.0a1
Bug 1401062 - Create Linux child processes with clone() for namespace/chroot sandboxing. r=gcp Namespace isolation is now handled by using clone() at process creation time, rather than calling unshare. pthread_atfork will no longer apply to sandboxed child processes. The two significant uses of it in Firefox currently are to (1) make malloc work post-fork, which we already avoid depending on in IPC and sandboxing, and (2) block SIGPROF while forking, which is taken care of; see SandboxFork::Fork for details. Note that if we need pthread_atfork in the future it could be emulated by symbol interposition. clone() is called via glibc's wrapper, for increased compatibility vs. invoking the syscall directly, using longjmp to recover the syscall's fork-like semantics the same way Chromium does; see comments for details. The chroot helper is reimplemented; the general approach is similar, but instead of a thread it's a process cloned with CLONE_FS (so the filesystem root is shared) from the child process before it calls exec, so that it still holds CAP_SYS_CHROOT in the newly created user namespace. This does mean that it will retain a CoW copy of the parent's address space until the child starts sandboxing, but that is a relatively short period of time, so the memory overhead should be small and short-lived. The chrooting now happens *after* the seccomp-bpf policy is applied; previously this wasn't possible because the chroot thread would have become seccomp-restricted and unable to chroot. This fixes a potential race condition where a thread could try to access the filesystem after chrooting but before having its syscalls intercepted for brokering, causing spurious failure. (This failure mode hasn't been observed in practice, but we may not be looking for it.) This adds a hidden bool pref, security.sandbox.content.force-namespace, which unshares the user namespace (if possible) even if no sandboxing requires it. It defaults to true on Nightly and false otherwise, to get test coverage; the default will change to false once we're using namespaces by default with content. MozReview-Commit-ID: JhCXF9EgOt6
ipc/chromium/src/base/process_util.h
ipc/chromium/src/base/process_util_linux.cc
security/sandbox/linux/LinuxCapabilities.cpp
security/sandbox/linux/LinuxCapabilities.h
security/sandbox/linux/Sandbox.cpp
security/sandbox/linux/SandboxChrootProto.h
security/sandbox/linux/launch/LinuxCapabilities.cpp
security/sandbox/linux/launch/LinuxCapabilities.h
security/sandbox/linux/launch/SandboxLaunch.cpp
security/sandbox/linux/launch/moz.build
security/sandbox/linux/moz.build
--- a/ipc/chromium/src/base/process_util.h
+++ b/ipc/chromium/src/base/process_util.h
@@ -115,16 +115,26 @@ struct LaunchOptions {
 #endif
 #if defined(OS_POSIX)
   environment_map env_map;
 
   // A mapping of (src fd -> dest fd) to propagate into the child
   // process.  All other fds will be closed, except std{in,out,err}.
   file_handle_mapping_vector fds_to_remap;
 #endif
+
+#if defined(OS_LINUX)
+  struct ForkDelegate {
+    virtual ~ForkDelegate() { }
+    virtual pid_t Fork() = 0;
+  };
+
+  // If non-null, the fork delegate will be called instead of fork().
+  mozilla::UniquePtr<ForkDelegate> fork_delegate = nullptr;
+#endif
 };
 
 #if defined(OS_WIN)
 // Runs the given application name with the given command line. Normally, the
 // first command line argument should be the path to the process, and don't
 // forget to quote it.
 //
 // Example (including literal quotes)
--- a/ipc/chromium/src/base/process_util_linux.cc
+++ b/ipc/chromium/src/base/process_util_linux.cc
@@ -31,17 +31,17 @@ bool LaunchApp(const std::vector<std::st
   mozilla::UniquePtr<char*[]> argv_cstr(new char*[argv.size() + 1]);
   // Illegal to allocate memory after fork and before execvp
   InjectiveMultimap fd_shuffle1, fd_shuffle2;
   fd_shuffle1.reserve(options.fds_to_remap.size());
   fd_shuffle2.reserve(options.fds_to_remap.size());
 
   EnvironmentArray envp = BuildEnvironmentArray(options.env_map);
 
-  pid_t pid = fork();
+  pid_t pid = options.fork_delegate ? options.fork_delegate->Fork() : fork();
   if (pid < 0)
     return false;
 
   if (pid == 0) {
     // In the child:
     for (const auto& fd_map : options.fds_to_remap) {
       fd_shuffle1.push_back(InjectionArc(fd_map.first, fd_map.second, false));
       fd_shuffle2.push_back(InjectionArc(fd_map.first, fd_map.second, false));
--- a/security/sandbox/linux/Sandbox.cpp
+++ b/security/sandbox/linux/Sandbox.cpp
@@ -3,16 +3,17 @@
 /* This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this file,
  * You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 #include "Sandbox.h"
 
 #include "LinuxSched.h"
 #include "SandboxBrokerClient.h"
+#include "SandboxChrootProto.h"
 #include "SandboxFilter.h"
 #include "SandboxInternal.h"
 #include "SandboxLogging.h"
 #ifdef MOZ_GMP_SANDBOX
 #include "SandboxOpenedFiles.h"
 #endif
 #include "SandboxReporterClient.h"
 
@@ -37,16 +38,17 @@
 #include "mozilla/Array.h"
 #include "mozilla/Atomics.h"
 #include "mozilla/Range.h"
 #include "mozilla/SandboxInfo.h"
 #include "mozilla/Span.h"
 #include "mozilla/UniquePtr.h"
 #include "mozilla/Unused.h"
 #include "prenv.h"
+#include "base/posix/eintr_wrapper.h"
 #include "sandbox/linux/bpf_dsl/codegen.h"
 #include "sandbox/linux/bpf_dsl/dump_bpf.h"
 #include "sandbox/linux/bpf_dsl/policy.h"
 #include "sandbox/linux/bpf_dsl/policy_compiler.h"
 #include "sandbox/linux/bpf_dsl/seccomp_macros.h"
 #include "sandbox/linux/seccomp-bpf/trap.h"
 #include "sandbox/linux/system_headers/linux_filter.h"
 #include "sandbox/linux/system_headers/linux_seccomp.h"
@@ -295,16 +297,26 @@ SetThreadSandboxHandler(int signum)
   // explanation.
   syscall(__NR_futex, reinterpret_cast<int*>(&gSetSandboxDone),
           FUTEX_WAKE, 1);
 }
 
 static void
 EnterChroot()
 {
+  if (!PR_GetEnv(kSandboxChrootEnvFlag)) {
+    return;
+  }
+  char msg = kSandboxChrootRequest;
+  ssize_t msg_len = HANDLE_EINTR(write(kSandboxChrootClientFd, &msg, 1));
+  MOZ_RELEASE_ASSERT(msg_len == 1);
+  msg_len = HANDLE_EINTR(read(kSandboxChrootClientFd, &msg, 1));
+  MOZ_RELEASE_ASSERT(msg_len == 1);
+  MOZ_RELEASE_ASSERT(msg == kSandboxChrootResponse);
+  close(kSandboxChrootClientFd);
 }
 
 static void
 BroadcastSetThreadSandbox(const sock_fprog* aFilter)
 {
   pid_t pid, tid, myTid;
   DIR *taskdp;
   struct dirent *de;
@@ -318,18 +330,16 @@ BroadcastSetThreadSandbox(const sock_fpr
   pid = getpid();
   myTid = syscall(__NR_gettid);
   taskdp = opendir("/proc/self/task");
   if (taskdp == nullptr) {
     SANDBOX_LOG_ERROR("opendir /proc/self/task: %s\n", strerror(errno));
     MOZ_CRASH();
   }
 
-  EnterChroot();
-
   // In case this races with a not-yet-deprivileged thread cloning
   // itself, repeat iterating over all threads until we find none
   // that are still privileged.
   bool sandboxProgress;
   const int tsyncSignum = gSeccompTsyncBroadcastSignum;
   do {
     sandboxProgress = false;
     // For each thread...
@@ -434,21 +444,20 @@ BroadcastSetThreadSandbox(const sock_fpr
   // And now, deprivilege the main thread:
   SetThreadSandbox();
   gSetSandboxFilter = nullptr;
 }
 
 static void
 ApplySandboxWithTSync(sock_fprog* aFilter)
 {
-  EnterChroot();
-  // At this point we're committed to using tsync, because the signal
-  // broadcast workaround needs to access procfs.  (Unless chroot
-  // isn't used... but this failure shouldn't happen in the first
-  // place, so let's not make extra special cases for it.)
+  // At this point we're committed to using tsync, because we'd have
+  // needed to allocate a signal and prevent it from being blocked on
+  // other threads (see SandboxHooks.cpp), so there's no attempt to
+  // fall back to the non-tsync path.
   if (!InstallSyscallFilter(aFilter, true)) {
     MOZ_CRASH();
   }
 }
 
 #ifdef NIGHTLY_BUILD
 static bool
 IsLibPresent(const char* aName)
@@ -516,16 +525,22 @@ SandboxLateInit() {
 // Common code for sandbox startup.
 static void
 SetCurrentProcessSandbox(UniquePtr<sandbox::bpf_dsl::Policy> aPolicy)
 {
   MOZ_ASSERT(gSandboxCrashFunc);
   MOZ_RELEASE_ASSERT(gSandboxReporterClient != nullptr);
   SandboxLateInit();
 
+  // Auto-collect child processes -- mainly the chroot helper if
+  // present, but also anything setns()ed into the pid namespace (not
+  // yet implemented).  This process won't be able to waitpid them
+  // after the seccomp-bpf policy is applied.
+  signal(SIGCHLD, SIG_IGN);
+
   // Note: PolicyCompiler borrows the policy and registry for its
   // lifetime, but does not take ownership of them.
   sandbox::bpf_dsl::PolicyCompiler compiler(aPolicy.get(),
                                             sandbox::Trap::Registry());
   sandbox::CodeGen::Program program = compiler.Compile();
   if (SandboxInfo::Get().Test(SandboxInfo::kVerbose)) {
     sandbox::bpf_dsl::DumpBPF::PrintProgram(program);
   }
@@ -559,16 +574,20 @@ SetCurrentProcessSandbox(UniquePtr<sandb
     }
     ApplySandboxWithTSync(&fprog);
   } else {
     if (info.Test(SandboxInfo::kVerbose)) {
       SANDBOX_LOG_ERROR("no tsync support; using signal broadcast");
     }
     BroadcastSetThreadSandbox(&fprog);
   }
+
+  // Now that all threads' filesystem accesses are being intercepted
+  // (if a broker is used) it's safe to chroot the process:
+  EnterChroot();
 }
 
 #ifdef MOZ_CONTENT_SANDBOX
 /**
  * Starts the seccomp sandbox for a content process.  Should be called
  * only once, and before any potentially harmful content is loaded.
  *
  * Will normally make the process exit on failure.
new file mode 100644
--- /dev/null
+++ b/security/sandbox/linux/SandboxChrootProto.h
@@ -0,0 +1,21 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this file,
+ * You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef mozilla_SandboxChrootProto_h
+#define mozilla_SandboxChrootProto_h
+
+#include "mozilla/Types.h"
+
+namespace mozilla {
+
+static const int kSandboxChrootClientFd = 6;
+static const char kSandboxChrootRequest = 'C';
+static const char kSandboxChrootResponse = 'O';
+static const char kSandboxChrootEnvFlag[] = "MOZ_SANDBOX_USE_CHROOT";
+
+} // namespace mozilla
+
+#endif // mozilla_SandboxChrootProto_h
rename from security/sandbox/linux/LinuxCapabilities.cpp
rename to security/sandbox/linux/launch/LinuxCapabilities.cpp
rename from security/sandbox/linux/LinuxCapabilities.h
rename to security/sandbox/linux/launch/LinuxCapabilities.h
--- a/security/sandbox/linux/launch/SandboxLaunch.cpp
+++ b/security/sandbox/linux/launch/SandboxLaunch.cpp
@@ -1,20 +1,43 @@
 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
 /* This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this file,
  * You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 #include "SandboxLaunch.h"
 
+#include <fcntl.h>
+#include <sched.h>
+#include <setjmp.h>
+#include <signal.h>
+#include <sys/socket.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#include "LinuxCapabilities.h"
+#include "LinuxSched.h"
+#include "SandboxChrootProto.h"
+#include "SandboxInfo.h"
+#include "SandboxLogging.h"
+#include "base/eintr_wrapper.h"
+#include "base/strings/safe_sprintf.h"
+#include "mozilla/ArrayUtils.h"
 #include "mozilla/Assertions.h"
+#include "mozilla/Attributes.h"
+#include "mozilla/Move.h"
+#include "mozilla/Preferences.h"
 #include "mozilla/SandboxReporter.h"
+#include "mozilla/SandboxSettings.h"
+#include "mozilla/Unused.h"
 #include "nsString.h"
+#include "nsThreadUtils.h"
 #include "prenv.h"
+#include "sandbox/linux/system_headers/linux_syscalls.h"
 
 namespace mozilla {
 
 static void
 PreloadSandboxLib(base::environment_map* aEnv)
 {
   // Preload libmozsandbox.so so that sandbox-related interpositions
   // can be defined there instead of in the executable.
@@ -37,21 +60,423 @@ static void
 AttachSandboxReporter(base::file_handle_mapping_vector* aFdMap)
 {
   int srcFd, dstFd;
   SandboxReporter::Singleton()
     ->GetClientFileDescriptorMapping(&srcFd, &dstFd);
   aFdMap->push_back({srcFd, dstFd});
 }
 
+class SandboxFork : public base::LaunchOptions::ForkDelegate {
+public:
+  explicit SandboxFork(int aFlags, bool aChroot);
+  virtual ~SandboxFork();
+
+  void PrepareMapping(base::file_handle_mapping_vector* aMap);
+  pid_t Fork() override;
+
+private:
+  int mFlags;
+  int mChrootServer;
+  int mChrootClient;
+  // For CloseSuperfluousFds in the chroot helper process:
+  base::InjectiveMultimap mChrootMap;
+
+  void StartChrootServer();
+  SandboxFork(const SandboxFork&) = delete;
+  SandboxFork& operator=(const SandboxFork&) = delete;
+};
+
+static int
+GetEffectiveSandboxLevel(GeckoProcessType aType)
+{
+  auto info = SandboxInfo::Get();
+  switch (aType) {
+#ifdef MOZ_GMP_SANDBOX
+  case GeckoProcessType_GMPlugin:
+    if (info.Test(SandboxInfo::kEnabledForMedia)) {
+      return 1;
+    }
+    return 0;
+#endif
+#ifdef MOZ_CONTENT_SANDBOX
+  case GeckoProcessType_Content:
+    // GetEffectiveContentSandboxLevel is main-thread-only due to prefs.
+    MOZ_ASSERT(NS_IsMainThread());
+    if (info.Test(SandboxInfo::kEnabledForContent)) {
+      return GetEffectiveContentSandboxLevel();
+    }
+    return 0;
+#endif
+  default:
+    return 0;
+  }
+}
+
 void
 SandboxLaunchPrepare(GeckoProcessType aType,
 		     base::LaunchOptions* aOptions)
 {
   PreloadSandboxLib(&aOptions->env_map);
   AttachSandboxReporter(&aOptions->fds_to_remap);
 
-  // aType will be used in bug 1401062 to take over the functionality
-  // of SandboxEarlyInit
+  auto info = SandboxInfo::Get();
+
+  // We won't try any kind of sandboxing without seccomp-bpf.
+  if (!info.Test(SandboxInfo::kHasSeccompBPF)) {
+    return;
+  }
+
+  // Check prefs (and env vars) controlling sandbox use.
+  int level = GetEffectiveSandboxLevel(aType);
+  if (level == 0) {
+    return;
+  }
+
+  // Anything below this requires unprivileged user namespaces.
+  if (!info.Test(SandboxInfo::kHasUserNamespaces)) {
+    return;
+  }
+
+  bool canChroot = false;
+  int flags = 0;
+
+  switch (aType) {
+#ifdef MOZ_GMP_SANDBOX
+  case GeckoProcessType_GMPlugin:
+    if (level >= 1) {
+      canChroot = true;
+      flags |= CLONE_NEWNET | CLONE_NEWIPC;
+    }
+    break;
+#endif
+#ifdef MOZ_CONTENT_SANDBOX
+  case GeckoProcessType_Content:
+    // TODO: CLONE_NEWIPC (bug 1376910) if not fglrx and level >= 1,
+    // once the XShm detection shim is fixed.
+
+    // Hidden pref to allow testing user namespaces separately, even
+    // if there's nothing that would require them.
+    if (Preferences::GetBool("security.sandbox.content.force-namespace",
+#ifdef NIGHTLY_BUILD
+                             true
+#else
+                             false
+#endif
+          )) {
+      flags |= CLONE_NEWUSER;
+    }
+    break;
+#endif
+  default:
+    // Nothing yet.
+    break;
+  }
+
+  if (canChroot || flags != 0) {
+    auto forker = MakeUnique<SandboxFork>(flags | CLONE_NEWUSER, canChroot);
+    forker->PrepareMapping(&aOptions->fds_to_remap);
+    aOptions->fork_delegate = Move(forker);
+    if (canChroot) {
+      aOptions->env_map[kSandboxChrootEnvFlag] = "1";
+    }
+  }
+}
+
+SandboxFork::SandboxFork(int aFlags, bool aChroot)
+: mFlags(aFlags)
+, mChrootServer(-1)
+, mChrootClient(-1)
+{
+  if (aChroot) {
+    int fds[2];
+    int rv = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, fds);
+    if (rv != 0) {
+      SANDBOX_LOG_ERROR("socketpair: %s", strerror(errno));
+      MOZ_CRASH("socketpair failed");
+    }
+    mChrootClient = fds[0];
+    mChrootServer = fds[1];
+    // Do this here because the child process won't be able to malloc.
+    mChrootMap.push_back(base::InjectionArc(mChrootServer,
+                                            mChrootServer,
+                                            false));
+  }
+}
+
+void
+SandboxFork::PrepareMapping(base::file_handle_mapping_vector* aMap)
+{
+  if (mChrootClient >= 0) {
+    aMap->push_back({ mChrootClient, kSandboxChrootClientFd });
+  }
+}
+
+SandboxFork::~SandboxFork()
+{
+  if (mChrootClient >= 0) {
+    close(mChrootClient);
+  }
+  if (mChrootServer >= 0) {
+    close(mChrootServer);
+  }
+}
+
+static void
+BlockAllSignals(sigset_t* aOldSigs)
+{
+  sigset_t allSigs;
+  int rv = sigfillset(&allSigs);
+  MOZ_RELEASE_ASSERT(rv == 0);
+  rv = pthread_sigmask(SIG_BLOCK, &allSigs, aOldSigs);
+  if (rv != 0) {
+    SANDBOX_LOG_ERROR("pthread_sigmask (block all): %s", strerror(rv));
+    MOZ_CRASH("pthread_sigmask");
+  }
+}
+
+static void
+RestoreSignals(const sigset_t* aOldSigs)
+{
+  // Assuming that pthread_sigmask is a thin layer over rt_sigprocmask
+  // and doesn't try to touch TLS, which may be in an "interesting"
+  // state right now:
+  int rv = pthread_sigmask(SIG_SETMASK, aOldSigs, nullptr);
+  if (rv != 0) {
+    SANDBOX_LOG_ERROR("pthread_sigmask (restore): %s", strerror(-rv));
+    MOZ_CRASH("pthread_sigmask");
+  }
+}
+
+static void
+ResetSignalHandlers()
+{
+  for (int signum = 1; signum <= SIGRTMAX; ++signum) {
+    if (signal(signum, SIG_DFL) == SIG_ERR) {
+      MOZ_DIAGNOSTIC_ASSERT(errno == EINVAL);
+    }
+  }
+}
+
+namespace {
+
+// The libc clone() routine insists on calling a provided function on
+// a new stack, even if the address space isn't shared and it would be
+// safe to expose the underlying system call's fork()-like behavior.
+// So, we work around this by longjmp()ing back onto the original stack;
+// this technique is also used by Chromium.
+//
+// In theory, the clone syscall could be used directly if we ensure
+// that functions like raise() are never used in the child, including
+// by inherited signal handlers, but the longjmp approach isn't much
+// extra code and avoids a class of potential bugs.
+static int
+CloneCallee(void* aPtr)
+{
+  auto ctxPtr = reinterpret_cast<jmp_buf*>(aPtr);
+  longjmp(*ctxPtr, 1);
+  MOZ_CRASH("unreachable");
+  return 1;
 }
 
+// According to the Chromium developers, builds with FORTIFY_SOURCE
+// require that longjump move the stack pointer towards the root
+// function of the call stack.  Therefore, we must ensure that the
+// clone callee stack is leafward of the stack pointer captured in
+// setjmp() below by using this no-inline helper function.
+//
+// ASan apparently also causes problems, by the combination of
+// allocating the large stack-allocated buffer outside of the actual
+// stack and then assuming that longjmp is used only to unwind a
+// stack, not switch stacks.
+//
+// Valgrind would disapprove of using clone() without CLONE_VM;
+// Chromium uses the raw syscall as a workaround in that case, but
+// we don't currently support sandboxing under valgrind.
+MOZ_NEVER_INLINE MOZ_ASAN_BLACKLIST
+static pid_t
+DoClone(int aFlags, jmp_buf* aCtx)
+{
+  uint8_t miniStack[PTHREAD_STACK_MIN];
+#ifdef __hppa__
+  void* stackPtr = miniStack;
+#else
+  void* stackPtr = ArrayEnd(miniStack);
+#endif
+  return clone(CloneCallee, stackPtr, aFlags, aCtx);
+}
+
+} // namespace
+
+// Similar to fork(), but allows passing flags to clone() and does not
+// run pthread_atfork hooks.
+static pid_t
+ForkWithFlags(int aFlags)
+{
+  // Don't allow flags that would share the address space, or
+  // require clone() arguments we're not passing:
+  static const int kBadFlags = CLONE_VM | CLONE_VFORK | CLONE_SETTLS
+    | CLONE_PARENT_SETTID | CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID;
+  MOZ_RELEASE_ASSERT((aFlags & kBadFlags) == 0);
+
+  jmp_buf ctx;
+  if (setjmp(ctx) == 0) {
+    // In the parent and just called setjmp:
+    return DoClone(aFlags, &ctx);
+  }
+  // In the child and have longjmp'ed:
+  return 0;
+}
+
+static bool
+WriteStringToFile(const char* aPath, const char* aStr, const size_t aLen)
+{
+  int fd = open(aPath, O_WRONLY);
+  if (fd < 0) {
+    return false;
+  }
+  ssize_t written = write(fd, aStr, aLen);
+  if (close(fd) != 0 || written != ssize_t(aLen)) {
+    return false;
+  }
+  return true;
+}
+
+// This function sets up uid/gid mappings that preserve the
+// process's previous ids.  Mapping the uid/gid to something is
+// necessary in order to nest user namespaces (not currently being
+// used, but could be useful), and leaving the ids unchanged is
+// likely to minimize unexpected side-effects.
+static void
+ConfigureUserNamespace(uid_t uid, gid_t gid)
+{
+  using base::strings::SafeSPrintf;
+  char buf[sizeof("18446744073709551615 18446744073709551615 1")];
+  size_t len;
+
+  len = static_cast<size_t>(SafeSPrintf(buf, "%d %d 1", uid, uid));
+  MOZ_RELEASE_ASSERT(len < sizeof(buf));
+  if (!WriteStringToFile("/proc/self/uid_map", buf, len)) {
+    MOZ_CRASH("Failed to write /proc/self/uid_map");
+  }
+
+  // In recent kernels (3.19, 3.18.2, 3.17.8), for security reasons,
+  // establishing gid mappings will fail unless the process first
+  // revokes its ability to call setgroups() by using a /proc node
+  // added in the same set of patches.
+  Unused << WriteStringToFile("/proc/self/setgroups", "deny", 4);
+
+  len = static_cast<size_t>(SafeSPrintf(buf, "%d %d 1", gid, gid));
+  MOZ_RELEASE_ASSERT(len < sizeof(buf));
+  if (!WriteStringToFile("/proc/self/gid_map", buf, len)) {
+    MOZ_CRASH("Failed to write /proc/self/gid_map");
+  }
+}
+
+static void
+DropAllCaps()
+{
+  if (!LinuxCapabilities().SetCurrent()) {
+    SANDBOX_LOG_ERROR("capset (drop all): %s", strerror(errno));
+  }
+}
+
+pid_t
+SandboxFork::Fork() {
+  if (mFlags == 0) {
+    MOZ_ASSERT(mChrootServer < 0);
+    return fork();
+  }
+
+  uid_t uid = getuid();
+  gid_t gid = getgid();
+
+  // Block signals so that the handlers can be safely reset in the
+  // child process without races, and so that repeated SIGPROF from
+  // the profiler won't prevent clone() from making progress.  (The
+  // profiler uses pthread_atfork to do that, but ForkWithFlags
+  // can't run atfork hooks.)
+  sigset_t oldSigs;
+  BlockAllSignals(&oldSigs);
+  pid_t pid = ForkWithFlags(mFlags);
+  if (pid != 0) {
+    RestoreSignals(&oldSigs);
+    return pid;
+  }
+
+  // WARNING: all code from this point on (and in StartChrootServer)
+  // must be async signal safe.  In particular, it cannot do anything
+  // that could allocate heap memory or use mutexes.
+
+  // Clear signal handlers in the child, under the assumption that any
+  // actions they would take (running the crash reporter, manipulating
+  // the Gecko profile, etc.) wouldn't work correctly in the child.
+  ResetSignalHandlers();
+  RestoreSignals(&oldSigs);
+  ConfigureUserNamespace(uid, gid);
+
+  if (mChrootServer >= 0) {
+    StartChrootServer();
+  }
+
+  // execve() will drop capabilities, but it seems best to also drop
+  // them here in case they'd do something unexpected in the generic
+  // post-fork code.
+  DropAllCaps();
+  return 0;
+}
+
+void
+SandboxFork::StartChrootServer()
+{
+  // Run the rest of this function in a separate process that can
+  // chroot() on behalf of this process after it's sandboxed.
+  pid_t pid = ForkWithFlags(CLONE_FS);
+  if (pid < 0) {
+    MOZ_CRASH("failed to clone chroot helper process");
+  }
+  if (pid > 0) {
+    return;
+  }
+
+  LinuxCapabilities caps;
+  caps.Effective(CAP_SYS_CHROOT) = true;
+  if (!caps.SetCurrent()) {
+    SANDBOX_LOG_ERROR("capset (chroot helper): %s", strerror(errno));
+    MOZ_DIAGNOSTIC_ASSERT(false);
+  }
+
+  CloseSuperfluousFds(mChrootMap);
+
+  char msg;
+  ssize_t msgLen = HANDLE_EINTR(read(mChrootServer, &msg, 1));
+  if (msgLen == 0) {
+    // Process exited before chrooting (or chose not to chroot?).
+    _exit(0);
+  }
+  MOZ_RELEASE_ASSERT(msgLen == 1);
+  MOZ_RELEASE_ASSERT(msg == kSandboxChrootRequest);
+
+  // This chroots both processes to this process's procfs fdinfo
+  // directory, which becomes empty and unlinked when this process
+  // exits at the end of this function, and which is always
+  // unwriteable.
+  int rv = chroot("/proc/self/fdinfo");
+  MOZ_RELEASE_ASSERT(rv == 0);
+
+  // Drop CAP_SYS_CHROOT ASAP.  This must happen before responding;
+  // the main child won't be able to waitpid(), so it could start
+  // handling hostile content before this process finishes exiting.
+  DropAllCaps();
+
+  // The working directory still grant access to the real filesystem;
+  // remove that.  (Note: if the process can obtain directory fds, for
+  // example via SandboxBroker, it must be blocked from using fchdir.)
+  rv = chdir("/");
+  MOZ_RELEASE_ASSERT(rv == 0);
+
+  msg = kSandboxChrootResponse;
+  msgLen = HANDLE_EINTR(write(mChrootServer, &msg, 1));
+  MOZ_RELEASE_ASSERT(msgLen == 1);
+  _exit(0);
+}
 
 } // namespace mozilla
--- a/security/sandbox/linux/launch/moz.build
+++ b/security/sandbox/linux/launch/moz.build
@@ -4,16 +4,17 @@
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 EXPORTS.mozilla += [
     'SandboxLaunch.h',
 ]
 
 SOURCES += [
+    'LinuxCapabilities.cpp',
     'SandboxLaunch.cpp',
 ]
 
 include('/ipc/chromium/chromium-config.mozbuild')
 
 LOCAL_INCLUDES += [
     # Need this for safe_sprintf.h used by SandboxLogging.h,
     # but it has to be after ipc/chromium/src.
--- a/security/sandbox/linux/moz.build
+++ b/security/sandbox/linux/moz.build
@@ -60,17 +60,16 @@ SOURCES += [
     '../chromium/sandbox/linux/bpf_dsl/policy.cc',
     '../chromium/sandbox/linux/bpf_dsl/policy_compiler.cc',
     '../chromium/sandbox/linux/bpf_dsl/syscall_set.cc',
     '../chromium/sandbox/linux/seccomp-bpf/die.cc',
     '../chromium/sandbox/linux/seccomp-bpf/syscall.cc',
     '../chromium/sandbox/linux/seccomp-bpf/trap.cc',
     '../chromium/sandbox/linux/services/syscall_wrappers.cc',
     'broker/SandboxBrokerCommon.cpp',
-    'LinuxCapabilities.cpp',
     'Sandbox.cpp',
     'SandboxBrokerClient.cpp',
     'SandboxFilter.cpp',
     'SandboxFilterUtil.cpp',
     'SandboxHooks.cpp',
     'SandboxInfo.cpp',
     'SandboxLogging.cpp',
     'SandboxReporterClient.cpp',