/* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ #include #include #include #include #include #include #include #include "base/eintr_wrapper.h" #include "base/logging.h" #include "base/message_loop.h" #include "base/process_util.h" #include "mozilla/DataMutex.h" #include "mozilla/StaticPtr.h" #include "mozilla/ipc/IOThread.h" #include "nsITimer.h" #include "nsTArray.h" #include "nsThreadUtils.h" #include "nsXULAppAPI.h" #include "prenv.h" #include "chrome/common/process_watcher.h" #ifdef MOZ_ENABLE_FORKSERVER # include "mozilla/ipc/ForkServiceChild.h" #endif #if defined(XP_LINUX) && !defined(ANDROID) # include "mozilla/AvailableMemoryWatcher.h" # include "mozilla/glean/XpcomMetrics.h" # include "nsPrintfCString.h" #endif // Just to make sure the moz.build is doing the right things with // TARGET_OS and/or OS_TARGET: #if defined(MOZ_WIDGET_ANDROID) || defined(MOZ_WIDGET_UIKIT) # error Unsupported OS #endif #if !defined(XP_DARWIN) // Linux, {Free,Net,Open}BSD, and Solaris; but not macOS, yet. # define HAVE_PIPE2 1 #endif // The basic idea here is a minimal SIGCHLD handler which writes to a // pipe and a libevent callback on the I/O thread which fires when the // other end becomes readable. When we start waiting for process // termination we check if it had already terminated, and otherwise // register it to be checked later when SIGCHLD fires. // // Making this more complicated is that we usually want to kill the // process after a timeout, in case it hangs trying to exit, but not // if it's already exited by that point (see `DelayedKill`). // But we also support waiting indefinitely, for debug/CI use cases // like refcount logging / leak detection / code coverage, and in that // case we block parent process shutdown until all children exit // (which is done by blocking the I/O thread late in shutdown, which // isn't ideal, but the Windows implementation has the same issue). // Maximum amount of time (in milliseconds) to wait for the process to exit. // XXX/cjones: fairly arbitrary, chosen to match process_watcher_win.cc static constexpr int kMaxWaitMs = 2000; // This is also somewhat arbitrary, but loosely based on Try results. // See also toolkit.asyncshutdown.crash_timeout (currently 60s) after // which the parent process will be killed. #ifdef MOZ_CODE_COVERAGE // Code coverage instrumentation can be slow (especially when writing // out data, which has to take a lock on the data files). static constexpr int kShutdownWaitMs = 80000; #elif defined(MOZ_ASAN) || defined(MOZ_TSAN) // Sanitizers slow things down in some cases; see bug 1806224. static constexpr int kShutdownWaitMs = 40000; #else static constexpr int kShutdownWaitMs = 8000; #endif namespace { using base::BlockingWait; // Represents a child process being awaited (which is expected to exit // soon, or already has). // // If `mForce` is null then we will wait indefinitely (and block // parent shutdown; see above); otherwise it will be killed after a // timeout (or during parent shutdown, if that happens first). struct PendingChild { pid_t mPid; nsCOMPtr mForce; }; // `EnsureProcessTerminated` is called when a process is expected to // be shutting down, so there should be relatively few `PendingChild` // instances at any given time, meaning that using an array and doing // O(n) operations should be fine. static mozilla::StaticDataMutex>> gPendingChildren("ProcessWatcher::gPendingChildren"); static int gSignalPipe[2] = {-1, -1}; static mozilla::Atomic gProcessWatcherShutdown; #if defined(XP_LINUX) && !defined(ANDROID) // Record Glean event when a content process is killed by OOM killer static void RecordContentProcessOOMKilled() { // Get PSI data mozilla::PSIInfo psi; nsresult rv = mozilla::GetLastPSISnapshot(psi); if (NS_SUCCEEDED(rv)) { // Record Glean event with PSI metrics mozilla::glean::memory_watcher::ProcessOomKilledExtra extra; extra.psiSomeAvg10 = mozilla::Some(nsPrintfCString("%lu", psi.some_avg10)); extra.psiSomeAvg60 = mozilla::Some(nsPrintfCString("%lu", psi.some_avg60)); extra.psiFullAvg10 = mozilla::Some(nsPrintfCString("%lu", psi.full_avg10)); extra.psiFullAvg60 = mozilla::Some(nsPrintfCString("%lu", psi.full_avg60)); extra.psiAvailable = mozilla::Some(psi.psi_available); mozilla::glean::memory_watcher::process_oom_killed.Record( mozilla::Some(extra)); mozilla::StartNonOOMPSISampling(); } } #endif // A wrapper around WaitForProcess to simplify the result (true if the // process exited and the pid is now freed for reuse, false if it's // still running), and handle the case where "blocking" mode doesn't // block (so this function will always return true if `aBlock` is // `YES`), and log a warning message if the process didn't exit // successfully (as in `exit(0)`). static bool IsProcessDead(pid_t pid, BlockingWait aBlock) { int info = 0; auto status = WaitForProcess(pid, aBlock, &info); while (aBlock == BlockingWait::Yes && status == base::ProcessStatus::Running) { // It doesn't matter if this is interrupted; we just need to // wait for some amount of time while the other process status // event is (hopefully) handled. This is used only during an // error case at shutdown, so a 1s wait won't be too noticeable. sleep(1); status = WaitForProcess(pid, aBlock, &info); } switch (status) { case base::ProcessStatus::Running: return false; case base::ProcessStatus::Exited: if (info != 0) { CHROMIUM_LOG(WARNING) << "process " << pid << " exited with status " << info; } return true; case base::ProcessStatus::Killed: CHROMIUM_LOG(WARNING) << "process " << pid << " exited on signal " << info; #if defined(XP_LINUX) && !defined(ANDROID) // Record telemetry for OOM kills if (info == SIGKILL) { NS_DispatchToMainThread( NS_NewRunnableFunction("ContentProcessOOMTelemetry", []() { RecordContentProcessOOMKilled(); })); } #endif return true; case base::ProcessStatus::Error: CHROMIUM_LOG(ERROR) << "waiting for process " << pid << " failed with error " << info; // Don't keep trying. return true; default: DCHECK(false) << "can't happen"; return true; } } // Creates a timer to kill the process after a delay, for the // `force=true` case. The timer is bound to the I/O thread, which // means it needs to be cancelled there (and thus that child exit // notifications need to be handled on the I/O thread). already_AddRefed DelayedKill(pid_t aPid) { nsCOMPtr timer; nsresult rv = NS_NewTimerWithCallback( getter_AddRefs(timer), [aPid](nsITimer*) { // If the process already exited, normally it would remain as // a zombie and the `SIGKILL` would be ignored. But if the // fork server crashed, then the child would be reparented to // pid 1 and cleaned up immediately, so in that case we should // not try to signal. if (IsProcessDead(aPid, BlockingWait::No)) { return; } // In theory it's possible for the fork server to crash and // the child process to exit and have its pid reused by a new // process all between these two statements, but that is // *extremely* unlikely. if (kill(aPid, SIGKILL) != 0) { const int err = errno; // Bug 1944669: suppress logging if it's a forkserver child // process that already exited. (Before bug 1658072 we // would kill(pid, 0) first, but that doesn't change // anything.) This can be removed with bug 1752638. #ifdef MOZ_ENABLE_FORKSERVER const bool forkServed = mozilla::ipc::ForkServiceChild::WasUsed(); #else constexpr bool forkServed = false; #endif if (err != ESRCH || !forkServed) { CHROMIUM_LOG(ERROR) << "failed to send SIGKILL to process " << aPid << strerror(err); } } // If the process was still running, it will exit and the // SIGCHLD handler will waitpid it. }, kMaxWaitMs, nsITimer::TYPE_ONE_SHOT, "ProcessWatcher::DelayedKill"_ns, XRE_GetAsyncIOEventTarget()); // This should happen only during shutdown, in which case we're // about to kill the process anyway during I/O thread destruction. if (NS_FAILED(rv)) { CHROMIUM_LOG(WARNING) << "failed to start kill timer for process " << aPid << "; killing immediately"; kill(aPid, SIGKILL); return nullptr; } return timer.forget(); } bool CrashProcessIfHanging(pid_t aPid) { if (IsProcessDead(aPid, BlockingWait::No)) { return false; } // If child processes seems to be hanging on shutdown, wait for a // reasonable time. The wait is global instead of per-process // because the child processes should be shutting down in // parallel, and also we're potentially racing global timeouts // like nsTerminator. (The counter doesn't need to be atomic; // this is always called on the I/O thread.) static int sWaitMs = kShutdownWaitMs; if (sWaitMs > 0) { CHROMIUM_LOG(WARNING) << "Process " << aPid << " may be hanging at shutdown; will wait for up to " << sWaitMs << "ms"; } // There isn't a way to do a time-limited wait that's both // portable and doesn't require messing with signals. Instead, we // sleep in short increments and poll the process status. while (sWaitMs > 0) { static constexpr int kWaitTickMs = 200; struct timespec ts = {kWaitTickMs / 1000, (kWaitTickMs % 1000) * 1000000}; HANDLE_EINTR(nanosleep(&ts, &ts)); sWaitMs -= kWaitTickMs; if (IsProcessDead(aPid, BlockingWait::No)) { return false; } } // We want TreeHerder to flag this log line as an error, so that // this is more obviously a deliberate crash; "fatal error" is one // of the strings it looks for. CHROMIUM_LOG(ERROR) << "Process " << aPid << " hanging at shutdown; attempting crash report (fatal error)."; kill(aPid, SIGABRT); return true; } // Most of the logic is here. Reponds to SIGCHLD via the self-pipe, // and handles shutdown behavior in `WillDestroyCurrentMessageLoop`. // There is one instance of this class; it's created the first time // it's used and destroys itself during IPC shutdown. class ProcessCleaner final : public MessageLoopForIO::Watcher, public MessageLoop::DestructionObserver { public: // Safety: this must be called on the I/O thread. void Register() { MessageLoopForIO* loop = MessageLoopForIO::current(); loop->AddDestructionObserver(this); loop->WatchFileDescriptor(gSignalPipe[0], /* persistent= */ true, MessageLoopForIO::WATCH_READ, &mWatcher, this); } void OnFileCanReadWithoutBlocking(int fd) override { DCHECK(fd == gSignalPipe[0]); ssize_t rv; // Drain the pipe and prune dead processes. do { char msg[32]; rv = HANDLE_EINTR(read(gSignalPipe[0], msg, sizeof msg)); CHECK(rv != 0); if (rv < 0) { DCHECK(errno == EAGAIN || errno == EWOULDBLOCK); } else { #ifdef DEBUG for (size_t i = 0; i < (size_t)rv; ++i) { DCHECK(msg[i] == 0); } #endif } } while (rv > 0); PruneDeadProcesses(); } void OnFileCanWriteWithoutBlocking(int fd) override { CHROMIUM_LOG(FATAL) << "unreachable"; } void WillDestroyCurrentMessageLoop() override { gProcessWatcherShutdown = true; mWatcher.StopWatchingFileDescriptor(); auto lock = gPendingChildren.Lock(); auto& children = lock.ref(); if (children) { for (const auto& child : *children) { // If the child still has force-termination pending, do that now. if (child.mForce) { // This is too late for timers to run, so no need to Cancel(). // // FIXME (bug 1724337, approximately): This code isn't run at // all in practice, because the parent process will already have // exited (unless the fastShutdownStage pref is changed). if (kill(child.mPid, SIGKILL) != 0) { CHROMIUM_LOG(ERROR) << "failed to send SIGKILL to process " << child.mPid; continue; } } else { // Exception for the fake hang tests in ipc/glue/test/browser // (See also the comment in `~ProcessChild()`.) if (!PR_GetEnv("MOZ_TEST_CHILD_EXIT_HANG") && !CrashProcessIfHanging(child.mPid)) { continue; } } // If the process was just killed, it should exit immediately; // otherwise, block until it exits on its own. IsProcessDead(child.mPid, BlockingWait::Yes); } children = nullptr; } #ifdef MOZ_ENABLE_FORKSERVER mozilla::ipc::ForkServiceChild::StopForkServer(); #endif delete this; } private: MessageLoopForIO::FileDescriptorWatcher mWatcher; static void PruneDeadProcesses() { auto lock = gPendingChildren.Lock(); auto& children = lock.ref(); if (!children || children->IsEmpty()) { return; } nsTArray live; for (const auto& child : *children) { if (IsProcessDead(child.mPid, BlockingWait::No)) { if (child.mForce) { child.mForce->Cancel(); } } else { live.AppendElement(child); } } *children = std::move(live); } }; static void HandleSigChld(int signum) { DCHECK(signum == SIGCHLD); char msg = 0; HANDLE_EINTR(write(gSignalPipe[1], &msg, 1)); // Can't log here if this fails (at least not normally; SafeSPrintf // from security/sandbox/chromium could be used). // // (Note that this could fail with EAGAIN if the pipe buffer becomes // full; this is extremely unlikely, and it doesn't matter because // the reader will be woken up regardless and doesn't care about the // number of signals delivered.) } static void ProcessWatcherInit() { int rv; #ifdef HAVE_PIPE2 rv = pipe2(gSignalPipe, O_NONBLOCK | O_CLOEXEC); CHECK(rv == 0) << "pipe2() failed"; #else rv = pipe(gSignalPipe); CHECK(rv == 0) << "pipe() failed"; for (int fd : gSignalPipe) { rv = fcntl(fd, F_SETFL, O_NONBLOCK); CHECK(rv == 0) << "O_NONBLOCK failed"; rv = fcntl(fd, F_SETFD, FD_CLOEXEC); CHECK(rv == 0) << "FD_CLOEXEC failed"; } #endif // HAVE_PIPE2 // Currently there are no other SIGCHLD handlers; this is debug // asserted. If the situation changes, it should be relatively // simple to delegate; note that this ProcessWatcher doesn't // interfere with child processes it hasn't been asked to handle. auto oldHandler = signal(SIGCHLD, HandleSigChld); CHECK(oldHandler != SIG_ERR); DCHECK(oldHandler == SIG_DFL); // Start the ProcessCleaner; registering it with the I/O thread must // happen on the I/O thread itself. It's okay for that to happen // asynchronously: the callback is level-triggered, so if the signal // handler already wrote to the pipe at that point then it will be // detected, and the signal itself is async so additional delay // doesn't change the semantics. XRE_GetAsyncIOEventTarget()->Dispatch( NS_NewRunnableFunction("ProcessCleaner::Register", [] { ProcessCleaner* pc = new ProcessCleaner(); pc->Register(); })); } static void EnsureProcessWatcher() { static std::once_flag sInited; std::call_once(sInited, ProcessWatcherInit); } } // namespace mozilla::UniqueFileHandle ProcessWatcher::GetSignalPipe() { EnsureProcessWatcher(); int fd = gSignalPipe[1]; MOZ_ASSERT(fd >= 0); auto rv = mozilla::DuplicateFileHandle(fd); MOZ_ASSERT(rv); return rv; } /** * Do everything possible to ensure that |process| has been reaped * before this process exits. * * |force| decides how strict to be with the child's shutdown. * * | child exit timeout | upon parent shutdown: * +--------------------+---------------------------------- * force=true | 2 seconds | kill(child, SIGKILL) * force=false | infinite | waitpid(child) * * If a child process doesn't shut down properly, and |force=false| * used, then the parent will wait on the child forever. So, * |force=false| is expected to be used when an external entity can be * responsible for terminating hung processes, e.g. automated test * harnesses. */ void ProcessWatcher::EnsureProcessTerminated(base::ProcessHandle process, bool force) { DCHECK(process != base::GetCurrentProcId()); DCHECK(process > 0); if (gProcessWatcherShutdown) { // This late in shutdown, should only come from the I/O thread; // see further comments below. mozilla::ipc::AssertIOThread(); // This should always be true given that gProcessWatcherShutdown // is set, but just in case something changes with MessageLoop // shutdown: DCHECK(!MessageLoop::current()->IsAcceptingTasks()); // This is for the fork server itself, being torn down late // in shutdown. Generally won't be reached with force=true, // because build types that default to it will QuickExit first. // It's not strictly necessary to wait for child processes when // the parent process is about to exit (pid 1 should clean them // up). // // However, if called in "wait forever" mode, let's wait for it // and log the exit status if it was abnormal: if (!force) { (void)IsProcessDead(process, BlockingWait::Yes); } return; } EnsureProcessWatcher(); auto lock = gPendingChildren.Lock(); auto& children = lock.ref(); // Check if the process already exited. This needs to happen under // the `gPendingChildren` lock to prevent this sequence: // // A1. this non-blocking wait fails // B1. the process exits // B2. SIGCHLD is handled // B3. the ProcessCleaner wakes up and drains the signal pipe // A2. the process is added to `gPendingChildren` // // Holding the lock prevents B3 from occurring between A1 and A2. if (IsProcessDead(process, BlockingWait::No)) { return; } if (!children) { children = new nsTArray(); } // Check for duplicate pids. This is safe even in corner cases with // pid reuse: the pid can't be reused by the OS until the zombie // process has been waited, and both the `waitpid` and the following // removal of the `PendingChild` object occur while continually // holding the lock, which is also held here. for (const auto& child : *children) { if (child.mPid == process) { #ifdef MOZ_ENABLE_FORKSERVER if (mozilla::ipc::ForkServiceChild::WasUsed()) { // Ideally, this would never be reached. But, in theory it's // possible if the fork server crashes and is restarted: the // process will be reparented to pid 1 which will clean it up // immediately, at which point the pid could be reused (but // it's very unlikely for that to happen so soon). So, if // this is reached without any mistakes by the calling code, // in that case the old process has already terminated and // ProcessWatcher has no more responsibility for it. CHROMIUM_LOG(WARNING) << "EnsureProcessTerminated: duplicate process" " ID " << process; // So, we want to end up with a PendingChild for the new // process; we can just use the old one. Ideally we'd fix the // `mForce` value, but that would involve needing to cancel a // timer when we aren't necessarily on the right thread, and // in practice the `force` parameter depends only on the build // type. return; } #endif MOZ_ASSERT(false, "EnsureProcessTerminated must be called at most once for a " "given process"); return; } } PendingChild child{}; child.mPid = process; if (force) { child.mForce = DelayedKill(process); } children->AppendElement(std::move(child)); }