Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 0ee759cc authored by Nikita Ioffe's avatar Nikita Ioffe Committed by android-build-merger
Browse files

Merge "userspace reboot: stop post-data services and wait for them to be killed"

am: 45922373

Change-Id: I0f9e4a03f883c8cf301f3f918c921e91c9612f03
parents 66568bf4 45922373
Loading
Loading
Loading
Loading
+116 −54
Original line number Diff line number Diff line
@@ -22,6 +22,7 @@
#include <linux/loop.h>
#include <mntent.h>
#include <semaphore.h>
#include <stdlib.h>
#include <sys/cdefs.h>
#include <sys/ioctl.h>
#include <sys/mount.h>
@@ -31,6 +32,7 @@
#include <sys/types.h>
#include <sys/wait.h>

#include <chrono>
#include <memory>
#include <set>
#include <thread>
@@ -41,6 +43,7 @@
#include <android-base/logging.h>
#include <android-base/macros.h>
#include <android-base/properties.h>
#include <android-base/scopeguard.h>
#include <android-base/strings.h>
#include <android-base/unique_fd.h>
#include <bootloader_message/bootloader_message.h>
@@ -59,6 +62,7 @@
#include "service.h"
#include "service_list.h"
#include "sigchld_handler.h"
#include "util.h"

#define PROC_SYSRQ "/proc/sysrq-trigger"

@@ -75,6 +79,19 @@ namespace init {

static bool shutting_down = false;

static const std::set<std::string> kDebuggingServices{"tombstoned", "logd", "adbd", "console"};

static std::vector<Service*> GetDebuggingServices(bool only_post_data) {
    std::vector<Service*> ret;
    ret.reserve(kDebuggingServices.size());
    for (const auto& s : ServiceList::GetInstance()) {
        if (kDebuggingServices.count(s->name()) && (!only_post_data || s->is_post_data())) {
            ret.push_back(s.get());
        }
    }
    return ret;
}

// represents umount status during reboot / shutdown.
enum UmountStat {
    /* umount succeeded. */
@@ -446,6 +463,49 @@ static void KillZramBackingDevice() {
    LOG(INFO) << "zram_backing_dev: `" << backing_dev << "` is cleared successfully.";
}

// Stops given services, waits for them to be stopped for |timeout| ms.
// If terminate is true, then SIGTERM is sent to services, otherwise SIGKILL is sent.
static void StopServices(const std::vector<Service*>& services, std::chrono::milliseconds timeout,
                         bool terminate) {
    LOG(INFO) << "Stopping " << services.size() << " services by sending "
              << (terminate ? "SIGTERM" : "SIGKILL");
    std::vector<pid_t> pids;
    pids.reserve(services.size());
    for (const auto& s : services) {
        if (s->pid() > 0) {
            pids.push_back(s->pid());
        }
        if (terminate) {
            s->Terminate();
        } else {
            s->Stop();
        }
    }
    if (timeout > 0ms) {
        WaitToBeReaped(pids, timeout);
    } else {
        // Even if we don't to wait for services to stop, we still optimistically reap zombies.
        ReapAnyOutstandingChildren();
    }
}

// Like StopServices, but also logs all the services that failed to stop after the provided timeout.
// Returns number of violators.
static int StopServicesAndLogViolations(const std::vector<Service*>& services,
                                        std::chrono::milliseconds timeout, bool terminate) {
    StopServices(services, timeout, terminate);
    int still_running = 0;
    for (const auto& s : services) {
        if (s->IsRunning()) {
            LOG(ERROR) << "[service-misbehaving] : service '" << s->name() << "' is still running "
                       << timeout.count() << "ms after receiving "
                       << (terminate ? "SIGTERM" : "SIGKILL");
            still_running++;
        }
    }
    return still_running;
}

//* Reboot / shutdown the system.
// cmd ANDROID_RB_* as defined in android_reboot.h
// reason Reason string like "reboot", "shutdown,userrequested"
@@ -510,12 +570,13 @@ static void DoReboot(unsigned int cmd, const std::string& reason, const std::str
    // Start reboot monitor thread
    sem_post(&reboot_semaphore);

    // keep debugging tools until non critical ones are all gone.
    const std::set<std::string> kill_after_apps{"tombstoned", "logd", "adbd"};
    // watchdogd is a vendor specific component but should be alive to complete shutdown safely.
    const std::set<std::string> to_starts{"watchdogd"};
    std::vector<Service*> stop_first;
    stop_first.reserve(ServiceList::GetInstance().services().size());
    for (const auto& s : ServiceList::GetInstance()) {
        if (kill_after_apps.count(s->name())) {
        if (kDebuggingServices.count(s->name())) {
            // keep debugging tools until non critical ones are all gone.
            s->SetShutdownCritical();
        } else if (to_starts.count(s->name())) {
            if (auto result = s->Start(); !result) {
@@ -529,6 +590,8 @@ static void DoReboot(unsigned int cmd, const std::string& reason, const std::str
                LOG(ERROR) << "Could not start shutdown critical service '" << s->name()
                           << "': " << result.error();
            }
        } else {
            stop_first.push_back(s.get());
        }
    }

@@ -571,49 +634,12 @@ static void DoReboot(unsigned int cmd, const std::string& reason, const std::str
    // optional shutdown step
    // 1. terminate all services except shutdown critical ones. wait for delay to finish
    if (shutdown_timeout > 0ms) {
        LOG(INFO) << "terminating init services";

        // Ask all services to terminate except shutdown critical ones.
        for (const auto& s : ServiceList::GetInstance().services_in_shutdown_order()) {
            if (!s->IsShutdownCritical()) s->Terminate();
        }

        int service_count = 0;
        // Only wait up to half of timeout here
        auto termination_wait_timeout = shutdown_timeout / 2;
        while (t.duration() < termination_wait_timeout) {
            ReapAnyOutstandingChildren();

            service_count = 0;
            for (const auto& s : ServiceList::GetInstance()) {
                // Count the number of services running except shutdown critical.
                // Exclude the console as it will ignore the SIGTERM signal
                // and not exit.
                // Note: SVC_CONSOLE actually means "requires console" but
                // it is only used by the shell.
                if (!s->IsShutdownCritical() && s->pid() != 0 && (s->flags() & SVC_CONSOLE) == 0) {
                    service_count++;
                }
            }

            if (service_count == 0) {
                // All terminable services terminated. We can exit early.
                break;
            }

            // Wait a bit before recounting the number or running services.
            std::this_thread::sleep_for(50ms);
        }
        LOG(INFO) << "Terminating running services took " << t
                  << " with remaining services:" << service_count;
    }

    // minimum safety steps before restarting
    // 2. kill all services except ones that are necessary for the shutdown sequence.
    for (const auto& s : ServiceList::GetInstance().services_in_shutdown_order()) {
        if (!s->IsShutdownCritical()) s->Stop();
        StopServicesAndLogViolations(stop_first, shutdown_timeout / 2, true /* SIGTERM */);
    }
    // Send SIGKILL to ones that didn't terminate cleanly.
    StopServicesAndLogViolations(stop_first, 0ms, false /* SIGKILL */);
    SubcontextTerminate();
    // Reap subcontext pids.
    ReapAnyOutstandingChildren();

    // 3. send volume shutdown to vold
@@ -625,9 +651,7 @@ static void DoReboot(unsigned int cmd, const std::string& reason, const std::str
        LOG(INFO) << "vold not running, skipping vold shutdown";
    }
    // logcat stopped here
    for (const auto& s : ServiceList::GetInstance().services_in_shutdown_order()) {
        if (kill_after_apps.count(s->name())) s->Stop();
    }
    StopServices(GetDebuggingServices(false /* only_post_data */), 0ms, false /* SIGKILL */);
    // 4. sync, try umount, and optionally run fsck for user shutdown
    {
        Timer sync_timer;
@@ -660,6 +684,7 @@ static void DoReboot(unsigned int cmd, const std::string& reason, const std::str
}

static void EnterShutdown() {
    LOG(INFO) << "Entering shutdown mode";
    shutting_down = true;
    // Skip wait for prop if it is in progress
    ResetWaitForProp();
@@ -675,21 +700,61 @@ static void EnterShutdown() {
}

static void LeaveShutdown() {
    LOG(INFO) << "Leaving shutdown mode";
    shutting_down = false;
    SendStartSendingMessagesMessage();
}

static void DoUserspaceReboot() {
static Result<void> DoUserspaceReboot() {
    LOG(INFO) << "Userspace reboot initiated";
    auto guard = android::base::make_scope_guard([] {
        // Leave shutdown so that we can handle a full reboot.
        LeaveShutdown();
        property_set("sys.powerctl", "reboot,abort-userspace-reboot");
    });
    // Triggering userspace-reboot-requested will result in a bunch of set_prop
    // actions. We should make sure, that all of them are propagated before
    // proceeding with userspace reboot.
    // TODO(b/135984674): implement proper synchronization logic.
    std::this_thread::sleep_for(500ms);
    EnterShutdown();
    // TODO(b/135984674): tear down post-data services
    LeaveShutdown();
    std::vector<Service*> stop_first;
    // Remember the services that were enabled. We will need to manually enable them again otherwise
    // triggers like class_start won't restart them.
    std::vector<Service*> were_enabled;
    stop_first.reserve(ServiceList::GetInstance().services().size());
    for (const auto& s : ServiceList::GetInstance().services_in_shutdown_order()) {
        if (s->is_post_data() && !kDebuggingServices.count(s->name())) {
            stop_first.push_back(s);
        }
        if (s->is_post_data() && s->IsEnabled()) {
            were_enabled.push_back(s);
        }
    }
    // TODO(b/135984674): do we need shutdown animation for userspace reboot?
    // TODO(b/135984674): control userspace timeout via read-only property?
    StopServicesAndLogViolations(stop_first, 10s, true /* SIGTERM */);
    if (int r = StopServicesAndLogViolations(stop_first, 20s, false /* SIGKILL */); r > 0) {
        // TODO(b/135984674): store information about offending services for debugging.
        return Error() << r << " post-data services are still running";
    }
    // TODO(b/135984674): remount userdata
    if (int r = StopServicesAndLogViolations(GetDebuggingServices(true /* only_post_data */), 5s,
                                             false /* SIGKILL */);
        r > 0) {
        // TODO(b/135984674): store information about offending services for debugging.
        return Error() << r << " debugging services are still running";
    }
    // TODO(b/135984674): deactivate APEX modules and switch back to bootstrap namespace.
    // Re-enable services
    for (const auto& s : were_enabled) {
        LOG(INFO) << "Re-enabling service '" << s->name() << "'";
        s->Enable();
    }
    LeaveShutdown();
    ActionManager::GetInstance().QueueEventTrigger("userspace-reboot-resume");
    guard.Disable();  // Go on with userspace reboot.
    return {};
}

static void HandleUserspaceReboot() {
@@ -697,10 +762,7 @@ static void HandleUserspaceReboot() {
    auto& am = ActionManager::GetInstance();
    am.ClearQueue();
    am.QueueEventTrigger("userspace-reboot-requested");
    auto handler = [](const BuiltinArguments&) {
        DoUserspaceReboot();
        return Result<void>{};
    };
    auto handler = [](const BuiltinArguments&) { return DoUserspaceReboot(); };
    am.QueueBuiltinAction(handler, "userspace-reboot");
}

+1 −0
Original line number Diff line number Diff line
@@ -75,6 +75,7 @@ class Service {
            const std::vector<std::string>& args);

    bool IsRunning() { return (flags_ & SVC_RUNNING) != 0; }
    bool IsEnabled() { return (flags_ & SVC_DISABLED) == 0; }
    Result<void> ExecStart();
    Result<void> Start();
    Result<void> StartIfNotDisabled();
+30 −7
Original line number Diff line number Diff line
@@ -28,28 +28,31 @@
#include <android-base/scopeguard.h>
#include <android-base/stringprintf.h>

#include <thread>

#include "init.h"
#include "service.h"
#include "service_list.h"

using android::base::StringPrintf;
using android::base::boot_clock;
using android::base::make_scope_guard;
using android::base::StringPrintf;
using android::base::Timer;

namespace android {
namespace init {

static bool ReapOneProcess() {
static pid_t ReapOneProcess() {
    siginfo_t siginfo = {};
    // This returns a zombie pid or informs us that there are no zombies left to be reaped.
    // It does NOT reap the pid; that is done below.
    if (TEMP_FAILURE_RETRY(waitid(P_ALL, 0, &siginfo, WEXITED | WNOHANG | WNOWAIT)) != 0) {
        PLOG(ERROR) << "waitid failed";
        return false;
        return 0;
    }

    auto pid = siginfo.si_pid;
    if (pid == 0) return false;
    if (pid == 0) return 0;

    // At this point we know we have a zombie pid, so we use this scopeguard to reap the pid
    // whenever the function returns from this point forward.
@@ -92,7 +95,7 @@ static bool ReapOneProcess() {
        LOG(INFO) << name << " received signal " << siginfo.si_status << wait_string;
    }

    if (!service) return true;
    if (!service) return pid;

    service->Reap(siginfo);

@@ -100,12 +103,32 @@ static bool ReapOneProcess() {
        ServiceList::GetInstance().RemoveService(*service);
    }

    return true;
    return pid;
}

void ReapAnyOutstandingChildren() {
    while (ReapOneProcess()) {
    while (ReapOneProcess() != 0) {
    }
}

void WaitToBeReaped(const std::vector<pid_t>& pids, std::chrono::milliseconds timeout) {
    Timer t;
    std::vector<pid_t> alive_pids(pids.begin(), pids.end());
    while (!alive_pids.empty() && t.duration() < timeout) {
        pid_t pid;
        while ((pid = ReapOneProcess()) != 0) {
            auto it = std::find(alive_pids.begin(), alive_pids.end(), pid);
            if (it != alive_pids.end()) {
                alive_pids.erase(it);
            }
        }
        if (alive_pids.empty()) {
            break;
        }
        std::this_thread::sleep_for(50ms);
    }
    LOG(INFO) << "Waiting for " << pids.size() << " pids to be reaped took " << t << " with "
              << alive_pids.size() << " of them still running";
}

}  // namespace init
+5 −0
Original line number Diff line number Diff line
@@ -17,11 +17,16 @@
#ifndef _INIT_SIGCHLD_HANDLER_H_
#define _INIT_SIGCHLD_HANDLER_H_

#include <chrono>
#include <vector>

namespace android {
namespace init {

void ReapAnyOutstandingChildren();

void WaitToBeReaped(const std::vector<pid_t>& pids, std::chrono::milliseconds timeout);

}  // namespace init
}  // namespace android

+5 −2
Original line number Diff line number Diff line
@@ -919,11 +919,14 @@ on property:ro.debuggable=1
on init && property:ro.debuggable=1
    start console

on userspace-reboot:
on userspace-reboot
  # TODO(b/135984674): reset all necessary properties here.
  setprop sys.init.userspace_reboot_in_progress 1
  setprop sys.boot_completed 0
  setprop sys.init.updatable_crashing 0
  setprop apexd.status 0

on userspace-reboot-resume:
on userspace-reboot-resume
  # TODO(b/135984674): remount userdata and reset checkpointing
  trigger nonencrypted
  trigger post-fs-data