Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 14f9c15e authored by David Anderson's avatar David Anderson
Browse files

init: Add more diagnostics for signalfd hangs.

This adds two new diagnostics. First, signalfd reads are now non-blocking. If the read takes more than 10 seconds, we log an error.

Second, init now wakes up from epoll() every 10 seconds. If it waits on an "exec" command for more than 10 seconds, it logs an error.

This change will be reverted as soon as we get feedback.

Bug: 223076262
Test: device boots
Change-Id: I7ee98d159599217a641b3de2564a92c2435f57ef
parent a323e023
Loading
Loading
Loading
Loading
+30 −6
Original line number Diff line number Diff line
@@ -580,13 +580,30 @@ static void HandleSigtermSignal(const signalfd_siginfo& siginfo) {
    HandlePowerctlMessage("shutdown,container");
}

static constexpr std::chrono::milliseconds kDiagnosticTimeout = 10s;

static void HandleSignalFd() {
    signalfd_siginfo siginfo;
    auto started = std::chrono::steady_clock::now();
    for (;;) {
        ssize_t bytes_read = TEMP_FAILURE_RETRY(read(signal_fd, &siginfo, sizeof(siginfo)));
        if (bytes_read < 0 && errno == EAGAIN) {
            auto now = std::chrono::steady_clock::now();
            std::chrono::duration<double> waited = now - started;
            if (waited >= kDiagnosticTimeout) {
                LOG(ERROR) << "epoll() woke us up, but we waited with no SIGCHLD!";
                started = now;
            }

            std::this_thread::sleep_for(100ms);
            continue;
        }
        if (bytes_read != sizeof(siginfo)) {
            PLOG(ERROR) << "Failed to read siginfo from signal_fd";
            return;
        }
        break;
    }

    switch (siginfo.ssi_signo) {
        case SIGCHLD:
@@ -641,7 +658,7 @@ static void InstallSignalFdHandler(Epoll* epoll) {
        LOG(FATAL) << "Failed to register a fork handler: " << strerror(result);
    }

    signal_fd = signalfd(-1, &mask, SFD_CLOEXEC);
    signal_fd = signalfd(-1, &mask, SFD_CLOEXEC | SFD_NONBLOCK);
    if (signal_fd == -1) {
        PLOG(FATAL) << "failed to create signalfd";
    }
@@ -1020,7 +1037,7 @@ int SecondStageMain(int argc, char** argv) {
    setpriority(PRIO_PROCESS, 0, 0);
    while (true) {
        // By default, sleep until something happens.
        auto epoll_timeout = std::optional<std::chrono::milliseconds>{};
        auto epoll_timeout = std::optional<std::chrono::milliseconds>{kDiagnosticTimeout};

        auto shutdown_command = shutdown_state.CheckShutdown();
        if (shutdown_command) {
@@ -1060,6 +1077,13 @@ int SecondStageMain(int argc, char** argv) {
            for (const auto& function : *pending_functions) {
                (*function)();
            }
        } else if (Service::is_exec_service_running()) {
            std::chrono::duration<double> waited =
                    std::chrono::steady_clock::now() - Service::exec_service_started();
            if (waited >= kDiagnosticTimeout) {
                LOG(ERROR) << "Exec service is hung? Waited " << waited.count()
                           << " without SIGCHLD";
            }
        }
        if (!IsShuttingDown()) {
            HandleControlMessages();
+2 −0
Original line number Diff line number Diff line
@@ -127,6 +127,7 @@ static bool ExpandArgsAndExecv(const std::vector<std::string>& args, bool sigsto

unsigned long Service::next_start_order_ = 1;
bool Service::is_exec_service_running_ = false;
std::chrono::time_point<std::chrono::steady_clock> Service::exec_service_started_;

Service::Service(const std::string& name, Subcontext* subcontext_for_restart_commands,
                 const std::vector<std::string>& args, bool from_apex)
@@ -388,6 +389,7 @@ Result<void> Service::ExecStart() {

    flags_ |= SVC_EXEC;
    is_exec_service_running_ = true;
    exec_service_started_ = std::chrono::steady_clock::now();

    LOG(INFO) << "SVC_EXEC service '" << name_ << "' pid " << pid_ << " (uid " << proc_attr_.uid
              << " gid " << proc_attr_.gid << "+" << proc_attr_.supp_gids.size() << " context "
+5 −0
Original line number Diff line number Diff line
@@ -102,6 +102,9 @@ class Service {
    size_t CheckAllCommands() const { return onrestart_.CheckAllCommands(); }

    static bool is_exec_service_running() { return is_exec_service_running_; }
    static std::chrono::time_point<std::chrono::steady_clock> exec_service_started() {
        return exec_service_started_;
    }

    const std::string& name() const { return name_; }
    const std::set<std::string>& classnames() const { return classnames_; }
@@ -148,6 +151,8 @@ class Service {

    static unsigned long next_start_order_;
    static bool is_exec_service_running_;
    static std::chrono::time_point<std::chrono::steady_clock> exec_service_started_;
    static pid_t exec_service_pid_;

    std::string name_;
    std::set<std::string> classnames_;