Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 6e2763fb authored by Atneya Nair's avatar Atneya Nair Committed by Android (Google) Code Review
Browse files

Merge "Abort correct thread on TimeCheck timeout"

parents 0e7b2f1e cce14202
Loading
Loading
Loading
Loading
+26 −9
Original line number Diff line number Diff line
@@ -14,11 +14,14 @@
 * limitations under the License.
 */

#include <csignal>
#include "mediautils/TimerThread.h"
#define LOG_TAG "TimeCheck"

#include <optional>

#include <android-base/logging.h>
#include <android-base/strings.h>
#include <audio_utils/clock.h>
#include <mediautils/EventLog.h>
#include <mediautils/FixedString.h>
@@ -27,20 +30,21 @@
#include <mediautils/TidWrapper.h>
#include <utils/Log.h>

#if defined(__BIONIC__)
#if defined(__ANDROID__)
#include "debuggerd/handler.h"
#endif


namespace android::mediautils {
// This function appropriately signals a pid to dump a backtrace if we are
// running on device.
// running on device (and the HAL exists). If we are not running on an Android
// device, there is no HAL to signal (so we do nothing).
static inline void signalAudioHAL([[maybe_unused]] pid_t pid) {
#if defined(__BIONIC__)
#if defined(__ANDROID__)
    sigqueue(pid, DEBUGGER_SIGNAL, {.sival_int = 0});
#endif
}

namespace android::mediautils {

/**
 * Returns the std::string "HH:MM:SS.MSc" from a system_clock time_point.
 */
@@ -148,7 +152,7 @@ TimerThread& TimeCheck::getTimeCheckThread() {
std::string TimeCheck::toString() {
    // note pending and retired are individually locked for maximum concurrency,
    // snapshot is not instantaneous at a single time.
    return getTimeCheckThread().toString();
    return getTimeCheckThread().getSnapshotAnalysis().toString();
}

TimeCheck::TimeCheck(std::string_view tag, OnTimerFunc&& onTimer, Duration requestedTimeoutDuration,
@@ -253,7 +257,7 @@ void TimeCheck::TimeCheckHandler::onTimeout(TimerThread::Handle timerHandle) con

    // Generate the TimerThread summary string early before sending signals to the
    // HAL processes which can affect thread behavior.
    const std::string summary = getTimeCheckThread().toString(4 /* retiredCount */);
    const auto snapshotAnalysis = getTimeCheckThread().getSnapshotAnalysis(4 /* retiredCount */);

    // Generate audio HAL processes tombstones and allow time to complete
    // before forcing restart
@@ -281,7 +285,7 @@ void TimeCheck::TimeCheckHandler::onTimeout(TimerThread::Handle timerHandle) con
            .append(analyzeTimeouts(requestedTimeoutMs + secondChanceMs,
                    elapsedSteadyMs, elapsedSystemMs)).append("\n")
            .append(halPids).append("\n")
            .append(summary);
            .append(snapshotAnalysis.toString());

    // Note: LOG_ALWAYS_FATAL limits the size of the string - per log/log.h:
    // Log message text may be truncated to less than an
@@ -291,7 +295,20 @@ void TimeCheck::TimeCheckHandler::onTimeout(TimerThread::Handle timerHandle) con
    // to avoid the size limitation. LOG(FATAL) does an abort whereas
    // LOG(FATAL_WITHOUT_ABORT) does not abort.

    LOG(FATAL) << abortMessage;
    static constexpr pid_t invalidPid = TimerThread::SnapshotAnalysis::INVALID_PID;
    pid_t tidToAbort = invalidPid;
    if (snapshotAnalysis.suspectTid != invalidPid) {
        tidToAbort = snapshotAnalysis.suspectTid;
    } else if (snapshotAnalysis.timeoutTid != invalidPid) {
        tidToAbort = snapshotAnalysis.timeoutTid;
    }

    LOG(FATAL_WITHOUT_ABORT) << abortMessage;
    const auto ret = abortTid(tidToAbort);
    if (ret < 0) {
        LOG(FATAL) << "TimeCheck thread signal failed, aborting process. "
                       "errno: " << errno << base::ErrnoNumberAsString(errno);
    }
}

// Automatically create a TimeCheck class for a class and method.
+36 −41
Original line number Diff line number Diff line
@@ -59,39 +59,29 @@ bool TimerThread::cancelTask(Handle handle) {
    return true;
}

std::string TimerThread::toString(size_t retiredCount) const {

std::string TimerThread::SnapshotAnalysis::toString() const {
    // Note: These request queues are snapshot very close together but
    // not at "identical" times as we don't use a class-wide lock.

    std::vector<std::shared_ptr<const Request>> timeoutRequests;
    std::vector<std::shared_ptr<const Request>> retiredRequests;
    mTimeoutQueue.copyRequests(timeoutRequests);
    mRetiredQueue.copyRequests(retiredRequests, retiredCount);
    std::vector<std::shared_ptr<const Request>> pendingRequests =
        getPendingRequests();

    struct Analysis analysis = analyzeTimeout(timeoutRequests, pendingRequests);
    std::string analysisSummary;
    if (!analysis.summary.empty()) {
        analysisSummary = std::string("\nanalysis [ ").append(analysis.summary).append(" ]");
    }
    std::string analysisSummary = std::string("\nanalysis [ ").append(description).append(" ]");
    std::string timeoutStack;
    if (analysis.timeoutTid != -1) {
        timeoutStack = std::string("\ntimeout(")
                .append(std::to_string(analysis.timeoutTid)).append(") callstack [\n")
                .append(getCallStackStringForTid(analysis.timeoutTid)).append("]");
    }
    std::string blockedStack;
    if (analysis.HALBlockedTid != -1) {
    if (timeoutTid != -1) {
        timeoutStack = std::string(suspectTid == timeoutTid ? "\ntimeout/blocked(" : "\ntimeout(")
                .append(std::to_string(timeoutTid)).append(") callstack [\n")
                .append(getCallStackStringForTid(timeoutTid)).append("]");
    }

    if (suspectTid != -1 && suspectTid != timeoutTid) {
        blockedStack = std::string("\nblocked(")
                .append(std::to_string(analysis.HALBlockedTid)).append(")  callstack [\n")
                .append(getCallStackStringForTid(analysis.HALBlockedTid)).append("]");
                .append(std::to_string(suspectTid)).append(")  callstack [\n")
                .append(getCallStackStringForTid(suspectTid)).append("]");
    }

    return std::string("now ")
            .append(formatTime(std::chrono::system_clock::now()))
            .append("\nsecondChanceCount ")
            .append(std::to_string(mMonitorThread.getSecondChanceCount()))
            .append(std::to_string(secondChanceCount))
            .append(analysisSummary)
            .append("\ntimeout [ ")
            .append(requestsToString(timeoutRequests))
@@ -121,16 +111,23 @@ bool TimerThread::isRequestFromHal(const std::shared_ptr<const Request>& request
    return separatorPos != std::string::npos;
}

/* static */
struct TimerThread::Analysis TimerThread::analyzeTimeout(
    const std::vector<std::shared_ptr<const Request>>& timeoutRequests,
    const std::vector<std::shared_ptr<const Request>>& pendingRequests) {

    if (timeoutRequests.empty() || pendingRequests.empty()) return {}; // nothing to say.

struct TimerThread::SnapshotAnalysis TimerThread::getSnapshotAnalysis(size_t retiredCount) const {
    struct SnapshotAnalysis analysis{};
    // The following snapshot of the TimerThread state will be utilized for
    // analysis. Note, there is no lock around these calls, so there could be
    // a state update between them.
    mTimeoutQueue.copyRequests(analysis.timeoutRequests);
    mRetiredQueue.copyRequests(analysis.retiredRequests, retiredCount);
    analysis.pendingRequests = getPendingRequests();
    analysis.secondChanceCount = mMonitorThread.getSecondChanceCount();
    // No call has timed out, so there is no analysis to be done.
    if (analysis.timeoutRequests.empty())
        return analysis;
    // for now look at last timeout (in our case, the only timeout)
    const std::shared_ptr<const Request> timeout = timeoutRequests.back();

    const std::shared_ptr<const Request> timeout = analysis.timeoutRequests.back();
    analysis.timeoutTid = timeout->tid;
    if (analysis.pendingRequests.empty())
      return analysis;
    // pending Requests that are problematic.
    std::vector<std::shared_ptr<const Request>> pendingExact;
    std::vector<std::shared_ptr<const Request>> pendingPossible;
@@ -141,7 +138,7 @@ struct TimerThread::Analysis TimerThread::analyzeTimeout(
    // such as HAL write() and read().
    //
    constexpr Duration kPendingDuration = 1000ms;
    for (const auto& pending : pendingRequests) {
    for (const auto& pending : analysis.pendingRequests) {
        // If the pending tid is the same as timeout tid, problem identified.
        if (pending->tid == timeout->tid) {
            pendingExact.emplace_back(pending);
@@ -154,29 +151,27 @@ struct TimerThread::Analysis TimerThread::analyzeTimeout(
        }
    }

    struct Analysis analysis{};

    analysis.timeoutTid = timeout->tid;
    std::string& summary = analysis.summary;
    std::string& description = analysis.description;
    if (!pendingExact.empty()) {
        const auto& request = pendingExact.front();
        const bool hal = isRequestFromHal(request);

        if (hal) {
            summary = std::string("Blocked directly due to HAL call: ")
            description = std::string("Blocked directly due to HAL call: ")
                .append(request->toString());
            analysis.suspectTid= request->tid;
        }
    }
    if (summary.empty() && !pendingPossible.empty()) {
    if (description.empty() && !pendingPossible.empty()) {
        for (const auto& request : pendingPossible) {
            const bool hal = isRequestFromHal(request);
            if (hal) {
                // The first blocked call is the most likely one.
                // Recent calls might be temporarily blocked
                // calls such as write() or read() depending on kDuration.
                summary = std::string("Blocked possibly due to HAL call: ")
                description = std::string("Blocked possibly due to HAL call: ")
                    .append(request->toString());
                analysis.HALBlockedTid = request->tid;
                analysis.suspectTid= request->tid;
            }
       }
    }
+19 −0
Original line number Diff line number Diff line
@@ -16,8 +16,11 @@

#pragma once

#if defined(__linux__)
#include <signal.h>
#include <sys/syscall.h>
#include <unistd.h>
#endif

namespace android::mediautils {

@@ -31,4 +34,20 @@ inline pid_t getThreadIdWrapper() {
#endif
}

// Send an abort signal to a (linux) thread id.
inline int abortTid(int tid) {
#if defined(__linux__)
    const pid_t pid = getpid();
    siginfo_t siginfo = {
        .si_code = SI_QUEUE,
        .si_pid = pid,
        .si_uid = getuid(),
    };
    return syscall(SYS_rt_tgsigqueueinfo, pid, tid, SIGABRT, &siginfo);
#else
  errno = ENODEV;
  return -1;
#endif
}

}
+0 −1
Original line number Diff line number Diff line
@@ -123,7 +123,6 @@ class TimeCheck {
        const Duration secondChanceDuration;
        const std::chrono::system_clock::time_point startSystemTime;
        const pid_t tid;

        void onCancel(TimerThread::Handle handle) const;
        void onTimeout(TimerThread::Handle handle) const;
    };
+45 −27
Original line number Diff line number Diff line
@@ -21,9 +21,11 @@
#include <deque>
#include <functional>
#include <map>
#include <memory>
#include <mutex>
#include <string>
#include <thread>
#include <vector>

#include <android-base/thread_annotations.h>

@@ -151,7 +153,15 @@ class TimerThread {
     */
    bool cancelTask(Handle handle);

    std::string toString(size_t retiredCount = SIZE_MAX) const;
    struct SnapshotAnalysis;
    /**
     * Take a snapshot of the current state of the TimerThread and determine the
     * potential cause of a deadlock.
     * \param retiredCount The number of successfully retired calls to capture
     *                      (may be many).
     * \return See below for a description of a SnapShotAnalysis object
     */
    SnapshotAnalysis getSnapshotAnalysis(size_t retiredCount = SIZE_MAX) const;

    /**
     * Returns a string representation of the TimerThread queue.
@@ -202,7 +212,6 @@ class TimerThread {
        return s;
    }

  private:
    // To minimize movement of data, we pass around shared_ptrs to Requests.
    // These are allocated and deallocated outside of the lock.
    // TODO(b/243839867) consider options to merge Request with the
@@ -232,6 +241,40 @@ class TimerThread {
        std::string toString() const;
    };


    // SnapshotAnalysis contains info deduced by analysisTimeout().

    struct SnapshotAnalysis {
        // If we were unable to determine any applicable thread ids,
        // we leave their value as INVALID_PID.
        // Note, we use the linux thread id (not pthread), so its type is pid_t.
        static constexpr pid_t INVALID_PID = -1;
        // Description of likely issue and/or blocked method.
        // Empty if no actionable info.
        std::string description;
        // Tid of the (latest) monitored thread which has timed out.
        // This is the thread which the suspect is deduced with respect to.
        // Most often, this is the thread which an abort is being triggered
        // from.
        pid_t timeoutTid = INVALID_PID;
        // Tid of the (HAL) thread which has likely halted progress, selected
        // from pendingRequests. May be the same as timeoutTid, if the timed-out
        // thread directly called into the HAL.
        pid_t suspectTid = INVALID_PID;
        // Number of second chances given by the timer thread
        size_t secondChanceCount;
        // List of pending requests
        std::vector<std::shared_ptr<const Request>> pendingRequests;
        // List of timed-out requests
        std::vector<std::shared_ptr<const Request>> timeoutRequests;
        // List of retired requests
        std::vector<std::shared_ptr<const Request>> retiredRequests;
        // Dumps the information contained above as well as additional call
        // stacks where applicable.
        std::string toString() const;
    };

  private:
    // Deque of requests, in order of add().
    // This class is thread-safe.
    class RequestQueue {
@@ -326,36 +369,11 @@ class TimerThread {
        }
    };

    // Analysis contains info deduced by analysisTimeout().
    //
    // Summary is the result string from checking timeoutRequests to see if
    // any might be caused by blocked calls in pendingRequests.
    //
    // Summary string is empty if there is no automatic actionable info.
    //
    // timeoutTid is the tid selected from timeoutRequests (if any).
    //
    // HALBlockedTid is the tid that is blocked from pendingRequests believed
    // to cause the timeout.
    // HALBlockedTid may be INVALID_PID if no suspected tid is found,
    // and if HALBlockedTid is valid, it will not be the same as timeoutTid.
    //
    static constexpr pid_t INVALID_PID = -1;
    struct Analysis {
        std::string summary;
        pid_t timeoutTid = INVALID_PID;
        pid_t HALBlockedTid = INVALID_PID;
    };

    // A HAL method is where the substring "Hidl" is in the class name.
    // The tag should look like: ... Hidl ... :: ...
    static bool isRequestFromHal(const std::shared_ptr<const Request>& request);

    // Returns analysis from the requests.
    static Analysis analyzeTimeout(
        const std::vector<std::shared_ptr<const Request>>& timeoutRequests,
        const std::vector<std::shared_ptr<const Request>>& pendingRequests);

    std::vector<std::shared_ptr<const Request>> getPendingRequests() const;

    static constexpr size_t kRetiredQueueMax = 16;
Loading