Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit cce14202 authored by Atneya Nair's avatar Atneya Nair
Browse files

Abort correct thread on TimeCheck timeout

The current TimeCheck implementation leads to incorrect
clustering/duping based on the abort stack trace being generated from
TimeCheck (running in the watchdog thread).

Appropriately signal the timed-out thread, so we receive more accurate
stack traces.

Test: atest timecheck_tests, manual verification of correct thread stack
trace.
Bug: 239252933
Bug: 227594853

Change-Id: I5d62581682ae85af269e7e10125343f096e26303
parent 1a5246e5
Loading
Loading
Loading
Loading
+26 −9
Original line number Diff line number Diff line
@@ -14,11 +14,14 @@
 * limitations under the License.
 */

#include <csignal>
#include "mediautils/TimerThread.h"
#define LOG_TAG "TimeCheck"

#include <optional>

#include <android-base/logging.h>
#include <android-base/strings.h>
#include <audio_utils/clock.h>
#include <mediautils/EventLog.h>
#include <mediautils/FixedString.h>
@@ -27,20 +30,21 @@
#include <mediautils/TidWrapper.h>
#include <utils/Log.h>

#if defined(__BIONIC__)
#if defined(__ANDROID__)
#include "debuggerd/handler.h"
#endif


namespace android::mediautils {
// This function appropriately signals a pid to dump a backtrace if we are
// running on device.
// running on device (and the HAL exists). If we are not running on an Android
// device, there is no HAL to signal (so we do nothing).
static inline void signalAudioHAL([[maybe_unused]] pid_t pid) {
#if defined(__BIONIC__)
#if defined(__ANDROID__)
    sigqueue(pid, DEBUGGER_SIGNAL, {.sival_int = 0});
#endif
}

namespace android::mediautils {

/**
 * Returns the std::string "HH:MM:SS.MSc" from a system_clock time_point.
 */
@@ -148,7 +152,7 @@ TimerThread& TimeCheck::getTimeCheckThread() {
std::string TimeCheck::toString() {
    // note pending and retired are individually locked for maximum concurrency,
    // snapshot is not instantaneous at a single time.
    return getTimeCheckThread().toString();
    return getTimeCheckThread().getSnapshotAnalysis().toString();
}

TimeCheck::TimeCheck(std::string_view tag, OnTimerFunc&& onTimer, Duration requestedTimeoutDuration,
@@ -253,7 +257,7 @@ void TimeCheck::TimeCheckHandler::onTimeout(TimerThread::Handle timerHandle) con

    // Generate the TimerThread summary string early before sending signals to the
    // HAL processes which can affect thread behavior.
    const std::string summary = getTimeCheckThread().toString(4 /* retiredCount */);
    const auto snapshotAnalysis = getTimeCheckThread().getSnapshotAnalysis(4 /* retiredCount */);

    // Generate audio HAL processes tombstones and allow time to complete
    // before forcing restart
@@ -281,7 +285,7 @@ void TimeCheck::TimeCheckHandler::onTimeout(TimerThread::Handle timerHandle) con
            .append(analyzeTimeouts(requestedTimeoutMs + secondChanceMs,
                    elapsedSteadyMs, elapsedSystemMs)).append("\n")
            .append(halPids).append("\n")
            .append(summary);
            .append(snapshotAnalysis.toString());

    // Note: LOG_ALWAYS_FATAL limits the size of the string - per log/log.h:
    // Log message text may be truncated to less than an
@@ -291,7 +295,20 @@ void TimeCheck::TimeCheckHandler::onTimeout(TimerThread::Handle timerHandle) con
    // to avoid the size limitation. LOG(FATAL) does an abort whereas
    // LOG(FATAL_WITHOUT_ABORT) does not abort.

    LOG(FATAL) << abortMessage;
    static constexpr pid_t invalidPid = TimerThread::SnapshotAnalysis::INVALID_PID;
    pid_t tidToAbort = invalidPid;
    if (snapshotAnalysis.suspectTid != invalidPid) {
        tidToAbort = snapshotAnalysis.suspectTid;
    } else if (snapshotAnalysis.timeoutTid != invalidPid) {
        tidToAbort = snapshotAnalysis.timeoutTid;
    }

    LOG(FATAL_WITHOUT_ABORT) << abortMessage;
    const auto ret = abortTid(tidToAbort);
    if (ret < 0) {
        LOG(FATAL) << "TimeCheck thread signal failed, aborting process. "
                       "errno: " << errno << base::ErrnoNumberAsString(errno);
    }
}

// Automatically create a TimeCheck class for a class and method.
+36 −41
Original line number Diff line number Diff line
@@ -59,39 +59,29 @@ bool TimerThread::cancelTask(Handle handle) {
    return true;
}

std::string TimerThread::toString(size_t retiredCount) const {

std::string TimerThread::SnapshotAnalysis::toString() const {
    // Note: These request queues are snapshot very close together but
    // not at "identical" times as we don't use a class-wide lock.

    std::vector<std::shared_ptr<const Request>> timeoutRequests;
    std::vector<std::shared_ptr<const Request>> retiredRequests;
    mTimeoutQueue.copyRequests(timeoutRequests);
    mRetiredQueue.copyRequests(retiredRequests, retiredCount);
    std::vector<std::shared_ptr<const Request>> pendingRequests =
        getPendingRequests();

    struct Analysis analysis = analyzeTimeout(timeoutRequests, pendingRequests);
    std::string analysisSummary;
    if (!analysis.summary.empty()) {
        analysisSummary = std::string("\nanalysis [ ").append(analysis.summary).append(" ]");
    }
    std::string analysisSummary = std::string("\nanalysis [ ").append(description).append(" ]");
    std::string timeoutStack;
    if (analysis.timeoutTid != -1) {
        timeoutStack = std::string("\ntimeout(")
                .append(std::to_string(analysis.timeoutTid)).append(") callstack [\n")
                .append(getCallStackStringForTid(analysis.timeoutTid)).append("]");
    }
    std::string blockedStack;
    if (analysis.HALBlockedTid != -1) {
    if (timeoutTid != -1) {
        timeoutStack = std::string(suspectTid == timeoutTid ? "\ntimeout/blocked(" : "\ntimeout(")
                .append(std::to_string(timeoutTid)).append(") callstack [\n")
                .append(getCallStackStringForTid(timeoutTid)).append("]");
    }

    if (suspectTid != -1 && suspectTid != timeoutTid) {
        blockedStack = std::string("\nblocked(")
                .append(std::to_string(analysis.HALBlockedTid)).append(")  callstack [\n")
                .append(getCallStackStringForTid(analysis.HALBlockedTid)).append("]");
                .append(std::to_string(suspectTid)).append(")  callstack [\n")
                .append(getCallStackStringForTid(suspectTid)).append("]");
    }

    return std::string("now ")
            .append(formatTime(std::chrono::system_clock::now()))
            .append("\nsecondChanceCount ")
            .append(std::to_string(mMonitorThread.getSecondChanceCount()))
            .append(std::to_string(secondChanceCount))
            .append(analysisSummary)
            .append("\ntimeout [ ")
            .append(requestsToString(timeoutRequests))
@@ -121,16 +111,23 @@ bool TimerThread::isRequestFromHal(const std::shared_ptr<const Request>& request
    return separatorPos != std::string::npos;
}

/* static */
struct TimerThread::Analysis TimerThread::analyzeTimeout(
    const std::vector<std::shared_ptr<const Request>>& timeoutRequests,
    const std::vector<std::shared_ptr<const Request>>& pendingRequests) {

    if (timeoutRequests.empty() || pendingRequests.empty()) return {}; // nothing to say.

struct TimerThread::SnapshotAnalysis TimerThread::getSnapshotAnalysis(size_t retiredCount) const {
    struct SnapshotAnalysis analysis{};
    // The following snapshot of the TimerThread state will be utilized for
    // analysis. Note, there is no lock around these calls, so there could be
    // a state update between them.
    mTimeoutQueue.copyRequests(analysis.timeoutRequests);
    mRetiredQueue.copyRequests(analysis.retiredRequests, retiredCount);
    analysis.pendingRequests = getPendingRequests();
    analysis.secondChanceCount = mMonitorThread.getSecondChanceCount();
    // No call has timed out, so there is no analysis to be done.
    if (analysis.timeoutRequests.empty())
        return analysis;
    // for now look at last timeout (in our case, the only timeout)
    const std::shared_ptr<const Request> timeout = timeoutRequests.back();

    const std::shared_ptr<const Request> timeout = analysis.timeoutRequests.back();
    analysis.timeoutTid = timeout->tid;
    if (analysis.pendingRequests.empty())
      return analysis;
    // pending Requests that are problematic.
    std::vector<std::shared_ptr<const Request>> pendingExact;
    std::vector<std::shared_ptr<const Request>> pendingPossible;
@@ -141,7 +138,7 @@ struct TimerThread::Analysis TimerThread::analyzeTimeout(
    // such as HAL write() and read().
    //
    constexpr Duration kPendingDuration = 1000ms;
    for (const auto& pending : pendingRequests) {
    for (const auto& pending : analysis.pendingRequests) {
        // If the pending tid is the same as timeout tid, problem identified.
        if (pending->tid == timeout->tid) {
            pendingExact.emplace_back(pending);
@@ -154,29 +151,27 @@ struct TimerThread::Analysis TimerThread::analyzeTimeout(
        }
    }

    struct Analysis analysis{};

    analysis.timeoutTid = timeout->tid;
    std::string& summary = analysis.summary;
    std::string& description = analysis.description;
    if (!pendingExact.empty()) {
        const auto& request = pendingExact.front();
        const bool hal = isRequestFromHal(request);

        if (hal) {
            summary = std::string("Blocked directly due to HAL call: ")
            description = std::string("Blocked directly due to HAL call: ")
                .append(request->toString());
            analysis.suspectTid= request->tid;
        }
    }
    if (summary.empty() && !pendingPossible.empty()) {
    if (description.empty() && !pendingPossible.empty()) {
        for (const auto& request : pendingPossible) {
            const bool hal = isRequestFromHal(request);
            if (hal) {
                // The first blocked call is the most likely one.
                // Recent calls might be temporarily blocked
                // calls such as write() or read() depending on kDuration.
                summary = std::string("Blocked possibly due to HAL call: ")
                description = std::string("Blocked possibly due to HAL call: ")
                    .append(request->toString());
                analysis.HALBlockedTid = request->tid;
                analysis.suspectTid= request->tid;
            }
       }
    }
+19 −0
Original line number Diff line number Diff line
@@ -16,8 +16,11 @@

#pragma once

#if defined(__linux__)
#include <signal.h>
#include <sys/syscall.h>
#include <unistd.h>
#endif

namespace android::mediautils {

@@ -31,4 +34,20 @@ inline pid_t getThreadIdWrapper() {
#endif
}

// Send an abort signal to a (linux) thread id.
inline int abortTid(int tid) {
#if defined(__linux__)
    const pid_t pid = getpid();
    siginfo_t siginfo = {
        .si_code = SI_QUEUE,
        .si_pid = pid,
        .si_uid = getuid(),
    };
    return syscall(SYS_rt_tgsigqueueinfo, pid, tid, SIGABRT, &siginfo);
#else
  errno = ENODEV;
  return -1;
#endif
}

}
+0 −1
Original line number Diff line number Diff line
@@ -123,7 +123,6 @@ class TimeCheck {
        const Duration secondChanceDuration;
        const std::chrono::system_clock::time_point startSystemTime;
        const pid_t tid;

        void onCancel(TimerThread::Handle handle) const;
        void onTimeout(TimerThread::Handle handle) const;
    };
+45 −27
Original line number Diff line number Diff line
@@ -21,9 +21,11 @@
#include <deque>
#include <functional>
#include <map>
#include <memory>
#include <mutex>
#include <string>
#include <thread>
#include <vector>

#include <android-base/thread_annotations.h>

@@ -151,7 +153,15 @@ class TimerThread {
     */
    bool cancelTask(Handle handle);

    std::string toString(size_t retiredCount = SIZE_MAX) const;
    struct SnapshotAnalysis;
    /**
     * Take a snapshot of the current state of the TimerThread and determine the
     * potential cause of a deadlock.
     * \param retiredCount The number of successfully retired calls to capture
     *                      (may be many).
     * \return See below for a description of a SnapShotAnalysis object
     */
    SnapshotAnalysis getSnapshotAnalysis(size_t retiredCount = SIZE_MAX) const;

    /**
     * Returns a string representation of the TimerThread queue.
@@ -202,7 +212,6 @@ class TimerThread {
        return s;
    }

  private:
    // To minimize movement of data, we pass around shared_ptrs to Requests.
    // These are allocated and deallocated outside of the lock.
    // TODO(b/243839867) consider options to merge Request with the
@@ -232,6 +241,40 @@ class TimerThread {
        std::string toString() const;
    };


    // SnapshotAnalysis contains info deduced by analysisTimeout().

    struct SnapshotAnalysis {
        // If we were unable to determine any applicable thread ids,
        // we leave their value as INVALID_PID.
        // Note, we use the linux thread id (not pthread), so its type is pid_t.
        static constexpr pid_t INVALID_PID = -1;
        // Description of likely issue and/or blocked method.
        // Empty if no actionable info.
        std::string description;
        // Tid of the (latest) monitored thread which has timed out.
        // This is the thread which the suspect is deduced with respect to.
        // Most often, this is the thread which an abort is being triggered
        // from.
        pid_t timeoutTid = INVALID_PID;
        // Tid of the (HAL) thread which has likely halted progress, selected
        // from pendingRequests. May be the same as timeoutTid, if the timed-out
        // thread directly called into the HAL.
        pid_t suspectTid = INVALID_PID;
        // Number of second chances given by the timer thread
        size_t secondChanceCount;
        // List of pending requests
        std::vector<std::shared_ptr<const Request>> pendingRequests;
        // List of timed-out requests
        std::vector<std::shared_ptr<const Request>> timeoutRequests;
        // List of retired requests
        std::vector<std::shared_ptr<const Request>> retiredRequests;
        // Dumps the information contained above as well as additional call
        // stacks where applicable.
        std::string toString() const;
    };

  private:
    // Deque of requests, in order of add().
    // This class is thread-safe.
    class RequestQueue {
@@ -326,36 +369,11 @@ class TimerThread {
        }
    };

    // Analysis contains info deduced by analysisTimeout().
    //
    // Summary is the result string from checking timeoutRequests to see if
    // any might be caused by blocked calls in pendingRequests.
    //
    // Summary string is empty if there is no automatic actionable info.
    //
    // timeoutTid is the tid selected from timeoutRequests (if any).
    //
    // HALBlockedTid is the tid that is blocked from pendingRequests believed
    // to cause the timeout.
    // HALBlockedTid may be INVALID_PID if no suspected tid is found,
    // and if HALBlockedTid is valid, it will not be the same as timeoutTid.
    //
    static constexpr pid_t INVALID_PID = -1;
    struct Analysis {
        std::string summary;
        pid_t timeoutTid = INVALID_PID;
        pid_t HALBlockedTid = INVALID_PID;
    };

    // A HAL method is where the substring "Hidl" is in the class name.
    // The tag should look like: ... Hidl ... :: ...
    static bool isRequestFromHal(const std::shared_ptr<const Request>& request);

    // Returns analysis from the requests.
    static Analysis analyzeTimeout(
        const std::vector<std::shared_ptr<const Request>>& timeoutRequests,
        const std::vector<std::shared_ptr<const Request>>& pendingRequests);

    std::vector<std::shared_ptr<const Request>> getPendingRequests() const;

    static constexpr size_t kRetiredQueueMax = 16;
Loading