Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit f45f34cd authored by Andy Hung's avatar Andy Hung
Browse files

TimerThread: Add timeout analysis logic

Automatically dumps TimeCheck analysis if
there is an actionable blocked binder call.

Test: dedicated "crasher" test, check logcat
Test: adb shell dumpsys media.audio_flinger
Bug: 219958414
Change-Id: Ie21ef8940404fa7174938277bbf9c7aab2817ef7
parent 18b6ea1a
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -197,6 +197,7 @@ void TimeCheck::TimeCheckHandler::onTimeout() const
    } else {
        ALOGI("No HAL process pid available, skipping tombstones");
    }

    LOG_EVENT_STRING(LOGTAG_AUDIO_BINDER_TIMEOUT, tag.c_str());
    LOG_ALWAYS_FATAL("TimeCheck timeout for %s scheduled %s on thread %d\n%s",
            tag.c_str(), formatTime(startTime).c_str(), tid, summary.c_str());
+100 −5
Original line number Diff line number Diff line
@@ -53,17 +53,112 @@ bool TimerThread::cancelTask(Handle handle) {
}

std::string TimerThread::toString(size_t retiredCount) const {
    // Note: These request queues are snapshot very close together but
    // not at "identical" times as we don't use a class-wide lock.

    std::vector<std::shared_ptr<const Request>> timeoutRequests;
    std::vector<std::shared_ptr<const Request>> retiredRequests;
    mTimeoutQueue.copyRequests(timeoutRequests);
    mRetiredQueue.copyRequests(retiredRequests, retiredCount);
    std::vector<std::shared_ptr<const Request>> pendingRequests =
        getPendingRequests();

    struct Analysis analysis = analyzeTimeout(timeoutRequests, pendingRequests);
    std::string analysisSummary;
    if (!analysis.summary.empty()) {
        analysisSummary = std::string("\nanalysis [ ").append(analysis.summary).append(" ]");
    }
    return std::string("now ")
            .append(formatTime(std::chrono::system_clock::now()))
            .append("\npending [ ")
            .append(pendingToString())
            .append(" ]\ntimeout [ ")
            .append(timeoutToString())
            .append(analysisSummary)
            .append("\ntimeout [ ")
            .append(requestsToString(timeoutRequests))
            .append(" ]\npending [ ")
            .append(requestsToString(pendingRequests))
            .append(" ]\nretired [ ")
            .append(retiredToString(retiredCount))
            .append(requestsToString(retiredRequests))
            .append(" ]");
}

// A HAL method is where the substring "Hidl" is in the class name.
// The tag should look like: ... Hidl ... :: ...
// When the audio HAL is updated to AIDL perhaps we will use instead
// a global directory of HAL classes.
//
// See MethodStatistics.cpp:
// mediautils::getStatisticsClassesForModule(METHOD_STATISTICS_MODULE_NAME_AUDIO_HIDL)
//
/* static */
bool TimerThread::isRequestFromHal(const std::shared_ptr<const Request>& request) {
    const size_t hidlPos = request->tag.find("Hidl");
    if (hidlPos == std::string::npos) return false;
    // should be a separator afterwards Hidl which indicates the string was in the class.
    const size_t separatorPos = request->tag.find("::", hidlPos);
    return separatorPos != std::string::npos;
}

/* static */
struct TimerThread::Analysis TimerThread::analyzeTimeout(
    const std::vector<std::shared_ptr<const Request>>& timeoutRequests,
    const std::vector<std::shared_ptr<const Request>>& pendingRequests) {

    if (timeoutRequests.empty() || pendingRequests.empty()) return {}; // nothing to say.

    // for now look at last timeout (in our case, the only timeout)
    const std::shared_ptr<const Request> timeout = timeoutRequests.back();

    // pending Requests that are problematic.
    std::vector<std::shared_ptr<const Request>> pendingExact;
    std::vector<std::shared_ptr<const Request>> pendingPossible;

    // We look at pending requests that were scheduled no later than kDuration
    // after the timeout request. This prevents false matches with calls
    // that naturally block for a short period of time
    // such as HAL write() and read().
    //
    auto constexpr kDuration = std::chrono::milliseconds(1000);
    for (const auto& pending : pendingRequests) {
        // If the pending tid is the same as timeout tid, problem identified.
        if (pending->tid == timeout->tid) {
            pendingExact.emplace_back(pending);
            continue;
        }

        // if the pending tid is scheduled within time limit
        if (pending->scheduled - timeout->scheduled < kDuration) {
            pendingPossible.emplace_back(pending);
        }
    }

    struct Analysis analysis{};

    analysis.timeoutTid = timeout->tid;
    std::string& summary = analysis.summary;
    if (!pendingExact.empty()) {
        const auto& request = pendingExact.front();
        const bool hal = isRequestFromHal(request);

        if (hal) {
            summary = std::string("Blocked directly due to HAL call: ")
                .append(request->toString());
        }
    }
    if (summary.empty() && !pendingPossible.empty()) {
        for (const auto& request : pendingPossible) {
            const bool hal = isRequestFromHal(request);
            if (hal) {
                // The first blocked call is the most likely one.
                // Recent calls might be temporarily blocked
                // calls such as write() or read() depending on kDuration.
                summary = std::string("Blocked possibly due to HAL call: ")
                    .append(request->toString());
                analysis.HALBlockedTid = request->tid;
            }
       }
    }
    return analysis;
}

std::vector<std::shared_ptr<const TimerThread::Request>> TimerThread::getPendingRequests() const {
    constexpr size_t kEstimatedPendingRequests = 8;  // approx 128 byte alloc.
    std::vector<std::shared_ptr<const Request>> pendingRequests;
+30 −0
Original line number Diff line number Diff line
@@ -212,6 +212,36 @@ class TimerThread {
        void copyRequests(std::vector<std::shared_ptr<const Request>>& requests) const;
    };

    // Analysis contains info deduced by analysisTimeout().
    //
    // Summary is the result string from checking timeoutRequests to see if
    // any might be caused by blocked calls in pendingRequests.
    //
    // Summary string is empty if there is no automatic actionable info.
    //
    // timeoutTid is the tid selected from timeoutRequests (if any).
    //
    // HALBlockedTid is the tid that is blocked from pendingRequests believed
    // to cause the timeout.
    // HALBlockedTid may be INVALID_PID if no suspected tid is found,
    // and if HALBlockedTid is valid, it will not be the same as timeoutTid.
    //
    static constexpr pid_t INVALID_PID = -1;
    struct Analysis {
        std::string summary;
        pid_t timeoutTid = INVALID_PID;
        pid_t HALBlockedTid = INVALID_PID;
    };

    // A HAL method is where the substring "Hidl" is in the class name.
    // The tag should look like: ... Hidl ... :: ...
    static bool isRequestFromHal(const std::shared_ptr<const Request>& request);

    // Returns analysis from the requests.
    static Analysis analyzeTimeout(
        const std::vector<std::shared_ptr<const Request>>& timeoutRequests,
        const std::vector<std::shared_ptr<const Request>>& pendingRequests);

    std::vector<std::shared_ptr<const Request>> getPendingRequests() const;

    // A no-timeout request is represented by a handles at the end of steady_clock time,