Merge "Abort correct thread on TimeCheck timeout" (6e2763fb) · Commits · e / os / android_frameworks_av

media/utils/TimeCheck.cpp

+26 −9

Original line number	Diff line number	Diff line
		@@ -14,11 +14,14 @@
		* limitations under the License.
		*/

		#include <csignal>
		#include "mediautils/TimerThread.h"
		#define LOG_TAG "TimeCheck"

		#include <optional>

		#include <android-base/logging.h>
		#include <android-base/strings.h>
		#include <audio_utils/clock.h>
		#include <mediautils/EventLog.h>
		#include <mediautils/FixedString.h>
		@@ -27,20 +30,21 @@
		#include <mediautils/TidWrapper.h>
		#include <utils/Log.h>

		#if defined(__BIONIC__)
		#if defined(__ANDROID__)
		#include "debuggerd/handler.h"
		#endif


		namespace android::mediautils {
		// This function appropriately signals a pid to dump a backtrace if we are
		// running on device.
		// running on device (and the HAL exists). If we are not running on an Android
		// device, there is no HAL to signal (so we do nothing).
		static inline void signalAudioHAL([[maybe_unused]] pid_t pid) {
		#if defined(__BIONIC__)
		#if defined(__ANDROID__)
		sigqueue(pid, DEBUGGER_SIGNAL, {.sival_int = 0});
		#endif
		}

		namespace android::mediautils {

		/**
		* Returns the std::string "HH:MM:SS.MSc" from a system_clock time_point.
		*/
		@@ -148,7 +152,7 @@ TimerThread& TimeCheck::getTimeCheckThread() {
		std::string TimeCheck::toString() {
		// note pending and retired are individually locked for maximum concurrency,
		// snapshot is not instantaneous at a single time.
		return getTimeCheckThread().toString();
		return getTimeCheckThread().getSnapshotAnalysis().toString();
		}

		TimeCheck::TimeCheck(std::string_view tag, OnTimerFunc&& onTimer, Duration requestedTimeoutDuration,
		@@ -253,7 +257,7 @@ void TimeCheck::TimeCheckHandler::onTimeout(TimerThread::Handle timerHandle) con

		// Generate the TimerThread summary string early before sending signals to the
		// HAL processes which can affect thread behavior.
		const std::string summary = getTimeCheckThread().toString(4 /* retiredCount */);
		const auto snapshotAnalysis = getTimeCheckThread().getSnapshotAnalysis(4 /* retiredCount */);

		// Generate audio HAL processes tombstones and allow time to complete
		// before forcing restart
		@@ -281,7 +285,7 @@ void TimeCheck::TimeCheckHandler::onTimeout(TimerThread::Handle timerHandle) con
		.append(analyzeTimeouts(requestedTimeoutMs + secondChanceMs,
		elapsedSteadyMs, elapsedSystemMs)).append("\n")
		.append(halPids).append("\n")
		.append(summary);
		.append(snapshotAnalysis.toString());

		// Note: LOG_ALWAYS_FATAL limits the size of the string - per log/log.h:
		// Log message text may be truncated to less than an
		@@ -291,7 +295,20 @@ void TimeCheck::TimeCheckHandler::onTimeout(TimerThread::Handle timerHandle) con
		// to avoid the size limitation. LOG(FATAL) does an abort whereas
		// LOG(FATAL_WITHOUT_ABORT) does not abort.

		LOG(FATAL) << abortMessage;
		static constexpr pid_t invalidPid = TimerThread::SnapshotAnalysis::INVALID_PID;
		pid_t tidToAbort = invalidPid;
		if (snapshotAnalysis.suspectTid != invalidPid) {
		tidToAbort = snapshotAnalysis.suspectTid;
		} else if (snapshotAnalysis.timeoutTid != invalidPid) {
		tidToAbort = snapshotAnalysis.timeoutTid;
		}

		LOG(FATAL_WITHOUT_ABORT) << abortMessage;
		const auto ret = abortTid(tidToAbort);
		if (ret < 0) {
		LOG(FATAL) << "TimeCheck thread signal failed, aborting process. "
		"errno: " << errno << base::ErrnoNumberAsString(errno);
		}
		}

		// Automatically create a TimeCheck class for a class and method.

media/utils/TimerThread.cpp

+36 −41

Original line number	Diff line number	Diff line
		@@ -59,39 +59,29 @@ bool TimerThread::cancelTask(Handle handle) {
		return true;
		}

		std::string TimerThread::toString(size_t retiredCount) const {

		std::string TimerThread::SnapshotAnalysis::toString() const {
		// Note: These request queues are snapshot very close together but
		// not at "identical" times as we don't use a class-wide lock.

		std::vector<std::shared_ptr<const Request>> timeoutRequests;
		std::vector<std::shared_ptr<const Request>> retiredRequests;
		mTimeoutQueue.copyRequests(timeoutRequests);
		mRetiredQueue.copyRequests(retiredRequests, retiredCount);
		std::vector<std::shared_ptr<const Request>> pendingRequests =
		getPendingRequests();

		struct Analysis analysis = analyzeTimeout(timeoutRequests, pendingRequests);
		std::string analysisSummary;
		if (!analysis.summary.empty()) {
		analysisSummary = std::string("\nanalysis [ ").append(analysis.summary).append(" ]");
		}
		std::string analysisSummary = std::string("\nanalysis [ ").append(description).append(" ]");
		std::string timeoutStack;
		if (analysis.timeoutTid != -1) {
		timeoutStack = std::string("\ntimeout(")
		.append(std::to_string(analysis.timeoutTid)).append(") callstack [\n")
		.append(getCallStackStringForTid(analysis.timeoutTid)).append("]");
		}
		std::string blockedStack;
		if (analysis.HALBlockedTid != -1) {
		if (timeoutTid != -1) {
		timeoutStack = std::string(suspectTid == timeoutTid ? "\ntimeout/blocked(" : "\ntimeout(")
		.append(std::to_string(timeoutTid)).append(") callstack [\n")
		.append(getCallStackStringForTid(timeoutTid)).append("]");
		}

		if (suspectTid != -1 && suspectTid != timeoutTid) {
		blockedStack = std::string("\nblocked(")
		.append(std::to_string(analysis.HALBlockedTid)).append(") callstack [\n")
		.append(getCallStackStringForTid(analysis.HALBlockedTid)).append("]");
		.append(std::to_string(suspectTid)).append(") callstack [\n")
		.append(getCallStackStringForTid(suspectTid)).append("]");
		}

		return std::string("now ")
		.append(formatTime(std::chrono::system_clock::now()))
		.append("\nsecondChanceCount ")
		.append(std::to_string(mMonitorThread.getSecondChanceCount()))
		.append(std::to_string(secondChanceCount))
		.append(analysisSummary)
		.append("\ntimeout [ ")
		.append(requestsToString(timeoutRequests))
		@@ -121,16 +111,23 @@ bool TimerThread::isRequestFromHal(const std::shared_ptr<const Request>& request
		return separatorPos != std::string::npos;
		}

		/* static */
		struct TimerThread::Analysis TimerThread::analyzeTimeout(
		const std::vector<std::shared_ptr<const Request>>& timeoutRequests,
		const std::vector<std::shared_ptr<const Request>>& pendingRequests) {

		if (timeoutRequests.empty() \|\| pendingRequests.empty()) return {}; // nothing to say.

		struct TimerThread::SnapshotAnalysis TimerThread::getSnapshotAnalysis(size_t retiredCount) const {
		struct SnapshotAnalysis analysis{};
		// The following snapshot of the TimerThread state will be utilized for
		// analysis. Note, there is no lock around these calls, so there could be
		// a state update between them.
		mTimeoutQueue.copyRequests(analysis.timeoutRequests);
		mRetiredQueue.copyRequests(analysis.retiredRequests, retiredCount);
		analysis.pendingRequests = getPendingRequests();
		analysis.secondChanceCount = mMonitorThread.getSecondChanceCount();
		// No call has timed out, so there is no analysis to be done.
		if (analysis.timeoutRequests.empty())
		return analysis;
		// for now look at last timeout (in our case, the only timeout)
		const std::shared_ptr<const Request> timeout = timeoutRequests.back();

		const std::shared_ptr<const Request> timeout = analysis.timeoutRequests.back();
		analysis.timeoutTid = timeout->tid;
		if (analysis.pendingRequests.empty())
		return analysis;
		// pending Requests that are problematic.
		std::vector<std::shared_ptr<const Request>> pendingExact;
		std::vector<std::shared_ptr<const Request>> pendingPossible;
		@@ -141,7 +138,7 @@ struct TimerThread::Analysis TimerThread::analyzeTimeout(
		// such as HAL write() and read().
		//
		constexpr Duration kPendingDuration = 1000ms;
		for (const auto& pending : pendingRequests) {
		for (const auto& pending : analysis.pendingRequests) {
		// If the pending tid is the same as timeout tid, problem identified.
		if (pending->tid == timeout->tid) {
		pendingExact.emplace_back(pending);
		@@ -154,29 +151,27 @@ struct TimerThread::Analysis TimerThread::analyzeTimeout(
		}
		}

		struct Analysis analysis{};

		analysis.timeoutTid = timeout->tid;
		std::string& summary = analysis.summary;
		std::string& description = analysis.description;
		if (!pendingExact.empty()) {
		const auto& request = pendingExact.front();
		const bool hal = isRequestFromHal(request);

		if (hal) {
		summary = std::string("Blocked directly due to HAL call: ")
		description = std::string("Blocked directly due to HAL call: ")
		.append(request->toString());
		analysis.suspectTid= request->tid;
		}
		}
		if (summary.empty() && !pendingPossible.empty()) {
		if (description.empty() && !pendingPossible.empty()) {
		for (const auto& request : pendingPossible) {
		const bool hal = isRequestFromHal(request);
		if (hal) {
		// The first blocked call is the most likely one.
		// Recent calls might be temporarily blocked
		// calls such as write() or read() depending on kDuration.
		summary = std::string("Blocked possibly due to HAL call: ")
		description = std::string("Blocked possibly due to HAL call: ")
		.append(request->toString());
		analysis.HALBlockedTid = request->tid;
		analysis.suspectTid= request->tid;
		}
		}
		}

media/utils/include/mediautils/TidWrapper.h

+19 −0

Original line number	Diff line number	Diff line
		@@ -16,8 +16,11 @@

		#pragma once

		#if defined(__linux__)
		#include <signal.h>
		#include <sys/syscall.h>
		#include <unistd.h>
		#endif

		namespace android::mediautils {

		@@ -31,4 +34,20 @@ inline pid_t getThreadIdWrapper() {
		#endif
		}

		// Send an abort signal to a (linux) thread id.
		inline int abortTid(int tid) {
		#if defined(__linux__)
		const pid_t pid = getpid();
		siginfo_t siginfo = {
		.si_code = SI_QUEUE,
		.si_pid = pid,
		.si_uid = getuid(),
		};
		return syscall(SYS_rt_tgsigqueueinfo, pid, tid, SIGABRT, &siginfo);
		#else
		errno = ENODEV;
		return -1;
		#endif
		}

		}

media/utils/include/mediautils/TimeCheck.h

+0 −1

Original line number	Diff line number	Diff line
		@@ -123,7 +123,6 @@ class TimeCheck {
		const Duration secondChanceDuration;
		const std::chrono::system_clock::time_point startSystemTime;
		const pid_t tid;

		void onCancel(TimerThread::Handle handle) const;
		void onTimeout(TimerThread::Handle handle) const;
		};

media/utils/include/mediautils/TimerThread.h

+45 −27

Original line number	Diff line number	Diff line
		@@ -21,9 +21,11 @@
		#include <deque>
		#include <functional>
		#include <map>
		#include <memory>
		#include <mutex>
		#include <string>
		#include <thread>
		#include <vector>

		#include <android-base/thread_annotations.h>

		@@ -151,7 +153,15 @@ class TimerThread {
		*/
		bool cancelTask(Handle handle);

		std::string toString(size_t retiredCount = SIZE_MAX) const;
		struct SnapshotAnalysis;
		/**
		* Take a snapshot of the current state of the TimerThread and determine the
		* potential cause of a deadlock.
		* \param retiredCount The number of successfully retired calls to capture
		* (may be many).
		* \return See below for a description of a SnapShotAnalysis object
		*/
		SnapshotAnalysis getSnapshotAnalysis(size_t retiredCount = SIZE_MAX) const;

		/**
		* Returns a string representation of the TimerThread queue.
		@@ -202,7 +212,6 @@ class TimerThread {
		return s;
		}

		private:
		// To minimize movement of data, we pass around shared_ptrs to Requests.
		// These are allocated and deallocated outside of the lock.
		// TODO(b/243839867) consider options to merge Request with the
		@@ -232,6 +241,40 @@ class TimerThread {
		std::string toString() const;
		};


		// SnapshotAnalysis contains info deduced by analysisTimeout().

		struct SnapshotAnalysis {
		// If we were unable to determine any applicable thread ids,
		// we leave their value as INVALID_PID.
		// Note, we use the linux thread id (not pthread), so its type is pid_t.
		static constexpr pid_t INVALID_PID = -1;
		// Description of likely issue and/or blocked method.
		// Empty if no actionable info.
		std::string description;
		// Tid of the (latest) monitored thread which has timed out.
		// This is the thread which the suspect is deduced with respect to.
		// Most often, this is the thread which an abort is being triggered
		// from.
		pid_t timeoutTid = INVALID_PID;
		// Tid of the (HAL) thread which has likely halted progress, selected
		// from pendingRequests. May be the same as timeoutTid, if the timed-out
		// thread directly called into the HAL.
		pid_t suspectTid = INVALID_PID;
		// Number of second chances given by the timer thread
		size_t secondChanceCount;
		// List of pending requests
		std::vector<std::shared_ptr<const Request>> pendingRequests;
		// List of timed-out requests
		std::vector<std::shared_ptr<const Request>> timeoutRequests;
		// List of retired requests
		std::vector<std::shared_ptr<const Request>> retiredRequests;
		// Dumps the information contained above as well as additional call
		// stacks where applicable.
		std::string toString() const;
		};

		private:
		// Deque of requests, in order of add().
		// This class is thread-safe.
		class RequestQueue {
		@@ -326,36 +369,11 @@ class TimerThread {
		}
		};

		// Analysis contains info deduced by analysisTimeout().
		//
		// Summary is the result string from checking timeoutRequests to see if
		// any might be caused by blocked calls in pendingRequests.
		//
		// Summary string is empty if there is no automatic actionable info.
		//
		// timeoutTid is the tid selected from timeoutRequests (if any).
		//
		// HALBlockedTid is the tid that is blocked from pendingRequests believed
		// to cause the timeout.
		// HALBlockedTid may be INVALID_PID if no suspected tid is found,
		// and if HALBlockedTid is valid, it will not be the same as timeoutTid.
		//
		static constexpr pid_t INVALID_PID = -1;
		struct Analysis {
		std::string summary;
		pid_t timeoutTid = INVALID_PID;
		pid_t HALBlockedTid = INVALID_PID;
		};

		// A HAL method is where the substring "Hidl" is in the class name.
		// The tag should look like: ... Hidl ... :: ...
		static bool isRequestFromHal(const std::shared_ptr<const Request>& request);

		// Returns analysis from the requests.
		static Analysis analyzeTimeout(
		const std::vector<std::shared_ptr<const Request>>& timeoutRequests,
		const std::vector<std::shared_ptr<const Request>>& pendingRequests);

		std::vector<std::shared_ptr<const Request>> getPendingRequests() const;

		static constexpr size_t kRetiredQueueMax = 16;