Merge changes from topic "2022-02-24-gpu-work" into tm-dev (a2c7a637) · Commits · e / os / platform_frameworks_native

services/gpuservice/gpuwork/GpuWork.cpp

+173 −135

Original line number	Diff line number	Diff line
		@@ -39,17 +39,30 @@
		#include <map>
		#include <mutex>
		#include <unordered_map>
		#include <unordered_set>
		#include <vector>

		#include "gpuwork/gpu_work.h"

		#define MS_IN_NS (1000000)
		#define ONE_MS_IN_NS (10000000)

		namespace android {
		namespace gpuwork {

		namespace {

		bool lessThanGpuIdUid(const android::gpuwork::GpuIdUid& l, const android::gpuwork::GpuIdUid& r) {
		return std::tie(l.gpu_id, l.uid) < std::tie(r.gpu_id, r.uid);
		}

		size_t hashGpuIdUid(const android::gpuwork::GpuIdUid& gpuIdUid) {
		return static_cast<size_t>((gpuIdUid.gpu_id << 5U) + gpuIdUid.uid);
		}

		bool equalGpuIdUid(const android::gpuwork::GpuIdUid& l, const android::gpuwork::GpuIdUid& r) {
		return std::tie(l.gpu_id, l.uid) == std::tie(r.gpu_id, r.uid);
		}

		// Gets a BPF map from \|mapPath\|.
		template <class Key, class Value>
		bool getBpfMap(const char* mapPath, bpf::BpfMap<Key, Value>* out) {
		@@ -76,24 +89,6 @@ inline int32_t bitcast_int32<uint32_t>(uint32_t source) {
		return result;
		}

		template <>
		inline int32_t cast_int32<uint64_t>(uint64_t source) {
		if (source > std::numeric_limits<int32_t>::max()) {
		return std::numeric_limits<int32_t>::max();
		}
		return static_cast<int32_t>(source);
		}

		template <>
		inline int32_t cast_int32<long long>(long long source) {
		if (source > std::numeric_limits<int32_t>::max()) {
		return std::numeric_limits<int32_t>::max();
		} else if (source < std::numeric_limits<int32_t>::min()) {
		return std::numeric_limits<int32_t>::min();
		}
		return static_cast<int32_t>(source);
		}

		} // namespace

		using base::StringAppendF;
		@@ -115,7 +110,7 @@ GpuWork::~GpuWork() {
		{
		std::scoped_lock<std::mutex> lock(mMutex);
		if (mStatsdRegistered) {
		AStatsManager_clearPullAtomCallback(android::util::GPU_FREQ_TIME_IN_STATE_PER_UID);
		AStatsManager_clearPullAtomCallback(android::util::GPU_WORK_PER_UID);
		}
		}

		@@ -144,7 +139,7 @@ void GpuWork::initialize() {
		mPreviousMapClearTimePoint = std::chrono::steady_clock::now();
		}

		// Attach the tracepoint ONLY if we got the map above.
		// Attach the tracepoint.
		if (!attachTracepoint("/sys/fs/bpf/prog_gpu_work_tracepoint_power_gpu_work_period", "power",
		"gpu_work_period")) {
		return;
		@@ -157,8 +152,8 @@ void GpuWork::initialize() {

		{
		std::lock_guard<std::mutex> lock(mMutex);
		AStatsManager_setPullAtomCallback(int32_t{android::util::GPU_FREQ_TIME_IN_STATE_PER_UID},
		nullptr, GpuWork::pullAtomCallback, this);
		AStatsManager_setPullAtomCallback(int32_t{android::util::GPU_WORK_PER_UID}, nullptr,
		GpuWork::pullAtomCallback, this);
		mStatsdRegistered = true;
		}

		@@ -169,18 +164,18 @@ void GpuWork::initialize() {

		void GpuWork::dump(const Vector<String16>& /* args /, std::string result) {
		if (!mInitialized.load()) {
		result->append("GPU time in state information is not available.\n");
		result->append("GPU work information is not available.\n");
		return;
		}

		// Ordered map ensures output data is sorted by UID.
		std::map<Uid, UidTrackingInfo> dumpMap;
		// Ordered map ensures output data is sorted.
		std::map<GpuIdUid, UidTrackingInfo, decltype(lessThanGpuIdUid)*> dumpMap(&lessThanGpuIdUid);

		{
		std::lock_guard<std::mutex> lock(mMutex);

		if (!mGpuWorkMap.isValid()) {
		result->append("GPU time in state map is not available.\n");
		result->append("GPU work map is not available.\n");
		return;
		}

		@@ -189,56 +184,42 @@ void GpuWork::dump(const Vector<String16>& /* args /, std::string result) {
		// threads. The buckets are all preallocated. Our eBPF program only updates
		// entries (in-place) or adds entries. \|GpuWork\| only iterates or clears the
		// map while holding \|mMutex\|. Given this, we should be able to iterate over
		// all elements reliably. In the worst case, we might see elements more than
		// once.
		// all elements reliably. Nevertheless, we copy into a map to avoid
		// duplicates.

		// Note that userspace reads of BPF maps make a copy of the value, and
		// thus the returned value is not being concurrently accessed by the BPF
		// program (no atomic reads needed below).

		mGpuWorkMap.iterateWithValue([&dumpMap](const Uid& key, const UidTrackingInfo& value,
		const android::bpf::BpfMap<Uid, UidTrackingInfo>&)
		mGpuWorkMap.iterateWithValue(
		[&dumpMap](const GpuIdUid& key, const UidTrackingInfo& value,
		const android::bpf::BpfMap<GpuIdUid, UidTrackingInfo>&)
		-> base::Result<void> {
		dumpMap[key] = value;
		return {};
		});
		}

		// Find the largest frequency where some UID has spent time in that frequency.
		size_t largestFrequencyWithTime = 0;
		for (const auto& uidToUidInfo : dumpMap) {
		for (size_t i = largestFrequencyWithTime + 1; i < kNumTrackedFrequencies; ++i) {
		if (uidToUidInfo.second.frequency_times_ns[i] > 0) {
		largestFrequencyWithTime = i;
		}
		}
		}

		// Dump time in state information.
		// Dump work information.
		// E.g.
		// uid/freq: 0MHz 50MHz 100MHz ...
		// 1000: 0 0 0 0 ...
		// 1003: 0 0 3456 0 ...
		// [errors:3]1006: 0 0 3456 0 ...
		// GPU work information.
		// gpu_id uid total_active_duration_ns total_inactive_duration_ns
		// 0 1000 0 0
		// 0 1003 1234 123
		// [errors:3]0 1006 4567 456

		// Header.
		result->append("GPU time in frequency state in ms.\n");
		result->append("uid/freq: 0MHz");
		for (size_t i = 1; i <= largestFrequencyWithTime; ++i) {
		StringAppendF(result, " %zuMHz", i * 50);
		}
		result->append("\n");
		result->append("GPU work information.\ngpu_id uid total_active_duration_ns "
		"total_inactive_duration_ns\n");

		for (const auto& uidToUidInfo : dumpMap) {
		if (uidToUidInfo.second.error_count) {
		StringAppendF(result, "[errors:%" PRIu32 "]", uidToUidInfo.second.error_count);
		}
		StringAppendF(result, "%" PRIu32 ":", uidToUidInfo.first);
		for (size_t i = 0; i <= largestFrequencyWithTime; ++i) {
		StringAppendF(result, " %" PRIu64,
		uidToUidInfo.second.frequency_times_ns[i] / MS_IN_NS);
		for (const auto& idToUidInfo : dumpMap) {
		if (idToUidInfo.second.error_count) {
		StringAppendF(result, "[errors:%" PRIu32 "]", idToUidInfo.second.error_count);
		}
		result->append("\n");
		StringAppendF(result, "%" PRIu32 " %" PRIu32 " %" PRIu64 " %" PRIu64 "\n",
		idToUidInfo.first.gpu_id, idToUidInfo.first.uid,
		idToUidInfo.second.total_active_duration_ns,
		idToUidInfo.second.total_inactive_duration_ns);
		}
		}

		@@ -275,14 +256,14 @@ AStatsManager_PullAtomCallbackReturn GpuWork::pullAtomCallback(int32_t atomTag,
		ATRACE_CALL();

		GpuWork* gpuWork = reinterpret_cast<GpuWork*>(cookie);
		if (atomTag == android::util::GPU_FREQ_TIME_IN_STATE_PER_UID) {
		return gpuWork->pullFrequencyAtoms(data);
		if (atomTag == android::util::GPU_WORK_PER_UID) {
		return gpuWork->pullWorkAtoms(data);
		}

		return AStatsManager_PULL_SKIP;
		}

		AStatsManager_PullAtomCallbackReturn GpuWork::pullFrequencyAtoms(AStatsEventList* data) {
		AStatsManager_PullAtomCallbackReturn GpuWork::pullWorkAtoms(AStatsEventList* data) {
		ATRACE_CALL();

		if (!data \|\| !mInitialized.load()) {
		@@ -295,96 +276,153 @@ AStatsManager_PullAtomCallbackReturn GpuWork::pullFrequencyAtoms(AStatsEventList
		return AStatsManager_PULL_SKIP;
		}

		std::unordered_map<Uid, UidTrackingInfo> uidInfos;
		std::unordered_map<GpuIdUid, UidTrackingInfo, decltype(hashGpuIdUid), decltype(equalGpuIdUid)>
		workMap(32, &hashGpuIdUid, &equalGpuIdUid);

		// Iteration of BPF hash maps can be unreliable (no data races, but elements
		// may be repeated), as the map is typically being modified by other
		// threads. The buckets are all preallocated. Our eBPF program only updates
		// entries (in-place) or adds entries. \|GpuWork\| only iterates or clears the
		// map while holding \|mMutex\|. Given this, we should be able to iterate over
		// all elements reliably. In the worst case, we might see elements more than
		// once.
		// all elements reliably. Nevertheless, we copy into a map to avoid
		// duplicates.

		// Note that userspace reads of BPF maps make a copy of the value, and thus
		// the returned value is not being concurrently accessed by the BPF program
		// (no atomic reads needed below).

		mGpuWorkMap.iterateWithValue(
		[&uidInfos](const Uid& key, const UidTrackingInfo& value,
		const android::bpf::BpfMap<Uid, UidTrackingInfo>&) -> base::Result<void> {
		uidInfos[key] = value;
		mGpuWorkMap.iterateWithValue([&workMap](const GpuIdUid& key, const UidTrackingInfo& value,
		const android::bpf::BpfMap<GpuIdUid, UidTrackingInfo>&)
		-> base::Result<void> {
		workMap[key] = value;
		return {};
		});

		ALOGI("pullFrequencyAtoms: uidInfos.size() == %zu", uidInfos.size());

		// Get a list of just the UIDs; the order does not matter.
		std::vector<Uid> uids;
		for (const auto& pair : uidInfos) {
		uids.push_back(pair.first);
		// Get a list of the GPU IDs, in order.
		std::set<uint32_t> gpuIds;
		{
		// To avoid adding duplicate UIDs.
		std::unordered_set<Uid> addedUids;

		for (const auto& workInfo : workMap) {
		if (addedUids.insert(workInfo.first.uid).second) {
		// Insertion was successful.
		uids.push_back(workInfo.first.uid);
		}
		gpuIds.insert(workInfo.first.gpu_id);
		}
		}

		ALOGI("pullWorkAtoms: uids.size() == %zu", uids.size());
		ALOGI("pullWorkAtoms: gpuIds.size() == %zu", gpuIds.size());

		if (gpuIds.size() > kNumGpusHardLimit) {
		// If we observe a very high number of GPUs then something has probably
		// gone wrong, so don't log any atoms.
		return AStatsManager_PULL_SKIP;
		}

		size_t numSampledUids = kNumSampledUids;

		if (gpuIds.size() > kNumGpusSoftLimit) {
		// If we observe a high number of GPUs then we just sample 1 UID.
		numSampledUids = 1;
		}

		// Remove all UIDs that do not have at least \|kMinGpuTimeNanoseconds\| on at
		// least one GPU.
		{
		auto uidIt = uids.begin();
		while (uidIt != uids.end()) {
		bool hasEnoughGpuTime = false;
		for (uint32_t gpuId : gpuIds) {
		auto infoIt = workMap.find(GpuIdUid{gpuId, *uidIt});
		if (infoIt == workMap.end()) {
		continue;
		}
		if (infoIt->second.total_active_duration_ns +
		infoIt->second.total_inactive_duration_ns >=
		kMinGpuTimeNanoseconds) {
		hasEnoughGpuTime = true;
		break;
		}
		}
		if (hasEnoughGpuTime) {
		++uidIt;
		} else {
		uidIt = uids.erase(uidIt);
		}
		}
		}

		ALOGI("pullWorkAtoms: after removing uids with very low GPU time: uids.size() == %zu",
		uids.size());

		std::random_device device;
		std::default_random_engine random_engine(device());

		// If we have more than \|kNumSampledUids\| UIDs, choose \|kNumSampledUids\|
		// If we have more than \|numSampledUids\| UIDs, choose \|numSampledUids\|
		// random UIDs. We swap them to the front of the list. Given the list
		// indices 0..i..n-1, we have the following inclusive-inclusive ranges:
		// - [0, i-1] == the randomly chosen elements.
		// - [i, n-1] == the remaining unchosen elements.
		if (uids.size() > kNumSampledUids) {
		for (size_t i = 0; i < kNumSampledUids; ++i) {
		if (uids.size() > numSampledUids) {
		for (size_t i = 0; i < numSampledUids; ++i) {
		std::uniform_int_distribution<size_t> uniform_dist(i, uids.size() - 1);
		size_t random_index = uniform_dist(random_engine);
		std::swap(uids[i], uids[random_index]);
		}
		// Only keep the front \|kNumSampledUids\| elements.
		uids.resize(kNumSampledUids);
		// Only keep the front \|numSampledUids\| elements.
		uids.resize(numSampledUids);
		}

		ALOGI("pullFrequencyAtoms: uids.size() == %zu", uids.size());
		ALOGI("pullWorkAtoms: after random selection: uids.size() == %zu", uids.size());

		auto now = std::chrono::steady_clock::now();

		int32_t duration = cast_int32(
		long long duration =
		std::chrono::duration_cast<std::chrono::seconds>(now - mPreviousMapClearTimePoint)
		.count());
		.count();
		if (duration > std::numeric_limits<int32_t>::max() \|\| duration < 0) {
		// This is essentially impossible. If it does somehow happen, give up,
		// but still clear the map.
		clearMap();
		return AStatsManager_PULL_SKIP;
		}

		for (const Uid uid : uids) {
		const UidTrackingInfo& info = uidInfos[uid];
		ALOGI("pullFrequencyAtoms: adding stats for UID %" PRIu32, uid);
		android::util::addAStatsEvent(data, int32_t{android::util::GPU_FREQ_TIME_IN_STATE_PER_UID},
		// Log an atom for each (gpu id, uid) pair for which we have data.
		for (uint32_t gpuId : gpuIds) {
		for (Uid uid : uids) {
		auto it = workMap.find(GpuIdUid{gpuId, uid});
		if (it == workMap.end()) {
		continue;
		}
		const UidTrackingInfo& info = it->second;

		uint64_t total_active_duration_ms = info.total_active_duration_ns / ONE_MS_IN_NS;
		uint64_t total_inactive_duration_ms = info.total_inactive_duration_ns / ONE_MS_IN_NS;

		// Skip this atom if any numbers are out of range. \|duration\| is
		// already checked above.
		if (total_active_duration_ms > std::numeric_limits<int32_t>::max() \|\|
		total_inactive_duration_ms > std::numeric_limits<int32_t>::max()) {
		continue;
		}

		ALOGI("pullWorkAtoms: adding stats for GPU ID %" PRIu32 "; UID %" PRIu32, gpuId, uid);
		android::util::addAStatsEvent(data, int32_t{android::util::GPU_WORK_PER_UID},
		// uid
		bitcast_int32(uid),
		// gpu_id
		bitcast_int32(gpuId),
		// time_duration_seconds
		int32_t{duration},
		// max_freq_mhz
		int32_t{1000},
		// freq_0_mhz_time_millis
		cast_int32(info.frequency_times_ns[0] / 1000000),
		// freq_50_mhz_time_millis
		cast_int32(info.frequency_times_ns[1] / 1000000),
		// ... etc. ...
		cast_int32(info.frequency_times_ns[2] / 1000000),
		cast_int32(info.frequency_times_ns[3] / 1000000),
		cast_int32(info.frequency_times_ns[4] / 1000000),
		cast_int32(info.frequency_times_ns[5] / 1000000),
		cast_int32(info.frequency_times_ns[6] / 1000000),
		cast_int32(info.frequency_times_ns[7] / 1000000),
		cast_int32(info.frequency_times_ns[8] / 1000000),
		cast_int32(info.frequency_times_ns[9] / 1000000),
		cast_int32(info.frequency_times_ns[10] / 1000000),
		cast_int32(info.frequency_times_ns[11] / 1000000),
		cast_int32(info.frequency_times_ns[12] / 1000000),
		cast_int32(info.frequency_times_ns[13] / 1000000),
		cast_int32(info.frequency_times_ns[14] / 1000000),
		cast_int32(info.frequency_times_ns[15] / 1000000),
		cast_int32(info.frequency_times_ns[16] / 1000000),
		cast_int32(info.frequency_times_ns[17] / 1000000),
		cast_int32(info.frequency_times_ns[18] / 1000000),
		cast_int32(info.frequency_times_ns[19] / 1000000),
		// freq_1000_mhz_time_millis
		cast_int32(info.frequency_times_ns[20] / 1000000));
		static_cast<int32_t>(duration),
		// total_active_duration_millis
		static_cast<int32_t>(total_active_duration_ms),
		// total_inactive_duration_millis
		static_cast<int32_t>(total_inactive_duration_ms));
		}
		}
		clearMap();
		return AStatsManager_PULL_SUCCESS;
		@@ -435,7 +473,7 @@ void GpuWork::clearMapIfNeeded() {
		uint64_t numEntries = globalData.value().num_map_entries;

		// If the map is <=75% full, we do nothing.
		if (numEntries <= (kMaxTrackedUids / 4) * 3) {
		if (numEntries <= (kMaxTrackedGpuIdUids / 4) * 3) {
		return;
		}

		@@ -456,22 +494,22 @@ void GpuWork::clearMap() {

		// Iterating BPF maps to delete keys is tricky. If we just repeatedly call
		// \|getFirstKey()\| and delete that, we may loop forever (or for a long time)
		// because our BPF program might be repeatedly re-adding UID keys. Also,
		// even if we limit the number of elements we try to delete, we might only
		// delete new entries, leaving old entries in the map. If we delete a key A
		// and then call \|getNextKey(A)\|, the first key in the map is returned, so
		// we have the same issue.
		// because our BPF program might be repeatedly re-adding keys. Also, even if
		// we limit the number of elements we try to delete, we might only delete
		// new entries, leaving old entries in the map. If we delete a key A and
		// then call \|getNextKey(A)\|, the first key in the map is returned, so we
		// have the same issue.
		//
		// Thus, we instead get the next key and then delete the previous key. We
		// also limit the number of deletions we try, just in case.

		base::Result<Uid> key = mGpuWorkMap.getFirstKey();
		base::Result<GpuIdUid> key = mGpuWorkMap.getFirstKey();

		for (size_t i = 0; i < kMaxTrackedUids; ++i) {
		for (size_t i = 0; i < kMaxTrackedGpuIdUids; ++i) {
		if (!key.ok()) {
		break;
		}
		base::Result<Uid> previousKey = key;
		base::Result<GpuIdUid> previousKey = key;
		key = mGpuWorkMap.getNextKey(previousKey.value());
		mGpuWorkMap.deleteValue(previousKey.value());
		}

services/gpuservice/gpuwork/bpfprogs/gpu_work.c

+186 −124

File changed.

Preview size limit exceeded, changes collapsed.

services/gpuservice/gpuwork/bpfprogs/include/gpuwork/gpu_work.h

+21 −15

Original line number	Diff line number	Diff line
		@@ -26,17 +26,27 @@ namespace gpuwork {
		#endif

		typedef struct {
		// The end time of the previous period from this UID in nanoseconds.
		uint64_t previous_end_time_ns;
		uint32_t gpu_id;
		uint32_t uid;
		} GpuIdUid;

		// The time spent at each GPU frequency while running GPU work from the UID,
		// in nanoseconds. Array index i stores the time for frequency i*50 MHz. So
		// index 0 is 0Mhz, index 1 is 50MHz, index 2 is 100MHz, etc., up to index
		// \|kNumTrackedFrequencies\|.
		uint64_t frequency_times_ns[21];
		typedef struct {
		// The end time of the previous period where the GPU was active for the UID,
		// in nanoseconds.
		uint64_t previous_active_end_time_ns;

		// The total amount of time the GPU has spent running work for the UID, in
		// nanoseconds.
		uint64_t total_active_duration_ns;

		// The number of times we received \|GpuUidWorkPeriodEvent\| events in an
		// unexpected order. See \|GpuUidWorkPeriodEvent\|.
		// The total amount of time of the "gaps" between "continuous" GPU work for
		// the UID, in nanoseconds. This is estimated by ignoring large gaps between
		// GPU work for this UID.
		uint64_t total_inactive_duration_ns;

		// The number of errors detected due to \|GpuWorkPeriodEvent\| events for the
		// UID violating the specification in some way. E.g. periods with a zero or
		// negative duration.
		uint32_t error_count;

		} UidTrackingInfo;
		@@ -48,14 +58,10 @@ typedef struct {
		uint64_t num_map_entries;
		} GlobalData;

		static const uint32_t kMaxTrackedUids = 512;
		static const uint32_t kFrequencyGranularityMhz = 50;
		static const uint32_t kNumTrackedFrequencies = 21;
		// The maximum number of tracked GPU ID and UID pairs (\|GpuIdUid\|).
		static const uint32_t kMaxTrackedGpuIdUids = 512;

		#ifdef __cplusplus
		static_assert(kNumTrackedFrequencies ==
		std::extent<decltype(UidTrackingInfo::frequency_times_ns)>::value);

		} // namespace gpuwork
		} // namespace android
		#endif

services/gpuservice/gpuwork/include/gpuwork/GpuWork.h

+15 −4

Original line number	Diff line number	Diff line
		@@ -41,7 +41,7 @@ public:

		void initialize();

		// Dumps the GPU time in frequency state information.
		// Dumps the GPU work information.
		void dump(const Vector<String16>& args, std::string* result);

		private:
		@@ -55,7 +55,7 @@ private:
		AStatsEventList* data,
		void* cookie);

		AStatsManager_PullAtomCallbackReturn pullFrequencyAtoms(AStatsEventList* data);
		AStatsManager_PullAtomCallbackReturn pullWorkAtoms(AStatsEventList* data);

		// Periodically calls \|clearMapIfNeeded\| to clear the \|mGpuWorkMap\| map, if
		// needed.
		@@ -88,7 +88,7 @@ private:
		std::mutex mMutex;

		// BPF map for per-UID GPU work.
		bpf::BpfMap<Uid, UidTrackingInfo> mGpuWorkMap GUARDED_BY(mMutex);
		bpf::BpfMap<GpuIdUid, UidTrackingInfo> mGpuWorkMap GUARDED_BY(mMutex);

		// BPF map containing a single element for global data.
		bpf::BpfMap<uint32_t, GlobalData> mGpuWorkGlobalDataMap GUARDED_BY(mMutex);
		@@ -110,7 +110,18 @@ private:
		bool mStatsdRegistered GUARDED_BY(mMutex) = false;

		// The number of randomly chosen (i.e. sampled) UIDs to log stats for.
		static constexpr int kNumSampledUids = 10;
		static constexpr size_t kNumSampledUids = 10;

		// A "large" number of GPUs. If we observe more GPUs than this limit then
		// we reduce the amount of stats we log.
		static constexpr size_t kNumGpusSoftLimit = 4;

		// A "very large" number of GPUs. If we observe more GPUs than this limit
		// then we don't log any stats.
		static constexpr size_t kNumGpusHardLimit = 32;

		// The minimum GPU time needed to actually log stats for a UID.
		static constexpr uint64_t kMinGpuTimeNanoseconds = 30U * 1000000000U; // 30 seconds.

		// The previous time point at which \|mGpuWorkMap\| was cleared.
		std::chrono::steady_clock::time_point mPreviousMapClearTimePoint GUARDED_BY(mMutex);

services/gpuservice/vts/AndroidTest.xml

+3 −0

Original line number	Diff line number	Diff line
		@@ -14,6 +14,9 @@
		limitations under the License.
		-->
		<configuration description="Runs GpuServiceVendorTests">
		<target_preparer class="com.android.tradefed.targetprep.RootTargetPreparer">
		<option name="force-root" value="false" />
		</target_preparer>
		<test class="com.android.tradefed.testtype.HostTest" >
		<option name="jar" value="GpuServiceVendorTests.jar" />
		</test>