Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit fdbfa3b5 authored by Ben Miles's avatar Ben Miles
Browse files

Add critical event log

The critical event log is a small (20 items) log of recent "critical
events" such as watchdogs and half-watchdogs.

Events from the log that occurred in the last 5 minutes are dumped to
ANR files and ANR dropbox entries in textproto format.

This change adds the log itself & support for logging watchdogs and
half-watchdogs. Other event types such as system server crashes will be
added in follow-up CLs.

Bug: 200263868
Test: atest CriticalEventLogTest
Change-Id: I431e15c5d5c16a358cd0eea30682078d04257499
parent 508008be
Loading
Loading
Loading
Loading
+78 −0
Original line number Diff line number Diff line
/*
 * Copyright (C) 2021 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

syntax = "proto2";
package com.android.server.am;

option java_multiple_files = true;

// Output proto containing recent critical events for inclusion in logs such as ANR files.
// Do not change the field names since this data is dumped to ANR files in textproto format.
message CriticalEventLogProto {
  // Timestamp when the log snapshot was generated.
  // Required.
  optional int64 timestamp_ms = 1;

  // Max age of events that are included in this proto.
  // Required.
  optional int32 window_ms = 2;

  // Max number of events in the log.
  // Note: if the number of events is equal to the capacity then it is likely the actual time range
  // covered by the log is shorter than window_ms.
  // Required.
  optional int32 capacity = 3;

  // Recent critical events.
  repeated CriticalEventProto events = 4;
}

// On-disk storage of events.
message CriticalEventLogStorageProto {
  repeated CriticalEventProto events = 1;
}

// A "critical" event such as an ANR or watchdog.
// Do not change the field names since this data is dumped to ANR files in textproto format.
message CriticalEventProto {
  // Timestamp of the event.
  // Required.
  optional int64 timestamp_ms = 1;

  // Required.
  oneof event {
    Watchdog watchdog = 2;
    HalfWatchdog half_watchdog = 3;
  }

  message Watchdog {
    // The watchdog subject.
    // Required.
    optional string subject = 1;

    // Unique identifier of the watchdog.
    // Can be used to join with other data for this watchdog such as stack dumps & perfetto traces.
    // Generated in {@link com.android.server.Watchdog#run}.
    // Required.
    optional string uuid = 2;
  }

  message HalfWatchdog {
    // The half-watchdog subject.
    // Required.
    optional string subject = 1;
  }
}
 No newline at end of file
+1 −1
Original line number Diff line number Diff line
@@ -192,7 +192,7 @@ public final class SystemServerInitThreadPool implements Dumpable {
        final ArrayList<Integer> pids = new ArrayList<>();
        pids.add(Process.myPid());
        ActivityManagerService.dumpStackTraces(pids, null, null,
                Watchdog.getInterestingNativePids(), null, null);
                Watchdog.getInterestingNativePids(), null, null, null);
    }

    @Override
+14 −6
Original line number Diff line number Diff line
@@ -45,6 +45,7 @@ import com.android.internal.os.ProcessCpuTracker;
import com.android.internal.os.ZygoteConnectionConstants;
import com.android.internal.util.FrameworkStatsLog;
import com.android.server.am.ActivityManagerService;
import com.android.server.am.CriticalEventLog;
import com.android.server.am.TraceErrorLogger;
import com.android.server.wm.SurfaceAnimationThread;

@@ -661,10 +662,15 @@ public class Watchdog {
            } // END synchronized (mLock)

            if (doWaitedHalfDump) {
                // Get critical event log before logging the half watchdog so that it doesn't
                // occur in the log.
                String criticalEvents = CriticalEventLog.getInstance().logLinesForAnrFile();
                CriticalEventLog.getInstance().logHalfWatchdog(subject);

                // We've waited half the deadlock-detection interval.  Pull a stack
                // trace and wait another half.
                ActivityManagerService.dumpStackTraces(pids, null, null,
                        getInterestingNativePids(), null, subject);
                        getInterestingNativePids(), null, subject, criticalEvents);
                continue;
            }

@@ -673,12 +679,9 @@ public class Watchdog {
            // Then kill this process so that the system will restart.
            EventLog.writeEvent(EventLogTags.WATCHDOG, subject);

            final UUID errorId;
            final UUID errorId = mTraceErrorLogger.generateErrorId();
            if (mTraceErrorLogger.isAddErrorIdEnabled()) {
                errorId = mTraceErrorLogger.generateErrorId();
                mTraceErrorLogger.addErrorIdToTrace("system_server", errorId);
            } else {
                errorId = null;
            }

            // Log the atom as early as possible since it is used as a mechanism to trigger
@@ -686,6 +689,11 @@ public class Watchdog {
            // point in time when the Watchdog happens as possible.
            FrameworkStatsLog.write(FrameworkStatsLog.SYSTEM_SERVER_WATCHDOG_OCCURRED, subject);

            // Get critical event log before logging the watchdog so that it doesn't occur in the
            // log.
            String criticalEvents = CriticalEventLog.getInstance().logLinesForAnrFile();
            CriticalEventLog.getInstance().logWatchdog(subject, errorId);

            long anrTime = SystemClock.uptimeMillis();
            StringBuilder report = new StringBuilder();
            report.append(MemoryPressureUtil.currentPsiState());
@@ -693,7 +701,7 @@ public class Watchdog {
            StringWriter tracesFileException = new StringWriter();
            final File stack = ActivityManagerService.dumpStackTraces(
                    pids, processCpuTracker, new SparseArray<>(), getInterestingNativePids(),
                    tracesFileException, subject);
                    tracesFileException, subject, criticalEvents);

            // Give some extra time to make sure the stack traces get written.
            // The system's been hanging for a minute, another second or two won't hurt much.
+15 −8
Original line number Diff line number Diff line
@@ -2394,6 +2394,7 @@ public class ActivityManagerService extends IActivityManager.Stub
    private void start() {
        removeAllProcessGroups();
        CriticalEventLog.init();
        mBatteryStatsService.publish();
        mAppOpsService.publish();
        Slog.d("AppOps", "AppOpsService published");
@@ -3199,7 +3200,7 @@ public class ActivityManagerService extends IActivityManager.Stub
            ProcessCpuTracker processCpuTracker, SparseArray<Boolean> lastPids,
            ArrayList<Integer> nativePids, StringWriter logExceptionCreatingFile) {
        return dumpStackTraces(firstPids, processCpuTracker, lastPids, nativePids,
                logExceptionCreatingFile, null, null);
                logExceptionCreatingFile, null, null, null);
    }
    /**
@@ -3209,13 +3210,14 @@ public class ActivityManagerService extends IActivityManager.Stub
     * @param nativePids optional list of native pids to dump stack crawls
     * @param logExceptionCreatingFile optional writer to which we log errors creating the file
     * @param subject optional line related to the error
     * @param criticalEventSection optional lines containing recent critical events.
     */
    public static File dumpStackTraces(ArrayList<Integer> firstPids,
            ProcessCpuTracker processCpuTracker, SparseArray<Boolean> lastPids,
            ArrayList<Integer> nativePids, StringWriter logExceptionCreatingFile,
            String subject) {
            String subject, String criticalEventSection) {
        return dumpStackTraces(firstPids, processCpuTracker, lastPids, nativePids,
                logExceptionCreatingFile, null, subject);
                logExceptionCreatingFile, null, subject, criticalEventSection);
    }
    /**
@@ -3225,7 +3227,7 @@ public class ActivityManagerService extends IActivityManager.Stub
    /* package */ static File dumpStackTraces(ArrayList<Integer> firstPids,
            ProcessCpuTracker processCpuTracker, SparseArray<Boolean> lastPids,
            ArrayList<Integer> nativePids, StringWriter logExceptionCreatingFile,
            long[] firstPidOffsets, String subject) {
            long[] firstPidOffsets, String subject, String criticalEventSection) {
        ArrayList<Integer> extraPids = null;
        Slog.i(TAG, "dumpStackTraces pids=" + lastPids + " nativepids=" + nativePids);
@@ -3277,12 +3279,17 @@ public class ActivityManagerService extends IActivityManager.Stub
            return null;
        }
        if (subject != null) {
        if (subject != null || criticalEventSection != null) {
            try (FileOutputStream fos = new FileOutputStream(tracesFile, true)) {
                String header = "Subject: " + subject + "\n";
                if (subject != null) {
                    String header = "Subject: " + subject + "\n\n";
                    fos.write(header.getBytes(StandardCharsets.UTF_8));
                }
                if (criticalEventSection != null) {
                    fos.write(criticalEventSection.getBytes(StandardCharsets.UTF_8));
                }
            } catch (IOException e) {
                Slog.w(TAG, "Exception writing subject to ANR dump file:", e);
                Slog.w(TAG, "Exception writing to ANR dump file:", e);
            }
        }
+328 −0
Original line number Diff line number Diff line
/*
 * Copyright (C) 2021 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.android.server.am;

import android.os.Handler;
import android.os.HandlerThread;
import android.util.Slog;

import com.android.framework.protobuf.nano.MessageNanoPrinter;
import com.android.internal.annotations.VisibleForTesting;
import com.android.internal.util.RingBuffer;
import com.android.server.am.nano.CriticalEventLogProto;
import com.android.server.am.nano.CriticalEventLogStorageProto;
import com.android.server.am.nano.CriticalEventProto;
import com.android.server.am.nano.CriticalEventProto.HalfWatchdog;
import com.android.server.am.nano.CriticalEventProto.Watchdog;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.time.Duration;
import java.util.Arrays;
import java.util.UUID;

/**
 * Log of recent critical events such as Watchdogs.
 *
 * For use in ANR reports to show recent events that may help to debug the ANR. In particular,
 * the presence of recent critical events signal that the device was already in a had state.
 *
 * This class needs to be thread safe since it's used as a singleton.
 */
public class CriticalEventLog {
    private static final String TAG = CriticalEventLog.class.getSimpleName();

    private static CriticalEventLog sInstance;

    /** Name of the file the log is saved to. */
    @VisibleForTesting
    static final String FILENAME = "critical_event_log.pb";

    /** Timestamp when the log was last saved (or attempted to be saved) to disk. */
    private long mLastSaveAttemptMs = 0;

    /** File the log is saved to. */
    private final File mLogFile;

    /** Ring buffer containing the log events. */
    private final ThreadSafeRingBuffer<CriticalEventProto> mEvents;

    /** Max age of events to include in the output log proto. */
    private final int mWindowMs;

    /** Minimum time between consecutive saves of the log to disk. */
    private final long mMinTimeBetweenSavesMs;

    /** Whether to load and save the log synchronously with no delay. Only set to true in tests. */
    private final boolean mLoadAndSaveImmediately;

    private final Handler mHandler;

    private final Runnable mSaveRunnable = this::saveLogToFileNow;

    @VisibleForTesting
    CriticalEventLog(String logDir, int capacity, int windowMs, long minTimeBetweenSavesMs,
            boolean loadAndSaveImmediately, ILogLoader logLoader) {
        mLogFile = Paths.get(logDir, FILENAME).toFile();
        mWindowMs = windowMs;
        mMinTimeBetweenSavesMs = minTimeBetweenSavesMs;
        mLoadAndSaveImmediately = loadAndSaveImmediately;

        mEvents = new ThreadSafeRingBuffer<>(CriticalEventProto.class, capacity);

        HandlerThread thread = new HandlerThread("CriticalEventLogIO");
        thread.start();
        mHandler = new Handler(thread.getLooper());

        final Runnable loadEvents = () -> logLoader.load(mLogFile, mEvents);
        if (!mLoadAndSaveImmediately) {
            mHandler.post(loadEvents);
        } else {
            loadEvents.run();
        }
    }

    /** Returns a new instance with production settings. */
    private CriticalEventLog() {
        this(
                /* logDir= */"/data/misc/critical-events",
                /* capacity= */ 20,
                /* windowMs= */ (int) Duration.ofMinutes(5).toMillis(),
                /* minTimeBetweenSavesMs= */ Duration.ofSeconds(2).toMillis(),
                /* loadAndSaveImmediately= */ false,
                new LogLoader());
    }

    /** Returns the singleton instance. */
    public static CriticalEventLog getInstance() {
        if (sInstance == null) {
            sInstance = new CriticalEventLog();
        }
        return sInstance;
    }

    /**
     * Ensures the singleton instance has been instantiated.
     *
     * Use this to eagerly instantiate the log (which loads the previous events from disk).
     * Otherwise this will occur lazily when the first event is logged, by which time the device may
     * be under load.
     */
    public static void init() {
        getInstance();
    }

    @VisibleForTesting
    protected long getWallTimeMillis() {
        return System.currentTimeMillis();
    }

    /** Logs a watchdog. */
    public void logWatchdog(String subject, UUID uuid) {
        Watchdog watchdog = new Watchdog();
        watchdog.subject = subject;
        watchdog.uuid = uuid.toString();
        CriticalEventProto event = new CriticalEventProto();
        event.setWatchdog(watchdog);
        log(event);
    }

    /** Logs a half-watchdog. */
    public void logHalfWatchdog(String subject) {
        HalfWatchdog halfWatchdog = new HalfWatchdog();
        halfWatchdog.subject = subject;
        CriticalEventProto event = new CriticalEventProto();
        event.setHalfWatchdog(halfWatchdog);
        log(event);
    }

    private void log(CriticalEventProto event) {
        event.timestampMs = getWallTimeMillis();
        mEvents.append(event);
        saveLogToFile();
    }

    /**
     * Returns recent critical events in text format to include in logs such as ANR files.
     *
     * Includes all events in the ring buffer with age less than or equal to {@code mWindowMs}.
     */
    public String logLinesForAnrFile() {
        return new StringBuilder()
                .append("--- CriticalEventLog ---\n")
                .append(MessageNanoPrinter.print(getRecentEvents()))
                .append('\n').toString();
    }

    /**
     * Returns a proto containing recent critical events.
     *
     * Includes all events in the ring buffer with age less than or equal to {@code mWindowMs}.
     */
    @VisibleForTesting
    protected CriticalEventLogProto getRecentEvents() {
        CriticalEventLogProto log = new CriticalEventLogProto();
        log.timestampMs = getWallTimeMillis();
        log.windowMs = mWindowMs;
        log.capacity = mEvents.capacity();
        log.events = recentEventsWithMinTimestamp(log.timestampMs - mWindowMs);

        return log;
    }

    /**
     * Returns the most recent logged events, starting with the first event that has a timestamp
     * greater than or equal to {@code minTimestampMs}.
     *
     * If no events have a timestamp greater than or equal to {@code minTimestampMs}, returns an
     * empty array.
     */
    private CriticalEventProto[] recentEventsWithMinTimestamp(long minTimestampMs) {
        // allEvents are in insertion order, i.e. in order of when the relevant log___() function
        // was called.
        // This means that if the system clock changed (e.g. a NITZ update) allEvents may not be
        // strictly ordered by timestamp. In this case we are permissive and start with the
        // first event that has a timestamp in the desired range.
        CriticalEventProto[] allEvents = mEvents.toArray();
        for (int i = 0; i < allEvents.length; i++) {
            if (allEvents[i].timestampMs >= minTimestampMs) {
                return Arrays.copyOfRange(allEvents, i, allEvents.length);
            }
        }
        return new CriticalEventProto[]{};
    }

    private void saveLogToFile() {
        if (mLoadAndSaveImmediately) {
            saveLogToFileNow();
            return;
        }
        if (mHandler.hasCallbacks(mSaveRunnable)) {
            // An earlier save is already scheduled so don't need to schedule an additional one.
            return;
        }

        if (!mHandler.postDelayed(mSaveRunnable, saveDelayMs())) {
            Slog.w(TAG, "Error scheduling save");
        }
    }

    /**
     * Returns the delay in milliseconds when scheduling a save on the handler thread.
     *
     * Returns a value in the range [0, {@code minTimeBetweenSavesMs}] such that the time between
     * consecutive saves does not exceed {@code minTimeBetweenSavesMs}.
     *
     * This means that if the last save occurred a long time ago, or if no previous saves
     * have occurred then the returned delay will be zero.
     */
    @VisibleForTesting
    protected long saveDelayMs() {
        final long nowMs = getWallTimeMillis();
        return Math.max(0,
                mLastSaveAttemptMs + mMinTimeBetweenSavesMs - nowMs);
    }

    @VisibleForTesting
    protected void saveLogToFileNow() {
        mLastSaveAttemptMs = getWallTimeMillis();

        File logDir = mLogFile.getParentFile();
        if (!logDir.exists()) {
            if (!logDir.mkdir()) {
                Slog.e(TAG, "Error creating log directory: " + logDir.getPath());
                return;
            }
        }

        if (!mLogFile.exists()) {
            try {
                mLogFile.createNewFile();
            } catch (IOException e) {
                Slog.e(TAG, "Error creating log file", e);
                return;
            }
        }

        CriticalEventLogStorageProto logProto = new CriticalEventLogStorageProto();
        logProto.events = mEvents.toArray();

        final byte[] bytes = CriticalEventLogStorageProto.toByteArray(logProto);
        try (FileOutputStream stream = new FileOutputStream(mLogFile, false)) {
            stream.write(bytes);
        } catch (IOException e) {
            Slog.e(TAG, "Error saving log to disk.", e);
        }
    }

    @VisibleForTesting
    protected static class ThreadSafeRingBuffer<T> {
        private final int mCapacity;
        private final RingBuffer<T> mBuffer;

        ThreadSafeRingBuffer(Class<T> clazz, int capacity) {
            this.mCapacity = capacity;
            this.mBuffer = new RingBuffer<>(clazz, capacity);
        }

        synchronized void append(T t) {
            mBuffer.append(t);
        }

        synchronized T[] toArray() {
            return mBuffer.toArray();
        }

        int capacity() {
            return mCapacity;
        }
    }

    /** Loads log events from disk into a ring buffer. */
    protected interface ILogLoader {
        void load(File logFile, ThreadSafeRingBuffer<CriticalEventProto> buffer);
    }

    /** Loads log events from disk into a ring buffer. */
    static class LogLoader implements ILogLoader {
        @Override
        public void load(File logFile,
                ThreadSafeRingBuffer<CriticalEventProto> buffer) {
            for (CriticalEventProto event : loadLogFromFile(logFile).events) {
                buffer.append(event);
            }
        }

        private static CriticalEventLogStorageProto loadLogFromFile(File logFile) {
            if (!logFile.exists()) {
                Slog.i(TAG, "No log found, returning empty log proto.");
                return new CriticalEventLogStorageProto();
            }

            try {
                return CriticalEventLogStorageProto.parseFrom(
                        Files.readAllBytes(logFile.toPath()));
            } catch (IOException e) {
                Slog.e(TAG, "Error reading log from disk.", e);
                return new CriticalEventLogStorageProto();
            }
        }
    }
}
Loading