Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 52f8cca0 authored by Ben Miles's avatar Ben Miles Committed by Android (Google) Code Review
Browse files

Merge "Add critical event log"

parents 4f05eafd fdbfa3b5
Loading
Loading
Loading
Loading
+78 −0
Original line number Diff line number Diff line
/*
 * Copyright (C) 2021 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

syntax = "proto2";
package com.android.server.am;

option java_multiple_files = true;

// Output proto containing recent critical events for inclusion in logs such as ANR files.
// Do not change the field names since this data is dumped to ANR files in textproto format.
message CriticalEventLogProto {
  // Timestamp when the log snapshot was generated.
  // Required.
  optional int64 timestamp_ms = 1;

  // Max age of events that are included in this proto.
  // Required.
  optional int32 window_ms = 2;

  // Max number of events in the log.
  // Note: if the number of events is equal to the capacity then it is likely the actual time range
  // covered by the log is shorter than window_ms.
  // Required.
  optional int32 capacity = 3;

  // Recent critical events.
  repeated CriticalEventProto events = 4;
}

// On-disk storage of events.
message CriticalEventLogStorageProto {
  repeated CriticalEventProto events = 1;
}

// A "critical" event such as an ANR or watchdog.
// Do not change the field names since this data is dumped to ANR files in textproto format.
message CriticalEventProto {
  // Timestamp of the event.
  // Required.
  optional int64 timestamp_ms = 1;

  // Required.
  oneof event {
    Watchdog watchdog = 2;
    HalfWatchdog half_watchdog = 3;
  }

  message Watchdog {
    // The watchdog subject.
    // Required.
    optional string subject = 1;

    // Unique identifier of the watchdog.
    // Can be used to join with other data for this watchdog such as stack dumps & perfetto traces.
    // Generated in {@link com.android.server.Watchdog#run}.
    // Required.
    optional string uuid = 2;
  }

  message HalfWatchdog {
    // The half-watchdog subject.
    // Required.
    optional string subject = 1;
  }
}
 No newline at end of file
+1 −1
Original line number Diff line number Diff line
@@ -192,7 +192,7 @@ public final class SystemServerInitThreadPool implements Dumpable {
        final ArrayList<Integer> pids = new ArrayList<>();
        pids.add(Process.myPid());
        ActivityManagerService.dumpStackTraces(pids, null, null,
                Watchdog.getInterestingNativePids(), null, null);
                Watchdog.getInterestingNativePids(), null, null, null);
    }

    @Override
+14 −6
Original line number Diff line number Diff line
@@ -45,6 +45,7 @@ import com.android.internal.os.ProcessCpuTracker;
import com.android.internal.os.ZygoteConnectionConstants;
import com.android.internal.util.FrameworkStatsLog;
import com.android.server.am.ActivityManagerService;
import com.android.server.am.CriticalEventLog;
import com.android.server.am.TraceErrorLogger;
import com.android.server.wm.SurfaceAnimationThread;

@@ -661,10 +662,15 @@ public class Watchdog {
            } // END synchronized (mLock)

            if (doWaitedHalfDump) {
                // Get critical event log before logging the half watchdog so that it doesn't
                // occur in the log.
                String criticalEvents = CriticalEventLog.getInstance().logLinesForAnrFile();
                CriticalEventLog.getInstance().logHalfWatchdog(subject);

                // We've waited half the deadlock-detection interval.  Pull a stack
                // trace and wait another half.
                ActivityManagerService.dumpStackTraces(pids, null, null,
                        getInterestingNativePids(), null, subject);
                        getInterestingNativePids(), null, subject, criticalEvents);
                continue;
            }

@@ -673,12 +679,9 @@ public class Watchdog {
            // Then kill this process so that the system will restart.
            EventLog.writeEvent(EventLogTags.WATCHDOG, subject);

            final UUID errorId;
            final UUID errorId = mTraceErrorLogger.generateErrorId();
            if (mTraceErrorLogger.isAddErrorIdEnabled()) {
                errorId = mTraceErrorLogger.generateErrorId();
                mTraceErrorLogger.addErrorIdToTrace("system_server", errorId);
            } else {
                errorId = null;
            }

            // Log the atom as early as possible since it is used as a mechanism to trigger
@@ -686,6 +689,11 @@ public class Watchdog {
            // point in time when the Watchdog happens as possible.
            FrameworkStatsLog.write(FrameworkStatsLog.SYSTEM_SERVER_WATCHDOG_OCCURRED, subject);

            // Get critical event log before logging the watchdog so that it doesn't occur in the
            // log.
            String criticalEvents = CriticalEventLog.getInstance().logLinesForAnrFile();
            CriticalEventLog.getInstance().logWatchdog(subject, errorId);

            long anrTime = SystemClock.uptimeMillis();
            StringBuilder report = new StringBuilder();
            report.append(MemoryPressureUtil.currentPsiState());
@@ -693,7 +701,7 @@ public class Watchdog {
            StringWriter tracesFileException = new StringWriter();
            final File stack = ActivityManagerService.dumpStackTraces(
                    pids, processCpuTracker, new SparseArray<>(), getInterestingNativePids(),
                    tracesFileException, subject);
                    tracesFileException, subject, criticalEvents);

            // Give some extra time to make sure the stack traces get written.
            // The system's been hanging for a minute, another second or two won't hurt much.
+15 −8
Original line number Diff line number Diff line
@@ -2394,6 +2394,7 @@ public class ActivityManagerService extends IActivityManager.Stub
    private void start() {
        removeAllProcessGroups();
        CriticalEventLog.init();
        mBatteryStatsService.publish();
        mAppOpsService.publish();
        Slog.d("AppOps", "AppOpsService published");
@@ -3199,7 +3200,7 @@ public class ActivityManagerService extends IActivityManager.Stub
            ProcessCpuTracker processCpuTracker, SparseArray<Boolean> lastPids,
            ArrayList<Integer> nativePids, StringWriter logExceptionCreatingFile) {
        return dumpStackTraces(firstPids, processCpuTracker, lastPids, nativePids,
                logExceptionCreatingFile, null, null);
                logExceptionCreatingFile, null, null, null);
    }
    /**
@@ -3209,13 +3210,14 @@ public class ActivityManagerService extends IActivityManager.Stub
     * @param nativePids optional list of native pids to dump stack crawls
     * @param logExceptionCreatingFile optional writer to which we log errors creating the file
     * @param subject optional line related to the error
     * @param criticalEventSection optional lines containing recent critical events.
     */
    public static File dumpStackTraces(ArrayList<Integer> firstPids,
            ProcessCpuTracker processCpuTracker, SparseArray<Boolean> lastPids,
            ArrayList<Integer> nativePids, StringWriter logExceptionCreatingFile,
            String subject) {
            String subject, String criticalEventSection) {
        return dumpStackTraces(firstPids, processCpuTracker, lastPids, nativePids,
                logExceptionCreatingFile, null, subject);
                logExceptionCreatingFile, null, subject, criticalEventSection);
    }
    /**
@@ -3225,7 +3227,7 @@ public class ActivityManagerService extends IActivityManager.Stub
    /* package */ static File dumpStackTraces(ArrayList<Integer> firstPids,
            ProcessCpuTracker processCpuTracker, SparseArray<Boolean> lastPids,
            ArrayList<Integer> nativePids, StringWriter logExceptionCreatingFile,
            long[] firstPidOffsets, String subject) {
            long[] firstPidOffsets, String subject, String criticalEventSection) {
        ArrayList<Integer> extraPids = null;
        Slog.i(TAG, "dumpStackTraces pids=" + lastPids + " nativepids=" + nativePids);
@@ -3277,12 +3279,17 @@ public class ActivityManagerService extends IActivityManager.Stub
            return null;
        }
        if (subject != null) {
        if (subject != null || criticalEventSection != null) {
            try (FileOutputStream fos = new FileOutputStream(tracesFile, true)) {
                String header = "Subject: " + subject + "\n";
                if (subject != null) {
                    String header = "Subject: " + subject + "\n\n";
                    fos.write(header.getBytes(StandardCharsets.UTF_8));
                }
                if (criticalEventSection != null) {
                    fos.write(criticalEventSection.getBytes(StandardCharsets.UTF_8));
                }
            } catch (IOException e) {
                Slog.w(TAG, "Exception writing subject to ANR dump file:", e);
                Slog.w(TAG, "Exception writing to ANR dump file:", e);
            }
        }
+328 −0
Original line number Diff line number Diff line
/*
 * Copyright (C) 2021 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.android.server.am;

import android.os.Handler;
import android.os.HandlerThread;
import android.util.Slog;

import com.android.framework.protobuf.nano.MessageNanoPrinter;
import com.android.internal.annotations.VisibleForTesting;
import com.android.internal.util.RingBuffer;
import com.android.server.am.nano.CriticalEventLogProto;
import com.android.server.am.nano.CriticalEventLogStorageProto;
import com.android.server.am.nano.CriticalEventProto;
import com.android.server.am.nano.CriticalEventProto.HalfWatchdog;
import com.android.server.am.nano.CriticalEventProto.Watchdog;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.time.Duration;
import java.util.Arrays;
import java.util.UUID;

/**
 * Log of recent critical events such as Watchdogs.
 *
 * For use in ANR reports to show recent events that may help to debug the ANR. In particular,
 * the presence of recent critical events signal that the device was already in a had state.
 *
 * This class needs to be thread safe since it's used as a singleton.
 */
public class CriticalEventLog {
    private static final String TAG = CriticalEventLog.class.getSimpleName();

    private static CriticalEventLog sInstance;

    /** Name of the file the log is saved to. */
    @VisibleForTesting
    static final String FILENAME = "critical_event_log.pb";

    /** Timestamp when the log was last saved (or attempted to be saved) to disk. */
    private long mLastSaveAttemptMs = 0;

    /** File the log is saved to. */
    private final File mLogFile;

    /** Ring buffer containing the log events. */
    private final ThreadSafeRingBuffer<CriticalEventProto> mEvents;

    /** Max age of events to include in the output log proto. */
    private final int mWindowMs;

    /** Minimum time between consecutive saves of the log to disk. */
    private final long mMinTimeBetweenSavesMs;

    /** Whether to load and save the log synchronously with no delay. Only set to true in tests. */
    private final boolean mLoadAndSaveImmediately;

    private final Handler mHandler;

    private final Runnable mSaveRunnable = this::saveLogToFileNow;

    @VisibleForTesting
    CriticalEventLog(String logDir, int capacity, int windowMs, long minTimeBetweenSavesMs,
            boolean loadAndSaveImmediately, ILogLoader logLoader) {
        mLogFile = Paths.get(logDir, FILENAME).toFile();
        mWindowMs = windowMs;
        mMinTimeBetweenSavesMs = minTimeBetweenSavesMs;
        mLoadAndSaveImmediately = loadAndSaveImmediately;

        mEvents = new ThreadSafeRingBuffer<>(CriticalEventProto.class, capacity);

        HandlerThread thread = new HandlerThread("CriticalEventLogIO");
        thread.start();
        mHandler = new Handler(thread.getLooper());

        final Runnable loadEvents = () -> logLoader.load(mLogFile, mEvents);
        if (!mLoadAndSaveImmediately) {
            mHandler.post(loadEvents);
        } else {
            loadEvents.run();
        }
    }

    /** Returns a new instance with production settings. */
    private CriticalEventLog() {
        this(
                /* logDir= */"/data/misc/critical-events",
                /* capacity= */ 20,
                /* windowMs= */ (int) Duration.ofMinutes(5).toMillis(),
                /* minTimeBetweenSavesMs= */ Duration.ofSeconds(2).toMillis(),
                /* loadAndSaveImmediately= */ false,
                new LogLoader());
    }

    /** Returns the singleton instance. */
    public static CriticalEventLog getInstance() {
        if (sInstance == null) {
            sInstance = new CriticalEventLog();
        }
        return sInstance;
    }

    /**
     * Ensures the singleton instance has been instantiated.
     *
     * Use this to eagerly instantiate the log (which loads the previous events from disk).
     * Otherwise this will occur lazily when the first event is logged, by which time the device may
     * be under load.
     */
    public static void init() {
        getInstance();
    }

    @VisibleForTesting
    protected long getWallTimeMillis() {
        return System.currentTimeMillis();
    }

    /** Logs a watchdog. */
    public void logWatchdog(String subject, UUID uuid) {
        Watchdog watchdog = new Watchdog();
        watchdog.subject = subject;
        watchdog.uuid = uuid.toString();
        CriticalEventProto event = new CriticalEventProto();
        event.setWatchdog(watchdog);
        log(event);
    }

    /** Logs a half-watchdog. */
    public void logHalfWatchdog(String subject) {
        HalfWatchdog halfWatchdog = new HalfWatchdog();
        halfWatchdog.subject = subject;
        CriticalEventProto event = new CriticalEventProto();
        event.setHalfWatchdog(halfWatchdog);
        log(event);
    }

    private void log(CriticalEventProto event) {
        event.timestampMs = getWallTimeMillis();
        mEvents.append(event);
        saveLogToFile();
    }

    /**
     * Returns recent critical events in text format to include in logs such as ANR files.
     *
     * Includes all events in the ring buffer with age less than or equal to {@code mWindowMs}.
     */
    public String logLinesForAnrFile() {
        return new StringBuilder()
                .append("--- CriticalEventLog ---\n")
                .append(MessageNanoPrinter.print(getRecentEvents()))
                .append('\n').toString();
    }

    /**
     * Returns a proto containing recent critical events.
     *
     * Includes all events in the ring buffer with age less than or equal to {@code mWindowMs}.
     */
    @VisibleForTesting
    protected CriticalEventLogProto getRecentEvents() {
        CriticalEventLogProto log = new CriticalEventLogProto();
        log.timestampMs = getWallTimeMillis();
        log.windowMs = mWindowMs;
        log.capacity = mEvents.capacity();
        log.events = recentEventsWithMinTimestamp(log.timestampMs - mWindowMs);

        return log;
    }

    /**
     * Returns the most recent logged events, starting with the first event that has a timestamp
     * greater than or equal to {@code minTimestampMs}.
     *
     * If no events have a timestamp greater than or equal to {@code minTimestampMs}, returns an
     * empty array.
     */
    private CriticalEventProto[] recentEventsWithMinTimestamp(long minTimestampMs) {
        // allEvents are in insertion order, i.e. in order of when the relevant log___() function
        // was called.
        // This means that if the system clock changed (e.g. a NITZ update) allEvents may not be
        // strictly ordered by timestamp. In this case we are permissive and start with the
        // first event that has a timestamp in the desired range.
        CriticalEventProto[] allEvents = mEvents.toArray();
        for (int i = 0; i < allEvents.length; i++) {
            if (allEvents[i].timestampMs >= minTimestampMs) {
                return Arrays.copyOfRange(allEvents, i, allEvents.length);
            }
        }
        return new CriticalEventProto[]{};
    }

    private void saveLogToFile() {
        if (mLoadAndSaveImmediately) {
            saveLogToFileNow();
            return;
        }
        if (mHandler.hasCallbacks(mSaveRunnable)) {
            // An earlier save is already scheduled so don't need to schedule an additional one.
            return;
        }

        if (!mHandler.postDelayed(mSaveRunnable, saveDelayMs())) {
            Slog.w(TAG, "Error scheduling save");
        }
    }

    /**
     * Returns the delay in milliseconds when scheduling a save on the handler thread.
     *
     * Returns a value in the range [0, {@code minTimeBetweenSavesMs}] such that the time between
     * consecutive saves does not exceed {@code minTimeBetweenSavesMs}.
     *
     * This means that if the last save occurred a long time ago, or if no previous saves
     * have occurred then the returned delay will be zero.
     */
    @VisibleForTesting
    protected long saveDelayMs() {
        final long nowMs = getWallTimeMillis();
        return Math.max(0,
                mLastSaveAttemptMs + mMinTimeBetweenSavesMs - nowMs);
    }

    @VisibleForTesting
    protected void saveLogToFileNow() {
        mLastSaveAttemptMs = getWallTimeMillis();

        File logDir = mLogFile.getParentFile();
        if (!logDir.exists()) {
            if (!logDir.mkdir()) {
                Slog.e(TAG, "Error creating log directory: " + logDir.getPath());
                return;
            }
        }

        if (!mLogFile.exists()) {
            try {
                mLogFile.createNewFile();
            } catch (IOException e) {
                Slog.e(TAG, "Error creating log file", e);
                return;
            }
        }

        CriticalEventLogStorageProto logProto = new CriticalEventLogStorageProto();
        logProto.events = mEvents.toArray();

        final byte[] bytes = CriticalEventLogStorageProto.toByteArray(logProto);
        try (FileOutputStream stream = new FileOutputStream(mLogFile, false)) {
            stream.write(bytes);
        } catch (IOException e) {
            Slog.e(TAG, "Error saving log to disk.", e);
        }
    }

    @VisibleForTesting
    protected static class ThreadSafeRingBuffer<T> {
        private final int mCapacity;
        private final RingBuffer<T> mBuffer;

        ThreadSafeRingBuffer(Class<T> clazz, int capacity) {
            this.mCapacity = capacity;
            this.mBuffer = new RingBuffer<>(clazz, capacity);
        }

        synchronized void append(T t) {
            mBuffer.append(t);
        }

        synchronized T[] toArray() {
            return mBuffer.toArray();
        }

        int capacity() {
            return mCapacity;
        }
    }

    /** Loads log events from disk into a ring buffer. */
    protected interface ILogLoader {
        void load(File logFile, ThreadSafeRingBuffer<CriticalEventProto> buffer);
    }

    /** Loads log events from disk into a ring buffer. */
    static class LogLoader implements ILogLoader {
        @Override
        public void load(File logFile,
                ThreadSafeRingBuffer<CriticalEventProto> buffer) {
            for (CriticalEventProto event : loadLogFromFile(logFile).events) {
                buffer.append(event);
            }
        }

        private static CriticalEventLogStorageProto loadLogFromFile(File logFile) {
            if (!logFile.exists()) {
                Slog.i(TAG, "No log found, returning empty log proto.");
                return new CriticalEventLogStorageProto();
            }

            try {
                return CriticalEventLogStorageProto.parseFrom(
                        Files.readAllBytes(logFile.toPath()));
            } catch (IOException e) {
                Slog.e(TAG, "Error reading log from disk.", e);
                return new CriticalEventLogStorageProto();
            }
        }
    }
}
Loading