Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 0f5d6618 authored by Yao Chen's avatar Yao Chen
Browse files

Add more accurate logging for stats log loss.

+ Only record eventual failure after the retries to get a accurate log loss count.
+ Record all types of failures which lead to log loss
+ Change the timestamp from elapsedRealtime to wallclock time for easier debugging server side
+ Also log the count too.

Bug: 80538532
Test: manually tested

Change-Id: I4fcccae3fa39c9e280a842e27c6432bb0a090b85
parent 40613508
Loading
Loading
Loading
Loading
+12 −32
Original line number Diff line number Diff line
@@ -50,7 +50,7 @@ const int FIELD_ID_ANOMALY_ALARM_STATS = 9;
// const int FIELD_ID_PULLED_ATOM_STATS = 10; // The proto is written in stats_log_util.cpp
const int FIELD_ID_LOGGER_ERROR_STATS = 11;
const int FIELD_ID_PERIODIC_ALARM_STATS = 12;
const int FIELD_ID_LOG_LOSS_STATS = 14;
// const int FIELD_ID_LOG_LOSS_STATS = 14;
const int FIELD_ID_SYSTEM_SERVER_RESTART = 15;

const int FIELD_ID_ATOM_STATS_TAG = 1;
@@ -180,12 +180,12 @@ void StatsdStats::noteConfigReset(const ConfigKey& key) {
    noteConfigResetInternalLocked(key);
}

void StatsdStats::noteLogLost(int64_t timestampNs, int32_t count) {
void StatsdStats::noteLogLost(int32_t wallClockTimeSec, int32_t count) {
    lock_guard<std::mutex> lock(mLock);
    if (mLogLossTimestampNs.size() == kMaxLoggerErrors) {
        mLogLossTimestampNs.pop_front();
    if (mLogLossStats.size() == kMaxLoggerErrors) {
        mLogLossStats.pop_front();
    }
    mLogLossTimestampNs.push_back(std::make_pair(timestampNs, count));
    mLogLossStats.push_back(std::make_pair(wallClockTimeSec, count));
}

void StatsdStats::noteBroadcastSent(const ConfigKey& key) {
@@ -365,15 +365,6 @@ void StatsdStats::noteSystemServerRestart(int32_t timeSec) {
    mSystemServerRestartSec.push_back(timeSec);
}

void StatsdStats::noteLoggerError(int error) {
    lock_guard<std::mutex> lock(mLock);
    // grows strictly one at a time. so it won't > kMaxLoggerErrors
    if (mLoggerErrors.size() == kMaxLoggerErrors) {
        mLoggerErrors.pop_front();
    }
    mLoggerErrors.push_back(std::make_pair(getWallClockSec(), error));
}

void StatsdStats::reset() {
    lock_guard<std::mutex> lock(mLock);
    resetInternalLocked();
@@ -386,9 +377,8 @@ void StatsdStats::resetInternalLocked() {
    std::fill(mPushedAtomStats.begin(), mPushedAtomStats.end(), 0);
    mAnomalyAlarmRegisteredStats = 0;
    mPeriodicAlarmRegisteredStats = 0;
    mLoggerErrors.clear();
    mSystemServerRestartSec.clear();
    mLogLossTimestampNs.clear();
    mLogLossStats.clear();
    for (auto& config : mConfigStats) {
        config.second->broadcast_sent_time_sec.clear();
        config.second->data_drop_time_sec.clear();
@@ -515,21 +505,13 @@ void StatsdStats::dumpStats(FILE* out) const {
            mUidMapStats.bytes_used, mUidMapStats.changes, mUidMapStats.deleted_apps,
            mUidMapStats.dropped_changes);

    for (const auto& error : mLoggerErrors) {
        time_t error_time = error.first;
        struct tm* error_tm = localtime(&error_time);
        char buffer[80];
        strftime(buffer, sizeof(buffer), "%Y-%m-%d %I:%M%p\n", error_tm);
        fprintf(out, "Logger error %d at %s\n", error.second, buffer);
    }

    for (const auto& restart : mSystemServerRestartSec) {
        fprintf(out, "System server restarts at %s(%lld)\n",
            buildTimeString(restart).c_str(), (long long)restart);
    }

    for (const auto& loss : mLogLossTimestampNs) {
        fprintf(out, "Log loss: %lld (elapsedRealtimeNs) - %d (count)\n", (long long)loss.first,
    for (const auto& loss : mLogLossStats) {
        fprintf(out, "Log loss: %lld (wall clock sec) - %d (count)\n", (long long)loss.first,
                loss.second);
    }
}
@@ -678,7 +660,10 @@ void StatsdStats::dumpStats(std::vector<uint8_t>* output, bool reset) {
    proto.write(FIELD_TYPE_INT32 | FIELD_ID_UID_MAP_DELETED_APPS, mUidMapStats.deleted_apps);
    proto.end(uidMapToken);

    for (const auto& error : mLoggerErrors) {
    for (const auto& error : mLogLossStats) {
        // The logger error stats are not used anymore since we move away from logd.
        // Temporarily use this field to log the log loss timestamp and count
        // TODO(b/80538532) Add a dedicated field in stats_log for this.
        uint64_t token = proto.start(FIELD_TYPE_MESSAGE | FIELD_ID_LOGGER_ERROR_STATS |
                                      FIELD_COUNT_REPEATED);
        proto.write(FIELD_TYPE_INT32 | FIELD_ID_LOGGER_STATS_TIME, error.first);
@@ -686,11 +671,6 @@ void StatsdStats::dumpStats(std::vector<uint8_t>* output, bool reset) {
        proto.end(token);
    }

    for (const auto& loss : mLogLossTimestampNs) {
        proto.write(FIELD_TYPE_INT64 | FIELD_ID_LOG_LOSS_STATS | FIELD_COUNT_REPEATED,
                    (long long)loss.first);
    }

    for (const auto& restart : mSystemServerRestartSec) {
        proto.write(FIELD_TYPE_INT32 | FIELD_ID_SYSTEM_SERVER_RESTART | FIELD_COUNT_REPEATED,
                    restart);
+2 −10
Original line number Diff line number Diff line
@@ -271,11 +271,6 @@ public:
    // Notify pull request for an atom served from cached data
    void notePullFromCache(int pullAtomId);

    /**
     * Records statsd met an error while reading from logd.
     */
    void noteLoggerError(int error);

    /*
    * Records when system server restarts.
    */
@@ -284,7 +279,7 @@ public:
    /**
     * Records statsd skipped an event.
     */
    void noteLogLost(int64_t timestamp, int32_t count);
    void noteLogLost(int32_t wallClockTimeSec, int32_t count);

    /**
     * Reset the historical stats. Including all stats in icebox, and the tracked stats about
@@ -338,11 +333,8 @@ private:
    // Maps PullAtomId to its stats. The size is capped by the puller atom counts.
    std::map<int, PulledAtomStats> mPulledAtomStats;

    // Logd errors. Size capped by kMaxLoggerErrors.
    std::list<const std::pair<int, int>> mLoggerErrors;

    // Timestamps when we detect log loss, and the number of logs lost.
    std::list<std::pair<int64_t, int32_t>> mLogLossTimestampNs;
    std::list<std::pair<int32_t, int32_t>> mLogLossStats;

    std::list<int32_t> mSystemServerRestartSec;

+2 −1
Original line number Diff line number Diff line
@@ -111,7 +111,8 @@ bool StatsSocketListener::onDataAvailable(SocketClient* cli) {
        android_log_event_int_t* int_event = reinterpret_cast<android_log_event_int_t*>(ptr);
        if (int_event->header.tag == kLibLogTag && int_event->payload.type == EVENT_TYPE_INT) {
            ALOGE("Found dropped events: %d", int_event->payload.data);
            StatsdStats::getInstance().noteLogLost(getElapsedRealtimeNs(), int_event->payload.data);
            StatsdStats::getInstance().noteLogLost((int32_t)getWallClockSec(),
                                                   int_event->payload.data);
            return true;
        }
    }
+9 −4
Original line number Diff line number Diff line
@@ -349,8 +349,7 @@ static int write_stats_log_cpp(FILE *out, const Atoms &atoms,
           argIndex++;
       }
       fprintf(out, ");\n");
       fprintf(out, "      if (ret >= 0) { return retry; }\n");

       fprintf(out, "      if (ret >= 0) { break; }\n");

       fprintf(out, "      {\n");
       fprintf(out, "          std::lock_guard<std::mutex> lock(mLogdRetryMutex);\n");
@@ -360,6 +359,9 @@ static int write_stats_log_cpp(FILE *out, const Atoms &atoms,
       fprintf(out, "      }\n");
       fprintf(out, "      std::this_thread::sleep_for(std::chrono::milliseconds(10));\n");
       fprintf(out, "  }\n");
       fprintf(out, "  if (ret < 0) {\n");
       fprintf(out, "      note_log_drop();\n");
       fprintf(out, "  }\n");
       fprintf(out, "  return ret;\n");
       fprintf(out, "}\n");
       fprintf(out, "\n");
@@ -439,7 +441,7 @@ static int write_stats_log_cpp(FILE *out, const Atoms &atoms,
           argIndex++;
       }
       fprintf(out, ");\n");
       fprintf(out, "      if (ret >= 0) { return retry; }\n");
       fprintf(out, "      if (ret >= 0) { break; }\n");

       fprintf(out, "      {\n");
       fprintf(out, "          std::lock_guard<std::mutex> lock(mLogdRetryMutex);\n");
@@ -450,7 +452,10 @@ static int write_stats_log_cpp(FILE *out, const Atoms &atoms,

       fprintf(out, "      std::this_thread::sleep_for(std::chrono::milliseconds(10));\n");
       fprintf(out, "  }\n");
       fprintf(out, "  return ret;\n");
       fprintf(out, "  if (ret < 0) {\n");
       fprintf(out, "      note_log_drop();\n");
       fprintf(out, "  }\n");
       fprintf(out, "  return ret;\n\n");
       fprintf(out, "}\n");

       fprintf(out, "\n");