Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 21a7679c authored by Luke Huang's avatar Luke Huang
Browse files

Subsampling DNS events to reduce logs

Currently, each device generates about 4000 queries per day.
Reduce the number of events by subsampling events based on
how interesting they are:
- if return_code == 0 -> log 1 in 100 events (random sampling)
- if return_code == EAI_NODATA -> log 1 in 10 events (random sampling)
- else -> log 100%
Also allow to use experiment flag to update sub-sampling denom.

Example for dumpsys dnsresolver:
  NetId: 100
    DnsEvent subsampling map: default:1 0:100 7:10

Bug: 119862317
Test: manual test with using setprop to change subsampling map
      atest

Merged-In: Ibbe3c653bdf45fc03b56a61e3401fad11c05a010
(cherry picked from commit 79e5185780a94db0b2356edf987239da459433f4)

Change-Id: I170dda9247e94a17a7cac9e7e414d592bb4dccbf
parent 5d4bfe50
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -205,14 +205,17 @@ cc_test {
    srcs: [
        "dns_tls_test.cpp",
        "libnetd_resolv_test.cpp",
        "res_cache_test.cpp",
    ],
    shared_libs: [
        "libbase",
        "libcrypto",
        "libcutils",
        "liblog",
        "libssl",
    ],
    static_libs: [
        "libgmock",
        "libnetd_resolv",
        "libnetd_test_dnsresponder",
        "libnetdutils",
+20 −9
Original line number Diff line number Diff line
@@ -32,7 +32,6 @@
#define LOG_TAG "DnsProxyListener"

#include <algorithm>
#include <list>
#include <vector>

#include <android-base/stringprintf.h>
@@ -57,6 +56,7 @@
#include "gethnamaddr.h"
#include "netd_resolv/stats.h"  // RCODE_TIMEOUT
#include "res_send.h"
#include "resolv_cache.h"
#include "resolv_private.h"
#include "stats.pb.h"

@@ -305,17 +305,28 @@ void initDnsEvent(NetworkDnsEventReported* event) {
    event->set_res_nsend_flags(-1);
}

// Return 0 if the event should not be logged.
// Otherwise, return subsampling_denom
uint32_t getDnsEventSubsamplingRate(int netid, int returnCode) {
    uint32_t subsampling_denom = resolv_cache_get_subsampling_denom(netid, returnCode);
    if (subsampling_denom == 0) return 0;
    // Sample the event with a chance of 1 / denom.
    return (arc4random_uniform(subsampling_denom) == 0) ? subsampling_denom : 0;
}

void reportDnsEvent(int eventType, const android_net_context& netContext, int latencyUs,
                    int returnCode, NetworkDnsEventReported& event, const std::string& query_name,
                    const std::vector<std::string>& ip_addrs = {}, int total_ip_addr_count = 0) {
    if (uint32_t rate = getDnsEventSubsamplingRate(netContext.dns_netid, returnCode)) {
        const std::string& dnsQueryStats = event.dns_query_events().SerializeAsString();
        stats::BytesField dnsQueryBytesField{dnsQueryStats.c_str(), dnsQueryStats.size()};
        event.set_return_code(static_cast<ReturnCode>(returnCode));
    android::net::stats::stats_write(
            android::net::stats::NETWORK_DNS_EVENT_REPORTED, event.event_type(),
            event.return_code(), event.latency_micros(), event.hints_ai_flags(),
            event.res_nsend_flags(), event.network_type(), event.private_dns_modes(),
            dnsQueryBytesField, event.sampling_rate_denom());
        android::net::stats::stats_write(android::net::stats::NETWORK_DNS_EVENT_REPORTED,
                                         event.event_type(), event.return_code(),
                                         event.latency_micros(), event.hints_ai_flags(),
                                         event.res_nsend_flags(), event.network_type(),
                                         event.private_dns_modes(), dnsQueryBytesField, rate);
    }

    const auto& listeners = ResolverEventReporter::getInstance().getListeners();
    if (listeners.size() == 0) {
+2 −0
Original line number Diff line number Diff line
@@ -319,6 +319,8 @@ void ResolverController::dump(DumpWriter& dw, unsigned netId) {
        if (servers.empty()) {
            dw.println("No DNS servers defined");
        } else {
            dw.println("DnsEvent subsampling map: " +
                       android::base::Join(resolv_cache_dump_subsampling_map(netId), ' '));
            dw.println(
                    "DNS servers: # IP (total, successes, errors, timeouts, internal errors, "
                    "RTT avg, last sample)");
+88 −0
Original line number Diff line number Diff line
@@ -36,7 +36,11 @@
#include <stdlib.h>
#include <string.h>
#include <time.h>

#include <mutex>
#include <string>
#include <unordered_map>
#include <vector>

#include <arpa/inet.h>
#include <arpa/nameser.h>
@@ -47,6 +51,8 @@

#include <android-base/logging.h>
#include <android-base/parseint.h>
#include <android-base/stringprintf.h>
#include <android-base/strings.h>
#include <android-base/thread_annotations.h>
#include <android/multinetwork.h>  // ResNsendFlags

@@ -1146,6 +1152,8 @@ struct resolv_cache_info {
    char defdname[MAXDNSRCHPATH];
    int dnsrch_offset[MAXDNSRCH + 1];  // offsets into defdname
    int wait_for_pending_req_timeout_count;
    // Map format: ReturnCode:rate_denom
    std::unordered_map<int, uint32_t> dns_event_subsampling_map;
};

// A helper class for the Clang Thread Safety Analysis to deal with
@@ -1604,6 +1612,49 @@ bool resolv_has_nameservers(unsigned netid) {
    return (info != nullptr) && (info->nscount > 0);
}

namespace {

// Map format: ReturnCode:rate_denom
// if the ReturnCode is not associated with any rate_denom, use default
// Sampling rate varies by return code; events to log are chosen randomly, with a
// probability proportional to the sampling rate.
constexpr const char DEFAULT_SUBSAMPLING_MAP[] = "default:1 0:100 7:10";

std::unordered_map<int, uint32_t> resolv_get_dns_event_subsampling_map() {
    using android::base::ParseInt;
    using android::base::ParseUint;
    using android::base::Split;
    using server_configurable_flags::GetServerConfigurableFlag;
    std::unordered_map<int, uint32_t> sampling_rate_map{};
    std::vector<std::string> subsampling_vector =
            Split(GetServerConfigurableFlag("netd_native", "dns_event_subsample_map",
                                            DEFAULT_SUBSAMPLING_MAP),
                  " ");
    for (const auto& pair : subsampling_vector) {
        std::vector<std::string> rate_denom = Split(pair, ":");
        int return_code;
        uint32_t denom;
        if (rate_denom.size() != 2) {
            LOG(ERROR) << __func__ << ": invalid subsampling_pair = " << pair;
            continue;
        }
        if (rate_denom[0] == "default") {
            return_code = DNSEVENT_SUBSAMPLING_MAP_DEFAULT_KEY;
        } else if (!ParseInt(rate_denom[0], &return_code)) {
            LOG(ERROR) << __func__ << ": parse subsampling_pair failed = " << pair;
            continue;
        }
        if (!ParseUint(rate_denom[1], &denom)) {
            LOG(ERROR) << __func__ << ": parse subsampling_pair failed = " << pair;
            continue;
        }
        sampling_rate_map[return_code] = denom;
    }
    return sampling_rate_map;
}

}  // namespace

static int resolv_create_cache_for_net_locked(unsigned netid) {
    resolv_cache* cache = find_named_cache_locked(netid);
    // Should not happen
@@ -1621,6 +1672,7 @@ static int resolv_create_cache_for_net_locked(unsigned netid) {
    }
    cache_info->cache = cache;
    cache_info->netid = netid;
    cache_info->dns_event_subsampling_map = resolv_get_dns_event_subsampling_map();
    insert_cache_info_locked(cache_info);

    return 0;
@@ -2000,6 +2052,42 @@ int android_net_res_stats_get_info_for_net(unsigned netid, int* nscount,
    return revision_id;
}

std::vector<std::string> resolv_cache_dump_subsampling_map(unsigned netid) {
    using android::base::StringPrintf;
    std::lock_guard guard(cache_mutex);
    resolv_cache_info* cache_info = find_cache_info_locked(netid);
    if (cache_info == nullptr) return {};
    std::vector<std::string> result;
    for (const auto& pair : cache_info->dns_event_subsampling_map) {
        result.push_back(StringPrintf("%s:%d",
                                      (pair.first == DNSEVENT_SUBSAMPLING_MAP_DEFAULT_KEY)
                                              ? "default"
                                              : std::to_string(pair.first).c_str(),
                                      pair.second));
    }
    return result;
}

// Decides whether an event should be sampled using a random number generator and
// a sampling factor derived from the netid and the return code.
//
// Returns the subsampling rate if the event should be sampled, or 0 if it should be discarded.
uint32_t resolv_cache_get_subsampling_denom(unsigned netid, int return_code) {
    std::lock_guard guard(cache_mutex);
    resolv_cache_info* cache_info = find_cache_info_locked(netid);
    if (cache_info == nullptr) return 0;  // Don't log anything at all.
    const auto& subsampling_map = cache_info->dns_event_subsampling_map;
    auto search_returnCode = subsampling_map.find(return_code);
    uint32_t denom;
    if (search_returnCode != subsampling_map.end()) {
        denom = search_returnCode->second;
    } else {
        auto search_default = subsampling_map.find(DNSEVENT_SUBSAMPLING_MAP_DEFAULT_KEY);
        denom = (search_default == subsampling_map.end()) ? 0 : search_default->second;
    }
    return denom;
}

int resolv_cache_get_resolver_stats(unsigned netid, res_params* params, res_stats stats[MAXNS]) {
    std::lock_guard guard(cache_mutex);
    resolv_cache_info* info = find_cache_info_locked(netid);

res_cache_test.cpp

0 → 100644
+95 −0
Original line number Diff line number Diff line
/*
 * Copyright (C) 2019 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <netdb.h>

#include <cutils/properties.h>
#include <gmock/gmock-matchers.h>
#include <gtest/gtest.h>

#include "netd_resolv/stats.h"
#include "resolv_cache.h"

constexpr int TEST_NETID = 30;

namespace {

constexpr int EAI_OK = 0;
constexpr char DNS_EVENT_SUBSAMPLING_MAP_FLAG[] =
        "persist.device_config.netd_native.dns_event_subsample_map";

class ScopedCacheCreate {
  public:
    explicit ScopedCacheCreate(unsigned netid, const char* subsampling_map,
                               const char* property = DNS_EVENT_SUBSAMPLING_MAP_FLAG)
        : mStoredNetId(netid), mStoredProperty(property) {
        property_get(property, mStoredMap, "");
        property_set(property, subsampling_map);
        EXPECT_EQ(0, resolv_create_cache_for_net(netid));
    }
    ~ScopedCacheCreate() {
        resolv_delete_cache_for_net(mStoredNetId);
        property_set(mStoredProperty, mStoredMap);
    }

  private:
    unsigned mStoredNetId;
    const char* mStoredProperty;
    char mStoredMap[PROPERTY_VALUE_MAX]{};
};

}  // namespace

TEST(ResolvCacheTest, DnsEventSubsampling) {
    // Test defaults, default flag is "default:1 0:100 7:10" if no experiment flag is set
    {
        ScopedCacheCreate scopedCacheCreate(TEST_NETID, "");
        EXPECT_EQ(resolv_cache_get_subsampling_denom(TEST_NETID, EAI_NODATA), 10U);
        EXPECT_EQ(resolv_cache_get_subsampling_denom(TEST_NETID, EAI_OK), 100U);
        EXPECT_EQ(resolv_cache_get_subsampling_denom(TEST_NETID, EAI_BADFLAGS),
                  1U);  // default
        EXPECT_THAT(resolv_cache_dump_subsampling_map(TEST_NETID),
                    testing::UnorderedElementsAreArray({"default:1", "0:100", "7:10"}));
    }
    // Now change the experiment flag to "0:42 default:666"
    {
        ScopedCacheCreate scopedCacheCreate(TEST_NETID, "0:42 default:666");
        EXPECT_EQ(resolv_cache_get_subsampling_denom(TEST_NETID, EAI_OK), 42U);
        EXPECT_EQ(resolv_cache_get_subsampling_denom(TEST_NETID, EAI_NODATA),
                  666U);  // default
        EXPECT_THAT(resolv_cache_dump_subsampling_map(TEST_NETID),
                    testing::UnorderedElementsAreArray({"default:666", "0:42"}));
    }
    // Now change the experiment flag to something illegal
    {
        ScopedCacheCreate scopedCacheCreate(TEST_NETID, "asvaxx");
        // 0(disable log) is the default value if experiment flag is invalid.
        EXPECT_EQ(resolv_cache_get_subsampling_denom(TEST_NETID, EAI_OK), 0U);
        EXPECT_EQ(resolv_cache_get_subsampling_denom(TEST_NETID, EAI_NODATA), 0U);
        EXPECT_TRUE(resolv_cache_dump_subsampling_map(TEST_NETID).empty());
    }
    // Test negative and zero denom
    {
        ScopedCacheCreate scopedCacheCreate(TEST_NETID, "0:-42 default:-666 7:10 10:0");
        // 0(disable log) is the default value if no valid denom is set
        EXPECT_EQ(resolv_cache_get_subsampling_denom(TEST_NETID, EAI_OK), 0U);
        EXPECT_EQ(resolv_cache_get_subsampling_denom(TEST_NETID, EAI_BADFLAGS), 0U);
        EXPECT_EQ(resolv_cache_get_subsampling_denom(TEST_NETID, EAI_NODATA), 10U);
        EXPECT_EQ(resolv_cache_get_subsampling_denom(TEST_NETID, EAI_SOCKTYPE), 0U);
        EXPECT_THAT(resolv_cache_dump_subsampling_map(TEST_NETID),
                    testing::UnorderedElementsAreArray({"7:10", "10:0"}));
    }
}
Loading