Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit e126fbe7 authored by Luke Huang's avatar Luke Huang
Browse files

Subsampling DNS events to reduce logs

Currently, each device generates about 4000 queries per day.
Reduce the number of events by subsampling events based on
how interesting they are:
- if return_code == 0 -> log 1 in 100 events (random sampling)
- if return_code == EAI_NODATA -> log 1 in 10 events (random sampling)
- else -> log 100%
Also allow to use experiment flag to update sub-sampling denom.

Example for dumpsys dnsresolver:
  NetId: 100
    DnsEvent subsampling map: default:1 0:100 7:10

Bug: 119862317
Test: manual test with using setprop to change subsampling map
      atest
Change-Id: Ie0a433a91f629dd32b1d0787a5b45926415faab0
parent 66ec4ce2
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -203,9 +203,11 @@ cc_test {
    shared_libs: [
        "libbase",
        "libcrypto",
        "libcutils",
        "libssl",
    ],
    static_libs: [
        "libgmock",
        "libnetd_resolv",
        "libnetd_test_dnsresponder",
        "libnetdutils",
+20 −9
Original line number Diff line number Diff line
@@ -32,7 +32,6 @@
#define LOG_TAG "resolv"

#include <algorithm>
#include <list>
#include <vector>

#include <NetdClient.h>  // NETID_USE_LOCAL_NAMESERVERS
@@ -57,6 +56,7 @@
#include "gethnamaddr.h"
#include "netd_resolv/stats.h"  // RCODE_TIMEOUT
#include "res_send.h"
#include "resolv_cache.h"
#include "resolv_private.h"
#include "stats.pb.h"

@@ -305,17 +305,28 @@ void initDnsEvent(NetworkDnsEventReported* event) {
    event->set_res_nsend_flags(-1);
}

// Return 0 if the event should not be logged.
// Otherwise, return subsampling_denom
uint32_t getDnsEventSubsamplingRate(int netid, int returnCode) {
    uint32_t subsampling_denom = resolv_cache_get_subsampling_denom(netid, returnCode);
    if (subsampling_denom == 0) return 0;
    // Sample the event with a chance of 1 / denom.
    return (arc4random_uniform(subsampling_denom) == 0) ? subsampling_denom : 0;
}

void reportDnsEvent(int eventType, const android_net_context& netContext, int latencyUs,
                    int returnCode, NetworkDnsEventReported& event, const std::string& query_name,
                    const std::vector<std::string>& ip_addrs = {}, int total_ip_addr_count = 0) {
    if (uint32_t rate = getDnsEventSubsamplingRate(netContext.dns_netid, returnCode)) {
        const std::string& dnsQueryStats = event.dns_query_events().SerializeAsString();
        stats::BytesField dnsQueryBytesField{dnsQueryStats.c_str(), dnsQueryStats.size()};
        event.set_return_code(static_cast<ReturnCode>(returnCode));
    android::net::stats::stats_write(
            android::net::stats::NETWORK_DNS_EVENT_REPORTED, event.event_type(),
            event.return_code(), event.latency_micros(), event.hints_ai_flags(),
            event.res_nsend_flags(), event.network_type(), event.private_dns_modes(),
            dnsQueryBytesField, event.sampling_rate_denom());
        android::net::stats::stats_write(android::net::stats::NETWORK_DNS_EVENT_REPORTED,
                                         event.event_type(), event.return_code(),
                                         event.latency_micros(), event.hints_ai_flags(),
                                         event.res_nsend_flags(), event.network_type(),
                                         event.private_dns_modes(), dnsQueryBytesField, rate);
    }

    const auto& listeners = ResolverEventReporter::getInstance().getListeners();
    if (listeners.size() == 0) {
+2 −0
Original line number Diff line number Diff line
@@ -306,6 +306,8 @@ void ResolverController::dump(DumpWriter& dw, unsigned netId) {
        if (servers.empty()) {
            dw.println("No DNS servers defined");
        } else {
            dw.println("DnsEvent subsampling map: " +
                       android::base::Join(resolv_cache_dump_subsampling_map(netId), ' '));
            dw.println(
                    "DNS servers: # IP (total, successes, errors, timeouts, internal errors, "
                    "RTT avg, last sample)");
+84 −0
Original line number Diff line number Diff line
@@ -39,6 +39,7 @@
#include <mutex>
#include <set>
#include <string>
#include <unordered_map>
#include <vector>

#include <arpa/inet.h>
@@ -50,6 +51,7 @@

#include <android-base/logging.h>
#include <android-base/parseint.h>
#include <android-base/stringprintf.h>
#include <android-base/strings.h>
#include <android-base/thread_annotations.h>
#include <android/multinetwork.h>  // ResNsendFlags
@@ -1148,6 +1150,8 @@ struct resolv_cache_info {
    struct res_stats nsstats[MAXNS];
    std::vector<std::string> search_domains;
    int wait_for_pending_req_timeout_count;
    // Map format: ReturnCode:rate_denom
    std::unordered_map<int, uint32_t> dns_event_subsampling_map;
};

// A helper class for the Clang Thread Safety Analysis to deal with
@@ -1606,6 +1610,49 @@ bool resolv_has_nameservers(unsigned netid) {
    return (info != nullptr) && (info->nscount > 0);
}

namespace {

// Map format: ReturnCode:rate_denom
// if the ReturnCode is not associated with any rate_denom, use default
// Sampling rate varies by return code; events to log are chosen randomly, with a
// probability proportional to the sampling rate.
constexpr const char DEFAULT_SUBSAMPLING_MAP[] = "default:1 0:100 7:10";

std::unordered_map<int, uint32_t> resolv_get_dns_event_subsampling_map() {
    using android::base::ParseInt;
    using android::base::ParseUint;
    using android::base::Split;
    using server_configurable_flags::GetServerConfigurableFlag;
    std::unordered_map<int, uint32_t> sampling_rate_map{};
    std::vector<std::string> subsampling_vector =
            Split(GetServerConfigurableFlag("netd_native", "dns_event_subsample_map",
                                            DEFAULT_SUBSAMPLING_MAP),
                  " ");
    for (const auto& pair : subsampling_vector) {
        std::vector<std::string> rate_denom = Split(pair, ":");
        int return_code;
        uint32_t denom;
        if (rate_denom.size() != 2) {
            LOG(ERROR) << __func__ << ": invalid subsampling_pair = " << pair;
            continue;
        }
        if (rate_denom[0] == "default") {
            return_code = DNSEVENT_SUBSAMPLING_MAP_DEFAULT_KEY;
        } else if (!ParseInt(rate_denom[0], &return_code)) {
            LOG(ERROR) << __func__ << ": parse subsampling_pair failed = " << pair;
            continue;
        }
        if (!ParseUint(rate_denom[1], &denom)) {
            LOG(ERROR) << __func__ << ": parse subsampling_pair failed = " << pair;
            continue;
        }
        sampling_rate_map[return_code] = denom;
    }
    return sampling_rate_map;
}

}  // namespace

static int resolv_create_cache_for_net_locked(unsigned netid) {
    resolv_cache* cache = find_named_cache_locked(netid);
    // Should not happen
@@ -1623,6 +1670,7 @@ static int resolv_create_cache_for_net_locked(unsigned netid) {
    }
    cache_info->cache = cache;
    cache_info->netid = netid;
    cache_info->dns_event_subsampling_map = resolv_get_dns_event_subsampling_map();
    insert_cache_info_locked(cache_info);

    return 0;
@@ -1963,6 +2011,42 @@ int android_net_res_stats_get_info_for_net(unsigned netid, int* nscount,
    return revision_id;
}

std::vector<std::string> resolv_cache_dump_subsampling_map(unsigned netid) {
    using android::base::StringPrintf;
    std::lock_guard guard(cache_mutex);
    resolv_cache_info* cache_info = find_cache_info_locked(netid);
    if (cache_info == nullptr) return {};
    std::vector<std::string> result;
    for (const auto& pair : cache_info->dns_event_subsampling_map) {
        result.push_back(StringPrintf("%s:%d",
                                      (pair.first == DNSEVENT_SUBSAMPLING_MAP_DEFAULT_KEY)
                                              ? "default"
                                              : std::to_string(pair.first).c_str(),
                                      pair.second));
    }
    return result;
}

// Decides whether an event should be sampled using a random number generator and
// a sampling factor derived from the netid and the return code.
//
// Returns the subsampling rate if the event should be sampled, or 0 if it should be discarded.
uint32_t resolv_cache_get_subsampling_denom(unsigned netid, int return_code) {
    std::lock_guard guard(cache_mutex);
    resolv_cache_info* cache_info = find_cache_info_locked(netid);
    if (cache_info == nullptr) return 0;  // Don't log anything at all.
    const auto& subsampling_map = cache_info->dns_event_subsampling_map;
    auto search_returnCode = subsampling_map.find(return_code);
    uint32_t denom;
    if (search_returnCode != subsampling_map.end()) {
        denom = search_returnCode->second;
    } else {
        auto search_default = subsampling_map.find(DNSEVENT_SUBSAMPLING_MAP_DEFAULT_KEY);
        denom = (search_default == subsampling_map.end()) ? 0 : search_default->second;
    }
    return denom;
}

int resolv_cache_get_resolver_stats(unsigned netid, res_params* params, res_stats stats[MAXNS]) {
    std::lock_guard guard(cache_mutex);
    resolv_cache_info* info = find_cache_info_locked(netid);
+73 −1
Original line number Diff line number Diff line
@@ -14,7 +14,7 @@
 * limitations under the License.
 */

#include <gtest/gtest.h>
#include <netdb.h>

#include <array>
#include <atomic>
@@ -25,6 +25,9 @@
#include <android-base/logging.h>
#include <android-base/stringprintf.h>
#include <android/multinetwork.h>
#include <cutils/properties.h>
#include <gmock/gmock-matchers.h>
#include <gtest/gtest.h>

#include "dns_responder/dns_responder.h"
#include "netd_resolv/stats.h"
@@ -711,6 +714,75 @@ TEST_F(ResolvCacheTest, GetStats) {
    expectCacheStats("GetStats", TEST_NETID, cacheStats);
}

namespace {

constexpr int EAI_OK = 0;
constexpr char DNS_EVENT_SUBSAMPLING_MAP_FLAG[] =
        "persist.device_config.netd_native.dns_event_subsample_map";

class ScopedCacheCreate {
  public:
    explicit ScopedCacheCreate(unsigned netid, const char* subsampling_map,
                               const char* property = DNS_EVENT_SUBSAMPLING_MAP_FLAG)
        : mStoredNetId(netid), mStoredProperty(property) {
        property_get(property, mStoredMap, "");
        property_set(property, subsampling_map);
        EXPECT_EQ(0, resolv_create_cache_for_net(netid));
    }
    ~ScopedCacheCreate() {
        resolv_delete_cache_for_net(mStoredNetId);
        property_set(mStoredProperty, mStoredMap);
    }

  private:
    unsigned mStoredNetId;
    const char* mStoredProperty;
    char mStoredMap[PROPERTY_VALUE_MAX]{};
};

}  // namespace

TEST_F(ResolvCacheTest, DnsEventSubsampling) {
    // Test defaults, default flag is "default:1 0:100 7:10" if no experiment flag is set
    {
        ScopedCacheCreate scopedCacheCreate(TEST_NETID, "");
        EXPECT_EQ(resolv_cache_get_subsampling_denom(TEST_NETID, EAI_NODATA), 10U);
        EXPECT_EQ(resolv_cache_get_subsampling_denom(TEST_NETID, EAI_OK), 100U);
        EXPECT_EQ(resolv_cache_get_subsampling_denom(TEST_NETID, EAI_BADFLAGS),
                  1U);  // default
        EXPECT_THAT(resolv_cache_dump_subsampling_map(TEST_NETID),
                    testing::UnorderedElementsAreArray({"default:1", "0:100", "7:10"}));
    }
    // Now change the experiment flag to "0:42 default:666"
    {
        ScopedCacheCreate scopedCacheCreate(TEST_NETID, "0:42 default:666");
        EXPECT_EQ(resolv_cache_get_subsampling_denom(TEST_NETID, EAI_OK), 42U);
        EXPECT_EQ(resolv_cache_get_subsampling_denom(TEST_NETID, EAI_NODATA),
                  666U);  // default
        EXPECT_THAT(resolv_cache_dump_subsampling_map(TEST_NETID),
                    testing::UnorderedElementsAreArray({"default:666", "0:42"}));
    }
    // Now change the experiment flag to something illegal
    {
        ScopedCacheCreate scopedCacheCreate(TEST_NETID, "asvaxx");
        // 0(disable log) is the default value if experiment flag is invalid.
        EXPECT_EQ(resolv_cache_get_subsampling_denom(TEST_NETID, EAI_OK), 0U);
        EXPECT_EQ(resolv_cache_get_subsampling_denom(TEST_NETID, EAI_NODATA), 0U);
        EXPECT_TRUE(resolv_cache_dump_subsampling_map(TEST_NETID).empty());
    }
    // Test negative and zero denom
    {
        ScopedCacheCreate scopedCacheCreate(TEST_NETID, "0:-42 default:-666 7:10 10:0");
        // 0(disable log) is the default value if no valid denom is set
        EXPECT_EQ(resolv_cache_get_subsampling_denom(TEST_NETID, EAI_OK), 0U);
        EXPECT_EQ(resolv_cache_get_subsampling_denom(TEST_NETID, EAI_BADFLAGS), 0U);
        EXPECT_EQ(resolv_cache_get_subsampling_denom(TEST_NETID, EAI_NODATA), 10U);
        EXPECT_EQ(resolv_cache_get_subsampling_denom(TEST_NETID, EAI_SOCKTYPE), 0U);
        EXPECT_THAT(resolv_cache_dump_subsampling_map(TEST_NETID),
                    testing::UnorderedElementsAreArray({"7:10", "10:0"}));
    }
}

// TODO: Tests for struct resolv_cache_info, including:
//     - res_params
//         -- resolv_cache_get_resolver_stats()
Loading