Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 7a70a7e0 authored by Chiachang Wang's avatar Chiachang Wang
Browse files

Data stall detection using DNS event

If dns resolver on a network get consecutively timeout then it
is a strong signal that the network is no longer usable.
Reevaluate the network once it's data stall suspected

Test: 1. runtest frameworks-net
      2. SettingsBackupTest passes
      2. Run on wifi w/o internet capability
Bug: 112653893, 113916551

Change-Id: I74287b174d933f97a91fa1529b1809856ac3b38d
parent a97bdb95
Loading
Loading
Loading
Loading
+3 −0
Original line number Original line Diff line number Diff line
@@ -44,6 +44,8 @@ public final class NetworkEvent implements Parcelable {
    public static final int NETWORK_FIRST_VALIDATION_PORTAL_FOUND = 10;
    public static final int NETWORK_FIRST_VALIDATION_PORTAL_FOUND = 10;
    public static final int NETWORK_REVALIDATION_PORTAL_FOUND     = 11;
    public static final int NETWORK_REVALIDATION_PORTAL_FOUND     = 11;


    public static final int NETWORK_CONSECUTIVE_DNS_TIMEOUT_FOUND = 12;

    @IntDef(value = {
    @IntDef(value = {
            NETWORK_CONNECTED,
            NETWORK_CONNECTED,
            NETWORK_VALIDATED,
            NETWORK_VALIDATED,
@@ -56,6 +58,7 @@ public final class NetworkEvent implements Parcelable {
            NETWORK_REVALIDATION_SUCCESS,
            NETWORK_REVALIDATION_SUCCESS,
            NETWORK_FIRST_VALIDATION_PORTAL_FOUND,
            NETWORK_FIRST_VALIDATION_PORTAL_FOUND,
            NETWORK_REVALIDATION_PORTAL_FOUND,
            NETWORK_REVALIDATION_PORTAL_FOUND,
            NETWORK_CONSECUTIVE_DNS_TIMEOUT_FOUND,
    })
    })
    @Retention(RetentionPolicy.SOURCE)
    @Retention(RetentionPolicy.SOURCE)
    public @interface EventType {}
    public @interface EventType {}
+35 −0
Original line number Original line Diff line number Diff line
@@ -10435,6 +10435,41 @@ public final class Settings {
         */
         */
        public static final String CAPTIVE_PORTAL_USER_AGENT = "captive_portal_user_agent";
        public static final String CAPTIVE_PORTAL_USER_AGENT = "captive_portal_user_agent";
        /**
         * The threshold value for the number of consecutive dns timeout events received to be a
         * signal of data stall. Set the value to 0 or less than 0 to disable. Note that the value
         * should be larger than 0 if the DNS data stall detection is enabled.
         *
         * @hide
         */
        public static final String DATA_STALL_CONSECUTIVE_DNS_TIMEOUT_THRESHOLD =
                "data_stall_consecutive_dns_timeout_threshold";
        /**
         * The minimal time interval in milliseconds for data stall reevaluation.
         *
         * @hide
         */
        public static final String DATA_STALL_MIN_EVALUATE_INTERVAL =
                "data_stall_min_evaluate_interval";
        /**
         * DNS timeouts older than this timeout (in milliseconds) are not considered for detecting
         * a data stall.
         *
         * @hide
         */
        public static final String DATA_STALL_VALID_DNS_TIME_THRESHOLD =
                "data_stall_valid_dns_time_threshold";
        /**
         * Which data stall detection signal to use. Possible values are a union of the powers of 2
         * of DATA_STALL_EVALUATION_TYPE_*.
         *
         * @hide
         */
        public static final String DATA_STALL_EVALUATION_TYPE = "data_stall_evaluation_type";
        /**
        /**
         * Whether network service discovery is enabled.
         * Whether network service discovery is enabled.
         *
         *
+4 −0
Original line number Original line Diff line number Diff line
@@ -185,6 +185,10 @@ public class SettingsBackupTest {
                    Settings.Global.DATA_ROAMING,
                    Settings.Global.DATA_ROAMING,
                    Settings.Global.DATA_STALL_ALARM_AGGRESSIVE_DELAY_IN_MS,
                    Settings.Global.DATA_STALL_ALARM_AGGRESSIVE_DELAY_IN_MS,
                    Settings.Global.DATA_STALL_ALARM_NON_AGGRESSIVE_DELAY_IN_MS,
                    Settings.Global.DATA_STALL_ALARM_NON_AGGRESSIVE_DELAY_IN_MS,
                    Settings.Global.DATA_STALL_CONSECUTIVE_DNS_TIMEOUT_THRESHOLD,
                    Settings.Global.DATA_STALL_EVALUATION_TYPE,
                    Settings.Global.DATA_STALL_MIN_EVALUATE_INTERVAL,
                    Settings.Global.DATA_STALL_VALID_DNS_TIME_THRESHOLD,
                    Settings.Global.DEBUG_APP,
                    Settings.Global.DEBUG_APP,
                    Settings.Global.DEBUG_VIEW_ATTRIBUTES,
                    Settings.Global.DEBUG_VIEW_ATTRIBUTES,
                    Settings.Global.DEFAULT_DNS_SERVER,
                    Settings.Global.DEFAULT_DNS_SERVER,
+18 −0
Original line number Original line Diff line number Diff line
@@ -1659,6 +1659,24 @@ public class ConnectivityService extends IConnectivityManager.Stub
                loge("Error parsing ip address in validation event");
                loge("Error parsing ip address in validation event");
            }
            }
        }
        }

        @Override
        public void onDnsEvent(int netId, int eventType, int returnCode, String hostname,
                String[] ipAddresses, int ipAddressesCount, long timestamp, int uid) {
            NetworkAgentInfo nai = getNetworkAgentInfoForNetId(netId);
            // Netd event only allow registrants from system. Each NetworkMonitor thread is under
            // the caller thread of registerNetworkAgent. Thus, it's not allowed to register netd
            // event callback for certain nai. e.g. cellular. Register here to pass to
            // NetworkMonitor instead.
            // TODO: Move the Dns Event to NetworkMonitor. Use Binder.clearCallingIdentity() in
            // registerNetworkAgent to have NetworkMonitor created with system process as design
            // expectation. Also, NetdEventListenerService only allow one callback from each
            // caller type. Need to re-factor NetdEventListenerService to allow multiple
            // NetworkMonitor registrants.
            if (nai != null && nai.satisfies(mDefaultRequest)) {
                nai.networkMonitor.sendMessage(NetworkMonitor.EVENT_DNS_NOTIFICATION, returnCode);
            }
        }
    };
    };


    @VisibleForTesting
    @VisibleForTesting
+187 −1
Original line number Original line Diff line number Diff line
@@ -72,6 +72,7 @@ import android.util.Log;
import com.android.internal.annotations.VisibleForTesting;
import com.android.internal.annotations.VisibleForTesting;
import com.android.internal.util.ArrayUtils;
import com.android.internal.util.ArrayUtils;
import com.android.internal.util.Protocol;
import com.android.internal.util.Protocol;
import com.android.internal.util.RingBufferIndices;
import com.android.internal.util.State;
import com.android.internal.util.State;
import com.android.internal.util.StateMachine;
import com.android.internal.util.StateMachine;
import com.android.server.connectivity.DnsManager.PrivateDnsConfig;
import com.android.server.connectivity.DnsManager.PrivateDnsConfig;
@@ -99,7 +100,7 @@ public class NetworkMonitor extends StateMachine {
    private static final String TAG = NetworkMonitor.class.getSimpleName();
    private static final String TAG = NetworkMonitor.class.getSimpleName();
    private static final boolean DBG  = true;
    private static final boolean DBG  = true;
    private static final boolean VDBG = false;
    private static final boolean VDBG = false;

    private static final boolean VDBG_STALL = Log.isLoggable(TAG, Log.DEBUG);
    // Default configuration values for captive portal detection probes.
    // Default configuration values for captive portal detection probes.
    // TODO: append a random length parameter to the default HTTPS url.
    // TODO: append a random length parameter to the default HTTPS url.
    // TODO: randomize browser version ids in the default User-Agent String.
    // TODO: randomize browser version ids in the default User-Agent String.
@@ -116,6 +117,15 @@ public class NetworkMonitor extends StateMachine {
    private static final int SOCKET_TIMEOUT_MS = 10000;
    private static final int SOCKET_TIMEOUT_MS = 10000;
    private static final int PROBE_TIMEOUT_MS  = 3000;
    private static final int PROBE_TIMEOUT_MS  = 3000;


    // Default configuration values for data stall detection.
    private static final int DEFAULT_CONSECUTIVE_DNS_TIMEOUT_THRESHOLD = 5;
    private static final int DEFAULT_DATA_STALL_MIN_EVALUATE_TIME_MS = 60 * 1000;
    private static final int DEFAULT_DATA_STALL_VALID_DNS_TIME_THRESHOLD_MS = 30 * 60 * 1000;

    private static final int DATA_STALL_EVALUATION_TYPE_DNS = 1;
    private static final int DEFAULT_DATA_STALL_EVALUATION_TYPES =
            (1 << DATA_STALL_EVALUATION_TYPE_DNS);

    static enum EvaluationResult {
    static enum EvaluationResult {
        VALIDATED(true),
        VALIDATED(true),
        CAPTIVE_PORTAL(false);
        CAPTIVE_PORTAL(false);
@@ -233,6 +243,12 @@ public class NetworkMonitor extends StateMachine {
     */
     */
    public static final int CMD_PROBE_COMPLETE = BASE + 16;
    public static final int CMD_PROBE_COMPLETE = BASE + 16;


    /**
     * ConnectivityService notifies NetworkMonitor of DNS query responses event.
     * arg1 = returncode in OnDnsEvent which indicates the response code for the DNS query.
     */
    public static final int EVENT_DNS_NOTIFICATION = BASE + 17;

    // Start mReevaluateDelayMs at this value and double.
    // Start mReevaluateDelayMs at this value and double.
    private static final int INITIAL_REEVALUATE_DELAY_MS = 1000;
    private static final int INITIAL_REEVALUATE_DELAY_MS = 1000;
    private static final int MAX_REEVALUATE_DELAY_MS = 10*60*1000;
    private static final int MAX_REEVALUATE_DELAY_MS = 10*60*1000;
@@ -314,6 +330,12 @@ public class NetworkMonitor extends StateMachine {
    private int mReevaluateDelayMs = INITIAL_REEVALUATE_DELAY_MS;
    private int mReevaluateDelayMs = INITIAL_REEVALUATE_DELAY_MS;
    private int mEvaluateAttempts = 0;
    private int mEvaluateAttempts = 0;
    private volatile int mProbeToken = 0;
    private volatile int mProbeToken = 0;
    private final int mConsecutiveDnsTimeoutThreshold;
    private final int mDataStallMinEvaluateTime;
    private final int mDataStallValidDnsTimeThreshold;
    private final int mDataStallEvaluationType;
    private final DnsStallDetector mDnsStallDetector;
    private long mLastProbeTime;


    public NetworkMonitor(Context context, Handler handler, NetworkAgentInfo networkAgentInfo,
    public NetworkMonitor(Context context, Handler handler, NetworkAgentInfo networkAgentInfo,
            NetworkRequest defaultRequest) {
            NetworkRequest defaultRequest) {
@@ -359,6 +381,12 @@ public class NetworkMonitor extends StateMachine {
        mCaptivePortalFallbackUrls = makeCaptivePortalFallbackUrls();
        mCaptivePortalFallbackUrls = makeCaptivePortalFallbackUrls();
        mCaptivePortalFallbackSpecs = makeCaptivePortalFallbackProbeSpecs();
        mCaptivePortalFallbackSpecs = makeCaptivePortalFallbackProbeSpecs();
        mRandom = deps.getRandom();
        mRandom = deps.getRandom();
        // TODO: Evaluate to move data stall configuration to a specific class.
        mConsecutiveDnsTimeoutThreshold = getConsecutiveDnsTimeoutThreshold();
        mDnsStallDetector = new DnsStallDetector(mConsecutiveDnsTimeoutThreshold);
        mDataStallMinEvaluateTime = getDataStallMinEvaluateTime();
        mDataStallValidDnsTimeThreshold = getDataStallValidDnsTimeThreshold();
        mDataStallEvaluationType = getDataStallEvalutionType();


        start();
        start();
    }
    }
@@ -507,6 +535,9 @@ public class NetworkMonitor extends StateMachine {
                    sendMessage(CMD_EVALUATE_PRIVATE_DNS);
                    sendMessage(CMD_EVALUATE_PRIVATE_DNS);
                    break;
                    break;
                }
                }
                case EVENT_DNS_NOTIFICATION:
                    mDnsStallDetector.accumulateConsecutiveDnsTimeoutCount(message.arg1);
                    break;
                default:
                default:
                    break;
                    break;
            }
            }
@@ -537,6 +568,13 @@ public class NetworkMonitor extends StateMachine {
                case CMD_EVALUATE_PRIVATE_DNS:
                case CMD_EVALUATE_PRIVATE_DNS:
                    transitionTo(mEvaluatingPrivateDnsState);
                    transitionTo(mEvaluatingPrivateDnsState);
                    break;
                    break;
                case EVENT_DNS_NOTIFICATION:
                    mDnsStallDetector.accumulateConsecutiveDnsTimeoutCount(message.arg1);
                    if (isDataStall()) {
                        validationLog("Suspecting data stall, reevaluate");
                        transitionTo(mEvaluatingState);
                    }
                    break;
                default:
                default:
                    return NOT_HANDLED;
                    return NOT_HANDLED;
            }
            }
@@ -856,6 +894,7 @@ public class NetworkMonitor extends StateMachine {


                    final CaptivePortalProbeResult probeResult =
                    final CaptivePortalProbeResult probeResult =
                            (CaptivePortalProbeResult) message.obj;
                            (CaptivePortalProbeResult) message.obj;
                    mLastProbeTime = SystemClock.elapsedRealtime();
                    if (probeResult.isSuccessful()) {
                    if (probeResult.isSuccessful()) {
                        // Transit EvaluatingPrivateDnsState to get to Validated
                        // Transit EvaluatingPrivateDnsState to get to Validated
                        // state (even if no Private DNS validation required).
                        // state (even if no Private DNS validation required).
@@ -883,6 +922,7 @@ public class NetworkMonitor extends StateMachine {
                    // Leave the event to EvaluatingState. Defer this message will result in reset
                    // Leave the event to EvaluatingState. Defer this message will result in reset
                    // of mReevaluateDelayMs and mEvaluateAttempts.
                    // of mReevaluateDelayMs and mEvaluateAttempts.
                case CMD_NETWORK_DISCONNECTED:
                case CMD_NETWORK_DISCONNECTED:
                case EVENT_DNS_NOTIFICATION:
                    return NOT_HANDLED;
                    return NOT_HANDLED;
                default:
                default:
                    // TODO: Some events may able to handle in this state, instead of deferring to
                    // TODO: Some events may able to handle in this state, instead of deferring to
@@ -947,6 +987,29 @@ public class NetworkMonitor extends StateMachine {
                Settings.Global.CAPTIVE_PORTAL_HTTPS_URL, DEFAULT_HTTPS_URL);
                Settings.Global.CAPTIVE_PORTAL_HTTPS_URL, DEFAULT_HTTPS_URL);
    }
    }


    private int getConsecutiveDnsTimeoutThreshold() {
        return mDependencies.getSetting(mContext,
                Settings.Global.DATA_STALL_CONSECUTIVE_DNS_TIMEOUT_THRESHOLD,
                DEFAULT_CONSECUTIVE_DNS_TIMEOUT_THRESHOLD);
    }

    private int getDataStallMinEvaluateTime() {
        return mDependencies.getSetting(mContext,
                Settings.Global.DATA_STALL_MIN_EVALUATE_INTERVAL,
                DEFAULT_DATA_STALL_MIN_EVALUATE_TIME_MS);
    }

    private int getDataStallValidDnsTimeThreshold() {
        return mDependencies.getSetting(mContext,
                Settings.Global.DATA_STALL_VALID_DNS_TIME_THRESHOLD,
                DEFAULT_DATA_STALL_VALID_DNS_TIME_THRESHOLD_MS);
    }

    private int getDataStallEvalutionType() {
        return mDependencies.getSetting(mContext, Settings.Global.DATA_STALL_EVALUATION_TYPE,
                DEFAULT_DATA_STALL_EVALUATION_TYPES);
    }

    // Static for direct access by ConnectivityService
    // Static for direct access by ConnectivityService
    public static String getCaptivePortalServerHttpUrl(Context context) {
    public static String getCaptivePortalServerHttpUrl(Context context) {
        return getCaptivePortalServerHttpUrl(Dependencies.DEFAULT, context);
        return getCaptivePortalServerHttpUrl(Dependencies.DEFAULT, context);
@@ -1462,4 +1525,127 @@ public class NetworkMonitor extends StateMachine {


        public static final Dependencies DEFAULT = new Dependencies();
        public static final Dependencies DEFAULT = new Dependencies();
    }
    }

    /**
     * Methods in this class perform no locking because all accesses are performed on the state
     * machine's thread. Need to consider the thread safety if it ever could be accessed outside the
     * state machine.
     */
    @VisibleForTesting
    protected class DnsStallDetector {
        private static final int DEFAULT_DNS_LOG_SIZE = 50;
        private int mConsecutiveTimeoutCount = 0;
        private int mSize;
        final DnsResult[] mDnsEvents;
        final RingBufferIndices mResultIndices;

        DnsStallDetector(int size) {
            mSize = Math.max(DEFAULT_DNS_LOG_SIZE, size);
            mDnsEvents = new DnsResult[mSize];
            mResultIndices = new RingBufferIndices(mSize);
        }

        @VisibleForTesting
        protected void accumulateConsecutiveDnsTimeoutCount(int code) {
            final DnsResult result = new DnsResult(code);
            mDnsEvents[mResultIndices.add()] = result;
            if (result.isTimeout()) {
                mConsecutiveTimeoutCount++;
            } else {
                // Keep the event in mDnsEvents without clearing it so that there are logs to do the
                // simulation and analysis.
                mConsecutiveTimeoutCount = 0;
            }
        }

        private boolean isDataStallSuspected(int timeoutCountThreshold, int validTime) {
            if (timeoutCountThreshold <= 0) {
                Log.wtf(TAG, "Timeout count threshold should be larger than 0.");
                return false;
            }

            // Check if the consecutive timeout count reach the threshold or not.
            if (mConsecutiveTimeoutCount < timeoutCountThreshold) {
                return false;
            }

            // Check if the target dns event index is valid or not.
            final int firstConsecutiveTimeoutIndex =
                    mResultIndices.indexOf(mResultIndices.size() - timeoutCountThreshold);

            // If the dns timeout events happened long time ago, the events are meaningless for
            // data stall evaluation. Thus, check if the first consecutive timeout dns event
            // considered in the evaluation happened in defined threshold time.
            final long now = SystemClock.elapsedRealtime();
            final long firstTimeoutTime = now - mDnsEvents[firstConsecutiveTimeoutIndex].mTimeStamp;
            return (firstTimeoutTime < validTime);
        }

        int getConsecutiveTimeoutCount() {
            return mConsecutiveTimeoutCount;
        }
    }

    private static class DnsResult {
        // TODO: Need to move the DNS return code definition to a specific class once unify DNS
        // response code is done.
        private static final int RETURN_CODE_DNS_TIMEOUT = 255;

        private final long mTimeStamp;
        private final int mReturnCode;

        DnsResult(int code) {
            mTimeStamp = SystemClock.elapsedRealtime();
            mReturnCode = code;
        }

        private boolean isTimeout() {
            return mReturnCode == RETURN_CODE_DNS_TIMEOUT;
        }
    }


    @VisibleForTesting
    protected DnsStallDetector getDnsStallDetector() {
        return mDnsStallDetector;
    }

    private boolean dataStallEvaluateTypeEnabled(int type) {
        return (mDataStallEvaluationType & (1 << type)) != 0;
    }

    @VisibleForTesting
    protected long getLastProbeTime() {
        return mLastProbeTime;
    }

    @VisibleForTesting
    protected boolean isDataStall() {
        boolean result = false;
        // Reevaluation will generate traffic. Thus, set a minimal reevaluation timer to limit the
        // possible traffic cost in metered network.
        if (mNetworkAgentInfo.networkCapabilities.isMetered()
                && (SystemClock.elapsedRealtime() - getLastProbeTime()
                < mDataStallMinEvaluateTime)) {
            return false;
        }

        // Check dns signal. Suspect it may be a data stall if both :
        // 1. The number of consecutive DNS query timeouts > mConsecutiveDnsTimeoutThreshold.
        // 2. Those consecutive DNS queries happened in the last mValidDataStallDnsTimeThreshold ms.
        if (dataStallEvaluateTypeEnabled(DATA_STALL_EVALUATION_TYPE_DNS)) {
            if (mDnsStallDetector.isDataStallSuspected(mConsecutiveDnsTimeoutThreshold,
                    mDataStallValidDnsTimeThreshold)) {
                result = true;
                logNetworkEvent(NetworkEvent.NETWORK_CONSECUTIVE_DNS_TIMEOUT_FOUND);
            }
        }

        if (VDBG_STALL) {
            log("isDataStall: result=" + result + ", consecutive dns timeout count="
                    + mDnsStallDetector.getConsecutiveTimeoutCount());
        }

        return result;
    }
}
}
Loading