Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 550247e3 authored by Steven Moreland's avatar Steven Moreland
Browse files

RPC Binder: socket ENOMEM handling

(urgent to merge)

RPC Binder uses non-blocking sockets. This is so when client/server
pairs, when both writing to the same socket, if the socket would
block, they can back off to polling so that they can read commands
from the other side and recover.

When writing to one of these sockets, the vhost-vsock implementation
we are using returns ENOMEM in some cases.

In order to resolve this, we add an exponential packet size backoff
and time backoff.

There is the suggestion to use blocking sockets instead, since
it would engage direct reclaim that could potentially block. However,
I've not gone with this approach because:
- all RPC binder sockets are marked non-blocking globally. Setting
  the per-syscall flags on each operation all throughout the binder
  code base is a more invasive option. Specifically, it is hard
  to guarantee that we never hang indefinitely.
- the timeout needs to be configured with the global SO_SNDTIMEO
  socket option and can't be specified on sendmsg/recvmsg
  directly. This means retry would need to be added on every
  operation (if they now block, such as accept), or we would need
  2 syscalls for each of the operations here.

RPC_FLAKE_PRONE mode now simulates ENOMEM errors as well.

Future considerations:
- separate build of binderRpcTest with flake mode

Bug: 422574189
Test: binderRpcTest passes with FLAKE_MODE
Flag: EXEMPT bug fix

Change-Id: I5f2c99d0cb6760cca53ac4839d47daa9304a955a
parent 9ff64f6a
Loading
Loading
Loading
Loading
+9 −1
Original line number Diff line number Diff line
@@ -45,7 +45,7 @@ using android::binder::borrowed_fd;
using android::binder::unique_fd;

#if RPC_FLAKE_PRONE
void rpcMaybeWaitToFlake() {
static unsigned rpcFlakeUnsigned() {
    [[clang::no_destroy]] static std::random_device r;
    [[clang::no_destroy]] static RpcMutex m;
    unsigned num;
@@ -53,6 +53,14 @@ void rpcMaybeWaitToFlake() {
        RpcMutexLockGuard lock(m);
        num = r();
    }
    return num;
}
bool rpcMaybeFlake() {
    return rpcFlakeUnsigned() % 10 == 0; // flake 10%
    // return rpcFlakeUnsigned() % 4 != 0; // flake 75%
}
void rpcMaybeWaitToFlake() {
    unsigned num = rpcFlakeUnsigned();
    if (num % 10 == 0) usleep(num % 1000);
}
#endif
+8 −1
Original line number Diff line number Diff line
@@ -38,7 +38,9 @@ struct RpcWireHeader;
 * a specific subset of logs to debug, this could be broken up like
 * IPCThreadState's.
 */
// DO NOT ENABLE IN PRODUCTION
#define SHOULD_LOG_RPC_DETAIL false
// DO NOT ENABLE IN PRODUCTION

#if SHOULD_LOG_RPC_DETAIL
#define LOG_RPC_DETAIL(...) ALOGI(__VA_ARGS__)
@@ -46,12 +48,17 @@ struct RpcWireHeader;
#define LOG_RPC_DETAIL(...) ALOGV(__VA_ARGS__) // for type checking
#endif

// DO NOT ENABLE IN PRODUCTION
#define RPC_FLAKE_PRONE false
// DO NOT ENABLE IN PRODUCTION

#if RPC_FLAKE_PRONE
void rpcMaybeWaitToFlake();
LIBBINDER_INTERNAL_EXPORTED bool rpcMaybeFlake();
LIBBINDER_INTERNAL_EXPORTED void rpcMaybeWaitToFlake();
#define MAYBE_TRUE_IN_FLAKE_MODE rpcMaybeFlake()
#define MAYBE_WAIT_IN_FLAKE_MODE rpcMaybeWaitToFlake()
#else
#define MAYBE_TRUE_IN_FLAKE_MODE false
#define MAYBE_WAIT_IN_FLAKE_MODE do {} while (false)
#endif

+92 −24
Original line number Diff line number Diff line
@@ -21,6 +21,10 @@
#include "FdTrigger.h"
#include "RpcState.h"

// For just this file:
// #undef LOG_RPC_DETAIL
// #define LOG_RPC_DETAIL ALOGE

namespace android {

template <typename SendOrReceive>
@@ -56,10 +60,24 @@ status_t interruptableReadOrWrite(
        return OK;
    }

    // size to break up message
    constexpr size_t kChunkMax = 65536;
    const size_t kChunkMin = getpagesize(); // typical allocated granularity for sockets
    size_t chunkSize = kChunkMax;

    // b/419364025 - vhost-vsock, and perhaps other socket implementations may return
    // ENOMEM from blocking sockets, non-blocking
    // how long we are waiting on repeated enomems for memory to be available
    constexpr size_t kEnomemWaitStartUs = 10'000;
    constexpr size_t kEnomemWaitMaxUs = 1'000'000;       // don't risk ANR
    constexpr size_t kEnomemWaitTotalMaxUs = 30'000'000; // ANR at 30s anyway, so avoid hang
    size_t enomemWaitUs = 0;
    size_t enomemTotalUs = 0;

    bool havePolled = false;
    while (true) {
        ssize_t processSize = -1;
        bool canSendFullTransaction = false;
        bool skipPollingAndContinue = false; // set when we should retry immediately

        // This block dynamically adjusts packet sizes down to work around a
        // limitation in the vsock driver where large packets are sometimes
@@ -67,15 +85,15 @@ status_t interruptableReadOrWrite(
        // TODO: only apply this workaround on vsock ???
        // TODO: fix vsock
        {
            size_t limit = 65536;
            size_t chunkRemaining = chunkSize;
            int i = 0;
            for (i = 0; i < niovs; i++) {
                if (iovs[i].iov_len >= limit) {
                if (iovs[i].iov_len >= chunkRemaining) {
                    break;
                }
                limit -= iovs[i].iov_len;
                chunkRemaining -= iovs[i].iov_len;
            }
            canSendFullTransaction = i == niovs;
            bool canSendFullTransaction = i == niovs;

            int old_niovs = niovs;
            size_t old_len = 0xDEADBEEF;
@@ -84,14 +102,20 @@ status_t interruptableReadOrWrite(
                // pretend like we have fewer iovecs
                niovs = i + 1; // to restore (A)
                old_len = iovs[i].iov_len;
                // only send up to remaining limit from this iovec
                iovs[i].iov_len = limit; // to restore (B)
                LOG_ALWAYS_FATAL_IF(limit == 0,
                                    "limit should not be zero - see EMPTY IOVEC ISSUE above");
                // only send up to remaining chunkRemaining from this iovec
                iovs[i].iov_len = chunkRemaining; // to restore (B)
                LOG_ALWAYS_FATAL_IF(chunkRemaining == 0,
                                    "chunkRemaining never zero - see EMPTY IOVEC ISSUE above");
            }

            // MAIN ACTION
            if (MAYBE_TRUE_IN_FLAKE_MODE) {
                LOG_RPC_DETAIL("Injecting ENOMEM.");
                processSize = -1;
                errno = ENOMEM;
            } else {
                processSize = sendOrReceiveFun(iovs, niovs);
            }
            // MAIN ACTION

            if (!canSendFullTransaction) {
@@ -101,20 +125,67 @@ status_t interruptableReadOrWrite(
                niovs = old_niovs;         // (A) - restored
                iovs[i].iov_len = old_len; // (B) - restored
            }

            // altPoll may introduce long waiting since it assumes if it cannot write
            // data, that it needs to wait to send more to give time for the producer
            // consumer problem to be solved - otherwise it will busy loop. However,
            // for this worakround, we are breaking up the transaction intentionally,
            // not because the transaction won't fit, but to avoid a bug in the kernel
            // for how it combines messages. So, when we artificially simulate a
            // limited send, don't poll and just keep on sending data.
            skipPollingAndContinue = !canSendFullTransaction;
        }

        // HANDLE RESULT OF SEND OR RECEIVE
        if (processSize < 0) {
            int savedErrno = errno;

            if (savedErrno == ENOMEM) {
                LOG_RPC_DETAIL("RpcTransport %s(): %s", funName, strerror(savedErrno));

                // Since this is the limit only for this call to send this packet
                // we don't ever restore this. Assume it will be hard to get more
                // memory if we're already having difficulty sending this out.
                chunkSize = std::max(chunkSize / 2, kChunkMin);
                LOG_RPC_DETAIL("Chunk size is now %zu due to ENOMEM.", chunkSize);

                // When we've gotten down to the minimum send size, add a timer
                // to give time for more memory to be freed up. This means even
                // a single page is not available, so we have to wait.
                if (chunkSize <= kChunkMin) {
                    if (enomemWaitUs == 0) enomemWaitUs = kEnomemWaitStartUs;
                    enomemWaitUs = std::min(enomemWaitUs * 2, kEnomemWaitMaxUs);
                    enomemTotalUs += enomemWaitUs;

                    if (enomemTotalUs > kEnomemWaitTotalMaxUs) {
                        // by this time WatchDog should be kicking in
                        return -ENOMEM;
                    }

                    LOG_RPC_DETAIL("Sleeping %zuus due to ENOMEM.", enomemWaitUs);
                    usleep(enomemWaitUs);
                }

                // Need to survey socket code to see if polling in this situation is
                // guaranteed to be non-blocking.
                // NOTE: if the other side needs to deallocate memory, and that is the
                // only deallocatable memory in the entire system, but we need altPoll
                // to drain commands to unstick it so it can do that, then this could
                // cause a deadlock, but this is not realistic on Android.
                skipPollingAndContinue = true;
            } else if (havePolled || (savedErrno != EAGAIN && savedErrno != EWOULDBLOCK)) {
                // Still return the error on later passes, since it would expose
                // a problem with polling
            if (havePolled || (savedErrno != EAGAIN && savedErrno != EWOULDBLOCK)) {
                LOG_RPC_DETAIL("RpcTransport %s(): %s", funName, strerror(savedErrno));
                return -savedErrno;
            }
        } else if (processSize == 0) {
            return DEAD_OBJECT;
        } else {
            // success - reset error exponential backoffs
            enomemWaitUs = 0;
            enomemTotalUs = 0;

            while (processSize > 0 && niovs > 0) {
                auto& iov = iovs[0];
                if (static_cast<size_t>(processSize) < iov.iov_len) {
@@ -138,20 +209,17 @@ status_t interruptableReadOrWrite(
            }
        }

        // altPoll may introduce long waiting since it assumes if it cannot write
        // data, that it needs to wait to send more to give time for the producer
        // consumer problem to be solved - otherwise it will busy loop. However,
        // for this worakround, we are breaking up the transaction intentionally,
        // not because the transaction won't fit, but to avoid a bug in the kernel
        // for how it combines messages. So, when we artificially simulate a
        // limited send, don't poll and just keep on sending data.
        if (!canSendFullTransaction) continue;

        if (altPoll) {
            if (status_t status = (*altPoll)(); status != OK) return status;
        // METHOD OF POLLING
        if (skipPollingAndContinue) {
            if (fdTrigger->isTriggered()) {
                return DEAD_OBJECT;
            }
            // continue;
        } else if (altPoll) {
            if (status_t status = (*altPoll)(); status != OK) return status;
            if (fdTrigger->isTriggered()) { // altPoll may not check this
                return DEAD_OBJECT;
            }
        } else {
            if (status_t status = fdTrigger->triggerablePoll(socket, event); status != OK)
                return status;