Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 580312bc authored by Akilesh Kailash's avatar Akilesh Kailash
Browse files

libsnapshot:snapuserd: read-ahead COW copy ops



Introduce read-ahead mechanism for COW copy ops.

1: Read-ahead thread will read from base device
   and store the data in scratch space along with the metadata.
2: Worker threads during merge will retrieve the data
   from read-ahead cache
3: Fixed set of blocks are read during each cycle by the read-ahead
   thread.
4: When the last block in the region is merged, read-ahead thread
   makes forward progress.

Scratch space is set to 2MB and is only used from COW copy operations.
We can extend this to Replace Ops based on performance evaluation.

Performance:

As mentioned in bug 181883791, Incremental OTA of size 55M with
235K copy operations where every block is moved by 4k:

Without read-ahead: 40 Minutes for merge completion
With read-ahead:  21 Minutes for merge completion

Bug: 183863613

Test: 1: Full OTA - no regression observed.
2: Incremental OTA - with older COW format. Daemon will just skip
   the read-ahead feature for older COW format.
3: Incremental OTA - with new COW format.
4: Reboot and crash kernel when multiple times when incremental OTA is in-flight.
   Verify post reboot, read-ahead thread re-constructs the data from scratch
   space.
5: No regression observed in RSS-Anon memory usage when merge in-flight.

Signed-off-by: default avatarAkilesh Kailash <akailash@google.com>
Change-Id: Ic565bfbee3e9fcfc94af694596dbf44c0877639f
parent d967d01f
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
@@ -421,6 +421,7 @@ cc_defaults {
        "snapuserd.cpp",
        "snapuserd_daemon.cpp",
	"snapuserd_worker.cpp",
	"snapuserd_readahead.cpp",
    ],

    cflags: [
+18 −7
Original line number Diff line number Diff line
@@ -369,13 +369,7 @@ void CowReader::InitializeMerge() {
    //                        Replace-op-4, Zero-op-9, Replace-op-5 }
    //==============================================================

    for (uint64_t i = 0; i < ops_->size(); i++) {
        auto& current_op = ops_->data()[i];
        if (current_op.type != kCowCopyOp) {
            break;
        }
        num_copy_ops += 1;
    }
    num_copy_ops = FindNumCopyops();

    std::sort(ops_.get()->begin() + num_copy_ops, ops_.get()->end(),
              [](CowOperation& op1, CowOperation& op2) -> bool {
@@ -386,6 +380,23 @@ void CowReader::InitializeMerge() {
        CHECK(ops_->size() >= header_.num_merge_ops);
        ops_->erase(ops_.get()->begin(), ops_.get()->begin() + header_.num_merge_ops);
    }

    num_copy_ops = FindNumCopyops();
    set_copy_ops(num_copy_ops);
}

uint64_t CowReader::FindNumCopyops() {
    uint64_t num_copy_ops = 0;

    for (uint64_t i = 0; i < ops_->size(); i++) {
        auto& current_op = ops_->data()[i];
        if (current_op.type != kCowCopyOp) {
            break;
        }
        num_copy_ops += 1;
    }

    return num_copy_ops;
}

bool CowReader::GetHeader(CowHeader* header) {
+19 −0
Original line number Diff line number Diff line
@@ -35,6 +35,8 @@ static constexpr uint32_t BLOCK_SHIFT = (__builtin_ffs(BLOCK_SZ) - 1);
//      +-----------------------+
//      |     Header (fixed)    |
//      +-----------------------+
//      |     Scratch space     |
//      +-----------------------+
//      | Operation  (variable) |
//      | Data       (variable) |
//      +-----------------------+
@@ -152,11 +154,28 @@ static constexpr uint8_t kCowCompressNone = 0;
static constexpr uint8_t kCowCompressGz = 1;
static constexpr uint8_t kCowCompressBrotli = 2;

static constexpr uint8_t kCowReadAheadNotStarted = 0;
static constexpr uint8_t kCowReadAheadInProgress = 1;
static constexpr uint8_t kCowReadAheadDone = 2;

struct CowFooter {
    CowFooterOperation op;
    CowFooterData data;
} __attribute__((packed));

struct ScratchMetadata {
    // Block of data in the image that operation modifies
    // and read-ahead thread stores the modified data
    // in the scratch space
    uint64_t new_block;
    // Offset within the file to read the data
    uint64_t file_offset;
} __attribute__((packed));

struct BufferState {
    uint8_t read_ahead_state;
} __attribute__((packed));

// 2MB Scratch space used for read-ahead
static constexpr uint64_t BUFFER_REGION_DEFAULT_SIZE = (1ULL << 21);

+6 −2
Original line number Diff line number Diff line
@@ -141,18 +141,21 @@ class CowReader : public ICowReader {

    bool GetRawBytes(uint64_t offset, void* buffer, size_t len, size_t* read);

    void UpdateMergeProgress(uint64_t merge_ops) { header_.num_merge_ops += merge_ops; }

    void InitializeMerge();

    void set_total_data_ops(uint64_t size) { total_data_ops_ = size; }

    uint64_t total_data_ops() { return total_data_ops_; }

    void set_copy_ops(uint64_t size) { copy_ops_ = size; }

    uint64_t total_copy_ops() { return copy_ops_; }

    void CloseCowFd() { owned_fd_ = {}; }

  private:
    bool ParseOps(std::optional<uint64_t> label);
    uint64_t FindNumCopyops();

    android::base::unique_fd owned_fd_;
    android::base::borrowed_fd fd_;
@@ -162,6 +165,7 @@ class CowReader : public ICowReader {
    std::optional<uint64_t> last_label_;
    std::shared_ptr<std::vector<CowOperation>> ops_;
    uint64_t total_data_ops_;
    uint64_t copy_ops_;
};

}  // namespace snapshot
+283 −7
Original line number Diff line number Diff line
@@ -47,6 +47,9 @@ bool Snapuserd::InitializeWorkers() {

        worker_threads_.push_back(std::move(wt));
    }

    read_ahead_thread_ = std::make_unique<ReadAheadThread>(cow_device_, backing_store_device_,
                                                           misc_name_, GetSharedPtr());
    return true;
}

@@ -54,7 +57,11 @@ bool Snapuserd::CommitMerge(int num_merge_ops) {
    struct CowHeader* ch = reinterpret_cast<struct CowHeader*>(mapped_addr_);
    ch->num_merge_ops += num_merge_ops;

    // Sync the first 4k block
    if (read_ahead_feature_ && read_ahead_ops_.size() > 0) {
        struct BufferState* ra_state = GetBufferState();
        ra_state->read_ahead_state = kCowReadAheadInProgress;
    }

    int ret = msync(mapped_addr_, BLOCK_SZ, MS_SYNC);
    if (ret < 0) {
        PLOG(ERROR) << "msync header failed: " << ret;
@@ -66,6 +73,174 @@ bool Snapuserd::CommitMerge(int num_merge_ops) {
    return true;
}

void Snapuserd::PrepareReadAhead() {
    if (!read_ahead_feature_) {
        return;
    }

    struct BufferState* ra_state = GetBufferState();
    // Check if the data has to be re-constructed from COW device
    if (ra_state->read_ahead_state == kCowReadAheadDone) {
        populate_data_from_cow_ = true;
    } else {
        populate_data_from_cow_ = false;
    }

    StartReadAhead();
}

bool Snapuserd::GetRABuffer(std::unique_lock<std::mutex>* lock, uint64_t block, void* buffer) {
    CHECK(lock->owns_lock());
    std::unordered_map<uint64_t, void*>::iterator it = read_ahead_buffer_map_.find(block);

    // This will be true only for IO's generated as part of reading a root
    // filesystem. IO's related to merge should always be in read-ahead cache.
    if (it == read_ahead_buffer_map_.end()) {
        return false;
    }

    // Theoretically, we can send the data back from the read-ahead buffer
    // all the way to the kernel without memcpy. However, if the IO is
    // un-aligned, the wrapper function will need to touch the read-ahead
    // buffers and transitions will be bit more complicated.
    memcpy(buffer, it->second, BLOCK_SZ);
    return true;
}

// ========== State transition functions for read-ahead operations ===========

bool Snapuserd::GetReadAheadPopulatedBuffer(uint64_t block, void* buffer) {
    if (!read_ahead_feature_) {
        return false;
    }

    {
        std::unique_lock<std::mutex> lock(lock_);
        if (io_state_ == READ_AHEAD_IO_TRANSITION::READ_AHEAD_FAILURE) {
            return false;
        }

        if (io_state_ == READ_AHEAD_IO_TRANSITION::IO_IN_PROGRESS) {
            return GetRABuffer(&lock, block, buffer);
        }
    }

    {
        // Read-ahead thread IO is in-progress. Wait for it to complete
        std::unique_lock<std::mutex> lock(lock_);
        while (!(io_state_ == READ_AHEAD_IO_TRANSITION::READ_AHEAD_FAILURE ||
                 io_state_ == READ_AHEAD_IO_TRANSITION::IO_IN_PROGRESS)) {
            cv.wait(lock);
        }

        return GetRABuffer(&lock, block, buffer);
    }
}

// This is invoked by read-ahead thread waiting for merge IO's
// to complete
bool Snapuserd::WaitForMergeToComplete() {
    {
        std::unique_lock<std::mutex> lock(lock_);
        while (!(io_state_ == READ_AHEAD_IO_TRANSITION::READ_AHEAD_BEGIN ||
                 io_state_ == READ_AHEAD_IO_TRANSITION::IO_TERMINATED)) {
            cv.wait(lock);
        }

        if (io_state_ == READ_AHEAD_IO_TRANSITION::IO_TERMINATED) {
            return false;
        }

        io_state_ = READ_AHEAD_IO_TRANSITION::READ_AHEAD_IN_PROGRESS;
        return true;
    }
}

// This is invoked during the launch of worker threads. We wait
// for read-ahead thread to by fully up before worker threads
// are launched; else we will have a race between worker threads
// and read-ahead thread specifically during re-construction.
bool Snapuserd::WaitForReadAheadToStart() {
    {
        std::unique_lock<std::mutex> lock(lock_);
        while (!(io_state_ == READ_AHEAD_IO_TRANSITION::IO_IN_PROGRESS ||
                 io_state_ == READ_AHEAD_IO_TRANSITION::READ_AHEAD_FAILURE)) {
            cv.wait(lock);
        }

        if (io_state_ == READ_AHEAD_IO_TRANSITION::READ_AHEAD_FAILURE) {
            return false;
        }

        return true;
    }
}

// Invoked by worker threads when a sequence of merge operation
// is complete notifying read-ahead thread to make forward
// progress.
void Snapuserd::StartReadAhead() {
    {
        std::lock_guard<std::mutex> lock(lock_);
        io_state_ = READ_AHEAD_IO_TRANSITION::READ_AHEAD_BEGIN;
    }

    cv.notify_one();
}

void Snapuserd::MergeCompleted() {
    {
        std::lock_guard<std::mutex> lock(lock_);
        io_state_ = READ_AHEAD_IO_TRANSITION::IO_TERMINATED;
    }

    cv.notify_one();
}

bool Snapuserd::ReadAheadIOCompleted() {
    // Flush the entire buffer region
    int ret = msync(mapped_addr_, total_mapped_addr_length_, MS_SYNC);
    if (ret < 0) {
        PLOG(ERROR) << "msync failed after ReadAheadIOCompleted: " << ret;
        return false;
    }

    // Metadata and data are synced. Now, update the state.
    // We need to update the state after flushing data; if there is a crash
    // when read-ahead IO is in progress, the state of data in the COW file
    // is unknown. kCowReadAheadDone acts as a checkpoint wherein the data
    // in the scratch space is good and during next reboot, read-ahead thread
    // can safely re-construct the data.
    struct BufferState* ra_state = GetBufferState();
    ra_state->read_ahead_state = kCowReadAheadDone;

    ret = msync(mapped_addr_, BLOCK_SZ, MS_SYNC);
    if (ret < 0) {
        PLOG(ERROR) << "msync failed to flush Readahead completion state...";
        return false;
    }

    // Notify the worker threads
    {
        std::lock_guard<std::mutex> lock(lock_);
        io_state_ = READ_AHEAD_IO_TRANSITION::IO_IN_PROGRESS;
    }

    cv.notify_all();
    return true;
}

void Snapuserd::ReadAheadIOFailed() {
    {
        std::lock_guard<std::mutex> lock(lock_);
        io_state_ = READ_AHEAD_IO_TRANSITION::READ_AHEAD_FAILURE;
    }

    cv.notify_all();
}

//========== End of state transition functions ====================

bool Snapuserd::IsChunkIdMetadata(chunk_t chunk) {
    uint32_t stride = exceptions_per_area_ + 1;
    lldiv_t divresult = lldiv(chunk, stride);
@@ -257,11 +432,14 @@ bool Snapuserd::ReadMetadata() {
        data_chunk_id = GetNextAllocatableChunkId(data_chunk_id);
    }

    int num_ra_ops_per_iter = ((GetBufferDataSize()) / BLOCK_SZ);
    std::optional<chunk_t> prev_id = {};
    std::map<uint64_t, const CowOperation*> map;
    std::set<uint64_t> dest_blocks;
    size_t pending_copy_ops = exceptions_per_area_ - num_ops;
    SNAP_LOG(INFO) << " Processing copy-ops at Area: " << vec_.size()
    uint64_t total_copy_ops = reader_->total_copy_ops();

    SNAP_LOG(DEBUG) << " Processing copy-ops at Area: " << vec_.size()
                    << " Number of replace/zero ops completed in this area: " << num_ops
                    << " Pending copy ops for this area: " << pending_copy_ops;
    while (!cowop_riter_->Done()) {
@@ -425,6 +603,9 @@ bool Snapuserd::ReadMetadata() {
            offset += sizeof(struct disk_exception);
            num_ops += 1;
            copy_ops++;
            if (read_ahead_feature_) {
                read_ahead_ops_.push_back(it->second);
            }

            SNAP_LOG(DEBUG) << num_ops << ":"
                            << " Copy-op: "
@@ -452,6 +633,15 @@ bool Snapuserd::ReadMetadata() {
            }

            data_chunk_id = GetNextAllocatableChunkId(data_chunk_id);
            total_copy_ops -= 1;
            /*
             * Split the number of ops based on the size of read-ahead buffer
             * region. We need to ensure that kernel doesn't issue IO on blocks
             * which are not read by the read-ahead thread.
             */
            if (read_ahead_feature_ && (total_copy_ops % num_ra_ops_per_iter == 0)) {
                data_chunk_id = GetNextAllocatableChunkId(data_chunk_id);
            }
        }
        map.clear();
        dest_blocks.clear();
@@ -469,6 +659,7 @@ bool Snapuserd::ReadMetadata() {

    chunk_vec_.shrink_to_fit();
    vec_.shrink_to_fit();
    read_ahead_ops_.shrink_to_fit();

    // Sort the vector based on sectors as we need this during un-aligned access
    std::sort(chunk_vec_.begin(), chunk_vec_.end(), compare);
@@ -483,6 +674,8 @@ bool Snapuserd::ReadMetadata() {
    // Total number of sectors required for creating dm-user device
    num_sectors_ = ChunkToSector(data_chunk_id);
    merge_initiated_ = false;
    PrepareReadAhead();

    return true;
}

@@ -490,8 +683,15 @@ bool Snapuserd::MmapMetadata() {
    CowHeader header;
    reader_->GetHeader(&header);

    // mmap the first 4k page
    if (header.major_version >= 2 && header.buffer_size > 0) {
        total_mapped_addr_length_ = header.header_size + BUFFER_REGION_DEFAULT_SIZE;
        read_ahead_feature_ = true;
    } else {
        // mmap the first 4k page - older COW format
        total_mapped_addr_length_ = BLOCK_SZ;
        read_ahead_feature_ = false;
    }

    mapped_addr_ = mmap(NULL, total_mapped_addr_length_, PROT_READ | PROT_WRITE, MAP_SHARED,
                        cow_fd_.get(), 0);
    if (mapped_addr_ == MAP_FAILED) {
@@ -529,11 +729,26 @@ bool Snapuserd::InitCowDevice() {
}

/*
 * Entry point to launch worker threads
 * Entry point to launch threads
 */
bool Snapuserd::Start() {
    std::vector<std::future<bool>> threads;
    std::future<bool> ra_thread;
    bool rathread = (read_ahead_feature_ && (read_ahead_ops_.size() > 0));

    // Start the read-ahead thread and wait
    // for it as the data has to be re-constructed
    // from COW device.
    if (rathread) {
        ra_thread = std::async(std::launch::async, &ReadAheadThread::RunThread,
                               read_ahead_thread_.get());
        if (!WaitForReadAheadToStart()) {
            SNAP_LOG(ERROR) << "Failed to start Read-ahead thread...";
            return false;
        }
    }

    // Launch worker threads
    for (int i = 0; i < worker_threads_.size(); i++) {
        threads.emplace_back(
                std::async(std::launch::async, &WorkerThread::RunThread, worker_threads_[i].get()));
@@ -544,8 +759,69 @@ bool Snapuserd::Start() {
        ret = t.get() && ret;
    }

    if (rathread) {
        // Notify the read-ahead thread that all worker threads
        // are done. We need this explicit notification when
        // there is an IO failure or there was a switch
        // of dm-user table; thus, forcing the read-ahead
        // thread to wake up.
        MergeCompleted();
        ret = ret && ra_thread.get();
    }

    return ret;
}

uint64_t Snapuserd::GetBufferMetadataOffset() {
    CowHeader header;
    reader_->GetHeader(&header);

    size_t size = header.header_size + sizeof(BufferState);
    return size;
}

/*
 * Metadata for read-ahead is 16 bytes. For a 2 MB region, we will
 * end up with 8k (2 PAGE) worth of metadata. Thus, a 2MB buffer
 * region is split into:
 *
 * 1: 8k metadata
 *
 */
size_t Snapuserd::GetBufferMetadataSize() {
    CowHeader header;
    reader_->GetHeader(&header);

    size_t metadata_bytes = (header.buffer_size * sizeof(struct ScratchMetadata)) / BLOCK_SZ;
    return metadata_bytes;
}

size_t Snapuserd::GetBufferDataOffset() {
    CowHeader header;
    reader_->GetHeader(&header);

    return (header.header_size + GetBufferMetadataSize());
}

/*
 * (2MB - 8K = 2088960 bytes) will be the buffer region to hold the data.
 */
size_t Snapuserd::GetBufferDataSize() {
    CowHeader header;
    reader_->GetHeader(&header);

    size_t size = header.buffer_size - GetBufferMetadataSize();
    return size;
}

struct BufferState* Snapuserd::GetBufferState() {
    CowHeader header;
    reader_->GetHeader(&header);

    struct BufferState* ra_state =
            reinterpret_cast<struct BufferState*>((char*)mapped_addr_ + header.header_size);
    return ra_state;
}

}  // namespace snapshot
}  // namespace android
Loading