Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 91b351ea authored by David Anderson's avatar David Anderson
Browse files

Perform a consistency check before deleting snapshots.

If for some reason the COW state is not fully synced to disk, but
dm-snapshot has flushed its pending merges, we do not want to delete
snapshots. Doing so could potentially leave blocks unmerged.

This situation is quite unexpected so we label it as a merge failure.
The device can recover by completely syncing the COW state, and then
rebooting, which will attempt to make forward progress on the merge.

Bug: 190582627
Test: vts_libsnapshot_test
      full OTA on bramble
      incremental OTA on bramble
Change-Id: Ib887f1d9e4397a712ed2f800cc1222cf9305a039
Merged-In: Ib887f1d9e4397a712ed2f800cc1222cf9305a039
parent 028303d5
Loading
Loading
Loading
Loading
+7 −0
Original line number Diff line number Diff line
@@ -158,6 +158,13 @@ enum MergeFailureCode {
    ExpectedMergeTarget = 11;
    UnmergedSectorsAfterCompletion = 12;
    UnexpectedMergeState = 13;
    GetCowPathConsistencyCheck = 14;
    OpenCowConsistencyCheck = 15;
    ParseCowConsistencyCheck = 16;
    OpenCowDirectConsistencyCheck = 17;
    MemAlignConsistencyCheck = 18;
    DirectReadConsistencyCheck = 19;
    WrongMergeCountConsistencyCheck = 20;
};

// Next: 8
+2 −3
Original line number Diff line number Diff line
@@ -143,12 +143,11 @@ class CowReader : public ICowReader {

    void InitializeMerge();

    // Number of copy, replace, and zero ops. Set if InitializeMerge is called.
    void set_total_data_ops(uint64_t size) { total_data_ops_ = size; }

    uint64_t total_data_ops() { return total_data_ops_; }

    // Number of copy ops. Set if InitializeMerge is called.
    void set_copy_ops(uint64_t size) { copy_ops_ = size; }

    uint64_t total_copy_ops() { return copy_ops_; }

    void CloseCowFd() { owned_fd_ = {}; }
+2 −0
Original line number Diff line number Diff line
@@ -603,6 +603,8 @@ class SnapshotManager final : public ISnapshotManager {
    MergeResult CheckMergeState(LockedFile* lock, const std::function<bool()>& before_cancel);
    MergeResult CheckTargetMergeState(LockedFile* lock, const std::string& name,
                                      const SnapshotUpdateStatus& update_status);
    MergeFailureCode CheckMergeConsistency(LockedFile* lock, const std::string& name,
                                           const SnapshotStatus& update_status);

    // Interact with status files under /metadata/ota/snapshots.
    bool WriteSnapshotStatus(LockedFile* lock, const SnapshotStatus& status);
+91 −8
Original line number Diff line number Diff line
@@ -1126,6 +1126,11 @@ auto SnapshotManager::CheckTargetMergeState(LockedFile* lock, const std::string&
        return MergeResult(UpdateState::Merging);
    }

    auto code = CheckMergeConsistency(lock, name, snapshot_status);
    if (code != MergeFailureCode::Ok) {
        return MergeResult(UpdateState::MergeFailed, code);
    }

    // Merging is done. First, update the status file to indicate the merge
    // is complete. We do this before calling OnSnapshotMergeComplete, even
    // though this means the write is potentially wasted work (since in the
@@ -1144,6 +1149,91 @@ auto SnapshotManager::CheckTargetMergeState(LockedFile* lock, const std::string&
    return MergeResult(UpdateState::MergeCompleted, MergeFailureCode::Ok);
}

// This returns the backing device, not the dm-user layer.
static std::string GetMappedCowDeviceName(const std::string& snapshot,
                                          const SnapshotStatus& status) {
    // If no partition was created (the COW exists entirely on /data), the
    // device-mapper layering is different than if we had a partition.
    if (status.cow_partition_size() == 0) {
        return GetCowImageDeviceName(snapshot);
    }
    return GetCowName(snapshot);
}

MergeFailureCode SnapshotManager::CheckMergeConsistency(LockedFile* lock, const std::string& name,
                                                        const SnapshotStatus& status) {
    CHECK(lock);

    if (!status.compression_enabled()) {
        // Do not try to verify old-style COWs yet.
        return MergeFailureCode::Ok;
    }

    auto& dm = DeviceMapper::Instance();

    std::string cow_image_name = GetMappedCowDeviceName(name, status);
    std::string cow_image_path;
    if (!dm.GetDmDevicePathByName(cow_image_name, &cow_image_path)) {
        LOG(ERROR) << "Failed to get path for cow device: " << cow_image_name;
        return MergeFailureCode::GetCowPathConsistencyCheck;
    }

    // First pass, count # of ops.
    size_t num_ops = 0;
    {
        unique_fd fd(open(cow_image_path.c_str(), O_RDONLY | O_CLOEXEC));
        if (fd < 0) {
            PLOG(ERROR) << "Failed to open " << cow_image_name;
            return MergeFailureCode::OpenCowConsistencyCheck;
        }

        CowReader reader;
        if (!reader.Parse(std::move(fd))) {
            LOG(ERROR) << "Failed to parse cow " << cow_image_path;
            return MergeFailureCode::ParseCowConsistencyCheck;
        }

        for (auto iter = reader.GetOpIter(); !iter->Done(); iter->Next()) {
            if (!IsMetadataOp(iter->Get())) {
                num_ops++;
            }
        }
    }

    // Second pass, try as hard as we can to get the actual number of blocks
    // the system thinks is merged.
    unique_fd fd(open(cow_image_path.c_str(), O_RDONLY | O_DIRECT | O_SYNC | O_CLOEXEC));
    if (fd < 0) {
        PLOG(ERROR) << "Failed to open direct " << cow_image_name;
        return MergeFailureCode::OpenCowDirectConsistencyCheck;
    }

    void* addr;
    size_t page_size = getpagesize();
    if (posix_memalign(&addr, page_size, page_size) < 0) {
        PLOG(ERROR) << "posix_memalign with page size " << page_size;
        return MergeFailureCode::MemAlignConsistencyCheck;
    }

    // COWs are always at least 2MB, this is guaranteed in snapshot creation.
    std::unique_ptr<void, decltype(&::free)> buffer(addr, ::free);
    if (!android::base::ReadFully(fd, buffer.get(), page_size)) {
        PLOG(ERROR) << "Direct read failed " << cow_image_name;
        return MergeFailureCode::DirectReadConsistencyCheck;
    }

    auto header = reinterpret_cast<CowHeader*>(buffer.get());
    if (header->num_merge_ops != num_ops) {
        LOG(ERROR) << "COW consistency check failed, expected " << num_ops << " to be merged, "
                   << "but " << header->num_merge_ops << " were actually recorded.";
        LOG(ERROR) << "Aborting merge progress for snapshot " << name
                   << ", will try again next boot";
        return MergeFailureCode::WrongMergeCountConsistencyCheck;
    }

    return MergeFailureCode::Ok;
}

MergeFailureCode SnapshotManager::MergeSecondPhaseSnapshots(LockedFile* lock) {
    std::vector<std::string> snapshots;
    if (!ListSnapshots(lock, &snapshots)) {
@@ -1429,14 +1519,7 @@ bool SnapshotManager::PerformInitTransition(InitTransition transition,
            continue;
        }

        // If no partition was created (the COW exists entirely on /data), the
        // device-mapper layering is different than if we had a partition.
        std::string cow_image_name;
        if (snapshot_status.cow_partition_size() == 0) {
            cow_image_name = GetCowImageDeviceName(snapshot);
        } else {
            cow_image_name = GetCowName(snapshot);
        }
        std::string cow_image_name = GetMappedCowDeviceName(snapshot, snapshot_status);

        std::string cow_image_device;
        if (!dm.GetDmDevicePathByName(cow_image_name, &cow_image_device)) {