Loading fs/xfs/xfs_aops.c +267 −466 Original line number Diff line number Diff line Loading @@ -40,6 +40,17 @@ #define XFS_DIO_FLAG_UNWRITTEN (1 << 0) #define XFS_DIO_FLAG_APPEND (1 << 1) /* * structure owned by writepages passed to individual writepage calls */ struct xfs_writepage_ctx { struct xfs_bmbt_irec imap; bool imap_valid; unsigned int io_type; struct xfs_ioend *ioend; sector_t last_block; }; void xfs_count_page_state( struct page *page, Loading Loading @@ -271,7 +282,7 @@ xfs_alloc_ioend( */ atomic_set(&ioend->io_remaining, 1); ioend->io_error = 0; ioend->io_list = NULL; INIT_LIST_HEAD(&ioend->io_list); ioend->io_type = type; ioend->io_inode = inode; ioend->io_buffer_head = NULL; Loading @@ -289,8 +300,7 @@ xfs_map_blocks( struct inode *inode, loff_t offset, struct xfs_bmbt_irec *imap, int type, int nonblocking) int type) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; Loading @@ -306,12 +316,7 @@ xfs_map_blocks( if (type == XFS_IO_UNWRITTEN) bmapi_flags |= XFS_BMAPI_IGSTATE; if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { if (nonblocking) return -EAGAIN; xfs_ilock(ip, XFS_ILOCK_SHARED); } ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || (ip->i_df.if_flags & XFS_IFEXTENTS)); ASSERT(offset <= mp->m_super->s_maxbytes); Loading Loading @@ -347,7 +352,7 @@ xfs_map_blocks( return 0; } STATIC int STATIC bool xfs_imap_valid( struct inode *inode, struct xfs_bmbt_irec *imap, Loading Loading @@ -420,8 +425,7 @@ xfs_start_buffer_writeback( STATIC void xfs_start_page_writeback( struct page *page, int clear_dirty, int buffers) int clear_dirty) { ASSERT(PageLocked(page)); ASSERT(!PageWriteback(page)); Loading @@ -440,10 +444,6 @@ xfs_start_page_writeback( set_page_writeback_keepwrite(page); unlock_page(page); /* If no buffers on the page are to be written, finish it here */ if (!buffers) end_page_writeback(page); } static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh) Loading @@ -452,64 +452,41 @@ static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh) } /* * Submit all of the bios for all of the ioends we have saved up, covering the * initial writepage page and also any probed pages. * * Because we may have multiple ioends spanning a page, we need to start * writeback on all the buffers before we submit them for I/O. If we mark the * buffers as we got, then we can end up with a page that only has buffers * marked async write and I/O complete on can occur before we mark the other * buffers async write. * * The end result of this is that we trip a bug in end_page_writeback() because * we call it twice for the one page as the code in end_buffer_async_write() * assumes that all buffers on the page are started at the same time. * * The fix is two passes across the ioend list - one to start writeback on the * buffer_heads, and then submit them for I/O on the second pass. * Submit all of the bios for an ioend. We are only passed a single ioend at a * time; the caller is responsible for chaining prior to submission. * * If @fail is non-zero, it means that we have a situation where some part of * the submission process has failed after we have marked paged for writeback * and unlocked them. In this situation, we need to fail the ioend chain rather * than submit it to IO. This typically only happens on a filesystem shutdown. */ STATIC void STATIC int xfs_submit_ioend( struct writeback_control *wbc, xfs_ioend_t *ioend, int fail) int status) { xfs_ioend_t *head = ioend; xfs_ioend_t *next; struct buffer_head *bh; struct bio *bio; sector_t lastblock = 0; /* Pass 1 - start writeback */ do { next = ioend->io_list; for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) xfs_start_buffer_writeback(bh); } while ((ioend = next) != NULL); /* Pass 2 - submit I/O */ ioend = head; do { next = ioend->io_list; bio = NULL; /* Reserve log space if we might write beyond the on-disk inode size. */ if (!status && ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend)) status = xfs_setfilesize_trans_alloc(ioend); /* * If we are failing the IO now, just mark the ioend with an * error and finish it. This will run IO completion immediately * as there is only one reference to the ioend at this point in * time. */ if (fail) { ioend->io_error = fail; if (status) { ioend->io_error = status; xfs_finish_ioend(ioend); continue; return status; } bio = NULL; for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) { if (!bio) { Loading @@ -530,75 +507,46 @@ xfs_submit_ioend( if (bio) xfs_submit_ioend_bio(wbc, ioend, bio); xfs_finish_ioend(ioend); } while ((ioend = next) != NULL); } /* * Cancel submission of all buffer_heads so far in this endio. * Toss the endio too. Only ever called for the initial page * in a writepage request, so only ever one page. */ STATIC void xfs_cancel_ioend( xfs_ioend_t *ioend) { xfs_ioend_t *next; struct buffer_head *bh, *next_bh; do { next = ioend->io_list; bh = ioend->io_buffer_head; do { next_bh = bh->b_private; clear_buffer_async_write(bh); /* * The unwritten flag is cleared when added to the * ioend. We're not submitting for I/O so mark the * buffer unwritten again for next time around. */ if (ioend->io_type == XFS_IO_UNWRITTEN) set_buffer_unwritten(bh); unlock_buffer(bh); } while ((bh = next_bh) != NULL); mempool_free(ioend, xfs_ioend_pool); } while ((ioend = next) != NULL); return 0; } /* * Test to see if we've been building up a completion structure for * earlier buffers -- if so, we try to append to this ioend if we * can, otherwise we finish off any current ioend and start another. * Return true if we've finished the given ioend. * Return the ioend we finished off so that the caller can submit it * once it has finished processing the dirty page. */ STATIC void xfs_add_to_ioend( struct inode *inode, struct buffer_head *bh, xfs_off_t offset, unsigned int type, xfs_ioend_t **result, int need_ioend) struct xfs_writepage_ctx *wpc, struct list_head *iolist) { xfs_ioend_t *ioend = *result; if (!ioend || need_ioend || type != ioend->io_type) { xfs_ioend_t *previous = *result; ioend = xfs_alloc_ioend(inode, type); ioend->io_offset = offset; ioend->io_buffer_head = bh; ioend->io_buffer_tail = bh; if (previous) previous->io_list = ioend; *result = ioend; if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type || bh->b_blocknr != wpc->last_block + 1 || offset != wpc->ioend->io_offset + wpc->ioend->io_size) { struct xfs_ioend *new; if (wpc->ioend) list_add(&wpc->ioend->io_list, iolist); new = xfs_alloc_ioend(inode, wpc->io_type); new->io_offset = offset; new->io_buffer_head = bh; new->io_buffer_tail = bh; wpc->ioend = new; } else { ioend->io_buffer_tail->b_private = bh; ioend->io_buffer_tail = bh; wpc->ioend->io_buffer_tail->b_private = bh; wpc->ioend->io_buffer_tail = bh; } bh->b_private = NULL; ioend->io_size += bh->b_size; wpc->ioend->io_size += bh->b_size; wpc->last_block = bh->b_blocknr; xfs_start_buffer_writeback(bh); } STATIC void Loading Loading @@ -684,183 +632,6 @@ xfs_check_page_type( return false; } /* * Allocate & map buffers for page given the extent map. Write it out. * except for the original page of a writepage, this is called on * delalloc/unwritten pages only, for the original page it is possible * that the page has no mapping at all. */ STATIC int xfs_convert_page( struct inode *inode, struct page *page, loff_t tindex, struct xfs_bmbt_irec *imap, xfs_ioend_t **ioendp, struct writeback_control *wbc) { struct buffer_head *bh, *head; xfs_off_t end_offset; unsigned long p_offset; unsigned int type; int len, page_dirty; int count = 0, done = 0, uptodate = 1; xfs_off_t offset = page_offset(page); if (page->index != tindex) goto fail; if (!trylock_page(page)) goto fail; if (PageWriteback(page)) goto fail_unlock_page; if (page->mapping != inode->i_mapping) goto fail_unlock_page; if (!xfs_check_page_type(page, (*ioendp)->io_type, false)) goto fail_unlock_page; /* * page_dirty is initially a count of buffers on the page before * EOF and is decremented as we move each into a cleanable state. * * Derivation: * * End offset is the highest offset that this page should represent. * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1)) * will evaluate non-zero and be less than PAGE_CACHE_SIZE and * hence give us the correct page_dirty count. On any other page, * it will be zero and in that case we need page_dirty to be the * count of buffers on the page. */ end_offset = min_t(unsigned long long, (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, i_size_read(inode)); /* * If the current map does not span the entire page we are about to try * to write, then give up. The only way we can write a page that spans * multiple mappings in a single writeback iteration is via the * xfs_vm_writepage() function. Data integrity writeback requires the * entire page to be written in a single attempt, otherwise the part of * the page we don't write here doesn't get written as part of the data * integrity sync. * * For normal writeback, we also don't attempt to write partial pages * here as it simply means that write_cache_pages() will see it under * writeback and ignore the page until some point in the future, at * which time this will be the only page in the file that needs * writeback. Hence for more optimal IO patterns, we should always * avoid partial page writeback due to multiple mappings on a page here. */ if (!xfs_imap_valid(inode, imap, end_offset)) goto fail_unlock_page; len = 1 << inode->i_blkbits; p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1), PAGE_CACHE_SIZE); p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE; page_dirty = p_offset / len; /* * The moment we find a buffer that doesn't match our current type * specification or can't be written, abort the loop and start * writeback. As per the above xfs_imap_valid() check, only * xfs_vm_writepage() can handle partial page writeback fully - we are * limited here to the buffers that are contiguous with the current * ioend, and hence a buffer we can't write breaks that contiguity and * we have to defer the rest of the IO to xfs_vm_writepage(). */ bh = head = page_buffers(page); do { if (offset >= end_offset) break; if (!buffer_uptodate(bh)) uptodate = 0; if (!(PageUptodate(page) || buffer_uptodate(bh))) { done = 1; break; } if (buffer_unwritten(bh) || buffer_delay(bh) || buffer_mapped(bh)) { if (buffer_unwritten(bh)) type = XFS_IO_UNWRITTEN; else if (buffer_delay(bh)) type = XFS_IO_DELALLOC; else type = XFS_IO_OVERWRITE; /* * imap should always be valid because of the above * partial page end_offset check on the imap. */ ASSERT(xfs_imap_valid(inode, imap, offset)); lock_buffer(bh); if (type != XFS_IO_OVERWRITE) xfs_map_at_offset(inode, bh, imap, offset); xfs_add_to_ioend(inode, bh, offset, type, ioendp, done); page_dirty--; count++; } else { done = 1; break; } } while (offset += len, (bh = bh->b_this_page) != head); if (uptodate && bh == head) SetPageUptodate(page); if (count) { if (--wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) done = 1; } xfs_start_page_writeback(page, !page_dirty, count); return done; fail_unlock_page: unlock_page(page); fail: return 1; } /* * Convert & write out a cluster of pages in the same extent as defined * by mp and following the start page. */ STATIC void xfs_cluster_write( struct inode *inode, pgoff_t tindex, struct xfs_bmbt_irec *imap, xfs_ioend_t **ioendp, struct writeback_control *wbc, pgoff_t tlast) { struct pagevec pvec; int done = 0, i; pagevec_init(&pvec, 0); while (!done && tindex <= tlast) { unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1); if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len)) break; for (i = 0; i < pagevec_count(&pvec); i++) { done = xfs_convert_page(inode, pvec.pages[i], tindex++, imap, ioendp, wbc); if (done) break; } pagevec_release(&pvec); cond_resched(); } } STATIC void xfs_vm_invalidatepage( struct page *page, Loading Loading @@ -937,6 +708,164 @@ xfs_aops_discard_page( return; } /* * We implement an immediate ioend submission policy here to avoid needing to * chain multiple ioends and hence nest mempool allocations which can violate * forward progress guarantees we need to provide. The current ioend we are * adding buffers to is cached on the writepage context, and if the new buffer * does not append to the cached ioend it will create a new ioend and cache that * instead. * * If a new ioend is created and cached, the old ioend is returned and queued * locally for submission once the entire page is processed or an error has been * detected. While ioends are submitted immediately after they are completed, * batching optimisations are provided by higher level block plugging. * * At the end of a writeback pass, there will be a cached ioend remaining on the * writepage context that the caller will need to submit. */ static int xfs_writepage_map( struct xfs_writepage_ctx *wpc, struct writeback_control *wbc, struct inode *inode, struct page *page, loff_t offset, __uint64_t end_offset) { LIST_HEAD(submit_list); struct xfs_ioend *ioend, *next; struct buffer_head *bh, *head; ssize_t len = 1 << inode->i_blkbits; int error = 0; int count = 0; int uptodate = 1; bh = head = page_buffers(page); offset = page_offset(page); do { if (offset >= end_offset) break; if (!buffer_uptodate(bh)) uptodate = 0; /* * set_page_dirty dirties all buffers in a page, independent * of their state. The dirty state however is entirely * meaningless for holes (!mapped && uptodate), so skip * buffers covering holes here. */ if (!buffer_mapped(bh) && buffer_uptodate(bh)) { wpc->imap_valid = false; continue; } if (buffer_unwritten(bh)) { if (wpc->io_type != XFS_IO_UNWRITTEN) { wpc->io_type = XFS_IO_UNWRITTEN; wpc->imap_valid = false; } } else if (buffer_delay(bh)) { if (wpc->io_type != XFS_IO_DELALLOC) { wpc->io_type = XFS_IO_DELALLOC; wpc->imap_valid = false; } } else if (buffer_uptodate(bh)) { if (wpc->io_type != XFS_IO_OVERWRITE) { wpc->io_type = XFS_IO_OVERWRITE; wpc->imap_valid = false; } } else { if (PageUptodate(page)) ASSERT(buffer_mapped(bh)); /* * This buffer is not uptodate and will not be * written to disk. Ensure that we will put any * subsequent writeable buffers into a new * ioend. */ wpc->imap_valid = false; continue; } if (wpc->imap_valid) wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, offset); if (!wpc->imap_valid) { error = xfs_map_blocks(inode, offset, &wpc->imap, wpc->io_type); if (error) goto out; wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, offset); } if (wpc->imap_valid) { lock_buffer(bh); if (wpc->io_type != XFS_IO_OVERWRITE) xfs_map_at_offset(inode, bh, &wpc->imap, offset); xfs_add_to_ioend(inode, bh, offset, wpc, &submit_list); count++; } } while (offset += len, ((bh = bh->b_this_page) != head)); if (uptodate && bh == head) SetPageUptodate(page); ASSERT(wpc->ioend || list_empty(&submit_list)); out: /* * On error, we have to fail the ioend here because we have locked * buffers in the ioend. If we don't do this, we'll deadlock * invalidating the page as that tries to lock the buffers on the page. * Also, because we may have set pages under writeback, we have to make * sure we run IO completion to mark the error state of the IO * appropriately, so we can't cancel the ioend directly here. That means * we have to mark this page as under writeback if we included any * buffers from it in the ioend chain so that completion treats it * correctly. * * If we didn't include the page in the ioend, the on error we can * simply discard and unlock it as there are no other users of the page * or it's buffers right now. The caller will still need to trigger * submission of outstanding ioends on the writepage context so they are * treated correctly on error. */ if (count) { xfs_start_page_writeback(page, !error); /* * Preserve the original error if there was one, otherwise catch * submission errors here and propagate into subsequent ioend * submissions. */ list_for_each_entry_safe(ioend, next, &submit_list, io_list) { int error2; list_del_init(&ioend->io_list); error2 = xfs_submit_ioend(wbc, ioend, error); if (error2 && !error) error = error2; } } else if (error) { xfs_aops_discard_page(page); ClearPageUptodate(page); unlock_page(page); } else { /* * We can end up here with no error and nothing to write if we * race with a partial page truncate on a sub-page block sized * filesystem. In that case we need to mark the page clean. */ xfs_start_page_writeback(page, 1); end_page_writeback(page); } mapping_set_error(page->mapping, error); return error; } /* * Write out a dirty page. * Loading @@ -946,22 +875,16 @@ xfs_aops_discard_page( * For any other dirty buffer heads on the page we should flush them. */ STATIC int xfs_vm_writepage( xfs_do_writepage( struct page *page, struct writeback_control *wbc) struct writeback_control *wbc, void *data) { struct xfs_writepage_ctx *wpc = data; struct inode *inode = page->mapping->host; struct buffer_head *bh, *head; struct xfs_bmbt_irec imap; xfs_ioend_t *ioend = NULL, *iohead = NULL; loff_t offset; unsigned int type; __uint64_t end_offset; pgoff_t end_index, last_index; ssize_t len; int err, imap_valid = 0, uptodate = 1; int count = 0; int nonblocking = 0; pgoff_t end_index; trace_xfs_writepage(inode, page, 0, 0); Loading @@ -988,12 +911,9 @@ xfs_vm_writepage( if (WARN_ON_ONCE(current->flags & PF_FSTRANS)) goto redirty; /* Is this page beyond the end of the file? */ offset = i_size_read(inode); end_index = offset >> PAGE_CACHE_SHIFT; last_index = (offset - 1) >> PAGE_CACHE_SHIFT; /* * Is this page beyond the end of the file? * * The page index is less than the end_index, adjust the end_offset * to the highest offset that this page should represent. * ----------------------------------------------------- Loading @@ -1004,6 +924,8 @@ xfs_vm_writepage( * | desired writeback range | see else | * ---------------------------------^------------------| */ offset = i_size_read(inode); end_index = offset >> PAGE_CACHE_SHIFT; if (page->index < end_index) end_offset = (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT; else { Loading Loading @@ -1055,152 +977,7 @@ xfs_vm_writepage( end_offset = offset; } len = 1 << inode->i_blkbits; bh = head = page_buffers(page); offset = page_offset(page); type = XFS_IO_OVERWRITE; if (wbc->sync_mode == WB_SYNC_NONE) nonblocking = 1; do { int new_ioend = 0; if (offset >= end_offset) break; if (!buffer_uptodate(bh)) uptodate = 0; /* * set_page_dirty dirties all buffers in a page, independent * of their state. The dirty state however is entirely * meaningless for holes (!mapped && uptodate), so skip * buffers covering holes here. */ if (!buffer_mapped(bh) && buffer_uptodate(bh)) { imap_valid = 0; continue; } if (buffer_unwritten(bh)) { if (type != XFS_IO_UNWRITTEN) { type = XFS_IO_UNWRITTEN; imap_valid = 0; } } else if (buffer_delay(bh)) { if (type != XFS_IO_DELALLOC) { type = XFS_IO_DELALLOC; imap_valid = 0; } } else if (buffer_uptodate(bh)) { if (type != XFS_IO_OVERWRITE) { type = XFS_IO_OVERWRITE; imap_valid = 0; } } else { if (PageUptodate(page)) ASSERT(buffer_mapped(bh)); /* * This buffer is not uptodate and will not be * written to disk. Ensure that we will put any * subsequent writeable buffers into a new * ioend. */ imap_valid = 0; continue; } if (imap_valid) imap_valid = xfs_imap_valid(inode, &imap, offset); if (!imap_valid) { /* * If we didn't have a valid mapping then we need to * put the new mapping into a separate ioend structure. * This ensures non-contiguous extents always have * separate ioends, which is particularly important * for unwritten extent conversion at I/O completion * time. */ new_ioend = 1; err = xfs_map_blocks(inode, offset, &imap, type, nonblocking); if (err) goto error; imap_valid = xfs_imap_valid(inode, &imap, offset); } if (imap_valid) { lock_buffer(bh); if (type != XFS_IO_OVERWRITE) xfs_map_at_offset(inode, bh, &imap, offset); xfs_add_to_ioend(inode, bh, offset, type, &ioend, new_ioend); count++; } if (!iohead) iohead = ioend; } while (offset += len, ((bh = bh->b_this_page) != head)); if (uptodate && bh == head) SetPageUptodate(page); xfs_start_page_writeback(page, 1, count); /* if there is no IO to be submitted for this page, we are done */ if (!ioend) return 0; ASSERT(iohead); /* * Any errors from this point onwards need tobe reported through the IO * completion path as we have marked the initial page as under writeback * and unlocked it. */ if (imap_valid) { xfs_off_t end_index; end_index = imap.br_startoff + imap.br_blockcount; /* to bytes */ end_index <<= inode->i_blkbits; /* to pages */ end_index = (end_index - 1) >> PAGE_CACHE_SHIFT; /* check against file size */ if (end_index > last_index) end_index = last_index; xfs_cluster_write(inode, page->index + 1, &imap, &ioend, wbc, end_index); } /* * Reserve log space if we might write beyond the on-disk inode size. */ err = 0; if (ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend)) err = xfs_setfilesize_trans_alloc(ioend); xfs_submit_ioend(wbc, iohead, err); return 0; error: if (iohead) xfs_cancel_ioend(iohead); if (err == -EAGAIN) goto redirty; xfs_aops_discard_page(page); ClearPageUptodate(page); unlock_page(page); return err; return xfs_writepage_map(wpc, wbc, inode, page, offset, end_offset); redirty: redirty_page_for_writepage(wbc, page); Loading @@ -1208,13 +985,37 @@ xfs_vm_writepage( return 0; } STATIC int xfs_vm_writepage( struct page *page, struct writeback_control *wbc) { struct xfs_writepage_ctx wpc = { .io_type = XFS_IO_INVALID, }; int ret; ret = xfs_do_writepage(page, wbc, &wpc); if (wpc.ioend) ret = xfs_submit_ioend(wbc, wpc.ioend, ret); return ret; } STATIC int xfs_vm_writepages( struct address_space *mapping, struct writeback_control *wbc) { struct xfs_writepage_ctx wpc = { .io_type = XFS_IO_INVALID, }; int ret; xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); return generic_writepages(mapping, wbc); ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc); if (wpc.ioend) ret = xfs_submit_ioend(wbc, wpc.ioend, ret); return ret; } /* Loading fs/xfs/xfs_aops.h +3 −1 Original line number Diff line number Diff line Loading @@ -24,12 +24,14 @@ extern mempool_t *xfs_ioend_pool; * Types of I/O for bmap clustering and I/O completion tracking. */ enum { XFS_IO_INVALID, /* initial state */ XFS_IO_DELALLOC, /* covers delalloc region */ XFS_IO_UNWRITTEN, /* covers allocated but uninitialized data */ XFS_IO_OVERWRITE, /* covers already allocated extent */ }; #define XFS_IO_TYPES \ { XFS_IO_INVALID, "invalid" }, \ { XFS_IO_DELALLOC, "delalloc" }, \ { XFS_IO_UNWRITTEN, "unwritten" }, \ { XFS_IO_OVERWRITE, "overwrite" } Loading @@ -39,7 +41,7 @@ enum { * It can manage several multi-page bio's at once. */ typedef struct xfs_ioend { struct xfs_ioend *io_list; /* next ioend in chain */ struct list_head io_list; /* next ioend in chain */ unsigned int io_type; /* delalloc / unwritten */ int io_error; /* I/O error code */ atomic_t io_remaining; /* hold count */ Loading Loading
fs/xfs/xfs_aops.c +267 −466 Original line number Diff line number Diff line Loading @@ -40,6 +40,17 @@ #define XFS_DIO_FLAG_UNWRITTEN (1 << 0) #define XFS_DIO_FLAG_APPEND (1 << 1) /* * structure owned by writepages passed to individual writepage calls */ struct xfs_writepage_ctx { struct xfs_bmbt_irec imap; bool imap_valid; unsigned int io_type; struct xfs_ioend *ioend; sector_t last_block; }; void xfs_count_page_state( struct page *page, Loading Loading @@ -271,7 +282,7 @@ xfs_alloc_ioend( */ atomic_set(&ioend->io_remaining, 1); ioend->io_error = 0; ioend->io_list = NULL; INIT_LIST_HEAD(&ioend->io_list); ioend->io_type = type; ioend->io_inode = inode; ioend->io_buffer_head = NULL; Loading @@ -289,8 +300,7 @@ xfs_map_blocks( struct inode *inode, loff_t offset, struct xfs_bmbt_irec *imap, int type, int nonblocking) int type) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; Loading @@ -306,12 +316,7 @@ xfs_map_blocks( if (type == XFS_IO_UNWRITTEN) bmapi_flags |= XFS_BMAPI_IGSTATE; if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { if (nonblocking) return -EAGAIN; xfs_ilock(ip, XFS_ILOCK_SHARED); } ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || (ip->i_df.if_flags & XFS_IFEXTENTS)); ASSERT(offset <= mp->m_super->s_maxbytes); Loading Loading @@ -347,7 +352,7 @@ xfs_map_blocks( return 0; } STATIC int STATIC bool xfs_imap_valid( struct inode *inode, struct xfs_bmbt_irec *imap, Loading Loading @@ -420,8 +425,7 @@ xfs_start_buffer_writeback( STATIC void xfs_start_page_writeback( struct page *page, int clear_dirty, int buffers) int clear_dirty) { ASSERT(PageLocked(page)); ASSERT(!PageWriteback(page)); Loading @@ -440,10 +444,6 @@ xfs_start_page_writeback( set_page_writeback_keepwrite(page); unlock_page(page); /* If no buffers on the page are to be written, finish it here */ if (!buffers) end_page_writeback(page); } static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh) Loading @@ -452,64 +452,41 @@ static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh) } /* * Submit all of the bios for all of the ioends we have saved up, covering the * initial writepage page and also any probed pages. * * Because we may have multiple ioends spanning a page, we need to start * writeback on all the buffers before we submit them for I/O. If we mark the * buffers as we got, then we can end up with a page that only has buffers * marked async write and I/O complete on can occur before we mark the other * buffers async write. * * The end result of this is that we trip a bug in end_page_writeback() because * we call it twice for the one page as the code in end_buffer_async_write() * assumes that all buffers on the page are started at the same time. * * The fix is two passes across the ioend list - one to start writeback on the * buffer_heads, and then submit them for I/O on the second pass. * Submit all of the bios for an ioend. We are only passed a single ioend at a * time; the caller is responsible for chaining prior to submission. * * If @fail is non-zero, it means that we have a situation where some part of * the submission process has failed after we have marked paged for writeback * and unlocked them. In this situation, we need to fail the ioend chain rather * than submit it to IO. This typically only happens on a filesystem shutdown. */ STATIC void STATIC int xfs_submit_ioend( struct writeback_control *wbc, xfs_ioend_t *ioend, int fail) int status) { xfs_ioend_t *head = ioend; xfs_ioend_t *next; struct buffer_head *bh; struct bio *bio; sector_t lastblock = 0; /* Pass 1 - start writeback */ do { next = ioend->io_list; for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) xfs_start_buffer_writeback(bh); } while ((ioend = next) != NULL); /* Pass 2 - submit I/O */ ioend = head; do { next = ioend->io_list; bio = NULL; /* Reserve log space if we might write beyond the on-disk inode size. */ if (!status && ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend)) status = xfs_setfilesize_trans_alloc(ioend); /* * If we are failing the IO now, just mark the ioend with an * error and finish it. This will run IO completion immediately * as there is only one reference to the ioend at this point in * time. */ if (fail) { ioend->io_error = fail; if (status) { ioend->io_error = status; xfs_finish_ioend(ioend); continue; return status; } bio = NULL; for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) { if (!bio) { Loading @@ -530,75 +507,46 @@ xfs_submit_ioend( if (bio) xfs_submit_ioend_bio(wbc, ioend, bio); xfs_finish_ioend(ioend); } while ((ioend = next) != NULL); } /* * Cancel submission of all buffer_heads so far in this endio. * Toss the endio too. Only ever called for the initial page * in a writepage request, so only ever one page. */ STATIC void xfs_cancel_ioend( xfs_ioend_t *ioend) { xfs_ioend_t *next; struct buffer_head *bh, *next_bh; do { next = ioend->io_list; bh = ioend->io_buffer_head; do { next_bh = bh->b_private; clear_buffer_async_write(bh); /* * The unwritten flag is cleared when added to the * ioend. We're not submitting for I/O so mark the * buffer unwritten again for next time around. */ if (ioend->io_type == XFS_IO_UNWRITTEN) set_buffer_unwritten(bh); unlock_buffer(bh); } while ((bh = next_bh) != NULL); mempool_free(ioend, xfs_ioend_pool); } while ((ioend = next) != NULL); return 0; } /* * Test to see if we've been building up a completion structure for * earlier buffers -- if so, we try to append to this ioend if we * can, otherwise we finish off any current ioend and start another. * Return true if we've finished the given ioend. * Return the ioend we finished off so that the caller can submit it * once it has finished processing the dirty page. */ STATIC void xfs_add_to_ioend( struct inode *inode, struct buffer_head *bh, xfs_off_t offset, unsigned int type, xfs_ioend_t **result, int need_ioend) struct xfs_writepage_ctx *wpc, struct list_head *iolist) { xfs_ioend_t *ioend = *result; if (!ioend || need_ioend || type != ioend->io_type) { xfs_ioend_t *previous = *result; ioend = xfs_alloc_ioend(inode, type); ioend->io_offset = offset; ioend->io_buffer_head = bh; ioend->io_buffer_tail = bh; if (previous) previous->io_list = ioend; *result = ioend; if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type || bh->b_blocknr != wpc->last_block + 1 || offset != wpc->ioend->io_offset + wpc->ioend->io_size) { struct xfs_ioend *new; if (wpc->ioend) list_add(&wpc->ioend->io_list, iolist); new = xfs_alloc_ioend(inode, wpc->io_type); new->io_offset = offset; new->io_buffer_head = bh; new->io_buffer_tail = bh; wpc->ioend = new; } else { ioend->io_buffer_tail->b_private = bh; ioend->io_buffer_tail = bh; wpc->ioend->io_buffer_tail->b_private = bh; wpc->ioend->io_buffer_tail = bh; } bh->b_private = NULL; ioend->io_size += bh->b_size; wpc->ioend->io_size += bh->b_size; wpc->last_block = bh->b_blocknr; xfs_start_buffer_writeback(bh); } STATIC void Loading Loading @@ -684,183 +632,6 @@ xfs_check_page_type( return false; } /* * Allocate & map buffers for page given the extent map. Write it out. * except for the original page of a writepage, this is called on * delalloc/unwritten pages only, for the original page it is possible * that the page has no mapping at all. */ STATIC int xfs_convert_page( struct inode *inode, struct page *page, loff_t tindex, struct xfs_bmbt_irec *imap, xfs_ioend_t **ioendp, struct writeback_control *wbc) { struct buffer_head *bh, *head; xfs_off_t end_offset; unsigned long p_offset; unsigned int type; int len, page_dirty; int count = 0, done = 0, uptodate = 1; xfs_off_t offset = page_offset(page); if (page->index != tindex) goto fail; if (!trylock_page(page)) goto fail; if (PageWriteback(page)) goto fail_unlock_page; if (page->mapping != inode->i_mapping) goto fail_unlock_page; if (!xfs_check_page_type(page, (*ioendp)->io_type, false)) goto fail_unlock_page; /* * page_dirty is initially a count of buffers on the page before * EOF and is decremented as we move each into a cleanable state. * * Derivation: * * End offset is the highest offset that this page should represent. * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1)) * will evaluate non-zero and be less than PAGE_CACHE_SIZE and * hence give us the correct page_dirty count. On any other page, * it will be zero and in that case we need page_dirty to be the * count of buffers on the page. */ end_offset = min_t(unsigned long long, (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, i_size_read(inode)); /* * If the current map does not span the entire page we are about to try * to write, then give up. The only way we can write a page that spans * multiple mappings in a single writeback iteration is via the * xfs_vm_writepage() function. Data integrity writeback requires the * entire page to be written in a single attempt, otherwise the part of * the page we don't write here doesn't get written as part of the data * integrity sync. * * For normal writeback, we also don't attempt to write partial pages * here as it simply means that write_cache_pages() will see it under * writeback and ignore the page until some point in the future, at * which time this will be the only page in the file that needs * writeback. Hence for more optimal IO patterns, we should always * avoid partial page writeback due to multiple mappings on a page here. */ if (!xfs_imap_valid(inode, imap, end_offset)) goto fail_unlock_page; len = 1 << inode->i_blkbits; p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1), PAGE_CACHE_SIZE); p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE; page_dirty = p_offset / len; /* * The moment we find a buffer that doesn't match our current type * specification or can't be written, abort the loop and start * writeback. As per the above xfs_imap_valid() check, only * xfs_vm_writepage() can handle partial page writeback fully - we are * limited here to the buffers that are contiguous with the current * ioend, and hence a buffer we can't write breaks that contiguity and * we have to defer the rest of the IO to xfs_vm_writepage(). */ bh = head = page_buffers(page); do { if (offset >= end_offset) break; if (!buffer_uptodate(bh)) uptodate = 0; if (!(PageUptodate(page) || buffer_uptodate(bh))) { done = 1; break; } if (buffer_unwritten(bh) || buffer_delay(bh) || buffer_mapped(bh)) { if (buffer_unwritten(bh)) type = XFS_IO_UNWRITTEN; else if (buffer_delay(bh)) type = XFS_IO_DELALLOC; else type = XFS_IO_OVERWRITE; /* * imap should always be valid because of the above * partial page end_offset check on the imap. */ ASSERT(xfs_imap_valid(inode, imap, offset)); lock_buffer(bh); if (type != XFS_IO_OVERWRITE) xfs_map_at_offset(inode, bh, imap, offset); xfs_add_to_ioend(inode, bh, offset, type, ioendp, done); page_dirty--; count++; } else { done = 1; break; } } while (offset += len, (bh = bh->b_this_page) != head); if (uptodate && bh == head) SetPageUptodate(page); if (count) { if (--wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) done = 1; } xfs_start_page_writeback(page, !page_dirty, count); return done; fail_unlock_page: unlock_page(page); fail: return 1; } /* * Convert & write out a cluster of pages in the same extent as defined * by mp and following the start page. */ STATIC void xfs_cluster_write( struct inode *inode, pgoff_t tindex, struct xfs_bmbt_irec *imap, xfs_ioend_t **ioendp, struct writeback_control *wbc, pgoff_t tlast) { struct pagevec pvec; int done = 0, i; pagevec_init(&pvec, 0); while (!done && tindex <= tlast) { unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1); if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len)) break; for (i = 0; i < pagevec_count(&pvec); i++) { done = xfs_convert_page(inode, pvec.pages[i], tindex++, imap, ioendp, wbc); if (done) break; } pagevec_release(&pvec); cond_resched(); } } STATIC void xfs_vm_invalidatepage( struct page *page, Loading Loading @@ -937,6 +708,164 @@ xfs_aops_discard_page( return; } /* * We implement an immediate ioend submission policy here to avoid needing to * chain multiple ioends and hence nest mempool allocations which can violate * forward progress guarantees we need to provide. The current ioend we are * adding buffers to is cached on the writepage context, and if the new buffer * does not append to the cached ioend it will create a new ioend and cache that * instead. * * If a new ioend is created and cached, the old ioend is returned and queued * locally for submission once the entire page is processed or an error has been * detected. While ioends are submitted immediately after they are completed, * batching optimisations are provided by higher level block plugging. * * At the end of a writeback pass, there will be a cached ioend remaining on the * writepage context that the caller will need to submit. */ static int xfs_writepage_map( struct xfs_writepage_ctx *wpc, struct writeback_control *wbc, struct inode *inode, struct page *page, loff_t offset, __uint64_t end_offset) { LIST_HEAD(submit_list); struct xfs_ioend *ioend, *next; struct buffer_head *bh, *head; ssize_t len = 1 << inode->i_blkbits; int error = 0; int count = 0; int uptodate = 1; bh = head = page_buffers(page); offset = page_offset(page); do { if (offset >= end_offset) break; if (!buffer_uptodate(bh)) uptodate = 0; /* * set_page_dirty dirties all buffers in a page, independent * of their state. The dirty state however is entirely * meaningless for holes (!mapped && uptodate), so skip * buffers covering holes here. */ if (!buffer_mapped(bh) && buffer_uptodate(bh)) { wpc->imap_valid = false; continue; } if (buffer_unwritten(bh)) { if (wpc->io_type != XFS_IO_UNWRITTEN) { wpc->io_type = XFS_IO_UNWRITTEN; wpc->imap_valid = false; } } else if (buffer_delay(bh)) { if (wpc->io_type != XFS_IO_DELALLOC) { wpc->io_type = XFS_IO_DELALLOC; wpc->imap_valid = false; } } else if (buffer_uptodate(bh)) { if (wpc->io_type != XFS_IO_OVERWRITE) { wpc->io_type = XFS_IO_OVERWRITE; wpc->imap_valid = false; } } else { if (PageUptodate(page)) ASSERT(buffer_mapped(bh)); /* * This buffer is not uptodate and will not be * written to disk. Ensure that we will put any * subsequent writeable buffers into a new * ioend. */ wpc->imap_valid = false; continue; } if (wpc->imap_valid) wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, offset); if (!wpc->imap_valid) { error = xfs_map_blocks(inode, offset, &wpc->imap, wpc->io_type); if (error) goto out; wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, offset); } if (wpc->imap_valid) { lock_buffer(bh); if (wpc->io_type != XFS_IO_OVERWRITE) xfs_map_at_offset(inode, bh, &wpc->imap, offset); xfs_add_to_ioend(inode, bh, offset, wpc, &submit_list); count++; } } while (offset += len, ((bh = bh->b_this_page) != head)); if (uptodate && bh == head) SetPageUptodate(page); ASSERT(wpc->ioend || list_empty(&submit_list)); out: /* * On error, we have to fail the ioend here because we have locked * buffers in the ioend. If we don't do this, we'll deadlock * invalidating the page as that tries to lock the buffers on the page. * Also, because we may have set pages under writeback, we have to make * sure we run IO completion to mark the error state of the IO * appropriately, so we can't cancel the ioend directly here. That means * we have to mark this page as under writeback if we included any * buffers from it in the ioend chain so that completion treats it * correctly. * * If we didn't include the page in the ioend, the on error we can * simply discard and unlock it as there are no other users of the page * or it's buffers right now. The caller will still need to trigger * submission of outstanding ioends on the writepage context so they are * treated correctly on error. */ if (count) { xfs_start_page_writeback(page, !error); /* * Preserve the original error if there was one, otherwise catch * submission errors here and propagate into subsequent ioend * submissions. */ list_for_each_entry_safe(ioend, next, &submit_list, io_list) { int error2; list_del_init(&ioend->io_list); error2 = xfs_submit_ioend(wbc, ioend, error); if (error2 && !error) error = error2; } } else if (error) { xfs_aops_discard_page(page); ClearPageUptodate(page); unlock_page(page); } else { /* * We can end up here with no error and nothing to write if we * race with a partial page truncate on a sub-page block sized * filesystem. In that case we need to mark the page clean. */ xfs_start_page_writeback(page, 1); end_page_writeback(page); } mapping_set_error(page->mapping, error); return error; } /* * Write out a dirty page. * Loading @@ -946,22 +875,16 @@ xfs_aops_discard_page( * For any other dirty buffer heads on the page we should flush them. */ STATIC int xfs_vm_writepage( xfs_do_writepage( struct page *page, struct writeback_control *wbc) struct writeback_control *wbc, void *data) { struct xfs_writepage_ctx *wpc = data; struct inode *inode = page->mapping->host; struct buffer_head *bh, *head; struct xfs_bmbt_irec imap; xfs_ioend_t *ioend = NULL, *iohead = NULL; loff_t offset; unsigned int type; __uint64_t end_offset; pgoff_t end_index, last_index; ssize_t len; int err, imap_valid = 0, uptodate = 1; int count = 0; int nonblocking = 0; pgoff_t end_index; trace_xfs_writepage(inode, page, 0, 0); Loading @@ -988,12 +911,9 @@ xfs_vm_writepage( if (WARN_ON_ONCE(current->flags & PF_FSTRANS)) goto redirty; /* Is this page beyond the end of the file? */ offset = i_size_read(inode); end_index = offset >> PAGE_CACHE_SHIFT; last_index = (offset - 1) >> PAGE_CACHE_SHIFT; /* * Is this page beyond the end of the file? * * The page index is less than the end_index, adjust the end_offset * to the highest offset that this page should represent. * ----------------------------------------------------- Loading @@ -1004,6 +924,8 @@ xfs_vm_writepage( * | desired writeback range | see else | * ---------------------------------^------------------| */ offset = i_size_read(inode); end_index = offset >> PAGE_CACHE_SHIFT; if (page->index < end_index) end_offset = (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT; else { Loading Loading @@ -1055,152 +977,7 @@ xfs_vm_writepage( end_offset = offset; } len = 1 << inode->i_blkbits; bh = head = page_buffers(page); offset = page_offset(page); type = XFS_IO_OVERWRITE; if (wbc->sync_mode == WB_SYNC_NONE) nonblocking = 1; do { int new_ioend = 0; if (offset >= end_offset) break; if (!buffer_uptodate(bh)) uptodate = 0; /* * set_page_dirty dirties all buffers in a page, independent * of their state. The dirty state however is entirely * meaningless for holes (!mapped && uptodate), so skip * buffers covering holes here. */ if (!buffer_mapped(bh) && buffer_uptodate(bh)) { imap_valid = 0; continue; } if (buffer_unwritten(bh)) { if (type != XFS_IO_UNWRITTEN) { type = XFS_IO_UNWRITTEN; imap_valid = 0; } } else if (buffer_delay(bh)) { if (type != XFS_IO_DELALLOC) { type = XFS_IO_DELALLOC; imap_valid = 0; } } else if (buffer_uptodate(bh)) { if (type != XFS_IO_OVERWRITE) { type = XFS_IO_OVERWRITE; imap_valid = 0; } } else { if (PageUptodate(page)) ASSERT(buffer_mapped(bh)); /* * This buffer is not uptodate and will not be * written to disk. Ensure that we will put any * subsequent writeable buffers into a new * ioend. */ imap_valid = 0; continue; } if (imap_valid) imap_valid = xfs_imap_valid(inode, &imap, offset); if (!imap_valid) { /* * If we didn't have a valid mapping then we need to * put the new mapping into a separate ioend structure. * This ensures non-contiguous extents always have * separate ioends, which is particularly important * for unwritten extent conversion at I/O completion * time. */ new_ioend = 1; err = xfs_map_blocks(inode, offset, &imap, type, nonblocking); if (err) goto error; imap_valid = xfs_imap_valid(inode, &imap, offset); } if (imap_valid) { lock_buffer(bh); if (type != XFS_IO_OVERWRITE) xfs_map_at_offset(inode, bh, &imap, offset); xfs_add_to_ioend(inode, bh, offset, type, &ioend, new_ioend); count++; } if (!iohead) iohead = ioend; } while (offset += len, ((bh = bh->b_this_page) != head)); if (uptodate && bh == head) SetPageUptodate(page); xfs_start_page_writeback(page, 1, count); /* if there is no IO to be submitted for this page, we are done */ if (!ioend) return 0; ASSERT(iohead); /* * Any errors from this point onwards need tobe reported through the IO * completion path as we have marked the initial page as under writeback * and unlocked it. */ if (imap_valid) { xfs_off_t end_index; end_index = imap.br_startoff + imap.br_blockcount; /* to bytes */ end_index <<= inode->i_blkbits; /* to pages */ end_index = (end_index - 1) >> PAGE_CACHE_SHIFT; /* check against file size */ if (end_index > last_index) end_index = last_index; xfs_cluster_write(inode, page->index + 1, &imap, &ioend, wbc, end_index); } /* * Reserve log space if we might write beyond the on-disk inode size. */ err = 0; if (ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend)) err = xfs_setfilesize_trans_alloc(ioend); xfs_submit_ioend(wbc, iohead, err); return 0; error: if (iohead) xfs_cancel_ioend(iohead); if (err == -EAGAIN) goto redirty; xfs_aops_discard_page(page); ClearPageUptodate(page); unlock_page(page); return err; return xfs_writepage_map(wpc, wbc, inode, page, offset, end_offset); redirty: redirty_page_for_writepage(wbc, page); Loading @@ -1208,13 +985,37 @@ xfs_vm_writepage( return 0; } STATIC int xfs_vm_writepage( struct page *page, struct writeback_control *wbc) { struct xfs_writepage_ctx wpc = { .io_type = XFS_IO_INVALID, }; int ret; ret = xfs_do_writepage(page, wbc, &wpc); if (wpc.ioend) ret = xfs_submit_ioend(wbc, wpc.ioend, ret); return ret; } STATIC int xfs_vm_writepages( struct address_space *mapping, struct writeback_control *wbc) { struct xfs_writepage_ctx wpc = { .io_type = XFS_IO_INVALID, }; int ret; xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); return generic_writepages(mapping, wbc); ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc); if (wpc.ioend) ret = xfs_submit_ioend(wbc, wpc.ioend, ret); return ret; } /* Loading
fs/xfs/xfs_aops.h +3 −1 Original line number Diff line number Diff line Loading @@ -24,12 +24,14 @@ extern mempool_t *xfs_ioend_pool; * Types of I/O for bmap clustering and I/O completion tracking. */ enum { XFS_IO_INVALID, /* initial state */ XFS_IO_DELALLOC, /* covers delalloc region */ XFS_IO_UNWRITTEN, /* covers allocated but uninitialized data */ XFS_IO_OVERWRITE, /* covers already allocated extent */ }; #define XFS_IO_TYPES \ { XFS_IO_INVALID, "invalid" }, \ { XFS_IO_DELALLOC, "delalloc" }, \ { XFS_IO_UNWRITTEN, "unwritten" }, \ { XFS_IO_OVERWRITE, "overwrite" } Loading @@ -39,7 +41,7 @@ enum { * It can manage several multi-page bio's at once. */ typedef struct xfs_ioend { struct xfs_ioend *io_list; /* next ioend in chain */ struct list_head io_list; /* next ioend in chain */ unsigned int io_type; /* delalloc / unwritten */ int io_error; /* I/O error code */ atomic_t io_remaining; /* hold count */ Loading