Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 8ab22b9a authored by Hisashi Hifumi's avatar Hisashi Hifumi Committed by Linus Torvalds
Browse files

vfs: pagecache usage optimization for pagesize!=blocksize



When we read some part of a file through pagecache, if there is a
pagecache of corresponding index but this page is not uptodate, read IO
is issued and this page will be uptodate.

I think this is good for pagesize == blocksize environment but there is
room for improvement on pagesize != blocksize environment.  Because in
this case a page can have multiple buffers and even if a page is not
uptodate, some buffers can be uptodate.

So I suggest that when all buffers which correspond to a part of a file
that we want to read are uptodate, use this pagecache and copy data from
this pagecache to user buffer even if a page is not uptodate.  This can
reduce read IO and improve system throughput.

I wrote a benchmark program and got result number with this program.

This benchmark do:

  1: mount and open a test file.

  2: create a 512MB file.

  3: close a file and umount.

  4: mount and again open a test file.

  5: pwrite randomly 300000 times on a test file.  offset is aligned
     by IO size(1024bytes).

  6: measure time of preading randomly 100000 times on a test file.

The result was:
	2.6.26
        330 sec

	2.6.26-patched
        226 sec

Arch:i386
Filesystem:ext3
Blocksize:1024 bytes
Memory: 1GB

On ext3/4, a file is written through buffer/block.  So random read/write
mixed workloads or random read after random write workloads are optimized
with this patch under pagesize != blocksize environment.  This test result
showed this.

The benchmark program is as follows:

#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <time.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mount.h>

#define LEN 1024
#define LOOP 1024*512 /* 512MB */

main(void)
{
	unsigned long i, offset, filesize;
	int fd;
	char buf[LEN];
	time_t t1, t2;

	if (mount("/dev/sda1", "/root/test1/", "ext3", 0, 0) < 0) {
		perror("cannot mount\n");
		exit(1);
	}
	memset(buf, 0, LEN);
	fd = open("/root/test1/testfile", O_CREAT|O_RDWR|O_TRUNC);
	if (fd < 0) {
		perror("cannot open file\n");
		exit(1);
	}
	for (i = 0; i < LOOP; i++)
		write(fd, buf, LEN);
	close(fd);
	if (umount("/root/test1/") < 0) {
		perror("cannot umount\n");
		exit(1);
	}
	if (mount("/dev/sda1", "/root/test1/", "ext3", 0, 0) < 0) {
		perror("cannot mount\n");
		exit(1);
	}
	fd = open("/root/test1/testfile", O_RDWR);
	if (fd < 0) {
		perror("cannot open file\n");
		exit(1);
	}

	filesize = LEN * LOOP;
	for (i = 0; i < 300000; i++){
		offset = (random() % filesize) & (~(LEN - 1));
		pwrite(fd, buf, LEN, offset);
	}
	printf("start test\n");
	time(&t1);
	for (i = 0; i < 100000; i++){
		offset = (random() % filesize) & (~(LEN - 1));
		pread(fd, buf, LEN, offset);
	}
	time(&t2);
	printf("%ld sec\n", t2-t1);
	close(fd);
	if (umount("/root/test1/") < 0) {
		perror("cannot umount\n");
		exit(1);
	}
}

Signed-off-by: default avatarHisashi Hifumi <hifumi.hisashi@oss.ntt.co.jp>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Jan Kara <jack@ucw.cz>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent d84a52f6
Loading
Loading
Loading
Loading
+46 −0
Original line number Diff line number Diff line
@@ -2095,6 +2095,52 @@ int generic_write_end(struct file *file, struct address_space *mapping,
}
EXPORT_SYMBOL(generic_write_end);

/*
 * block_is_partially_uptodate checks whether buffers within a page are
 * uptodate or not.
 *
 * Returns true if all buffers which correspond to a file portion
 * we want to read are uptodate.
 */
int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
					unsigned long from)
{
	struct inode *inode = page->mapping->host;
	unsigned block_start, block_end, blocksize;
	unsigned to;
	struct buffer_head *bh, *head;
	int ret = 1;

	if (!page_has_buffers(page))
		return 0;

	blocksize = 1 << inode->i_blkbits;
	to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);
	to = from + to;
	if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
		return 0;

	head = page_buffers(page);
	bh = head;
	block_start = 0;
	do {
		block_end = block_start + blocksize;
		if (block_end > from && block_start < to) {
			if (!buffer_uptodate(bh)) {
				ret = 0;
				break;
			}
			if (block_end >= to)
				break;
		}
		block_start = block_end;
		bh = bh->b_this_page;
	} while (bh != head);

	return ret;
}
EXPORT_SYMBOL(block_is_partially_uptodate);

/*
 * Generic "read page" function for block devices that have the normal
 * get_block functionality. This is most of the block device filesystems.
+1 −0
Original line number Diff line number Diff line
@@ -791,6 +791,7 @@ const struct address_space_operations ext2_aops = {
	.direct_IO		= ext2_direct_IO,
	.writepages		= ext2_writepages,
	.migratepage		= buffer_migrate_page,
	.is_partially_uptodate	= block_is_partially_uptodate,
};

const struct address_space_operations ext2_aops_xip = {
+35 −32
Original line number Diff line number Diff line
@@ -1778,6 +1778,7 @@ static const struct address_space_operations ext3_ordered_aops = {
	.releasepage		= ext3_releasepage,
	.direct_IO		= ext3_direct_IO,
	.migratepage		= buffer_migrate_page,
	.is_partially_uptodate  = block_is_partially_uptodate,
};

static const struct address_space_operations ext3_writeback_aops = {
@@ -1792,6 +1793,7 @@ static const struct address_space_operations ext3_writeback_aops = {
	.releasepage		= ext3_releasepage,
	.direct_IO		= ext3_direct_IO,
	.migratepage		= buffer_migrate_page,
	.is_partially_uptodate  = block_is_partially_uptodate,
};

static const struct address_space_operations ext3_journalled_aops = {
@@ -1805,6 +1807,7 @@ static const struct address_space_operations ext3_journalled_aops = {
	.bmap			= ext3_bmap,
	.invalidatepage		= ext3_invalidatepage,
	.releasepage		= ext3_releasepage,
	.is_partially_uptodate  = block_is_partially_uptodate,
};

void ext3_set_aops(struct inode *inode)
+48 −44
Original line number Diff line number Diff line
@@ -2817,6 +2817,7 @@ static const struct address_space_operations ext4_ordered_aops = {
	.releasepage		= ext4_releasepage,
	.direct_IO		= ext4_direct_IO,
	.migratepage		= buffer_migrate_page,
	.is_partially_uptodate  = block_is_partially_uptodate,
};

static const struct address_space_operations ext4_writeback_aops = {
@@ -2831,6 +2832,7 @@ static const struct address_space_operations ext4_writeback_aops = {
	.releasepage		= ext4_releasepage,
	.direct_IO		= ext4_direct_IO,
	.migratepage		= buffer_migrate_page,
	.is_partially_uptodate  = block_is_partially_uptodate,
};

static const struct address_space_operations ext4_journalled_aops = {
@@ -2844,6 +2846,7 @@ static const struct address_space_operations ext4_journalled_aops = {
	.bmap			= ext4_bmap,
	.invalidatepage		= ext4_invalidatepage,
	.releasepage		= ext4_releasepage,
	.is_partially_uptodate  = block_is_partially_uptodate,
};

static const struct address_space_operations ext4_da_aops = {
@@ -2859,6 +2862,7 @@ static const struct address_space_operations ext4_da_aops = {
	.releasepage		= ext4_releasepage,
	.direct_IO		= ext4_direct_IO,
	.migratepage		= buffer_migrate_page,
	.is_partially_uptodate  = block_is_partially_uptodate,
};

void ext4_set_aops(struct inode *inode)
+2 −0
Original line number Diff line number Diff line
@@ -205,6 +205,8 @@ void block_invalidatepage(struct page *page, unsigned long offset);
int block_write_full_page(struct page *page, get_block_t *get_block,
				struct writeback_control *wbc);
int block_read_full_page(struct page*, get_block_t*);
int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
				unsigned long from);
int block_write_begin(struct file *, struct address_space *,
				loff_t, unsigned, unsigned,
				struct page **, void **, get_block_t*);
Loading