Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit be6297e9 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull ext4 updates from Ted Ts'o:
 "Scalability improvements when allocating inodes, and some
  miscellaneous bug fixes and cleanups"

* tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4:
  ext4: avoid Y2038 overflow in recently_deleted()
  ext4: fix fault handling when mounted with -o dax,ro
  ext4: fix quota inconsistency during orphan cleanup for read-only mounts
  ext4: fix incorrect quotaoff if the quota feature is enabled
  ext4: remove useless test and assignment in strtohash functions
  ext4: backward compatibility support for Lustre ea_inode implementation
  ext4: remove timebomb in ext4_decode_extra_time()
  ext4: use sizeof(*ptr)
  ext4: in ext4_seek_{hole,data}, return -ENXIO for negative offsets
  ext4: reduce lock contention in __ext4_new_inode
  ext4: cleanup goto next group
  ext4: do not unnecessarily allocate buffer in recently_deleted()
parents 57915779 b5f51573
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -411,7 +411,7 @@ static struct dir_private_info *ext4_htree_create_dir_info(struct file *filp,
{
	struct dir_private_info *p;

	p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
	p = kzalloc(sizeof(*p), GFP_KERNEL);
	if (!p)
		return NULL;
	p->curr_hash = pos2maj_hash(filp, pos);
+5 −6
Original line number Diff line number Diff line
@@ -838,13 +838,11 @@ static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
{
	if (unlikely(sizeof(time->tv_sec) > 4 &&
			(extra & cpu_to_le32(EXT4_EPOCH_MASK)))) {
#if LINUX_VERSION_CODE < KERNEL_VERSION(4,20,0)

#if 1
		/* Handle legacy encoding of pre-1970 dates with epoch
		 * bits 1,1.  We assume that by kernel version 4.20,
		 * everyone will have run fsck over the affected
		 * filesystems to correct the problem.  (This
		 * backwards compatibility may be removed before this
		 * time, at the discretion of the ext4 developers.)
		 * bits 1,1. (This backwards compatibility may be removed
		 * at the discretion of the ext4 developers.)
		 */
		u64 extra_bits = le32_to_cpu(extra) & EXT4_EPOCH_MASK;
		if (extra_bits == 3 && ((time->tv_sec) & 0x80000000) != 0)
@@ -1567,6 +1565,7 @@ enum {
					   nolocking */
	EXT4_STATE_MAY_INLINE_DATA,	/* may have in-inode data */
	EXT4_STATE_EXT_PRECACHED,	/* extents have been precached */
	EXT4_STATE_LUSTRE_EA_INODE,	/* Lustre-style ea_inode */
};

#define EXT4_INODE_BIT_FNS(name, field, offset)				\
+16 −3
Original line number Diff line number Diff line
@@ -279,7 +279,20 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf,
	handle_t *handle = NULL;
	struct inode *inode = file_inode(vmf->vma->vm_file);
	struct super_block *sb = inode->i_sb;
	bool write = vmf->flags & FAULT_FLAG_WRITE;

	/*
	 * We have to distinguish real writes from writes which will result in a
	 * COW page; COW writes should *not* poke the journal (the file will not
	 * be changed). Doing so would cause unintended failures when mounted
	 * read-only.
	 *
	 * We check for VM_SHARED rather than vmf->cow_page since the latter is
	 * unset for pe_size != PE_SIZE_PTE (i.e. only in do_cow_fault); for
	 * other sizes, dax_iomap_fault will handle splitting / fallback so that
	 * we eventually come back with a COW page.
	 */
	bool write = (vmf->flags & FAULT_FLAG_WRITE) &&
		(vmf->vma->vm_flags & VM_SHARED);

	if (write) {
		sb_start_pagefault(sb);
@@ -595,7 +608,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
	inode_lock(inode);

	isize = i_size_read(inode);
	if (offset >= isize) {
	if (offset < 0 || offset >= isize) {
		inode_unlock(inode);
		return -ENXIO;
	}
@@ -658,7 +671,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
	inode_lock(inode);

	isize = i_size_read(inode);
	if (offset >= isize) {
	if (offset < 0 || offset >= isize) {
		inode_unlock(inode);
		return -ENXIO;
	}
+0 −4
Original line number Diff line number Diff line
@@ -148,8 +148,6 @@ static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
	if (len > num*4)
		len = num * 4;
	for (i = 0; i < len; i++) {
		if ((i % 4) == 0)
			val = pad;
		val = ((int) scp[i]) + (val << 8);
		if ((i % 4) == 3) {
			*buf++ = val;
@@ -176,8 +174,6 @@ static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
	if (len > num*4)
		len = num * 4;
	for (i = 0; i < len; i++) {
		if ((i % 4) == 0)
			val = pad;
		val = ((int) ucp[i]) + (val << 8);
		if ((i % 4) == 3) {
			*buf++ = val;
+60 −33
Original line number Diff line number Diff line
@@ -692,24 +692,25 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
 * somewhat arbitrary...)
 */
#define RECENTCY_MIN	5
#define RECENTCY_DIRTY	30
#define RECENTCY_DIRTY	300

static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino)
{
	struct ext4_group_desc	*gdp;
	struct ext4_inode	*raw_inode;
	struct buffer_head	*bh;
	unsigned long		dtime, now;
	int inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
	int	offset, ret = 0, recentcy = RECENTCY_MIN;
	int offset, ret = 0;
	int recentcy = RECENTCY_MIN;
	u32 dtime, now;

	gdp = ext4_get_group_desc(sb, group, NULL);
	if (unlikely(!gdp))
		return 0;

	bh = sb_getblk(sb, ext4_inode_table(sb, gdp) +
	bh = sb_find_get_block(sb, ext4_inode_table(sb, gdp) +
		       (ino / inodes_per_block));
	if (unlikely(!bh) || !buffer_uptodate(bh))
	if (!bh || !buffer_uptodate(bh))
		/*
		 * If the block is not in the buffer cache, then it
		 * must have been written out.
@@ -718,18 +719,45 @@ static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino)

	offset = (ino % inodes_per_block) * EXT4_INODE_SIZE(sb);
	raw_inode = (struct ext4_inode *) (bh->b_data + offset);

	/* i_dtime is only 32 bits on disk, but we only care about relative
	 * times in the range of a few minutes (i.e. long enough to sync a
	 * recently-deleted inode to disk), so using the low 32 bits of the
	 * clock (a 68 year range) is enough, see time_before32() */
	dtime = le32_to_cpu(raw_inode->i_dtime);
	now = get_seconds();
	now = ktime_get_real_seconds();
	if (buffer_dirty(bh))
		recentcy += RECENTCY_DIRTY;

	if (dtime && (dtime < now) && (now < dtime + recentcy))
	if (dtime && time_before32(dtime, now) &&
	    time_before32(now, dtime + recentcy))
		ret = 1;
out:
	brelse(bh);
	return ret;
}

static int find_inode_bit(struct super_block *sb, ext4_group_t group,
			  struct buffer_head *bitmap, unsigned long *ino)
{
next:
	*ino = ext4_find_next_zero_bit((unsigned long *)
				       bitmap->b_data,
				       EXT4_INODES_PER_GROUP(sb), *ino);
	if (*ino >= EXT4_INODES_PER_GROUP(sb))
		return 0;

	if ((EXT4_SB(sb)->s_journal == NULL) &&
	    recently_deleted(sb, group, *ino)) {
		*ino = *ino + 1;
		if (*ino < EXT4_INODES_PER_GROUP(sb))
			goto next;
		return 0;
	}

	return 1;
}

/*
 * There are two policies for allocating an inode.  If the new inode is
 * a directory, then a forward search is made for a block group with both
@@ -892,19 +920,13 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
		/*
		 * Check free inodes count before loading bitmap.
		 */
		if (ext4_free_inodes_count(sb, gdp) == 0) {
			if (++group == ngroups)
				group = 0;
			continue;
		}
		if (ext4_free_inodes_count(sb, gdp) == 0)
			goto next_group;

		grp = ext4_get_group_info(sb, group);
		/* Skip groups with already-known suspicious inode tables */
		if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
			if (++group == ngroups)
				group = 0;
			continue;
		}
		if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
			goto next_group;

		brelse(inode_bitmap_bh);
		inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
@@ -912,27 +934,20 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
		if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp) ||
		    IS_ERR(inode_bitmap_bh)) {
			inode_bitmap_bh = NULL;
			if (++group == ngroups)
				group = 0;
			continue;
			goto next_group;
		}

repeat_in_this_group:
		ino = ext4_find_next_zero_bit((unsigned long *)
					      inode_bitmap_bh->b_data,
					      EXT4_INODES_PER_GROUP(sb), ino);
		if (ino >= EXT4_INODES_PER_GROUP(sb))
		ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino);
		if (!ret2)
			goto next_group;

		if (group == 0 && (ino + 1) < EXT4_FIRST_INO(sb)) {
			ext4_error(sb, "reserved inode found cleared - "
				   "inode=%lu", ino + 1);
			continue;
		}
		if ((EXT4_SB(sb)->s_journal == NULL) &&
		    recently_deleted(sb, group, ino)) {
			ino++;
			goto next_inode;
			goto next_group;
		}

		if (!handle) {
			BUG_ON(nblocks <= 0);
			handle = __ext4_journal_start_sb(dir->i_sb, line_no,
@@ -952,11 +967,23 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
		}
		ext4_lock_group(sb, group);
		ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data);
		if (ret2) {
			/* Someone already took the bit. Repeat the search
			 * with lock held.
			 */
			ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino);
			if (ret2) {
				ext4_set_bit(ino, inode_bitmap_bh->b_data);
				ret2 = 0;
			} else {
				ret2 = 1; /* we didn't grab the inode */
			}
		}
		ext4_unlock_group(sb, group);
		ino++;		/* the inode bitmap is zero-based */
		if (!ret2)
			goto got; /* we grabbed the inode! */
next_inode:

		if (ino < EXT4_INODES_PER_GROUP(sb))
			goto repeat_in_this_group;
next_group:
Loading