Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit e6f3ddbf authored by Jaegeuk Kim's avatar Jaegeuk Kim
Browse files

Merge remote-tracking branch 'origin/upstream-f2fs-stable-linux-4.19.y' into android-4.19



* origin/upstream-f2fs-stable-linux-4.19.y:
  f2fs: use EINVAL for superblock with invalid magic
  f2fs: fix to read source block before invalidating it
  f2fs: remove redundant check from f2fs_setflags_common()
  f2fs: use generic checking function for FS_IOC_FSSETXATTR
  f2fs: use generic checking and prep function for FS_IOC_SETFLAGS
  ubifs, fscrypt: cache decrypted symlink target in ->i_link
  vfs: use READ_ONCE() to access ->i_link
  fs, fscrypt: clear DCACHE_ENCRYPTED_NAME when unaliasing directory
  fscrypt: cache decrypted symlink target in ->i_link
  fscrypt: fix race where ->lookup() marks plaintext dentry as ciphertext
  fscrypt: only set dentry_operations on ciphertext dentries
  fscrypt: fix race allowing rename() and link() of ciphertext dentries
  fscrypt: clean up and improve dentry revalidation
  fscrypt: use READ_ONCE() to access ->i_crypt_info
  fscrypt: remove WARN_ON_ONCE() when decryption fails
  fscrypt: drop inode argument from fscrypt_get_ctx()
  f2fs: improve print log in f2fs_sanity_check_ckpt()
  f2fs: avoid out-of-range memory access
  f2fs: fix to avoid long latency during umount
  f2fs: allow all the users to pin a file
  f2fs: support swap file w/ DIO
  f2fs: allocate blocks for pinned file
  f2fs: fix is_idle() check for discard type
  f2fs: add a rw_sem to cover quota flag changes
  f2fs: set SBI_NEED_FSCK for xattr corruption case
  f2fs: use generic EFSBADCRC/EFSCORRUPTED
  f2fs: Use DIV_ROUND_UP() instead of open-coding
  f2fs: print kernel message if filesystem is inconsistent
  f2fs: introduce f2fs_<level> macros to wrap f2fs_printk()
  f2fs: avoid get_valid_blocks() for cleanup
  f2fs: ioctl for removing a range from F2FS
  f2fs: only set project inherit bit for directory
  f2fs: separate f2fs i_flags from fs_flags and ext4 i_flags
  f2fs: Add option to limit required GC for checkpoint=disable
  f2fs: Fix accounting for unusable blocks
  f2fs: Fix root reserved on remount
  f2fs: Lower threshold for disable_cp_again
  f2fs: fix sparse warning
  f2fs: fix f2fs_show_options to show nodiscard mount option
  f2fs: add error prints for debugging mount failure
  f2fs: fix to do sanity check on segment bitmap of LFS curseg
  f2fs: add missing sysfs entries in documentation
  f2fs: fix to avoid deadloop if data_flush is on
  f2fs: always assume that the device is idle under gc_urgent
  f2fs: add bio cache for IPU
  f2fs: allow ssr block allocation during checkpoint=disable period
  f2fs: fix to check layout on last valid checkpoint park

Change-Id: Ie910f127f574c2115e5b9a6725461ce002c267be
Signed-off-by: default avatarJaegeuk Kim <jaegeuk@google.com>
parents 95444f4a 94472d52
Loading
Loading
Loading
Loading
+8 −0
Original line number Original line Diff line number Diff line
@@ -243,3 +243,11 @@ Description:
		 - Del: echo '[h/c]!extension' > /sys/fs/f2fs/<disk>/extension_list
		 - Del: echo '[h/c]!extension' > /sys/fs/f2fs/<disk>/extension_list
		 - [h] means add/del hot file extension
		 - [h] means add/del hot file extension
		 - [c] means add/del cold file extension
		 - [c] means add/del cold file extension

What:		/sys/fs/f2fs/<disk>/unusable
Date		April 2019
Contact:	"Daniel Rosenberg" <drosen@google.com>
Description:
		If checkpoint=disable, it displays the number of blocks that are unusable.
                If checkpoint=enable it displays the enumber of blocks that would be unusable
                if checkpoint=disable were to be set.
+123 −10
Original line number Original line Diff line number Diff line
@@ -214,11 +214,22 @@ fsync_mode=%s Control the policy of fsync. Currently supports "posix",
                       non-atomic files likewise "nobarrier" mount option.
                       non-atomic files likewise "nobarrier" mount option.
test_dummy_encryption  Enable dummy encryption, which provides a fake fscrypt
test_dummy_encryption  Enable dummy encryption, which provides a fake fscrypt
                       context. The fake fscrypt context is used by xfstests.
                       context. The fake fscrypt context is used by xfstests.
checkpoint=%s          Set to "disable" to turn off checkpointing. Set to "enable"
checkpoint=%s[:%u[%]]     Set to "disable" to turn off checkpointing. Set to "enable"
                       to reenable checkpointing. Is enabled by default. While
                       to reenable checkpointing. Is enabled by default. While
                       disabled, any unmounting or unexpected shutdowns will cause
                       disabled, any unmounting or unexpected shutdowns will cause
                       the filesystem contents to appear as they did when the
                       the filesystem contents to appear as they did when the
                       filesystem was mounted with that option.
                       filesystem was mounted with that option.
                       While mounting with checkpoint=disabled, the filesystem must
                       run garbage collection to ensure that all available space can
                       be used. If this takes too much time, the mount may return
                       EAGAIN. You may optionally add a value to indicate how much
                       of the disk you would be willing to temporarily give up to
                       avoid additional garbage collection. This can be given as a
                       number of blocks, or as a percent. For instance, mounting
                       with checkpoint=disable:100% would always succeed, but it may
                       hide up to all remaining free space. The actual space that
                       would be unusable can be viewed at /sys/fs/f2fs/<disk>/unusable
                       This space is reclaimed once checkpoint=enable.


================================================================================
================================================================================
DEBUGFS ENTRIES
DEBUGFS ENTRIES
@@ -246,11 +257,14 @@ Files in /sys/fs/f2fs/<devname>
..............................................................................
..............................................................................
 File                         Content
 File                         Content


 gc_max_sleep_time            This tuning parameter controls the maximum sleep
 gc_urgent_sleep_time         This parameter controls sleep time for gc_urgent.
                              500 ms is set by default. See above gc_urgent.

 gc_min_sleep_time            This tuning parameter controls the minimum sleep
                              time for the garbage collection thread. Time is
                              time for the garbage collection thread. Time is
                              in milliseconds.
                              in milliseconds.


 gc_min_sleep_time            This tuning parameter controls the minimum sleep
 gc_max_sleep_time            This tuning parameter controls the maximum sleep
                              time for the garbage collection thread. Time is
                              time for the garbage collection thread. Time is
                              in milliseconds.
                              in milliseconds.


@@ -270,9 +284,6 @@ Files in /sys/fs/f2fs/<devname>
                              to 1, background thread starts to do GC by given
                              to 1, background thread starts to do GC by given
                              gc_urgent_sleep_time interval.
                              gc_urgent_sleep_time interval.


 gc_urgent_sleep_time         This parameter controls sleep time for gc_urgent.
                              500 ms is set by default. See above gc_urgent.

 reclaim_segments             This parameter controls the number of prefree
 reclaim_segments             This parameter controls the number of prefree
                              segments to be reclaimed. If the number of prefree
                              segments to be reclaimed. If the number of prefree
			      segments is larger than the number of segments
			      segments is larger than the number of segments
@@ -287,7 +298,16 @@ Files in /sys/fs/f2fs/<devname>
			      checkpoint is triggered, and issued during the
			      checkpoint is triggered, and issued during the
			      checkpoint. By default, it is disabled with 0.
			      checkpoint. By default, it is disabled with 0.


 trim_sections                This parameter controls the number of sections
 discard_granularity	      This parameter controls the granularity of discard
			      command size. It will issue discard commands iif
			      the size is larger than given granularity. Its
			      unit size is 4KB, and 4 (=16KB) is set by default.
			      The maximum value is 128 (=512KB).

 reserved_blocks	      This parameter indicates the number of blocks that
			      f2fs reserves internally for root.

 batched_trim_sections	      This parameter controls the number of sections
                              to be trimmed out in batch mode when FITRIM
                              to be trimmed out in batch mode when FITRIM
                              conducts. 32 sections is set by default.
                              conducts. 32 sections is set by default.


@@ -309,11 +329,35 @@ Files in /sys/fs/f2fs/<devname>
			      the number is less than this value, it triggers
			      the number is less than this value, it triggers
			      in-place-updates.
			      in-place-updates.


 min_seq_blocks		      This parameter controls the threshold to serialize
			      write IOs issued by multiple threads in parallel.

 min_hot_blocks		      This parameter controls the threshold to allocate
			      a hot data log for pending data blocks to write.

 min_ssr_sections	      This parameter adds the threshold when deciding
			      SSR block allocation. If this is large, SSR mode
			      will be enabled early.

 ram_thresh                   This parameter controls the memory footprint used
			      by free nids and cached nat entries. By default,
			      10 is set, which indicates 10 MB / 1 GB RAM.

 ra_nid_pages		      When building free nids, F2FS reads NAT blocks
			      ahead for speed up. Default is 0.

 dirty_nats_ratio	      Given dirty ratio of cached nat entries, F2FS
			      determines flushing them in background.

 max_victim_search	      This parameter controls the number of trials to
 max_victim_search	      This parameter controls the number of trials to
			      find a victim segment when conducting SSR and
			      find a victim segment when conducting SSR and
			      cleaning operations. The default value is 4096
			      cleaning operations. The default value is 4096
			      which covers 8GB block address range.
			      which covers 8GB block address range.


 migration_granularity	      For large-sized sections, F2FS can stop GC given
			      this granularity instead of reclaiming entire
			      section.

 dir_level                    This parameter controls the directory level to
 dir_level                    This parameter controls the directory level to
			      support large directory. If a directory has a
			      support large directory. If a directory has a
			      number of files, it can reduce the file lookup
			      number of files, it can reduce the file lookup
@@ -321,9 +365,53 @@ Files in /sys/fs/f2fs/<devname>
			      Otherwise, it needs to decrease this value to
			      Otherwise, it needs to decrease this value to
			      reduce the space overhead. The default value is 0.
			      reduce the space overhead. The default value is 0.


 ram_thresh                   This parameter controls the memory footprint used
 cp_interval		      F2FS tries to do checkpoint periodically, 60 secs
			      by free nids and cached nat entries. By default,
			      by default.
			      10 is set, which indicates 10 MB / 1 GB RAM.

 idle_interval		      F2FS detects system is idle, if there's no F2FS
			      operations during given interval, 5 secs by
			      default.

 discard_idle_interval	      F2FS detects the discard thread is idle, given
			      time interval. Default is 5 secs.

 gc_idle_interval	      F2FS detects the GC thread is idle, given time
			      interval. Default is 5 secs.

 umount_discard_timeout       When unmounting the disk, F2FS waits for finishing
			      queued discard commands which can take huge time.
			      This gives time out for it, 5 secs by default.

 iostat_enable		      This controls to enable/disable iostat in F2FS.

 readdir_ra		      This enables/disabled readahead of inode blocks
			      in readdir, and default is enabled.

 gc_pin_file_thresh	      This indicates how many GC can be failed for the
			      pinned file. If it exceeds this, F2FS doesn't
			      guarantee its pinning state. 2048 trials is set
			      by default.

 extension_list		      This enables to change extension_list for hot/cold
			      files in runtime.

 inject_rate		      This controls injection rate of arbitrary faults.

 inject_type		      This controls injection type of arbitrary faults.

 dirty_segments 	      This shows # of dirty segments.

 lifetime_write_kbytes	      This shows # of data written to the disk.

 features		      This shows current features enabled on F2FS.

 current_reserved_blocks      This shows # of blocks currently reserved.

 unusable                     If checkpoint=disable, this shows the number of
                              blocks that are unusable.
                              If checkpoint=enable it shows the number of blocks
                              that would be unusable if checkpoint=disable were
                              to be set.


================================================================================
================================================================================
USAGE
USAGE
@@ -716,3 +804,28 @@ WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET
WRITE_LIFE_NONE       "                        WRITE_LIFE_NONE
WRITE_LIFE_NONE       "                        WRITE_LIFE_NONE
WRITE_LIFE_MEDIUM     "                        WRITE_LIFE_MEDIUM
WRITE_LIFE_MEDIUM     "                        WRITE_LIFE_MEDIUM
WRITE_LIFE_LONG       "                        WRITE_LIFE_LONG
WRITE_LIFE_LONG       "                        WRITE_LIFE_LONG

Fallocate(2) Policy
-------------------

The default policy follows the below posix rule.

Allocating disk space
    The default operation (i.e., mode is zero) of fallocate() allocates
    the disk space within the range specified by offset and len.  The
    file size (as reported by stat(2)) will be changed if offset+len is
    greater than the file size.  Any subregion within the range specified
    by offset and len that did not contain data before the call will be
    initialized to zero.  This default behavior closely resembles the
    behavior of the posix_fallocate(3) library function, and is intended
    as a method of optimally implementing that function.

However, once F2FS receives ioctl(fd, F2FS_IOC_SET_PIN_FILE) in prior to
fallocate(fd, DEFAULT_MODE), it allocates on-disk blocks addressess having
zero or random data, which is useful to the below scenario where:
 1. create(fd)
 2. ioctl(fd, F2FS_IOC_SET_PIN_FILE)
 3. fallocate(fd, 0, 0, size)
 4. address = fibmap(fd, offset)
 5. open(blkdev)
 6. write(blkdev, address)
+3 −5
Original line number Original line Diff line number Diff line
@@ -36,12 +36,10 @@ static void __fscrypt_decrypt_bio(struct bio *bio, bool done)
		int ret = fscrypt_decrypt_page(page->mapping->host, page,
		int ret = fscrypt_decrypt_page(page->mapping->host, page,
				PAGE_SIZE, 0, page->index);
				PAGE_SIZE, 0, page->index);


		if (ret) {
		if (ret)
			WARN_ON_ONCE(1);
			SetPageError(page);
			SetPageError(page);
		} else if (done) {
		else if (done)
			SetPageUptodate(page);
			SetPageUptodate(page);
		}
		if (done)
		if (done)
			unlock_page(page);
			unlock_page(page);
	}
	}
@@ -103,7 +101,7 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,


	BUG_ON(inode->i_sb->s_blocksize != PAGE_SIZE);
	BUG_ON(inode->i_sb->s_blocksize != PAGE_SIZE);


	ctx = fscrypt_get_ctx(inode, GFP_NOFS);
	ctx = fscrypt_get_ctx(GFP_NOFS);
	if (IS_ERR(ctx))
	if (IS_ERR(ctx))
		return PTR_ERR(ctx);
		return PTR_ERR(ctx);


+35 −39
Original line number Original line Diff line number Diff line
@@ -87,23 +87,17 @@ EXPORT_SYMBOL(fscrypt_release_ctx);


/**
/**
 * fscrypt_get_ctx() - Gets an encryption context
 * fscrypt_get_ctx() - Gets an encryption context
 * @inode:       The inode for which we are doing the crypto
 * @gfp_flags:   The gfp flag for memory allocation
 * @gfp_flags:   The gfp flag for memory allocation
 *
 *
 * Allocates and initializes an encryption context.
 * Allocates and initializes an encryption context.
 *
 *
 * Return: An allocated and initialized encryption context on success; error
 * Return: A new encryption context on success; an ERR_PTR() otherwise.
 * value or NULL otherwise.
 */
 */
struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode, gfp_t gfp_flags)
struct fscrypt_ctx *fscrypt_get_ctx(gfp_t gfp_flags)
{
{
	struct fscrypt_ctx *ctx = NULL;
	struct fscrypt_ctx *ctx;
	struct fscrypt_info *ci = inode->i_crypt_info;
	unsigned long flags;
	unsigned long flags;


	if (ci == NULL)
		return ERR_PTR(-ENOKEY);

	/*
	/*
	 * We first try getting the ctx from a free list because in
	 * We first try getting the ctx from a free list because in
	 * the common case the ctx will have an allocated and
	 * the common case the ctx will have an allocated and
@@ -260,9 +254,9 @@ struct page *fscrypt_encrypt_page(const struct inode *inode,
	if (WARN_ON_ONCE(!PageLocked(page)))
	if (WARN_ON_ONCE(!PageLocked(page)))
		return ERR_PTR(-EINVAL);
		return ERR_PTR(-EINVAL);


	ctx = fscrypt_get_ctx(inode, gfp_flags);
	ctx = fscrypt_get_ctx(gfp_flags);
	if (IS_ERR(ctx))
	if (IS_ERR(ctx))
		return (struct page *)ctx;
		return ERR_CAST(ctx);


	/* The encryption operation will require a bounce page. */
	/* The encryption operation will require a bounce page. */
	ciphertext_page = fscrypt_alloc_bounce_page(ctx, gfp_flags);
	ciphertext_page = fscrypt_alloc_bounce_page(ctx, gfp_flags);
@@ -316,45 +310,47 @@ int fscrypt_decrypt_page(const struct inode *inode, struct page *page,
EXPORT_SYMBOL(fscrypt_decrypt_page);
EXPORT_SYMBOL(fscrypt_decrypt_page);


/*
/*
 * Validate dentries for encrypted directories to make sure we aren't
 * Validate dentries in encrypted directories to make sure we aren't potentially
 * potentially caching stale data after a key has been added or
 * caching stale dentries after a key has been added.
 * removed.
 */
 */
static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
{
{
	struct dentry *dir;
	struct dentry *dir;
	int dir_has_key, cached_with_key;
	int err;
	int valid;

	/*
	 * Plaintext names are always valid, since fscrypt doesn't support
	 * reverting to ciphertext names without evicting the directory's inode
	 * -- which implies eviction of the dentries in the directory.
	 */
	if (!(dentry->d_flags & DCACHE_ENCRYPTED_NAME))
		return 1;

	/*
	 * Ciphertext name; valid if the directory's key is still unavailable.
	 *
	 * Although fscrypt forbids rename() on ciphertext names, we still must
	 * use dget_parent() here rather than use ->d_parent directly.  That's
	 * because a corrupted fs image may contain directory hard links, which
	 * the VFS handles by moving the directory's dentry tree in the dcache
	 * each time ->lookup() finds the directory and it already has a dentry
	 * elsewhere.  Thus ->d_parent can be changing, and we must safely grab
	 * a reference to some ->d_parent to prevent it from being freed.
	 */


	if (flags & LOOKUP_RCU)
	if (flags & LOOKUP_RCU)
		return -ECHILD;
		return -ECHILD;


	dir = dget_parent(dentry);
	dir = dget_parent(dentry);
	if (!IS_ENCRYPTED(d_inode(dir))) {
	err = fscrypt_get_encryption_info(d_inode(dir));
	valid = !fscrypt_has_encryption_key(d_inode(dir));
	dput(dir);
	dput(dir);
		return 0;
	}


	spin_lock(&dentry->d_lock);
	if (err < 0)
	cached_with_key = dentry->d_flags & DCACHE_ENCRYPTED_WITH_KEY;
		return err;
	spin_unlock(&dentry->d_lock);
	dir_has_key = (d_inode(dir)->i_crypt_info != NULL);
	dput(dir);


	/*
	return valid;
	 * If the dentry was cached without the key, and it is a
	 * negative dentry, it might be a valid name.  We can't check
	 * if the key has since been made available due to locking
	 * reasons, so we fail the validation so ext4_lookup() can do
	 * this check.
	 *
	 * We also fail the validation if the dentry was created with
	 * the key present, but we no longer have the key, or vice versa.
	 */
	if ((!cached_with_key && d_is_negative(dentry)) ||
			(!cached_with_key && dir_has_key) ||
			(cached_with_key && !dir_has_key))
		return 0;
	return 1;
}
}


const struct dentry_operations fscrypt_d_ops = {
const struct dentry_operations fscrypt_d_ops = {
+3 −2
Original line number Original line Diff line number Diff line
@@ -269,7 +269,7 @@ int fscrypt_fname_disk_to_usr(struct inode *inode,
	if (iname->len < FS_CRYPTO_BLOCK_SIZE)
	if (iname->len < FS_CRYPTO_BLOCK_SIZE)
		return -EUCLEAN;
		return -EUCLEAN;


	if (inode->i_crypt_info)
	if (fscrypt_has_encryption_key(inode))
		return fname_decrypt(inode, iname, oname);
		return fname_decrypt(inode, iname, oname);


	if (iname->len <= FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE) {
	if (iname->len <= FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE) {
@@ -336,7 +336,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
	if (ret)
	if (ret)
		return ret;
		return ret;


	if (dir->i_crypt_info) {
	if (fscrypt_has_encryption_key(dir)) {
		if (!fscrypt_fname_encrypted_size(dir, iname->len,
		if (!fscrypt_fname_encrypted_size(dir, iname->len,
						  dir->i_sb->s_cop->max_namelen,
						  dir->i_sb->s_cop->max_namelen,
						  &fname->crypto_buf.len))
						  &fname->crypto_buf.len))
@@ -356,6 +356,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
	}
	}
	if (!lookup)
	if (!lookup)
		return -ENOKEY;
		return -ENOKEY;
	fname->is_ciphertext_name = true;


	/*
	/*
	 * We don't have the key and we are doing a lookup; decode the
	 * We don't have the key and we are doing a lookup; decode the
Loading