Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 4f76f0ec authored by wang di's avatar wang di Committed by Greg Kroah-Hartman
Browse files

staging: lustre: llite: move dir cache to MDC layer



Move directory entries cache from llite to MDC, so client
side dir stripe will use independent hash function(in LMV),
which does not need to be tightly coupled with the backend
storage dir-entry hash function. With striped directory, it
will be 2-tier hash, LMV calculate hash value according to the
name and hash-type in layout, then each MDT will store these
entry in disk by its own hash.

Signed-off-by: default avatarwang di <di.wang@intel.com>
Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-3531
Reviewed-on: http://review.whamcloud.com/7043


Reviewed-by: default avatarJohn L. Hammond <john.hammond@intel.com>
Reviewed-by: default avatarJinshan Xiong <jinshan.xiong@intel.com>
Reviewed-by: default avatarAndreas Dilger <andreas.dilger@intel.com>
Reviewed-by: default avatarOleg Drokin <oleg.drokin@intel.com>
Signed-off-by: default avatarJames Simmons <jsimmons@infradead.org>
Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
parent 7ccb7c8f
Loading
Loading
Loading
Loading
+0 −11
Original line number Original line Diff line number Diff line
@@ -80,17 +80,6 @@ static inline void ll_dir_chain_fini(struct ll_dir_chain *chain)
{
{
}
}


static inline unsigned long hash_x_index(__u64 hash, int hash64)
{
	if (BITS_PER_LONG == 32 && hash64)
		hash >>= 32;
	/* save hash 0 as index 0 because otherwise we'll save it at
	 * page index end (~0UL) and it causes truncate_inode_pages_range()
	 * to loop forever.
	 */
	return ~0UL - (hash + !hash);
}

/** @} lite */
/** @} lite */


#endif
#endif
+19 −315
Original line number Original line Diff line number Diff line
@@ -134,111 +134,35 @@
 * for this integrated page will be adjusted. See lmv_adjust_dirpages().
 * for this integrated page will be adjusted. See lmv_adjust_dirpages().
 *
 *
 */
 */

struct page *ll_get_dir_page(struct inode *dir, struct md_op_data *op_data,
/* returns the page unlocked, but with a reference */
			     __u64 offset, struct ll_dir_chain *chain)
static int ll_dir_filler(void *_hash, struct page *page0)
{
{
	struct inode *inode = page0->mapping->host;
	struct md_callback cb_op;
	int hash64 = ll_i2sbi(inode)->ll_flags & LL_SBI_64BIT_HASH;
	struct obd_export *exp = ll_i2sbi(inode)->ll_md_exp;
	struct ptlrpc_request *request;
	struct mdt_body *body;
	struct md_op_data *op_data;
	__u64 hash = *((__u64 *)_hash);
	struct page **page_pool;
	struct page *page;
	struct page *page;
	struct lu_dirpage *dp;
	int max_pages = ll_i2sbi(inode)->ll_md_brw_pages;
	int nrdpgs = 0; /* number of pages read actually */
	int npages;
	int i;
	int rc;
	int rc;


	CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p) hash %llu\n",
	cb_op.md_blocking_ast = ll_md_blocking_ast;
	       PFID(ll_inode2fid(inode)), inode, hash);
	rc = md_read_page(ll_i2mdexp(dir), op_data, &cb_op, offset, &page);

	if (rc)
	LASSERT(max_pages > 0 && max_pages <= MD_MAX_BRW_PAGES);
		return ERR_PTR(rc);

	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
				     LUSTRE_OPC_ANY, NULL);
	if (IS_ERR(op_data))
		return PTR_ERR(op_data);

	page_pool = kcalloc(max_pages, sizeof(page), GFP_NOFS);
	if (page_pool) {
		page_pool[0] = page0;
	} else {
		page_pool = &page0;
		max_pages = 1;
	}
	for (npages = 1; npages < max_pages; npages++) {
		page = page_cache_alloc_cold(inode->i_mapping);
		if (!page)
			break;
		page_pool[npages] = page;
	}

	op_data->op_npages = npages;
	op_data->op_offset = hash;
	rc = md_readpage(exp, op_data, page_pool, &request);
	ll_finish_md_op_data(op_data);
	if (rc < 0) {
		/* page0 is special, which was added into page cache early */
		delete_from_page_cache(page0);
	} else if (rc == 0) {
		body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
		/* Checked by mdc_readpage() */
		if (body->mbo_valid & OBD_MD_FLSIZE)
			i_size_write(inode, body->mbo_size);

		nrdpgs = (request->rq_bulk->bd_nob_transferred+PAGE_SIZE-1)
			 >> PAGE_SHIFT;
		SetPageUptodate(page0);
	}
	unlock_page(page0);
	ptlrpc_req_finished(request);

	CDEBUG(D_VFSTRACE, "read %d/%d pages\n", nrdpgs, npages);

	for (i = 1; i < npages; i++) {
		unsigned long offset;
		int ret;

		page = page_pool[i];


		if (rc < 0 || i >= nrdpgs) {
	return page;
			put_page(page);
			continue;
}
}


		SetPageUptodate(page);
void ll_release_page(struct inode *inode, struct page *page, bool remove)

{
		dp = kmap(page);
		hash = le64_to_cpu(dp->ldp_hash_start);
	kunmap(page);
	kunmap(page);


		offset = hash_x_index(hash, hash64);
	/*

	 * Always remove the page for striped dir, because the page is
		prefetchw(&page->flags);
	 * built from temporarily in LMV layer
		ret = add_to_page_cache_lru(page, inode->i_mapping, offset,
	 */
					    GFP_NOFS);
	if (inode && S_ISDIR(inode->i_mode) &&
		if (ret == 0) {
	    ll_i2info(inode)->lli_lsm_md) {
			unlock_page(page);
		__free_page(page);
		} else {
		return;
			CDEBUG(D_VFSTRACE, "page %lu add to page cache failed: %d\n",
			       offset, ret);
		}
		put_page(page);
	}

	if (page_pool != &page0)
		kfree(page_pool);
	return rc;
	}
	}


void ll_release_page(struct inode *inode, struct page *page, bool remove)
{
	kunmap(page);
	if (remove) {
	if (remove) {
		lock_page(page);
		lock_page(page);
		if (likely(page->mapping))
		if (likely(page->mapping))
@@ -248,226 +172,6 @@ void ll_release_page(struct inode *inode, struct page *page, bool remove)
	put_page(page);
	put_page(page);
}
}


/*
 * Find, kmap and return page that contains given hash.
 */
static struct page *ll_dir_page_locate(struct inode *dir, __u64 *hash,
				       __u64 *start, __u64 *end)
{
	int hash64 = ll_i2sbi(dir)->ll_flags & LL_SBI_64BIT_HASH;
	struct address_space *mapping = dir->i_mapping;
	/*
	 * Complement of hash is used as an index so that
	 * radix_tree_gang_lookup() can be used to find a page with starting
	 * hash _smaller_ than one we are looking for.
	 */
	unsigned long offset = hash_x_index(*hash, hash64);
	struct page *page;
	int found;

	spin_lock_irq(&mapping->tree_lock);
	found = radix_tree_gang_lookup(&mapping->page_tree,
				       (void **)&page, offset, 1);
	if (found > 0 && !radix_tree_exceptional_entry(page)) {
		struct lu_dirpage *dp;

		get_page(page);
		spin_unlock_irq(&mapping->tree_lock);
		/*
		 * In contrast to find_lock_page() we are sure that directory
		 * page cannot be truncated (while DLM lock is held) and,
		 * hence, can avoid restart.
		 *
		 * In fact, page cannot be locked here at all, because
		 * ll_dir_filler() does synchronous io.
		 */
		wait_on_page_locked(page);
		if (PageUptodate(page)) {
			dp = kmap(page);
			if (BITS_PER_LONG == 32 && hash64) {
				*start = le64_to_cpu(dp->ldp_hash_start) >> 32;
				*end   = le64_to_cpu(dp->ldp_hash_end) >> 32;
				*hash  = *hash >> 32;
			} else {
				*start = le64_to_cpu(dp->ldp_hash_start);
				*end   = le64_to_cpu(dp->ldp_hash_end);
			}
			LASSERTF(*start <= *hash, "start = %#llx,end = %#llx,hash = %#llx\n",
				 *start, *end, *hash);
			CDEBUG(D_VFSTRACE, "page %lu [%llu %llu], hash %llu\n",
			       offset, *start, *end, *hash);
			if (*hash > *end) {
				ll_release_page(dir, page, false);
				page = NULL;
			} else if (*end != *start && *hash == *end) {
				/*
				 * upon hash collision, remove this page,
				 * otherwise put page reference, and
				 * ll_get_dir_page() will issue RPC to fetch
				 * the page we want.
				 */
				ll_release_page(dir, page,
						le32_to_cpu(dp->ldp_flags) &
						LDF_COLLIDE);
				page = NULL;
			}
		} else {
			put_page(page);
			page = ERR_PTR(-EIO);
		}

	} else {
		spin_unlock_irq(&mapping->tree_lock);
		page = NULL;
	}
	return page;
}

struct page *ll_get_dir_page(struct inode *dir, struct md_op_data *op_data,
			     __u64 hash, struct ll_dir_chain *chain)
{
	ldlm_policy_data_t policy = {.l_inodebits = {MDS_INODELOCK_UPDATE} };
	struct address_space *mapping = dir->i_mapping;
	struct lustre_handle lockh;
	struct lu_dirpage *dp;
	struct page *page;
	enum ldlm_mode mode;
	int rc;
	__u64 start = 0;
	__u64 end = 0;
	__u64 lhash = hash;
	struct ll_inode_info *lli = ll_i2info(dir);
	int hash64 = ll_i2sbi(dir)->ll_flags & LL_SBI_64BIT_HASH;

	mode = LCK_PR;
	rc = md_lock_match(ll_i2sbi(dir)->ll_md_exp, LDLM_FL_BLOCK_GRANTED,
			   ll_inode2fid(dir), LDLM_IBITS, &policy, mode, &lockh);
	if (!rc) {
		struct ldlm_enqueue_info einfo = {
			.ei_type = LDLM_IBITS,
			.ei_mode = mode,
			.ei_cb_bl = ll_md_blocking_ast,
			.ei_cb_cp = ldlm_completion_ast,
		};
		struct lookup_intent it = { .it_op = IT_READDIR };
		struct ptlrpc_request *request;
		struct md_op_data *op_data;

		op_data = ll_prep_md_op_data(NULL, dir, dir, NULL, 0, 0,
					     LUSTRE_OPC_ANY, NULL);
		if (IS_ERR(op_data))
			return (void *)op_data;

		rc = md_enqueue(ll_i2sbi(dir)->ll_md_exp, &einfo, &it,
				op_data, &lockh, NULL, 0, NULL, 0);

		ll_finish_md_op_data(op_data);

		request = (struct ptlrpc_request *)it.it_request;
		if (request)
			ptlrpc_req_finished(request);
		if (rc < 0) {
			CERROR("lock enqueue: " DFID " at %llu: rc %d\n",
			       PFID(ll_inode2fid(dir)), hash, rc);
			return ERR_PTR(rc);
		}

		CDEBUG(D_INODE, "setting lr_lvb_inode to inode "DFID"(%p)\n",
		       PFID(ll_inode2fid(dir)), dir);
		md_set_lock_data(ll_i2sbi(dir)->ll_md_exp,
				 &it.it_lock_handle, dir, NULL);
	} else {
		/* for cross-ref object, l_ast_data of the lock may not be set,
		 * we reset it here
		 */
		md_set_lock_data(ll_i2sbi(dir)->ll_md_exp, &lockh.cookie,
				 dir, NULL);
	}
	ldlm_lock_dump_handle(D_OTHER, &lockh);

	mutex_lock(&lli->lli_readdir_mutex);
	page = ll_dir_page_locate(dir, &lhash, &start, &end);
	if (IS_ERR(page)) {
		CERROR("dir page locate: "DFID" at %llu: rc %ld\n",
		       PFID(ll_inode2fid(dir)), lhash, PTR_ERR(page));
		goto out_unlock;
	} else if (page) {
		/*
		 * XXX nikita: not entirely correct handling of a corner case:
		 * suppose hash chain of entries with hash value HASH crosses
		 * border between pages P0 and P1. First both P0 and P1 are
		 * cached, seekdir() is called for some entry from the P0 part
		 * of the chain. Later P0 goes out of cache. telldir(HASH)
		 * happens and finds P1, as it starts with matching hash
		 * value. Remaining entries from P0 part of the chain are
		 * skipped. (Is that really a bug?)
		 *
		 * Possible solutions: 0. don't cache P1 is such case, handle
		 * it as an "overflow" page. 1. invalidate all pages at
		 * once. 2. use HASH|1 as an index for P1.
		 */
		goto hash_collision;
	}

	page = read_cache_page(mapping, hash_x_index(hash, hash64),
			       ll_dir_filler, &lhash);
	if (IS_ERR(page)) {
		CERROR("read cache page: "DFID" at %llu: rc %ld\n",
		       PFID(ll_inode2fid(dir)), hash, PTR_ERR(page));
		goto out_unlock;
	}

	wait_on_page_locked(page);
	(void)kmap(page);
	if (!PageUptodate(page)) {
		CERROR("page not updated: "DFID" at %llu: rc %d\n",
		       PFID(ll_inode2fid(dir)), hash, -5);
		goto fail;
	}
	if (!PageChecked(page))
		/* XXX: check page format later */
		SetPageChecked(page);
	if (PageError(page)) {
		CERROR("page error: "DFID" at %llu: rc %d\n",
		       PFID(ll_inode2fid(dir)), hash, -5);
		goto fail;
	}
hash_collision:
	dp = page_address(page);
	if (BITS_PER_LONG == 32 && hash64) {
		start = le64_to_cpu(dp->ldp_hash_start) >> 32;
		end   = le64_to_cpu(dp->ldp_hash_end) >> 32;
		lhash = hash >> 32;
	} else {
		start = le64_to_cpu(dp->ldp_hash_start);
		end   = le64_to_cpu(dp->ldp_hash_end);
		lhash = hash;
	}
	if (end == start) {
		LASSERT(start == lhash);
		CWARN("Page-wide hash collision: %llu\n", end);
		if (BITS_PER_LONG == 32 && hash64)
			CWARN("Real page-wide hash collision at [%llu %llu] with hash %llu\n",
			      le64_to_cpu(dp->ldp_hash_start),
			      le64_to_cpu(dp->ldp_hash_end), hash);
		/*
		 * Fetch whole overflow chain...
		 *
		 * XXX not yet.
		 */
		goto fail;
	}
out_unlock:
	mutex_unlock(&lli->lli_readdir_mutex);
	ldlm_lock_decref(&lockh, mode);
	return page;

fail:
	ll_release_page(dir, page, true);
	page = ERR_PTR(-EIO);
	goto out_unlock;
}

/**
/**
 * return IF_* type for given lu_dirent entry.
 * return IF_* type for given lu_dirent entry.
 * IF_* flag shld be converted to particular OS file type in
 * IF_* flag shld be converted to particular OS file type in
+1 −1
Original line number Original line Diff line number Diff line
@@ -665,7 +665,7 @@ int ll_dir_read(struct inode *inode, __u64 *ppos, struct md_op_data *op_data,
int ll_get_mdt_idx(struct inode *inode);
int ll_get_mdt_idx(struct inode *inode);
int ll_get_mdt_idx_by_fid(struct ll_sb_info *sbi, const struct lu_fid *fid);
int ll_get_mdt_idx_by_fid(struct ll_sb_info *sbi, const struct lu_fid *fid);
struct page *ll_get_dir_page(struct inode *dir, struct md_op_data *op_data,
struct page *ll_get_dir_page(struct inode *dir, struct md_op_data *op_data,
			     __u64 hash, struct ll_dir_chain *chain);
			     __u64 offset, struct ll_dir_chain *chain);
void ll_release_page(struct inode *inode, struct page *page, bool remove);
void ll_release_page(struct inode *inode, struct page *page, bool remove);


/* llite/namei.c */
/* llite/namei.c */
+4 −8
Original line number Original line Diff line number Diff line
@@ -1035,7 +1035,7 @@ static int ll_statahead_thread(void *arg)
	struct ll_statahead_info *sai    = ll_sai_get(plli->lli_sai);
	struct ll_statahead_info *sai    = ll_sai_get(plli->lli_sai);
	struct ptlrpc_thread     *thread = &sai->sai_thread;
	struct ptlrpc_thread     *thread = &sai->sai_thread;
	struct ptlrpc_thread *agl_thread = &sai->sai_agl_thread;
	struct ptlrpc_thread *agl_thread = &sai->sai_agl_thread;
	struct page	      *page;
	struct page	      *page = NULL;
	__u64		     pos    = 0;
	__u64		     pos    = 0;
	int		       first  = 0;
	int		       first  = 0;
	int		       rc     = 0;
	int		       rc     = 0;
@@ -1166,8 +1166,7 @@ static int ll_statahead_thread(void *arg)
					if (!list_empty(&sai->sai_entries_received))
					if (!list_empty(&sai->sai_entries_received))
						goto interpret_it;
						goto interpret_it;


					if (unlikely(
					if (unlikely(!thread_is_running(thread))) {
						!thread_is_running(thread))) {
						ll_release_page(dir, page, false);
						ll_release_page(dir, page, false);
						rc = 0;
						rc = 0;
						goto out;
						goto out;
@@ -1182,10 +1181,10 @@ static int ll_statahead_thread(void *arg)


				goto keep_it;
				goto keep_it;
			}
			}

do_it:
do_it:
			ll_statahead_one(parent, name, namelen);
			ll_statahead_one(parent, name, namelen);
		}
		}

		pos = le64_to_cpu(dp->ldp_hash_end);
		pos = le64_to_cpu(dp->ldp_hash_end);
		if (pos == MDS_DIR_END_OFF) {
		if (pos == MDS_DIR_END_OFF) {
			/*
			/*
@@ -1232,14 +1231,12 @@ static int ll_statahead_thread(void *arg)
			 * Normal case: continue to the next page.
			 * Normal case: continue to the next page.
			 */
			 */
			ll_release_page(dir, page,
			ll_release_page(dir, page,
					le32_to_cpu(dp->ldp_flags) &
					le32_to_cpu(dp->ldp_flags) & LDF_COLLIDE);
					LDF_COLLIDE);
			sai->sai_in_readpage = 1;
			sai->sai_in_readpage = 1;
			page = ll_get_dir_page(dir, op_data, pos, &chain);
			page = ll_get_dir_page(dir, op_data, pos, &chain);
			sai->sai_in_readpage = 0;
			sai->sai_in_readpage = 0;
		}
		}
	}
	}

out:
out:
	ll_finish_md_op_data(op_data);
	ll_finish_md_op_data(op_data);
	if (sai->sai_agl_valid) {
	if (sai->sai_agl_valid) {
@@ -1455,7 +1452,6 @@ static int is_first_dirent(struct inode *dir, struct dentry *dentry)
			page = ll_get_dir_page(dir, op_data, pos, &chain);
			page = ll_get_dir_page(dir, op_data, pos, &chain);
		}
		}
	}
	}

out:
out:
	ll_dir_chain_fini(&chain);
	ll_dir_chain_fini(&chain);
	ll_finish_md_op_data(op_data);
	ll_finish_md_op_data(op_data);
+8 −0
Original line number Original line Diff line number Diff line
@@ -135,4 +135,12 @@ static inline int mdc_prep_elc_req(struct obd_export *exp,
				 count);
				 count);
}
}


static inline unsigned long hash_x_index(__u64 hash, int hash64)
{
	if (BITS_PER_LONG == 32 && hash64)
		hash >>= 32;
	/* save hash 0 with hash 1 */
	return ~0UL - (hash + !hash);
}

#endif
#endif
Loading