Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit c4fd308e authored by Linus Torvalds's avatar Linus Torvalds
Browse files
* 'x86-pat-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
  x86, pat: Update the page flags for memtype atomically instead of using memtype_lock
  x86, pat: In rbt_memtype_check_insert(), update new->type only if valid
  x86, pat: Migrate to rbtree only backend for pat memtype management
  x86, pat: Preparatory changes in pat.c for bigger rbtree change
  rbtree: Add support for augmented rbtrees
parents 96fbeb97 1f9cc3cb
Loading
Loading
Loading
Loading
+58 −0
Original line number Diff line number Diff line
@@ -190,3 +190,61 @@ Example:
  for (node = rb_first(&mytree); node; node = rb_next(node))
	printk("key=%s\n", rb_entry(node, struct mytype, node)->keystring);

Support for Augmented rbtrees
-----------------------------

Augmented rbtree is an rbtree with "some" additional data stored in each node.
This data can be used to augment some new functionality to rbtree.
Augmented rbtree is an optional feature built on top of basic rbtree
infrastructure. rbtree user who wants this feature will have an augment
callback function in rb_root initialized.

This callback function will be called from rbtree core routines whenever
a node has a change in one or both of its children. It is the responsibility
of the callback function to recalculate the additional data that is in the
rb node using new children information. Note that if this new additional
data affects the parent node's additional data, then callback function has
to handle it and do the recursive updates.


Interval tree is an example of augmented rb tree. Reference -
"Introduction to Algorithms" by Cormen, Leiserson, Rivest and Stein.
More details about interval trees:

Classical rbtree has a single key and it cannot be directly used to store
interval ranges like [lo:hi] and do a quick lookup for any overlap with a new
lo:hi or to find whether there is an exact match for a new lo:hi.

However, rbtree can be augmented to store such interval ranges in a structured
way making it possible to do efficient lookup and exact match.

This "extra information" stored in each node is the maximum hi
(max_hi) value among all the nodes that are its descendents. This
information can be maintained at each node just be looking at the node
and its immediate children. And this will be used in O(log n) lookup
for lowest match (lowest start address among all possible matches)
with something like:

find_lowest_match(lo, hi, node)
{
	lowest_match = NULL;
	while (node) {
		if (max_hi(node->left) > lo) {
			// Lowest overlap if any must be on left side
			node = node->left;
		} else if (overlap(lo, hi, node)) {
			lowest_match = node;
			break;
		} else if (lo > node->lo) {
			// Lowest overlap if any must be on right side
			node = node->right;
		} else {
			break;
		}
	}
	return lowest_match;
}

Finding exact match will be to first find lowest match and then to follow
successor nodes looking for exact match, until the start of a node is beyond
the hi value we are looking for.
+25 −19
Original line number Diff line number Diff line
@@ -44,9 +44,6 @@ static inline void copy_from_user_page(struct vm_area_struct *vma,
	memcpy(dst, src, len);
}

#define PG_WC				PG_arch_1
PAGEFLAG(WC, WC)

#ifdef CONFIG_X86_PAT
/*
 * X86 PAT uses page flags WC and Uncached together to keep track of
@@ -55,16 +52,24 @@ PAGEFLAG(WC, WC)
 * _PAGE_CACHE_UC_MINUS and fourth state where page's memory type has not
 * been changed from its default (value of -1 used to denote this).
 * Note we do not support _PAGE_CACHE_UC here.
 *
 * Caller must hold memtype_lock for atomicity.
 */

#define _PGMT_DEFAULT		0
#define _PGMT_WC		(1UL << PG_arch_1)
#define _PGMT_UC_MINUS		(1UL << PG_uncached)
#define _PGMT_WB		(1UL << PG_uncached | 1UL << PG_arch_1)
#define _PGMT_MASK		(1UL << PG_uncached | 1UL << PG_arch_1)
#define _PGMT_CLEAR_MASK	(~_PGMT_MASK)

static inline unsigned long get_page_memtype(struct page *pg)
{
	if (!PageUncached(pg) && !PageWC(pg))
	unsigned long pg_flags = pg->flags & _PGMT_MASK;

	if (pg_flags == _PGMT_DEFAULT)
		return -1;
	else if (!PageUncached(pg) && PageWC(pg))
	else if (pg_flags == _PGMT_WC)
		return _PAGE_CACHE_WC;
	else if (PageUncached(pg) && !PageWC(pg))
	else if (pg_flags == _PGMT_UC_MINUS)
		return _PAGE_CACHE_UC_MINUS;
	else
		return _PAGE_CACHE_WB;
@@ -72,25 +77,26 @@ static inline unsigned long get_page_memtype(struct page *pg)

static inline void set_page_memtype(struct page *pg, unsigned long memtype)
{
	unsigned long memtype_flags = _PGMT_DEFAULT;
	unsigned long old_flags;
	unsigned long new_flags;

	switch (memtype) {
	case _PAGE_CACHE_WC:
		ClearPageUncached(pg);
		SetPageWC(pg);
		memtype_flags = _PGMT_WC;
		break;
	case _PAGE_CACHE_UC_MINUS:
		SetPageUncached(pg);
		ClearPageWC(pg);
		memtype_flags = _PGMT_UC_MINUS;
		break;
	case _PAGE_CACHE_WB:
		SetPageUncached(pg);
		SetPageWC(pg);
		break;
	default:
	case -1:
		ClearPageUncached(pg);
		ClearPageWC(pg);
		memtype_flags = _PGMT_WB;
		break;
	}

	do {
		old_flags = pg->flags;
		new_flags = (old_flags & _PGMT_CLEAR_MASK) | memtype_flags;
	} while (cmpxchg(&pg->flags, old_flags, new_flags) != old_flags);
}
#else
static inline unsigned long get_page_memtype(struct page *pg) { return -1; }
+1 −0
Original line number Diff line number Diff line
@@ -6,6 +6,7 @@ nostackp := $(call cc-option, -fno-stack-protector)
CFLAGS_physaddr.o		:= $(nostackp)
CFLAGS_setup_nx.o		:= $(nostackp)

obj-$(CONFIG_X86_PAT)		+= pat_rbtree.o
obj-$(CONFIG_SMP)		+= tlb.o

obj-$(CONFIG_X86_32)		+= pgtable_32.o iomap_32.o
+19 −220
Original line number Diff line number Diff line
@@ -30,6 +30,8 @@
#include <asm/pat.h>
#include <asm/io.h>

#include "pat_internal.h"

#ifdef CONFIG_X86_PAT
int __read_mostly pat_enabled = 1;

@@ -53,19 +55,15 @@ static inline void pat_disable(const char *reason)
#endif


static int debug_enable;
int pat_debug_enable;

static int __init pat_debug_setup(char *str)
{
	debug_enable = 1;
	pat_debug_enable = 1;
	return 0;
}
__setup("debugpat", pat_debug_setup);

#define dprintk(fmt, arg...) \
	do { if (debug_enable) printk(KERN_INFO fmt, ##arg); } while (0)


static u64 __read_mostly boot_pat_state;

enum {
@@ -132,84 +130,7 @@ void pat_init(void)

#undef PAT

static char *cattr_name(unsigned long flags)
{
	switch (flags & _PAGE_CACHE_MASK) {
	case _PAGE_CACHE_UC:		return "uncached";
	case _PAGE_CACHE_UC_MINUS:	return "uncached-minus";
	case _PAGE_CACHE_WB:		return "write-back";
	case _PAGE_CACHE_WC:		return "write-combining";
	default:			return "broken";
	}
}

/*
 * The global memtype list keeps track of memory type for specific
 * physical memory areas. Conflicting memory types in different
 * mappings can cause CPU cache corruption. To avoid this we keep track.
 *
 * The list is sorted based on starting address and can contain multiple
 * entries for each address (this allows reference counting for overlapping
 * areas). All the aliases have the same cache attributes of course.
 * Zero attributes are represented as holes.
 *
 * The data structure is a list that is also organized as an rbtree
 * sorted on the start address of memtype range.
 *
 * memtype_lock protects both the linear list and rbtree.
 */

struct memtype {
	u64			start;
	u64			end;
	unsigned long		type;
	struct list_head	nd;
	struct rb_node		rb;
};

static struct rb_root memtype_rbroot = RB_ROOT;
static LIST_HEAD(memtype_list);
static DEFINE_SPINLOCK(memtype_lock);	/* protects memtype list */

static struct memtype *memtype_rb_search(struct rb_root *root, u64 start)
{
	struct rb_node *node = root->rb_node;
	struct memtype *last_lower = NULL;

	while (node) {
		struct memtype *data = container_of(node, struct memtype, rb);

		if (data->start < start) {
			last_lower = data;
			node = node->rb_right;
		} else if (data->start > start) {
			node = node->rb_left;
		} else
			return data;
	}

	/* Will return NULL if there is no entry with its start <= start */
	return last_lower;
}

static void memtype_rb_insert(struct rb_root *root, struct memtype *data)
{
	struct rb_node **new = &(root->rb_node);
	struct rb_node *parent = NULL;

	while (*new) {
		struct memtype *this = container_of(*new, struct memtype, rb);

		parent = *new;
		if (data->start <= this->start)
			new = &((*new)->rb_left);
		else if (data->start > this->start)
			new = &((*new)->rb_right);
	}

	rb_link_node(&data->rb, parent, new);
	rb_insert_color(&data->rb, root);
}
static DEFINE_SPINLOCK(memtype_lock);	/* protects memtype accesses */

/*
 * Does intersection of PAT memory type and MTRR memory type and returns
@@ -237,33 +158,6 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
	return req_type;
}

static int
chk_conflict(struct memtype *new, struct memtype *entry, unsigned long *type)
{
	if (new->type != entry->type) {
		if (type) {
			new->type = entry->type;
			*type = entry->type;
		} else
			goto conflict;
	}

	 /* check overlaps with more than one entry in the list */
	list_for_each_entry_continue(entry, &memtype_list, nd) {
		if (new->end <= entry->start)
			break;
		else if (new->type != entry->type)
			goto conflict;
	}
	return 0;

 conflict:
	printk(KERN_INFO "%s:%d conflicting memory types "
	       "%Lx-%Lx %s<->%s\n", current->comm, current->pid, new->start,
	       new->end, cattr_name(new->type), cattr_name(entry->type));
	return -EBUSY;
}

static int pat_pagerange_is_ram(unsigned long start, unsigned long end)
{
	int ram_page = 0, not_rampage = 0;
@@ -296,8 +190,6 @@ static int pat_pagerange_is_ram(unsigned long start, unsigned long end)
 * Here we do two pass:
 * - Find the memtype of all the pages in the range, look for any conflicts
 * - In case of no conflicts, set the new memtype for pages in the range
 *
 * Caller must hold memtype_lock for atomicity.
 */
static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type,
				  unsigned long *new_type)
@@ -364,9 +256,8 @@ static int free_ram_pages_type(u64 start, u64 end)
int reserve_memtype(u64 start, u64 end, unsigned long req_type,
		    unsigned long *new_type)
{
	struct memtype *new, *entry;
	struct memtype *new;
	unsigned long actual_type;
	struct list_head *where;
	int is_range_ram;
	int err = 0;

@@ -404,9 +295,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
	is_range_ram = pat_pagerange_is_ram(start, end);
	if (is_range_ram == 1) {

		spin_lock(&memtype_lock);
		err = reserve_ram_pages_type(start, end, req_type, new_type);
		spin_unlock(&memtype_lock);

		return err;
	} else if (is_range_ram < 0) {
@@ -423,42 +312,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,

	spin_lock(&memtype_lock);

	/* Search for existing mapping that overlaps the current range */
	where = NULL;
	list_for_each_entry(entry, &memtype_list, nd) {
		if (end <= entry->start) {
			where = entry->nd.prev;
			break;
		} else if (start <= entry->start) { /* end > entry->start */
			err = chk_conflict(new, entry, new_type);
			if (!err) {
				dprintk("Overlap at 0x%Lx-0x%Lx\n",
					entry->start, entry->end);
				where = entry->nd.prev;
			}
			break;
		} else if (start < entry->end) { /* start > entry->start */
			err = chk_conflict(new, entry, new_type);
			if (!err) {
				dprintk("Overlap at 0x%Lx-0x%Lx\n",
					entry->start, entry->end);

				/*
				 * Move to right position in the linked
				 * list to add this new entry
				 */
				list_for_each_entry_continue(entry,
							&memtype_list, nd) {
					if (start <= entry->start) {
						where = entry->nd.prev;
						break;
					}
				}
			}
			break;
		}
	}

	err = rbt_memtype_check_insert(new, new_type);
	if (err) {
		printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, "
		       "track %s, req %s\n",
@@ -469,13 +323,6 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
		return err;
	}

	if (where)
		list_add(&new->nd, where);
	else
		list_add_tail(&new->nd, &memtype_list);

	memtype_rb_insert(&memtype_rbroot, new);

	spin_unlock(&memtype_lock);

	dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
@@ -487,7 +334,6 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,

int free_memtype(u64 start, u64 end)
{
	struct memtype *entry, *saved_entry;
	int err = -EINVAL;
	int is_range_ram;

@@ -501,9 +347,7 @@ int free_memtype(u64 start, u64 end)
	is_range_ram = pat_pagerange_is_ram(start, end);
	if (is_range_ram == 1) {

		spin_lock(&memtype_lock);
		err = free_ram_pages_type(start, end);
		spin_unlock(&memtype_lock);

		return err;
	} else if (is_range_ram < 0) {
@@ -511,46 +355,7 @@ int free_memtype(u64 start, u64 end)
	}

	spin_lock(&memtype_lock);

	entry = memtype_rb_search(&memtype_rbroot, start);
	if (unlikely(entry == NULL))
		goto unlock_ret;

	/*
	 * Saved entry points to an entry with start same or less than what
	 * we searched for. Now go through the list in both directions to look
	 * for the entry that matches with both start and end, with list stored
	 * in sorted start address
	 */
	saved_entry = entry;
	list_for_each_entry_from(entry, &memtype_list, nd) {
		if (entry->start == start && entry->end == end) {
			rb_erase(&entry->rb, &memtype_rbroot);
			list_del(&entry->nd);
			kfree(entry);
			err = 0;
			break;
		} else if (entry->start > start) {
			break;
		}
	}

	if (!err)
		goto unlock_ret;

	entry = saved_entry;
	list_for_each_entry_reverse(entry, &memtype_list, nd) {
		if (entry->start == start && entry->end == end) {
			rb_erase(&entry->rb, &memtype_rbroot);
			list_del(&entry->nd);
			kfree(entry);
			err = 0;
			break;
		} else if (entry->start < start) {
			break;
		}
	}
unlock_ret:
	err = rbt_memtype_erase(start, end);
	spin_unlock(&memtype_lock);

	if (err) {
@@ -583,10 +388,8 @@ static unsigned long lookup_memtype(u64 paddr)

	if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) {
		struct page *page;
		spin_lock(&memtype_lock);
		page = pfn_to_page(paddr >> PAGE_SHIFT);
		rettype = get_page_memtype(page);
		spin_unlock(&memtype_lock);
		/*
		 * -1 from get_page_memtype() implies RAM page is in its
		 * default state and not reserved, and hence of type WB
@@ -599,7 +402,7 @@ static unsigned long lookup_memtype(u64 paddr)

	spin_lock(&memtype_lock);

	entry = memtype_rb_search(&memtype_rbroot, paddr);
	entry = rbt_memtype_lookup(paddr);
	if (entry != NULL)
		rettype = entry->type;
	else
@@ -936,30 +739,26 @@ EXPORT_SYMBOL_GPL(pgprot_writecombine);

#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT)

/* get Nth element of the linked list */
static struct memtype *memtype_get_idx(loff_t pos)
{
	struct memtype *list_node, *print_entry;
	int i = 1;
	struct memtype *print_entry;
	int ret;

	print_entry  = kmalloc(sizeof(struct memtype), GFP_KERNEL);
	print_entry  = kzalloc(sizeof(struct memtype), GFP_KERNEL);
	if (!print_entry)
		return NULL;

	spin_lock(&memtype_lock);
	list_for_each_entry(list_node, &memtype_list, nd) {
		if (pos == i) {
			*print_entry = *list_node;
	ret = rbt_memtype_copy_nth_element(print_entry, pos);
	spin_unlock(&memtype_lock);

	if (!ret) {
		return print_entry;
		}
		++i;
	}
	spin_unlock(&memtype_lock);
	} else {
		kfree(print_entry);

		return NULL;
	}
}

static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
{
+46 −0
Original line number Diff line number Diff line
#ifndef __PAT_INTERNAL_H_
#define __PAT_INTERNAL_H_

extern int pat_debug_enable;

#define dprintk(fmt, arg...) \
	do { if (pat_debug_enable) printk(KERN_INFO fmt, ##arg); } while (0)

struct memtype {
	u64			start;
	u64			end;
	u64			subtree_max_end;
	unsigned long		type;
	struct rb_node		rb;
};

static inline char *cattr_name(unsigned long flags)
{
	switch (flags & _PAGE_CACHE_MASK) {
	case _PAGE_CACHE_UC:		return "uncached";
	case _PAGE_CACHE_UC_MINUS:	return "uncached-minus";
	case _PAGE_CACHE_WB:		return "write-back";
	case _PAGE_CACHE_WC:		return "write-combining";
	default:			return "broken";
	}
}

#ifdef CONFIG_X86_PAT
extern int rbt_memtype_check_insert(struct memtype *new,
					unsigned long *new_type);
extern int rbt_memtype_erase(u64 start, u64 end);
extern struct memtype *rbt_memtype_lookup(u64 addr);
extern int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos);
#else
static inline int rbt_memtype_check_insert(struct memtype *new,
					unsigned long *new_type)
{ return 0; }
static inline int rbt_memtype_erase(u64 start, u64 end)
{ return 0; }
static inline struct memtype *rbt_memtype_lookup(u64 addr)
{ return NULL; }
static inline int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos)
{ return 0; }
#endif

#endif /* __PAT_INTERNAL_H_ */
Loading