Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit e8e5e8a8 authored by Cliff Wickman's avatar Cliff Wickman Committed by Ingo Molnar
Browse files

x86, UV: BAU tunables into a debugfs file



Make the Broadcast Assist Unit driver's nine tuning values variable by
making them accessible through a read/write debugfs file.

The file will normally be mounted as
/sys/kernel/debug/sgi_uv/bau_tunables. The tunables are kept in each
cpu's per-cpu BAU structure.

The patch also does a little name improvement, and corrects the reset of
two destination timeout counters.

Signed-off-by: default avatarCliff Wickman <cpw@sgi.com>
Cc: gregkh@suse.de
LKML-Reference: <E1OJvNx-0004Zx-Uo@eag09.americas.sgi.com>
Signed-off-by: default avatarIngo Molnar <mingo@elte.hu>
parent 12a6611f
Loading
Loading
Loading
Loading
+37 −16
Original line number Diff line number Diff line
@@ -45,10 +45,14 @@
#define UV_DESC_BASE_PNODE_SHIFT	49
#define UV_PAYLOADQ_PNODE_SHIFT		49
#define UV_PTC_BASENAME			"sgi_uv/ptc_statistics"
#define UV_BAU_BASENAME			"sgi_uv/bau_tunables"
#define UV_BAU_TUNABLES_DIR		"sgi_uv"
#define UV_BAU_TUNABLES_FILE		"bau_tunables"
#define WHITESPACE			" \t\n"
#define uv_physnodeaddr(x)		((__pa((unsigned long)(x)) & uv_mmask))
#define UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT 15
#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT 16
#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL
#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x0000000009UL
/* [19:16] SOFT_ACK timeout period  19: 1 is urgency 7  17:16 1 is multiplier */
#define BAU_MISC_CONTROL_MULT_MASK 3

@@ -70,25 +74,23 @@
#define DESC_STATUS_DESTINATION_TIMEOUT	2
#define DESC_STATUS_SOURCE_TIMEOUT	3

#define TIMEOUT_DELAY			10
/*
 * source side threshholds at which message retries print a warning
 */
#define SOURCE_TIMEOUT_LIMIT		20
#define DESTINATION_TIMEOUT_LIMIT	20

/*
 * misc. delays, in microseconds
 * delay for 'plugged' timeout retries, in microseconds
 */
#define THROTTLE_DELAY			10
#define TIMEOUT_DELAY			10
#define BIOS_TO				1000
/* BIOS is assumed to set the destination timeout to 1003520 nanoseconds */
#define PLUGGED_DELAY			10

/*
 * threshholds at which to use IPI to free resources
 */
/* after this # consecutive 'plugged' timeouts, use IPI to release resources */
#define PLUGSB4RESET 100
#define TIMEOUTSB4RESET 100
/* after this many consecutive timeouts, use IPI to release resources */
#define TIMEOUTSB4RESET 1
/* at this number uses of IPI to release resources, giveup the request */
#define IPI_RESET_LIMIT 1
/* after this # consecutive successes, bump up the throttle if it was lowered */
#define COMPLETE_THRESHOLD 5

/*
 * number of entries in the destination side payload queue
@@ -107,6 +109,13 @@
#define FLUSH_GIVEUP			3
#define FLUSH_COMPLETE			4

/*
 * tuning the action when the numalink network is extremely delayed
 */
#define CONGESTED_RESPONSE_US 1000 /* 'long' response time, in microseconds */
#define CONGESTED_REPS 10 /* long delays averaged over this many broadcasts */
#define CONGESTED_PERIOD 30 /* time for the bau to be disabled, in seconds */

/*
 * Distribution: 32 bytes (256 bits) (bytes 0-0x1f of descriptor)
 * If the 'multilevel' flag in the header portion of the descriptor
@@ -323,14 +332,13 @@ struct bau_control {
	struct bau_control *uvhub_master;
	struct bau_control *socket_master;
	unsigned long timeout_interval;
	unsigned long set_bau_on_time;
	atomic_t active_descriptor_count;
	int max_concurrent;
	int max_concurrent_constant;
	int retry_message_scans;
	int plugged_tries;
	int timeout_tries;
	int ipi_attempts;
	int conseccompletes;
	int set_bau_off;
	short cpu;
	short uvhub_cpu;
	short uvhub;
@@ -343,6 +351,19 @@ struct bau_control {
	spinlock_t masks_lock;
	spinlock_t uvhub_lock;
	spinlock_t queue_lock;
	/* tunables */
	int max_bau_concurrent;
	int max_bau_concurrent_constant;
	int plugged_delay;
	int plugsb4reset;
	int timeoutsb4reset;
	int ipi_reset_limit;
	int complete_threshold;
	int congested_response_us;
	int congested_reps;
	int congested_period;
	cycles_t period_time;
	long period_requests;
};

/*
+241 −40
Original line number Diff line number Diff line
@@ -8,6 +8,7 @@
 */
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include <linux/debugfs.h>
#include <linux/kernel.h>
#include <linux/slab.h>

@@ -42,12 +43,22 @@ static int timeout_base_ns[] = {
		167772160
};
static int timeout_us;
static int nobau;

#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD	0x000000000bUL

static int uv_bau_max_concurrent __read_mostly;
/* tunables: */
static int max_bau_concurrent = MAX_BAU_CONCURRENT;
static int max_bau_concurrent_constant = MAX_BAU_CONCURRENT;
static int plugged_delay = PLUGGED_DELAY;
static int plugsb4reset = PLUGSB4RESET;
static int timeoutsb4reset = TIMEOUTSB4RESET;
static int ipi_reset_limit = IPI_RESET_LIMIT;
static int complete_threshold = COMPLETE_THRESHOLD;
static int congested_response_us = CONGESTED_RESPONSE_US;
static int congested_reps = CONGESTED_REPS;
static int congested_period = CONGESTED_PERIOD;
static struct dentry *tunables_dir;
static struct dentry *tunables_file;

static int nobau;
static int __init setup_nobau(char *arg)
{
	nobau = 1;
@@ -539,23 +550,24 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
	unsigned long index;
	cycles_t time1;
	cycles_t time2;
	cycles_t elapsed;
	struct ptc_stats *stat = &per_cpu(ptcstats, bcp->cpu);
	struct bau_control *smaster = bcp->socket_master;
	struct bau_control *hmaster = bcp->uvhub_master;

	/*
	 * Spin here while there are hmaster->max_concurrent or more active
	 * Spin here while there are hmaster->max_bau_concurrent or more active
	 * descriptors. This is the per-uvhub 'throttle'.
	 */
	if (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
			&hmaster->active_descriptor_count,
			hmaster->max_concurrent)) {
			hmaster->max_bau_concurrent)) {
		stat->s_throttles++;
		do {
			cpu_relax();
		} while (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
			&hmaster->active_descriptor_count,
			hmaster->max_concurrent));
			hmaster->max_bau_concurrent));
	}

	while (hmaster->uvhub_quiesce)
@@ -609,9 +621,9 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
			 * that case hardware immediately returns the ERROR
			 * that looks like a destination timeout.
			 */
			udelay(TIMEOUT_DELAY);
			udelay(bcp->plugged_delay);
			bcp->plugged_tries++;
			if (bcp->plugged_tries >= PLUGSB4RESET) {
			if (bcp->plugged_tries >= bcp->plugsb4reset) {
				bcp->plugged_tries = 0;
				quiesce_local_uvhub(hmaster);
				spin_lock(&hmaster->queue_lock);
@@ -623,10 +635,10 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
				stat->s_resets_plug++;
			}
		} else if (completion_status == FLUSH_RETRY_TIMEOUT) {
			hmaster->max_concurrent = 1;
			hmaster->max_bau_concurrent = 1;
			bcp->timeout_tries++;
			udelay(TIMEOUT_DELAY);
			if (bcp->timeout_tries >= TIMEOUTSB4RESET) {
			if (bcp->timeout_tries >= bcp->timeoutsb4reset) {
				bcp->timeout_tries = 0;
				quiesce_local_uvhub(hmaster);
				spin_lock(&hmaster->queue_lock);
@@ -638,7 +650,7 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
				stat->s_resets_timeout++;
			}
		}
		if (bcp->ipi_attempts >= 3) {
		if (bcp->ipi_attempts >= bcp->ipi_reset_limit) {
			bcp->ipi_attempts = 0;
			completion_status = FLUSH_GIVEUP;
			break;
@@ -648,9 +660,14 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
		 (completion_status == FLUSH_RETRY_TIMEOUT));
	time2 = get_cycles();

	if ((completion_status == FLUSH_COMPLETE) && (bcp->conseccompletes > 5)
	    && (hmaster->max_concurrent < hmaster->max_concurrent_constant))
			hmaster->max_concurrent++;
	bcp->plugged_tries = 0;
	bcp->timeout_tries = 0;

	if ((completion_status == FLUSH_COMPLETE) &&
	    (bcp->conseccompletes > bcp->complete_threshold) &&
	    (hmaster->max_bau_concurrent <
					hmaster->max_bau_concurrent_constant))
			hmaster->max_bau_concurrent++;

	/*
	 * hold any cpu not timing out here; no other cpu currently held by
@@ -661,9 +678,10 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
	atomic_dec(&hmaster->active_descriptor_count);

	/* guard against cycles wrap */
	if (time2 > time1)
		stat->s_time += (time2 - time1);
	else
	if (time2 > time1) {
		elapsed = time2 - time1;
		stat->s_time += elapsed;
	} else
		stat->s_requestor--; /* don't count this one */
	if (completion_status == FLUSH_COMPLETE && try > 1)
		stat->s_retriesok++;
@@ -730,10 +748,12 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
	struct ptc_stats *stat;
	struct bau_control *bcp;

	/* kernel was booted 'nobau' */
	if (nobau)
		return cpumask;

	bcp = &per_cpu(bau_control, cpu);

	/*
	 * Each sending cpu has a per-cpu mask which it fills from the caller's
	 * cpu mask.  Only remote cpus are converted to uvhubs and copied.
@@ -970,6 +990,7 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
			   stat->s_resets_plug, stat->s_resets_timeout,
			   stat->s_giveup, stat->s_stimeout,
			   stat->s_busy, stat->s_throttles);

		/* destination side statistics */
		seq_printf(file,
			   "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n",
@@ -985,10 +1006,29 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
	return 0;
}

/*
 * Display the tunables thru debugfs
 */
static ssize_t tunables_read(struct file *file, char __user *userbuf,
						size_t count, loff_t *ppos)
{
	char buf[300];
	int ret;

	ret = snprintf(buf, 300, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n",
		"max_bau_concurrent plugged_delay plugsb4reset",
		"timeoutsb4reset ipi_reset_limit complete_threshold",
		"congested_response_us congested_reps congested_period",
		max_bau_concurrent, plugged_delay, plugsb4reset,
		timeoutsb4reset, ipi_reset_limit, complete_threshold,
		congested_response_us, congested_reps, congested_period);

	return simple_read_from_buffer(userbuf, count, ppos, buf, ret);
}

/*
 * -1: resetf the statistics
 *  0: display meaning of the statistics
 * >0: maximum concurrent active descriptors per uvhub (throttle)
 */
static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
				 size_t count, loff_t *data)
@@ -997,7 +1037,6 @@ static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
	long input_arg;
	char optstr[64];
	struct ptc_stats *stat;
	struct bau_control *bcp;

	if (count == 0 || count > sizeof(optstr))
		return -EINVAL;
@@ -1078,24 +1117,149 @@ static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
			stat = &per_cpu(ptcstats, cpu);
			memset(stat, 0, sizeof(struct ptc_stats));
		}
	} else {
		uv_bau_max_concurrent = input_arg;
	}

	return count;
}

static int local_atoi(const char *name)
{
	int val = 0;

	for (;; name++) {
		switch (*name) {
		case '0' ... '9':
			val = 10*val+(*name-'0');
			break;
		default:
			return val;
		}
	}
}

/*
 * set the tunables
 * 0 values reset them to defaults
 */
static ssize_t tunables_write(struct file *file, const char __user *user,
				 size_t count, loff_t *data)
{
	int cpu;
	int cnt = 0;
	int val;
	char *p;
	char *q;
	char instr[64];
	struct bau_control *bcp;

	if (count == 0 || count > sizeof(instr)-1)
		return -EINVAL;
	if (copy_from_user(instr, user, count))
		return -EFAULT;

	instr[count] = '\0';
	/* count the fields */
	p = instr + strspn(instr, WHITESPACE);
	q = p;
	for (; *p; p = q + strspn(q, WHITESPACE)) {
		q = p + strcspn(p, WHITESPACE);
		cnt++;
		if (q == p)
			break;
	}
	if (cnt != 9) {
		printk(KERN_INFO "bau tunable error: should be 9 numbers\n");
		return -EINVAL;
	}

	p = instr + strspn(instr, WHITESPACE);
	q = p;
	for (cnt = 0; *p; p = q + strspn(q, WHITESPACE), cnt++) {
		q = p + strcspn(p, WHITESPACE);
		val = local_atoi(p);
		switch (cnt) {
		case 0:
			if (val == 0) {
				max_bau_concurrent = MAX_BAU_CONCURRENT;
				max_bau_concurrent_constant =
							MAX_BAU_CONCURRENT;
				continue;
			}
			bcp = &per_cpu(bau_control, smp_processor_id());
		if (uv_bau_max_concurrent < 1 ||
		    uv_bau_max_concurrent > bcp->cpus_in_uvhub) {
			if (val < 1 || val > bcp->cpus_in_uvhub) {
				printk(KERN_DEBUG
				"Error: BAU max concurrent %d; %d is invalid\n",
				bcp->max_concurrent, uv_bau_max_concurrent);
				"Error: BAU max concurrent %d is invalid\n",
				val);
				return -EINVAL;
			}
		printk(KERN_DEBUG "Set BAU max concurrent:%d\n",
		       uv_bau_max_concurrent);
			max_bau_concurrent = val;
			max_bau_concurrent_constant = val;
			continue;
		case 1:
			if (val == 0)
				plugged_delay = PLUGGED_DELAY;
			else
				plugged_delay = val;
			continue;
		case 2:
			if (val == 0)
				plugsb4reset = PLUGSB4RESET;
			else
				plugsb4reset = val;
			continue;
		case 3:
			if (val == 0)
				timeoutsb4reset = TIMEOUTSB4RESET;
			else
				timeoutsb4reset = val;
			continue;
		case 4:
			if (val == 0)
				ipi_reset_limit = IPI_RESET_LIMIT;
			else
				ipi_reset_limit = val;
			continue;
		case 5:
			if (val == 0)
				complete_threshold = COMPLETE_THRESHOLD;
			else
				complete_threshold = val;
			continue;
		case 6:
			if (val == 0)
				congested_response_us = CONGESTED_RESPONSE_US;
			else
				congested_response_us = val;
			continue;
		case 7:
			if (val == 0)
				congested_reps = CONGESTED_REPS;
			else
				congested_reps = val;
			continue;
		case 8:
			if (val == 0)
				congested_period = CONGESTED_PERIOD;
			else
				congested_period = val;
			continue;
		}
		if (q == p)
			break;
	}
	for_each_present_cpu(cpu) {
		bcp = &per_cpu(bau_control, cpu);
			bcp->max_concurrent = uv_bau_max_concurrent;
		}
		bcp->max_bau_concurrent = max_bau_concurrent;
		bcp->max_bau_concurrent_constant = max_bau_concurrent;
		bcp->plugged_delay = plugged_delay;
		bcp->plugsb4reset = plugsb4reset;
		bcp->timeoutsb4reset = timeoutsb4reset;
		bcp->ipi_reset_limit = ipi_reset_limit;
		bcp->complete_threshold = complete_threshold;
		bcp->congested_response_us = congested_response_us;
		bcp->congested_reps = congested_reps;
		bcp->congested_period = congested_period;
	}

	return count;
}

@@ -1111,6 +1275,11 @@ static int uv_ptc_proc_open(struct inode *inode, struct file *file)
	return seq_open(file, &uv_ptc_seq_ops);
}

static int tunables_open(struct inode *inode, struct file *file)
{
	return 0;
}

static const struct file_operations proc_uv_ptc_operations = {
	.open		= uv_ptc_proc_open,
	.read		= seq_read,
@@ -1119,6 +1288,12 @@ static const struct file_operations proc_uv_ptc_operations = {
	.release	= seq_release,
};

static const struct file_operations tunables_fops = {
	.open		= tunables_open,
	.read		= tunables_read,
	.write		= tunables_write,
};

static int __init uv_ptc_init(void)
{
	struct proc_dir_entry *proc_uv_ptc;
@@ -1133,6 +1308,20 @@ static int __init uv_ptc_init(void)
		       UV_PTC_BASENAME);
		return -EINVAL;
	}

	tunables_dir = debugfs_create_dir(UV_BAU_TUNABLES_DIR, NULL);
	if (!tunables_dir) {
		printk(KERN_ERR "unable to create debugfs directory %s\n",
		       UV_BAU_TUNABLES_DIR);
		return -EINVAL;
	}
	tunables_file = debugfs_create_file(UV_BAU_TUNABLES_FILE, 0600,
			tunables_dir, NULL, &tunables_fops);
	if (!tunables_file) {
		printk(KERN_ERR "unable to create debugfs file %s\n",
		       UV_BAU_TUNABLES_FILE);
		return -EINVAL;
	}
	return 0;
}

@@ -1336,15 +1525,12 @@ static void uv_init_per_cpu(int nuvhubs)
		bcp = &per_cpu(bau_control, cpu);
		memset(bcp, 0, sizeof(struct bau_control));
		spin_lock_init(&bcp->masks_lock);
		bcp->max_concurrent = uv_bau_max_concurrent;
		pnode = uv_cpu_hub_info(cpu)->pnode;
		uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
		bdp = &uvhub_descs[uvhub];
		bdp->num_cpus++;
		bdp->uvhub = uvhub;
		bdp->pnode = pnode;
		/* time interval to catch a hardware stay-busy bug */
		bcp->timeout_interval = microsec_2_cycles(2*timeout_us);
		/* kludge: assume uv_hub.h is constant */
		socket = (cpu_physical_id(cpu)>>5)&1;
		if (socket >= bdp->num_sockets)
@@ -1380,6 +1566,21 @@ static void uv_init_per_cpu(int nuvhubs)
		}
	}
	kfree(uvhub_descs);
	for_each_present_cpu(cpu) {
		bcp = &per_cpu(bau_control, cpu);
		/* time interval to catch a hardware stay-busy bug */
		bcp->timeout_interval = microsec_2_cycles(2*timeout_us);
		bcp->max_bau_concurrent = max_bau_concurrent;
		bcp->max_bau_concurrent_constant = max_bau_concurrent;
		bcp->plugged_delay = plugged_delay;
		bcp->plugsb4reset = plugsb4reset;
		bcp->timeoutsb4reset = timeoutsb4reset;
		bcp->ipi_reset_limit = ipi_reset_limit;
		bcp->complete_threshold = complete_threshold;
		bcp->congested_response_us = congested_response_us;
		bcp->congested_reps = congested_reps;
		bcp->congested_period = congested_period;
	}
}

/*
@@ -1404,7 +1605,7 @@ static int __init uv_bau_init(void)
		zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu),
				       GFP_KERNEL, cpu_to_node(cur_cpu));

	uv_bau_max_concurrent = MAX_BAU_CONCURRENT;
	max_bau_concurrent = MAX_BAU_CONCURRENT;
	uv_nshift = uv_hub_info->m_val;
	uv_mmask = (1UL << uv_hub_info->m_val) - 1;
	nuvhubs = uv_num_possible_blades();
@@ -1437,4 +1638,4 @@ static int __init uv_bau_init(void)
	return 0;
}
core_initcall(uv_bau_init);
core_initcall(uv_ptc_init);
fs_initcall(uv_ptc_init);