Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit a521b048 authored by Stefan Haberland's avatar Stefan Haberland Committed by Martin Schwidefsky
Browse files

s390/dasd: channel path aware error recovery



With this feature, the DASD device driver more robustly handles DASDs
that are attached via multiple channel paths and are subject to
constant Interface-Control-Checks (IFCCs) and Channel-Control-Checks
(CCCs) or loss of High-Performance-FICON (HPF) functionality on one or
more of these paths.

If a channel path does not work correctly, it is removed from normal
operation as long as other channel paths are available. All extended
error recovery states can be queried and reset via user space
interfaces.

Signed-off-by: default avatarStefan Haberland <sth@linux.vnet.ibm.com>
Reviewed-by: default avatarSebastian Ott <sebott@linux.vnet.ibm.com>
Reviewed-by: default avatarJan Hoeppner <hoeppner@linux.vnet.ibm.com>
Signed-off-by: default avatarMartin Schwidefsky <schwidefsky@de.ibm.com>
parent c9346151
Loading
Loading
Loading
Loading
+5 −1
Original line number Diff line number Diff line
@@ -96,7 +96,8 @@ struct tm_scsw {
	u32 dstat:8;
	u32 cstat:8;
	u32 fcxs:8;
	u32 schxs:8;
	u32 ifob:1;
	u32 sesq:7;
} __attribute__ ((packed));

/**
@@ -177,6 +178,9 @@ union scsw {
#define SCHN_STAT_INTF_CTRL_CHK	 0x02
#define SCHN_STAT_CHAIN_CHECK	 0x01

#define SCSW_SESQ_DEV_NOFCX	 3
#define SCSW_SESQ_PATH_NOFCX	 4

/*
 * architectured values for first sense byte
 */
+111 −38
Original line number Diff line number Diff line
@@ -69,6 +69,7 @@ static void dasd_block_tasklet(struct dasd_block *);
static void do_kick_device(struct work_struct *);
static void do_restore_device(struct work_struct *);
static void do_reload_device(struct work_struct *);
static void do_requeue_requests(struct work_struct *);
static void dasd_return_cqr_cb(struct dasd_ccw_req *, void *);
static void dasd_device_timeout(unsigned long);
static void dasd_block_timeout(unsigned long);
@@ -125,6 +126,7 @@ struct dasd_device *dasd_alloc_device(void)
	INIT_WORK(&device->kick_work, do_kick_device);
	INIT_WORK(&device->restore_device, do_restore_device);
	INIT_WORK(&device->reload_device, do_reload_device);
	INIT_WORK(&device->requeue_requests, do_requeue_requests);
	device->state = DASD_STATE_NEW;
	device->target = DASD_STATE_NEW;
	mutex_init(&device->state_mutex);
@@ -1622,6 +1624,13 @@ void dasd_generic_handle_state_change(struct dasd_device *device)
}
EXPORT_SYMBOL_GPL(dasd_generic_handle_state_change);

static int dasd_check_hpf_error(struct irb *irb)
{
	return (scsw_tm_is_valid_schxs(&irb->scsw) &&
	    (irb->scsw.tm.sesq == SCSW_SESQ_DEV_NOFCX ||
	     irb->scsw.tm.sesq == SCSW_SESQ_PATH_NOFCX));
}

/*
 * Interrupt handler for "normal" ssch-io based dasd devices.
 */
@@ -1748,6 +1757,13 @@ void dasd_int_handler(struct ccw_device *cdev, unsigned long intparm,
					  struct dasd_ccw_req, devlist);
		}
	} else {  /* error */
		/* check for HPF error
		 * call discipline function to requeue all requests
		 * and disable HPF accordingly
		 */
		if (cqr->cpmode && dasd_check_hpf_error(irb) &&
		    device->discipline->handle_hpf_error)
			device->discipline->handle_hpf_error(device, irb);
		/*
		 * If we don't want complex ERP for this request, then just
		 * reset this and retry it in the fastpath
@@ -2924,10 +2940,10 @@ static int _dasd_requeue_request(struct dasd_ccw_req *cqr)

	if (!block)
		return -EINVAL;
	spin_lock_irqsave(&block->queue_lock, flags);
	spin_lock_irqsave(&block->request_queue_lock, flags);
	req = (struct request *) cqr->callback_data;
	blk_requeue_request(block->request_queue, req);
	spin_unlock_irqrestore(&block->queue_lock, flags);
	spin_unlock_irqrestore(&block->request_queue_lock, flags);

	return 0;
}
@@ -3701,7 +3717,7 @@ EXPORT_SYMBOL_GPL(dasd_generic_notify);
void dasd_generic_path_event(struct ccw_device *cdev, int *path_event)
{
	struct dasd_device *device;
	int chp, oldopm;
	int chp, oldopm, hpfpm, ifccpm;

	device = dasd_device_from_cdev_locked(cdev);
	if (IS_ERR(device))
@@ -3733,7 +3749,30 @@ void dasd_generic_path_event(struct ccw_device *cdev, int *path_event)
				device->discipline->kick_validate(device);
		}
	}
	if (oldopm && !dasd_path_get_opm(device)) {
	hpfpm = dasd_path_get_hpfpm(device);
	ifccpm = dasd_path_get_ifccpm(device);
	if (!dasd_path_get_opm(device) && hpfpm) {
		/*
		 * device has no operational paths but at least one path is
		 * disabled due to HPF errors
		 * disable HPF at all and use the path(s) again
		 */
		if (device->discipline->disable_hpf)
			device->discipline->disable_hpf(device);
		dasd_device_set_stop_bits(device, DASD_STOPPED_NOT_ACC);
		dasd_path_set_tbvpm(device, hpfpm);
		dasd_schedule_device_bh(device);
		dasd_schedule_requeue(device);
	} else if (!dasd_path_get_opm(device) && ifccpm) {
		/*
		 * device has no operational paths but at least one path is
		 * disabled due to IFCC errors
		 * trigger path verification on paths with IFCC errors
		 */
		dasd_path_set_tbvpm(device, ifccpm);
		dasd_schedule_device_bh(device);
	}
	if (oldopm && !dasd_path_get_opm(device) && !hpfpm && !ifccpm) {
		dev_warn(&device->cdev->dev,
			 "No verified channel paths remain for the device\n");
		DBF_DEV_EVENT(DBF_WARNING, device,
@@ -3757,30 +3796,18 @@ int dasd_generic_verify_path(struct dasd_device *device, __u8 lpm)
}
EXPORT_SYMBOL_GPL(dasd_generic_verify_path);


int dasd_generic_pm_freeze(struct ccw_device *cdev)
/*
 * clear active requests and requeue them to block layer if possible
 */
static int dasd_generic_requeue_all_requests(struct dasd_device *device)
{
	struct dasd_device *device = dasd_device_from_cdev(cdev);
	struct list_head freeze_queue;
	struct list_head requeue_queue;
	struct dasd_ccw_req *cqr, *n;
	struct dasd_ccw_req *refers;
	int rc;

	if (IS_ERR(device))
		return PTR_ERR(device);

	/* mark device as suspended */
	set_bit(DASD_FLAG_SUSPENDED, &device->flags);

	if (device->discipline->freeze)
		rc = device->discipline->freeze(device);

	/* disallow new I/O  */
	dasd_device_set_stop_bits(device, DASD_STOPPED_PM);

	/* clear active requests and requeue them to block layer if possible */
	INIT_LIST_HEAD(&freeze_queue);
	spin_lock_irq(get_ccwdev_lock(cdev));
	INIT_LIST_HEAD(&requeue_queue);
	spin_lock_irq(get_ccwdev_lock(device->cdev));
	rc = 0;
	list_for_each_entry_safe(cqr, n, &device->ccw_queue, devlist) {
		/* Check status and move request to flush_queue */
@@ -3791,25 +3818,22 @@ int dasd_generic_pm_freeze(struct ccw_device *cdev)
				dev_err(&device->cdev->dev,
					"Unable to terminate request %p "
					"on suspend\n", cqr);
				spin_unlock_irq(get_ccwdev_lock(cdev));
				spin_unlock_irq(get_ccwdev_lock(device->cdev));
				dasd_put_device(device);
				return rc;
			}
		}
		list_move_tail(&cqr->devlist, &freeze_queue);
		list_move_tail(&cqr->devlist, &requeue_queue);
	}
	spin_unlock_irq(get_ccwdev_lock(cdev));
	spin_unlock_irq(get_ccwdev_lock(device->cdev));

	list_for_each_entry_safe(cqr, n, &freeze_queue, devlist) {
	list_for_each_entry_safe(cqr, n, &requeue_queue, devlist) {
		wait_event(dasd_flush_wq,
			   (cqr->status != DASD_CQR_CLEAR_PENDING));
		if (cqr->status == DASD_CQR_CLEARED)
			cqr->status = DASD_CQR_QUEUED;

		/* requeue requests to blocklayer will only work for
		   block device requests */
		if (_dasd_requeue_request(cqr))
			continue;
		/* mark sleepon requests as ended */
		if (cqr->callback_data == DASD_SLEEPON_START_TAG)
			cqr->callback_data = DASD_SLEEPON_END_TAG;

		/* remove requests from device and block queue */
		list_del_init(&cqr->devlist);
@@ -3821,6 +3845,14 @@ int dasd_generic_pm_freeze(struct ccw_device *cdev)
			dasd_free_erp_request(cqr, cqr->memdev);
			cqr = refers;
		}

		/*
		 * requeue requests to blocklayer will only work
		 * for block device requests
		 */
		if (_dasd_requeue_request(cqr))
			continue;

		if (cqr->block)
			list_del_init(&cqr->blocklist);
		cqr->block->base->discipline->free_cp(
@@ -3831,15 +3863,56 @@ int dasd_generic_pm_freeze(struct ccw_device *cdev)
	 * if requests remain then they are internal request
	 * and go back to the device queue
	 */
	if (!list_empty(&freeze_queue)) {
	if (!list_empty(&requeue_queue)) {
		/* move freeze_queue to start of the ccw_queue */
		spin_lock_irq(get_ccwdev_lock(cdev));
		list_splice_tail(&freeze_queue, &device->ccw_queue);
		spin_unlock_irq(get_ccwdev_lock(cdev));
		spin_lock_irq(get_ccwdev_lock(device->cdev));
		list_splice_tail(&requeue_queue, &device->ccw_queue);
		spin_unlock_irq(get_ccwdev_lock(device->cdev));
	}
	dasd_put_device(device);
	/* wake up generic waitqueue for eventually ended sleepon requests */
	wake_up(&generic_waitq);
	return rc;
}

static void do_requeue_requests(struct work_struct *work)
{
	struct dasd_device *device = container_of(work, struct dasd_device,
						  requeue_requests);
	dasd_generic_requeue_all_requests(device);
	dasd_device_remove_stop_bits(device, DASD_STOPPED_NOT_ACC);
	if (device->block)
		dasd_schedule_block_bh(device->block);
	dasd_put_device(device);
}

void dasd_schedule_requeue(struct dasd_device *device)
{
	dasd_get_device(device);
	/* queue call to dasd_reload_device to the kernel event daemon. */
	if (!schedule_work(&device->requeue_requests))
		dasd_put_device(device);
}
EXPORT_SYMBOL(dasd_schedule_requeue);

int dasd_generic_pm_freeze(struct ccw_device *cdev)
{
	struct dasd_device *device = dasd_device_from_cdev(cdev);
	int rc;

	if (IS_ERR(device))
		return PTR_ERR(device);

	/* mark device as suspended */
	set_bit(DASD_FLAG_SUSPENDED, &device->flags);

	if (device->discipline->freeze)
		rc = device->discipline->freeze(device);

	/* disallow new I/O  */
	dasd_device_set_stop_bits(device, DASD_STOPPED_PM);

	return dasd_generic_requeue_all_requests(device);
}
EXPORT_SYMBOL_GPL(dasd_generic_pm_freeze);

int dasd_generic_restore_device(struct ccw_device *cdev)
+46 −0
Original line number Diff line number Diff line
@@ -2208,6 +2208,51 @@ dasd_3990_erp_inspect_32(struct dasd_ccw_req * erp, char *sense)

}				/* end dasd_3990_erp_inspect_32 */

static void dasd_3990_erp_disable_path(struct dasd_device *device, __u8 lpum)
{
	int pos = pathmask_to_pos(lpum);

	/* no remaining path, cannot disable */
	if (!(dasd_path_get_opm(device) & ~lpum))
		return;

	dev_err(&device->cdev->dev,
		"Path %x.%02x (pathmask %02x) is disabled - IFCC threshold exceeded\n",
		device->path[pos].cssid, device->path[pos].chpid, lpum);
	dasd_path_remove_opm(device, lpum);
	dasd_path_add_ifccpm(device, lpum);
	device->path[pos].errorclk = 0;
	atomic_set(&device->path[pos].error_count, 0);
}

static void dasd_3990_erp_account_error(struct dasd_ccw_req *erp)
{
	struct dasd_device *device = erp->startdev;
	__u8 lpum = erp->refers->irb.esw.esw1.lpum;
	int pos = pathmask_to_pos(lpum);
	unsigned long long clk;

	if (!device->path_thrhld)
		return;

	clk = get_tod_clock();
	/*
	 * check if the last error is longer ago than the timeout,
	 * if so reset error state
	 */
	if ((tod_to_ns(clk - device->path[pos].errorclk) / NSEC_PER_SEC)
	    >= device->path_interval) {
		atomic_set(&device->path[pos].error_count, 0);
		device->path[pos].errorclk = 0;
	}
	atomic_inc(&device->path[pos].error_count);
	device->path[pos].errorclk = clk;
	/* threshold exceeded disable path if possible */
	if (atomic_read(&device->path[pos].error_count) >=
	    device->path_thrhld)
		dasd_3990_erp_disable_path(device, lpum);
}

/*
 *****************************************************************************
 * main ERP control functions (24 and 32 byte sense)
@@ -2237,6 +2282,7 @@ dasd_3990_erp_control_check(struct dasd_ccw_req *erp)
					   | SCHN_STAT_CHN_CTRL_CHK)) {
		DBF_DEV_EVENT(DBF_WARNING, device, "%s",
			    "channel or interface control check");
		dasd_3990_erp_account_error(erp);
		erp = dasd_3990_erp_action_4(erp, NULL);
	}
	return erp;
+151 −6
Original line number Diff line number Diff line
@@ -977,10 +977,12 @@ dasd_access_show(struct device *dev, struct device_attribute *attr,
	if (IS_ERR(device))
		return PTR_ERR(device);

	if (device->discipline->host_access_count)
		count = device->discipline->host_access_count(device);
	else
	if (!device->discipline)
		count = -ENODEV;
	else if (!device->discipline->host_access_count)
		count = -EOPNOTSUPP;
	else
		count = device->discipline->host_access_count(device);

	dasd_put_device(device);
	if (count < 0)
@@ -1341,6 +1343,50 @@ dasd_timeout_store(struct device *dev, struct device_attribute *attr,
static DEVICE_ATTR(timeout, 0644,
		   dasd_timeout_show, dasd_timeout_store);


static ssize_t
dasd_path_reset_store(struct device *dev, struct device_attribute *attr,
		      const char *buf, size_t count)
{
	struct dasd_device *device;
	unsigned int val;

	device = dasd_device_from_cdev(to_ccwdev(dev));
	if (IS_ERR(device))
		return -ENODEV;

	if ((kstrtouint(buf, 16, &val) != 0) || val > 0xff)
		val = 0;

	if (device->discipline && device->discipline->reset_path)
		device->discipline->reset_path(device, (__u8) val);

	dasd_put_device(device);
	return count;
}

static DEVICE_ATTR(path_reset, 0200, NULL, dasd_path_reset_store);

static ssize_t dasd_hpf_show(struct device *dev, struct device_attribute *attr,
			     char *buf)
{
	struct dasd_device *device;
	int hpf;

	device = dasd_device_from_cdev(to_ccwdev(dev));
	if (IS_ERR(device))
		return -ENODEV;
	if (!device->discipline || !device->discipline->hpf_enabled) {
		dasd_put_device(device);
		return snprintf(buf, PAGE_SIZE, "%d\n", dasd_nofcx);
	}
	hpf = device->discipline->hpf_enabled(device);
	dasd_put_device(device);
	return snprintf(buf, PAGE_SIZE, "%d\n", hpf);
}

static DEVICE_ATTR(hpf, 0444, dasd_hpf_show, NULL);

static ssize_t dasd_reservation_policy_show(struct device *dev,
					    struct device_attribute *attr,
					    char *buf)
@@ -1432,7 +1478,7 @@ static ssize_t dasd_pm_show(struct device *dev,
			      struct device_attribute *attr, char *buf)
{
	struct dasd_device *device;
	u8 opm, nppm, cablepm, cuirpm, hpfpm;
	u8 opm, nppm, cablepm, cuirpm, hpfpm, ifccpm;

	device = dasd_device_from_cdev(to_ccwdev(dev));
	if (IS_ERR(device))
@@ -1443,14 +1489,109 @@ static ssize_t dasd_pm_show(struct device *dev,
	cablepm = dasd_path_get_cablepm(device);
	cuirpm = dasd_path_get_cuirpm(device);
	hpfpm = dasd_path_get_hpfpm(device);
	ifccpm = dasd_path_get_ifccpm(device);
	dasd_put_device(device);

	return sprintf(buf, "%02x %02x %02x %02x %02x\n", opm, nppm,
		       cablepm, cuirpm, hpfpm);
	return sprintf(buf, "%02x %02x %02x %02x %02x %02x\n", opm, nppm,
		       cablepm, cuirpm, hpfpm, ifccpm);
}

static DEVICE_ATTR(path_masks, 0444, dasd_pm_show, NULL);

/*
 * threshold value for IFCC/CCC errors
 */
static ssize_t
dasd_path_threshold_show(struct device *dev,
			  struct device_attribute *attr, char *buf)
{
	struct dasd_device *device;
	int len;

	device = dasd_device_from_cdev(to_ccwdev(dev));
	if (IS_ERR(device))
		return -ENODEV;
	len = snprintf(buf, PAGE_SIZE, "%lu\n", device->path_thrhld);
	dasd_put_device(device);
	return len;
}

static ssize_t
dasd_path_threshold_store(struct device *dev, struct device_attribute *attr,
			   const char *buf, size_t count)
{
	struct dasd_device *device;
	unsigned long flags;
	unsigned long val;

	device = dasd_device_from_cdev(to_ccwdev(dev));
	if (IS_ERR(device))
		return -ENODEV;

	if ((kstrtoul(buf, 10, &val) != 0) ||
	    (val > DASD_THRHLD_MAX) || val == 0) {
		dasd_put_device(device);
		return -EINVAL;
	}
	spin_lock_irqsave(get_ccwdev_lock(to_ccwdev(dev)), flags);
	if (val)
		device->path_thrhld = val;
	spin_unlock_irqrestore(get_ccwdev_lock(to_ccwdev(dev)), flags);
	dasd_put_device(device);
	return count;
}

static DEVICE_ATTR(path_threshold, 0644, dasd_path_threshold_show,
		   dasd_path_threshold_store);
/*
 * interval for IFCC/CCC checks
 * meaning time with no IFCC/CCC error before the error counter
 * gets reset
 */
static ssize_t
dasd_path_interval_show(struct device *dev,
			struct device_attribute *attr, char *buf)
{
	struct dasd_device *device;
	int len;

	device = dasd_device_from_cdev(to_ccwdev(dev));
	if (IS_ERR(device))
		return -ENODEV;
	len = snprintf(buf, PAGE_SIZE, "%lu\n", device->path_interval);
	dasd_put_device(device);
	return len;
}

static ssize_t
dasd_path_interval_store(struct device *dev, struct device_attribute *attr,
	       const char *buf, size_t count)
{
	struct dasd_device *device;
	unsigned long flags;
	unsigned long val;

	device = dasd_device_from_cdev(to_ccwdev(dev));
	if (IS_ERR(device))
		return -ENODEV;

	if ((kstrtoul(buf, 10, &val) != 0) ||
	    (val > DASD_INTERVAL_MAX) || val == 0) {
		dasd_put_device(device);
		return -EINVAL;
	}
	spin_lock_irqsave(get_ccwdev_lock(to_ccwdev(dev)), flags);
	if (val)
		device->path_interval = val;
	spin_unlock_irqrestore(get_ccwdev_lock(to_ccwdev(dev)), flags);
	dasd_put_device(device);
	return count;
}

static DEVICE_ATTR(path_interval, 0644, dasd_path_interval_show,
		   dasd_path_interval_store);


static struct attribute * dasd_attrs[] = {
	&dev_attr_readonly.attr,
	&dev_attr_discipline.attr,
@@ -1471,6 +1612,10 @@ static struct attribute * dasd_attrs[] = {
	&dev_attr_safe_offline.attr,
	&dev_attr_host_access_count.attr,
	&dev_attr_path_masks.attr,
	&dev_attr_path_threshold.attr,
	&dev_attr_path_interval.attr,
	&dev_attr_path_reset.attr,
	&dev_attr_hpf.attr,
	NULL,
};

+146 −64

File changed.

Preview size limit exceeded, changes collapsed.

Loading