From 5c4fb76af31e9dabcd132a0e69ed3799df1304c3 Mon Sep 17 00:00:00 2001
From: Mark Salyzyn <mark_salyzyn@xyratex.com>
Date: Tue, 17 Jan 2012 12:56:45 -0500
Subject: [SCSI] pm8001: fix lockup on phy_control hard reset.

pm8001_phy_control PHY_FUNC_HARD_RESET locks up on second try via
smp_phy_control because response HW_EVENT_PHY_START_STATUS fails to complete
previous command. The PM8001F_RUN_TIME flag is not treated as a bit, but a
state in all readers, yet once we are operational or in the run time state,
the flags use a bit-set operation.

Signed-off-by: Mark Salyzyn <mark_salyzyn@xyratex.com>
Acked-by: Jack Wang <jack_wang@usish.com>
Signed-off-by: James Bottomley <JBottomley@Parallels.com>
---
 drivers/scsi/pm8001/pm8001_sas.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/scsi/pm8001/pm8001_sas.c')

diff --git a/drivers/scsi/pm8001/pm8001_sas.c b/drivers/scsi/pm8001/pm8001_sas.c
index fb3dc9978861..7ae22a67bd31 100644
--- a/drivers/scsi/pm8001/pm8001_sas.c
+++ b/drivers/scsi/pm8001/pm8001_sas.c
@@ -615,7 +615,7 @@ static int pm8001_dev_found_notify(struct domain_device *dev)
 	wait_for_completion(&completion);
 	if (dev->dev_type == SAS_END_DEV)
 		msleep(50);
-	pm8001_ha->flags |= PM8001F_RUN_TIME ;
+	pm8001_ha->flags = PM8001F_RUN_TIME;
 	return 0;
 found_out:
 	spin_unlock_irqrestore(&pm8001_ha->lock, flags);
-- 
cgit v1.2.3


From d95d00016f8f51dc502cadb263d861bd8c0212bb Mon Sep 17 00:00:00 2001
From: Mark Salyzyn <mark_salyzyn@xyratex.com>
Date: Tue, 17 Jan 2012 09:18:57 -0500
Subject: [SCSI] pm8001: Add FUNC_GET_EVENTS

Jack noticed I dropped a patch fragment associated with a flags automatic
variable in mpi_set_phys_g3_with_ssc (ooops) and that the pre-emptive locking
that piggy-backed this patch was not in-fact necessary because of underlying
atomic accesses to the hardware. Here is the updated patch fixing these two
issues.

The pm8001 driver is missing the FUNC_GET_EVENTS handler in the phy control
function. Since the pm8001_bar4_shift function was not designed to be called
at runtime, added locking surrounding the adjustment for all accesses.

Signed-off-by: Mark Salyzyn <mark_salyzyn@xyratex.com>
Acked-by: Jack Wang <jack_wang@usish.com>
Signed-off-by: James Bottomley <JBottomley@Parallels.com>
---
 drivers/scsi/pm8001/pm8001_sas.c | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

(limited to 'drivers/scsi/pm8001/pm8001_sas.c')

diff --git a/drivers/scsi/pm8001/pm8001_sas.c b/drivers/scsi/pm8001/pm8001_sas.c
index 7ae22a67bd31..ab0704e39040 100644
--- a/drivers/scsi/pm8001/pm8001_sas.c
+++ b/drivers/scsi/pm8001/pm8001_sas.c
@@ -166,6 +166,7 @@ int pm8001_phy_control(struct asd_sas_phy *sas_phy, enum phy_func func,
 	struct pm8001_hba_info *pm8001_ha = NULL;
 	struct sas_phy_linkrates *rates;
 	DECLARE_COMPLETION_ONSTACK(completion);
+	unsigned long flags;
 	pm8001_ha = sas_phy->ha->lldd_ha;
 	pm8001_ha->phy[phy_id].enable_completion = &completion;
 	switch (func) {
@@ -209,8 +210,29 @@ int pm8001_phy_control(struct asd_sas_phy *sas_phy, enum phy_func func,
 	case PHY_FUNC_DISABLE:
 		PM8001_CHIP_DISP->phy_stop_req(pm8001_ha, phy_id);
 		break;
+	case PHY_FUNC_GET_EVENTS:
+		spin_lock_irqsave(&pm8001_ha->lock, flags);
+		if (-1 == pm8001_bar4_shift(pm8001_ha,
+					(phy_id < 4) ? 0x30000 : 0x40000)) {
+			spin_unlock_irqrestore(&pm8001_ha->lock, flags);
+			return -EINVAL;
+		}
+		{
+			struct sas_phy *phy = sas_phy->phy;
+			uint32_t *qp = (uint32_t *)(((char *)
+				pm8001_ha->io_mem[2].memvirtaddr)
+				+ 0x1034 + (0x4000 * (phy_id & 3)));
+
+			phy->invalid_dword_count = qp[0];
+			phy->running_disparity_error_count = qp[1];
+			phy->loss_of_dword_sync_count = qp[3];
+			phy->phy_reset_problem_count = qp[4];
+		}
+		pm8001_bar4_shift(pm8001_ha, 0);
+		spin_unlock_irqrestore(&pm8001_ha->lock, flags);
+		return 0;
 	default:
-		rc = -ENOSYS;
+		rc = -EOPNOTSUPP;
 	}
 	msleep(300);
 	return rc;
-- 
cgit v1.2.3


From 5954d7380f627371c4d8d7c59c08f9596aa2c674 Mon Sep 17 00:00:00 2001
From: Mark Salyzyn <mark_salyzyn@xyratex.com>
Date: Tue, 17 Jan 2012 11:52:24 -0500
Subject: [SCSI] pm8001: deficient responses to IO_XFER_ERROR_BREAK and
 IO_XFER_OPEN_RETRY_TIMEOUT

IO_XFER_ERROR_BREAK and IO_XFER_OPEN_RETRY_TIMEOUT are deficient of the
required actions as outlined in the programming manual for the pm8001. Due to
the overlapping code requirements of these recovery responses, we found it
necessary to bundle them together into one patch.

When a break is received during the command phase (ssp_completion), this is a
result of a timeout or interruption on the bus. Logic suggests that we should
retry the command.

When a break is received during the data-phase (ssp_event), the task must be
aborted on the target or it will retain a data-phase lock turning the target
reticent to all future media commands yet will successfully respond to TUR,
INQUIRY and ABORT leading eventually to target failure through several
abort-cycle loops.

The open retry interval is exceedingly short resulting in occasional target
drop-off during expander resets or when targets push-back during bad-block
remapping. Increased effective timeout from 130ms to 1.5 seconds for each try
so as to trigger after the administrative inquiry/tur timeout in the scsi
subsystem to keep error-recovery harmonics to a minimum.

When an open retry timeout event is received, the action required by the
targets is to issue an abort for the outstanding command then logic suggests
we retry the command as this state is usually an indication of a credit block
or busy condition on the target.

We hijacked the pm8001_handle_event work queue handler so that it will handle
task as an argument instead of device for the workers in support of the
deferred handling outlined above.

Moderate to Heavy bad-path testing on a 2.6.32 vintage kernel, compile-testing
on scsi-misc-2.6 kernel ...

Signed-off-by: Mark Salyzyn <mark_salyzyn@xyratex.com>
Acked-by: Jack Wang <jack_wang@usish.com>
Signed-off-by: James Bottomley <JBottomley@Parallels.com>
---
 drivers/scsi/pm8001/pm8001_sas.c | 72 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)

(limited to 'drivers/scsi/pm8001/pm8001_sas.c')

diff --git a/drivers/scsi/pm8001/pm8001_sas.c b/drivers/scsi/pm8001/pm8001_sas.c
index ab0704e39040..9589fc941a8b 100644
--- a/drivers/scsi/pm8001/pm8001_sas.c
+++ b/drivers/scsi/pm8001/pm8001_sas.c
@@ -538,6 +538,7 @@ void pm8001_ccb_task_free(struct pm8001_hba_info *pm8001_ha,
 	task->lldd_task = NULL;
 	ccb->task = NULL;
 	ccb->ccb_tag = 0xFFFFFFFF;
+	ccb->open_retry = 0;
 	pm8001_ccb_free(pm8001_ha, ccb_idx);
 }
 
@@ -882,6 +883,77 @@ static int pm8001_issue_ssp_tmf(struct domain_device *dev,
 		tmf);
 }
 
+/* retry commands by ha, by task and/or by device */
+void pm8001_open_reject_retry(
+	struct pm8001_hba_info *pm8001_ha,
+	struct sas_task *task_to_close,
+	struct pm8001_device *device_to_close)
+{
+	int i;
+	unsigned long flags;
+
+	if (pm8001_ha == NULL)
+		return;
+
+	spin_lock_irqsave(&pm8001_ha->lock, flags);
+
+	for (i = 0; i < PM8001_MAX_CCB; i++) {
+		struct sas_task *task;
+		struct task_status_struct *ts;
+		struct pm8001_device *pm8001_dev;
+		unsigned long flags1;
+		u32 tag;
+		struct pm8001_ccb_info *ccb = &pm8001_ha->ccb_info[i];
+
+		pm8001_dev = ccb->device;
+		if (!pm8001_dev || (pm8001_dev->dev_type == NO_DEVICE))
+			continue;
+		if (!device_to_close) {
+			uintptr_t d = (uintptr_t)pm8001_dev
+					- (uintptr_t)&pm8001_ha->devices;
+			if (((d % sizeof(*pm8001_dev)) != 0)
+			 || ((d / sizeof(*pm8001_dev)) >= PM8001_MAX_DEVICES))
+				continue;
+		} else if (pm8001_dev != device_to_close)
+			continue;
+		tag = ccb->ccb_tag;
+		if (!tag || (tag == 0xFFFFFFFF))
+			continue;
+		task = ccb->task;
+		if (!task || !task->task_done)
+			continue;
+		if (task_to_close && (task != task_to_close))
+			continue;
+		ts = &task->task_status;
+		ts->resp = SAS_TASK_COMPLETE;
+		/* Force the midlayer to retry */
+		ts->stat = SAS_OPEN_REJECT;
+		ts->open_rej_reason = SAS_OREJ_RSVD_RETRY;
+		if (pm8001_dev)
+			pm8001_dev->running_req--;
+		spin_lock_irqsave(&task->task_state_lock, flags1);
+		task->task_state_flags &= ~SAS_TASK_STATE_PENDING;
+		task->task_state_flags &= ~SAS_TASK_AT_INITIATOR;
+		task->task_state_flags |= SAS_TASK_STATE_DONE;
+		if (unlikely((task->task_state_flags
+				& SAS_TASK_STATE_ABORTED))) {
+			spin_unlock_irqrestore(&task->task_state_lock,
+				flags1);
+			pm8001_ccb_task_free(pm8001_ha, task, ccb, tag);
+		} else {
+			spin_unlock_irqrestore(&task->task_state_lock,
+				flags1);
+			pm8001_ccb_task_free(pm8001_ha, task, ccb, tag);
+			mb();/* in order to force CPU ordering */
+			spin_unlock_irqrestore(&pm8001_ha->lock, flags);
+			task->task_done(task);
+			spin_lock_irqsave(&pm8001_ha->lock, flags);
+		}
+	}
+
+	spin_unlock_irqrestore(&pm8001_ha->lock, flags);
+}
+
 /**
   * Standard mandates link reset for ATA  (type 0) and hard reset for
   * SSP (type 1) , only for RECOVERY
-- 
cgit v1.2.3


From b1124cd3ec97406c767b90bf7e93ecd2d2915592 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 19 Dec 2011 16:42:34 -0800
Subject: [SCSI] libsas: introduce sas_drain_work()

When an lldd invokes ->notify_port_event() it can trigger a chain of libsas
events to:

  1/ form the port and find the direct attached device

  2/ if the attached device is an expander perform domain discovery

A call to flush_workqueue() will only flush the initial port formation work.
Currently libsas users need to call scsi_flush_work() up to the max depth of
chain (which will grow from 2 to 3 when ata discovery is moved to its own
discovery event).  Instead of open coding multiple calls switch to use
drain_workqueue() to flush sas work.

drain_workqueue() does not handle new work submitted during the drain so
libsas needs a bit of infrastructure to hold off unchained work submissions
while a drain is in flight.  A lldd ->notify() event is considered 'unchained'
while a sas_discover_event() is 'chained'.  As Tejun notes:

  "For now, I think it would be best to add private wrapper in libsas to
   support deferring unchained work items while draining."

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: James Bottomley <JBottomley@Parallels.com>
---
 drivers/scsi/pm8001/pm8001_sas.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'drivers/scsi/pm8001/pm8001_sas.c')

diff --git a/drivers/scsi/pm8001/pm8001_sas.c b/drivers/scsi/pm8001/pm8001_sas.c
index 9589fc941a8b..50837933a1e5 100644
--- a/drivers/scsi/pm8001/pm8001_sas.c
+++ b/drivers/scsi/pm8001/pm8001_sas.c
@@ -256,12 +256,14 @@ void pm8001_scan_start(struct Scsi_Host *shost)
 
 int pm8001_scan_finished(struct Scsi_Host *shost, unsigned long time)
 {
+	struct sas_ha_struct *ha = SHOST_TO_SAS_HA(shost);
+
 	/* give the phy enabling interrupt event time to come in (1s
 	* is empirically about all it takes) */
 	if (time < HZ)
 		return 0;
 	/* Wait for discovery to finish */
-	scsi_flush_work(shost);
+	sas_drain_work(ha);
 	return 1;
 }
 
-- 
cgit v1.2.3


From 312d3e56119a4bc5c36a96818f87f650c069ddc2 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 17 Nov 2011 17:59:50 -0800
Subject: [SCSI] libsas: remove ata_port.lock management duties from lldds

Each libsas driver (mvsas, pm8001, and isci) has invented a different
method for managing the ap->lock.  The lock is held by the ata
->queuecommand() path.  mvsas drops it prior to acquiring any internal
locks which allows it to hold its internal lock across calls to
task->task_done().  This capability is important as it is the only way
the driver can flush task->task_done() instances to guarantee that it no
longer has any in-flight references to a domain_device at
->lldd_dev_gone() time.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: James Bottomley <JBottomley@Parallels.com>
---
 drivers/scsi/pm8001/pm8001_sas.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'drivers/scsi/pm8001/pm8001_sas.c')

diff --git a/drivers/scsi/pm8001/pm8001_sas.c b/drivers/scsi/pm8001/pm8001_sas.c
index 50837933a1e5..310860e37d98 100644
--- a/drivers/scsi/pm8001/pm8001_sas.c
+++ b/drivers/scsi/pm8001/pm8001_sas.c
@@ -364,7 +364,7 @@ static int pm8001_task_exec(struct sas_task *task, const int num,
 	struct pm8001_ccb_info *ccb;
 	u32 tag = 0xdeadbeef, rc, n_elem = 0;
 	u32 n = num;
-	unsigned long flags = 0, flags_libsas = 0;
+	unsigned long flags = 0;
 
 	if (!dev->port) {
 		struct task_status_struct *tsm = &t->task_status;
@@ -388,11 +388,7 @@ static int pm8001_task_exec(struct sas_task *task, const int num,
 				ts->stat = SAS_PHY_DOWN;
 
 				spin_unlock_irqrestore(&pm8001_ha->lock, flags);
-				spin_unlock_irqrestore(dev->sata_dev.ap->lock,
-						flags_libsas);
 				t->task_done(t);
-				spin_lock_irqsave(dev->sata_dev.ap->lock,
-					flags_libsas);
 				spin_lock_irqsave(&pm8001_ha->lock, flags);
 				if (n > 1)
 					t = list_entry(t->list.next,
-- 
cgit v1.2.3


From f41a0c441c3fe43e79ebeb75584dbb5bfa83e5cd Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 21 Dec 2011 21:33:17 -0800
Subject: [SCSI] libsas: fix sas_find_local_phy(), take phy references

In the direct-attached case this routine returns the phy on which this
device was first discovered.  Which is broken if we want to support
wide-targets, as this phy reference can become stale even though the
port is still active.

In the expander-attached case this routine tries to lookup the phy by
scanning the attached sas addresses of the parent expander, and BUG_ONs
if it can't find it.  However since eh and the libsas workqueue run
independently we can still be attempting device recovery via eh after
libsas has recorded the device as detached.  This is even easier to hit
now that eh is blocked while device domain rediscovery takes place, and
that libata is fed more timed out commands increasing the chances that
it will try to recover the ata device.

Arrange for dev->phy to always point to a last known good phy, it may be
stale after the port is torn down, but it will catch up for wide port
reconfigurations, and never be NULL.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: James Bottomley <JBottomley@Parallels.com>
---
 drivers/scsi/pm8001/pm8001_sas.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

(limited to 'drivers/scsi/pm8001/pm8001_sas.c')

diff --git a/drivers/scsi/pm8001/pm8001_sas.c b/drivers/scsi/pm8001/pm8001_sas.c
index 310860e37d98..3b11edd4a50c 100644
--- a/drivers/scsi/pm8001/pm8001_sas.c
+++ b/drivers/scsi/pm8001/pm8001_sas.c
@@ -967,12 +967,14 @@ int pm8001_I_T_nexus_reset(struct domain_device *dev)
 
 	pm8001_dev = dev->lldd_dev;
 	pm8001_ha = pm8001_find_ha_by_dev(dev);
-	phy = sas_find_local_phy(dev);
+	phy = sas_get_local_phy(dev);
 
 	if (dev_is_sata(dev)) {
 		DECLARE_COMPLETION_ONSTACK(completion_setstate);
-		if (scsi_is_sas_phy_local(phy))
-			return 0;
+		if (scsi_is_sas_phy_local(phy)) {
+			rc = 0;
+			goto out;
+		}
 		rc = sas_phy_reset(phy, 1);
 		msleep(2000);
 		rc = pm8001_exec_internal_task_abort(pm8001_ha, pm8001_dev ,
@@ -981,12 +983,14 @@ int pm8001_I_T_nexus_reset(struct domain_device *dev)
 		rc = PM8001_CHIP_DISP->set_dev_state_req(pm8001_ha,
 			pm8001_dev, 0x01);
 		wait_for_completion(&completion_setstate);
-	} else{
-	rc = sas_phy_reset(phy, 1);
-	msleep(2000);
+	} else {
+		rc = sas_phy_reset(phy, 1);
+		msleep(2000);
 	}
 	PM8001_EH_DBG(pm8001_ha, pm8001_printk(" for device[%x]:rc=%d\n",
 		pm8001_dev->device_id, rc));
+ out:
+	sas_put_local_phy(phy);
 	return rc;
 }
 
@@ -998,10 +1002,11 @@ int pm8001_lu_reset(struct domain_device *dev, u8 *lun)
 	struct pm8001_device *pm8001_dev = dev->lldd_dev;
 	struct pm8001_hba_info *pm8001_ha = pm8001_find_ha_by_dev(dev);
 	if (dev_is_sata(dev)) {
-		struct sas_phy *phy = sas_find_local_phy(dev);
+		struct sas_phy *phy = sas_get_local_phy(dev);
 		rc = pm8001_exec_internal_task_abort(pm8001_ha, pm8001_dev ,
 			dev, 1, 0);
 		rc = sas_phy_reset(phy, 1);
+		sas_put_local_phy(phy);
 		rc = PM8001_CHIP_DISP->set_dev_state_req(pm8001_ha,
 			pm8001_dev, 0x01);
 		msleep(2000);
-- 
cgit v1.2.3