From 5c4fb76af31e9dabcd132a0e69ed3799df1304c3 Mon Sep 17 00:00:00 2001 From: Mark Salyzyn Date: Tue, 17 Jan 2012 12:56:45 -0500 Subject: [SCSI] pm8001: fix lockup on phy_control hard reset. pm8001_phy_control PHY_FUNC_HARD_RESET locks up on second try via smp_phy_control because response HW_EVENT_PHY_START_STATUS fails to complete previous command. The PM8001F_RUN_TIME flag is not treated as a bit, but a state in all readers, yet once we are operational or in the run time state, the flags use a bit-set operation. Signed-off-by: Mark Salyzyn Acked-by: Jack Wang Signed-off-by: James Bottomley --- drivers/scsi/pm8001/pm8001_sas.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/scsi/pm8001/pm8001_sas.c') diff --git a/drivers/scsi/pm8001/pm8001_sas.c b/drivers/scsi/pm8001/pm8001_sas.c index fb3dc9978861..7ae22a67bd31 100644 --- a/drivers/scsi/pm8001/pm8001_sas.c +++ b/drivers/scsi/pm8001/pm8001_sas.c @@ -615,7 +615,7 @@ static int pm8001_dev_found_notify(struct domain_device *dev) wait_for_completion(&completion); if (dev->dev_type == SAS_END_DEV) msleep(50); - pm8001_ha->flags |= PM8001F_RUN_TIME ; + pm8001_ha->flags = PM8001F_RUN_TIME; return 0; found_out: spin_unlock_irqrestore(&pm8001_ha->lock, flags); -- cgit v1.2.3 From d95d00016f8f51dc502cadb263d861bd8c0212bb Mon Sep 17 00:00:00 2001 From: Mark Salyzyn Date: Tue, 17 Jan 2012 09:18:57 -0500 Subject: [SCSI] pm8001: Add FUNC_GET_EVENTS Jack noticed I dropped a patch fragment associated with a flags automatic variable in mpi_set_phys_g3_with_ssc (ooops) and that the pre-emptive locking that piggy-backed this patch was not in-fact necessary because of underlying atomic accesses to the hardware. Here is the updated patch fixing these two issues. The pm8001 driver is missing the FUNC_GET_EVENTS handler in the phy control function. Since the pm8001_bar4_shift function was not designed to be called at runtime, added locking surrounding the adjustment for all accesses. Signed-off-by: Mark Salyzyn Acked-by: Jack Wang Signed-off-by: James Bottomley --- drivers/scsi/pm8001/pm8001_sas.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) (limited to 'drivers/scsi/pm8001/pm8001_sas.c') diff --git a/drivers/scsi/pm8001/pm8001_sas.c b/drivers/scsi/pm8001/pm8001_sas.c index 7ae22a67bd31..ab0704e39040 100644 --- a/drivers/scsi/pm8001/pm8001_sas.c +++ b/drivers/scsi/pm8001/pm8001_sas.c @@ -166,6 +166,7 @@ int pm8001_phy_control(struct asd_sas_phy *sas_phy, enum phy_func func, struct pm8001_hba_info *pm8001_ha = NULL; struct sas_phy_linkrates *rates; DECLARE_COMPLETION_ONSTACK(completion); + unsigned long flags; pm8001_ha = sas_phy->ha->lldd_ha; pm8001_ha->phy[phy_id].enable_completion = &completion; switch (func) { @@ -209,8 +210,29 @@ int pm8001_phy_control(struct asd_sas_phy *sas_phy, enum phy_func func, case PHY_FUNC_DISABLE: PM8001_CHIP_DISP->phy_stop_req(pm8001_ha, phy_id); break; + case PHY_FUNC_GET_EVENTS: + spin_lock_irqsave(&pm8001_ha->lock, flags); + if (-1 == pm8001_bar4_shift(pm8001_ha, + (phy_id < 4) ? 0x30000 : 0x40000)) { + spin_unlock_irqrestore(&pm8001_ha->lock, flags); + return -EINVAL; + } + { + struct sas_phy *phy = sas_phy->phy; + uint32_t *qp = (uint32_t *)(((char *) + pm8001_ha->io_mem[2].memvirtaddr) + + 0x1034 + (0x4000 * (phy_id & 3))); + + phy->invalid_dword_count = qp[0]; + phy->running_disparity_error_count = qp[1]; + phy->loss_of_dword_sync_count = qp[3]; + phy->phy_reset_problem_count = qp[4]; + } + pm8001_bar4_shift(pm8001_ha, 0); + spin_unlock_irqrestore(&pm8001_ha->lock, flags); + return 0; default: - rc = -ENOSYS; + rc = -EOPNOTSUPP; } msleep(300); return rc; -- cgit v1.2.3 From 5954d7380f627371c4d8d7c59c08f9596aa2c674 Mon Sep 17 00:00:00 2001 From: Mark Salyzyn Date: Tue, 17 Jan 2012 11:52:24 -0500 Subject: [SCSI] pm8001: deficient responses to IO_XFER_ERROR_BREAK and IO_XFER_OPEN_RETRY_TIMEOUT IO_XFER_ERROR_BREAK and IO_XFER_OPEN_RETRY_TIMEOUT are deficient of the required actions as outlined in the programming manual for the pm8001. Due to the overlapping code requirements of these recovery responses, we found it necessary to bundle them together into one patch. When a break is received during the command phase (ssp_completion), this is a result of a timeout or interruption on the bus. Logic suggests that we should retry the command. When a break is received during the data-phase (ssp_event), the task must be aborted on the target or it will retain a data-phase lock turning the target reticent to all future media commands yet will successfully respond to TUR, INQUIRY and ABORT leading eventually to target failure through several abort-cycle loops. The open retry interval is exceedingly short resulting in occasional target drop-off during expander resets or when targets push-back during bad-block remapping. Increased effective timeout from 130ms to 1.5 seconds for each try so as to trigger after the administrative inquiry/tur timeout in the scsi subsystem to keep error-recovery harmonics to a minimum. When an open retry timeout event is received, the action required by the targets is to issue an abort for the outstanding command then logic suggests we retry the command as this state is usually an indication of a credit block or busy condition on the target. We hijacked the pm8001_handle_event work queue handler so that it will handle task as an argument instead of device for the workers in support of the deferred handling outlined above. Moderate to Heavy bad-path testing on a 2.6.32 vintage kernel, compile-testing on scsi-misc-2.6 kernel ... Signed-off-by: Mark Salyzyn Acked-by: Jack Wang Signed-off-by: James Bottomley --- drivers/scsi/pm8001/pm8001_sas.c | 72 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) (limited to 'drivers/scsi/pm8001/pm8001_sas.c') diff --git a/drivers/scsi/pm8001/pm8001_sas.c b/drivers/scsi/pm8001/pm8001_sas.c index ab0704e39040..9589fc941a8b 100644 --- a/drivers/scsi/pm8001/pm8001_sas.c +++ b/drivers/scsi/pm8001/pm8001_sas.c @@ -538,6 +538,7 @@ void pm8001_ccb_task_free(struct pm8001_hba_info *pm8001_ha, task->lldd_task = NULL; ccb->task = NULL; ccb->ccb_tag = 0xFFFFFFFF; + ccb->open_retry = 0; pm8001_ccb_free(pm8001_ha, ccb_idx); } @@ -882,6 +883,77 @@ static int pm8001_issue_ssp_tmf(struct domain_device *dev, tmf); } +/* retry commands by ha, by task and/or by device */ +void pm8001_open_reject_retry( + struct pm8001_hba_info *pm8001_ha, + struct sas_task *task_to_close, + struct pm8001_device *device_to_close) +{ + int i; + unsigned long flags; + + if (pm8001_ha == NULL) + return; + + spin_lock_irqsave(&pm8001_ha->lock, flags); + + for (i = 0; i < PM8001_MAX_CCB; i++) { + struct sas_task *task; + struct task_status_struct *ts; + struct pm8001_device *pm8001_dev; + unsigned long flags1; + u32 tag; + struct pm8001_ccb_info *ccb = &pm8001_ha->ccb_info[i]; + + pm8001_dev = ccb->device; + if (!pm8001_dev || (pm8001_dev->dev_type == NO_DEVICE)) + continue; + if (!device_to_close) { + uintptr_t d = (uintptr_t)pm8001_dev + - (uintptr_t)&pm8001_ha->devices; + if (((d % sizeof(*pm8001_dev)) != 0) + || ((d / sizeof(*pm8001_dev)) >= PM8001_MAX_DEVICES)) + continue; + } else if (pm8001_dev != device_to_close) + continue; + tag = ccb->ccb_tag; + if (!tag || (tag == 0xFFFFFFFF)) + continue; + task = ccb->task; + if (!task || !task->task_done) + continue; + if (task_to_close && (task != task_to_close)) + continue; + ts = &task->task_status; + ts->resp = SAS_TASK_COMPLETE; + /* Force the midlayer to retry */ + ts->stat = SAS_OPEN_REJECT; + ts->open_rej_reason = SAS_OREJ_RSVD_RETRY; + if (pm8001_dev) + pm8001_dev->running_req--; + spin_lock_irqsave(&task->task_state_lock, flags1); + task->task_state_flags &= ~SAS_TASK_STATE_PENDING; + task->task_state_flags &= ~SAS_TASK_AT_INITIATOR; + task->task_state_flags |= SAS_TASK_STATE_DONE; + if (unlikely((task->task_state_flags + & SAS_TASK_STATE_ABORTED))) { + spin_unlock_irqrestore(&task->task_state_lock, + flags1); + pm8001_ccb_task_free(pm8001_ha, task, ccb, tag); + } else { + spin_unlock_irqrestore(&task->task_state_lock, + flags1); + pm8001_ccb_task_free(pm8001_ha, task, ccb, tag); + mb();/* in order to force CPU ordering */ + spin_unlock_irqrestore(&pm8001_ha->lock, flags); + task->task_done(task); + spin_lock_irqsave(&pm8001_ha->lock, flags); + } + } + + spin_unlock_irqrestore(&pm8001_ha->lock, flags); +} + /** * Standard mandates link reset for ATA (type 0) and hard reset for * SSP (type 1) , only for RECOVERY -- cgit v1.2.3 From b1124cd3ec97406c767b90bf7e93ecd2d2915592 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 19 Dec 2011 16:42:34 -0800 Subject: [SCSI] libsas: introduce sas_drain_work() When an lldd invokes ->notify_port_event() it can trigger a chain of libsas events to: 1/ form the port and find the direct attached device 2/ if the attached device is an expander perform domain discovery A call to flush_workqueue() will only flush the initial port formation work. Currently libsas users need to call scsi_flush_work() up to the max depth of chain (which will grow from 2 to 3 when ata discovery is moved to its own discovery event). Instead of open coding multiple calls switch to use drain_workqueue() to flush sas work. drain_workqueue() does not handle new work submitted during the drain so libsas needs a bit of infrastructure to hold off unchained work submissions while a drain is in flight. A lldd ->notify() event is considered 'unchained' while a sas_discover_event() is 'chained'. As Tejun notes: "For now, I think it would be best to add private wrapper in libsas to support deferring unchained work items while draining." Signed-off-by: Dan Williams Signed-off-by: James Bottomley --- drivers/scsi/pm8001/pm8001_sas.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers/scsi/pm8001/pm8001_sas.c') diff --git a/drivers/scsi/pm8001/pm8001_sas.c b/drivers/scsi/pm8001/pm8001_sas.c index 9589fc941a8b..50837933a1e5 100644 --- a/drivers/scsi/pm8001/pm8001_sas.c +++ b/drivers/scsi/pm8001/pm8001_sas.c @@ -256,12 +256,14 @@ void pm8001_scan_start(struct Scsi_Host *shost) int pm8001_scan_finished(struct Scsi_Host *shost, unsigned long time) { + struct sas_ha_struct *ha = SHOST_TO_SAS_HA(shost); + /* give the phy enabling interrupt event time to come in (1s * is empirically about all it takes) */ if (time < HZ) return 0; /* Wait for discovery to finish */ - scsi_flush_work(shost); + sas_drain_work(ha); return 1; } -- cgit v1.2.3 From 312d3e56119a4bc5c36a96818f87f650c069ddc2 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 17 Nov 2011 17:59:50 -0800 Subject: [SCSI] libsas: remove ata_port.lock management duties from lldds Each libsas driver (mvsas, pm8001, and isci) has invented a different method for managing the ap->lock. The lock is held by the ata ->queuecommand() path. mvsas drops it prior to acquiring any internal locks which allows it to hold its internal lock across calls to task->task_done(). This capability is important as it is the only way the driver can flush task->task_done() instances to guarantee that it no longer has any in-flight references to a domain_device at ->lldd_dev_gone() time. Signed-off-by: Dan Williams Signed-off-by: James Bottomley --- drivers/scsi/pm8001/pm8001_sas.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'drivers/scsi/pm8001/pm8001_sas.c') diff --git a/drivers/scsi/pm8001/pm8001_sas.c b/drivers/scsi/pm8001/pm8001_sas.c index 50837933a1e5..310860e37d98 100644 --- a/drivers/scsi/pm8001/pm8001_sas.c +++ b/drivers/scsi/pm8001/pm8001_sas.c @@ -364,7 +364,7 @@ static int pm8001_task_exec(struct sas_task *task, const int num, struct pm8001_ccb_info *ccb; u32 tag = 0xdeadbeef, rc, n_elem = 0; u32 n = num; - unsigned long flags = 0, flags_libsas = 0; + unsigned long flags = 0; if (!dev->port) { struct task_status_struct *tsm = &t->task_status; @@ -388,11 +388,7 @@ static int pm8001_task_exec(struct sas_task *task, const int num, ts->stat = SAS_PHY_DOWN; spin_unlock_irqrestore(&pm8001_ha->lock, flags); - spin_unlock_irqrestore(dev->sata_dev.ap->lock, - flags_libsas); t->task_done(t); - spin_lock_irqsave(dev->sata_dev.ap->lock, - flags_libsas); spin_lock_irqsave(&pm8001_ha->lock, flags); if (n > 1) t = list_entry(t->list.next, -- cgit v1.2.3 From f41a0c441c3fe43e79ebeb75584dbb5bfa83e5cd Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 21 Dec 2011 21:33:17 -0800 Subject: [SCSI] libsas: fix sas_find_local_phy(), take phy references In the direct-attached case this routine returns the phy on which this device was first discovered. Which is broken if we want to support wide-targets, as this phy reference can become stale even though the port is still active. In the expander-attached case this routine tries to lookup the phy by scanning the attached sas addresses of the parent expander, and BUG_ONs if it can't find it. However since eh and the libsas workqueue run independently we can still be attempting device recovery via eh after libsas has recorded the device as detached. This is even easier to hit now that eh is blocked while device domain rediscovery takes place, and that libata is fed more timed out commands increasing the chances that it will try to recover the ata device. Arrange for dev->phy to always point to a last known good phy, it may be stale after the port is torn down, but it will catch up for wide port reconfigurations, and never be NULL. Signed-off-by: Dan Williams Signed-off-by: James Bottomley --- drivers/scsi/pm8001/pm8001_sas.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'drivers/scsi/pm8001/pm8001_sas.c') diff --git a/drivers/scsi/pm8001/pm8001_sas.c b/drivers/scsi/pm8001/pm8001_sas.c index 310860e37d98..3b11edd4a50c 100644 --- a/drivers/scsi/pm8001/pm8001_sas.c +++ b/drivers/scsi/pm8001/pm8001_sas.c @@ -967,12 +967,14 @@ int pm8001_I_T_nexus_reset(struct domain_device *dev) pm8001_dev = dev->lldd_dev; pm8001_ha = pm8001_find_ha_by_dev(dev); - phy = sas_find_local_phy(dev); + phy = sas_get_local_phy(dev); if (dev_is_sata(dev)) { DECLARE_COMPLETION_ONSTACK(completion_setstate); - if (scsi_is_sas_phy_local(phy)) - return 0; + if (scsi_is_sas_phy_local(phy)) { + rc = 0; + goto out; + } rc = sas_phy_reset(phy, 1); msleep(2000); rc = pm8001_exec_internal_task_abort(pm8001_ha, pm8001_dev , @@ -981,12 +983,14 @@ int pm8001_I_T_nexus_reset(struct domain_device *dev) rc = PM8001_CHIP_DISP->set_dev_state_req(pm8001_ha, pm8001_dev, 0x01); wait_for_completion(&completion_setstate); - } else{ - rc = sas_phy_reset(phy, 1); - msleep(2000); + } else { + rc = sas_phy_reset(phy, 1); + msleep(2000); } PM8001_EH_DBG(pm8001_ha, pm8001_printk(" for device[%x]:rc=%d\n", pm8001_dev->device_id, rc)); + out: + sas_put_local_phy(phy); return rc; } @@ -998,10 +1002,11 @@ int pm8001_lu_reset(struct domain_device *dev, u8 *lun) struct pm8001_device *pm8001_dev = dev->lldd_dev; struct pm8001_hba_info *pm8001_ha = pm8001_find_ha_by_dev(dev); if (dev_is_sata(dev)) { - struct sas_phy *phy = sas_find_local_phy(dev); + struct sas_phy *phy = sas_get_local_phy(dev); rc = pm8001_exec_internal_task_abort(pm8001_ha, pm8001_dev , dev, 1, 0); rc = sas_phy_reset(phy, 1); + sas_put_local_phy(phy); rc = PM8001_CHIP_DISP->set_dev_state_req(pm8001_ha, pm8001_dev, 0x01); msleep(2000); -- cgit v1.2.3