From fd76ee4da55abb21babfc69310d321b9cb9a32e0 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 14 Oct 2015 17:43:45 +0300 Subject: net/mlx5_core: Fix internal error detection conditions The detection of a fatal condition has been updated to take into account the state reported by the device or by detecting an all ones read of the firmware version which indicates that the device is not accessible. Signed-off-by: Eli Cohen Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx5/driver.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 41a32873f608..62b7d439813d 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -393,6 +393,7 @@ struct mlx5_core_health { struct timer_list timer; u32 prev; int miss_counter; + bool sick; struct workqueue_struct *wq; struct work_struct work; }; -- cgit v1.2.3 From 89d44f0a6c732db23b219be708e2fe1e03ee4842 Mon Sep 17 00:00:00 2001 From: Majd Dibbiny Date: Wed, 14 Oct 2015 17:43:46 +0300 Subject: net/mlx5_core: Add pci error handlers to mlx5_core driver This patch implement the pci_error_handlers for mlx5_core which allow the driver to recover from PCI error. Once an error is detected in the PCI, the mlx5_pci_err_detected is called and it: 1) Marks the device to be in 'Internal Error' state. 2) Dispatches an event to the mlx5_ib to flush all the outstanding cqes with error. 3) Returns all the on going commands with error. 4) Unloads the driver. Afterwards, the FW is reset and mlx5_pci_slot_reset is called and it enables the device and restore it's pci state. If the later succeeds, mlx5_pci_resume is called, and it loads the SW stack. Signed-off-by: Majd Dibbiny Signed-off-by: Eli Cohen Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx5/driver.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 62b7d439813d..9aba8d5139fa 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -487,8 +487,26 @@ struct mlx5_priv { spinlock_t ctx_lock; }; +enum mlx5_device_state { + MLX5_DEVICE_STATE_UP, + MLX5_DEVICE_STATE_INTERNAL_ERROR, +}; + +enum mlx5_interface_state { + MLX5_INTERFACE_STATE_DOWN, + MLX5_INTERFACE_STATE_UP, +}; + +enum mlx5_pci_status { + MLX5_PCI_STATUS_DISABLED, + MLX5_PCI_STATUS_ENABLED, +}; + struct mlx5_core_dev { struct pci_dev *pdev; + /* sync pci state */ + struct mutex pci_status_mutex; + enum mlx5_pci_status pci_status; u8 rev_id; char board_id[MLX5_BOARD_ID_LEN]; struct mlx5_cmd cmd; @@ -497,6 +515,10 @@ struct mlx5_core_dev { u32 hca_caps_max[MLX5_CAP_NUM][MLX5_UN_SZ_DW(hca_cap_union)]; phys_addr_t iseg_base; struct mlx5_init_seg __iomem *iseg; + enum mlx5_device_state state; + /* sync interface state */ + struct mutex intf_state_mutex; + enum mlx5_interface_state interface_state; void (*event) (struct mlx5_core_dev *dev, enum mlx5_dev_event event, unsigned long param); -- cgit v1.2.3 From e3297246c2c8cf8548ba722da3e3a8104cdcd035 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 14 Oct 2015 17:43:47 +0300 Subject: net/mlx5_core: Wait for FW readiness on startup On device initialization, wait till firmware indicates that that it is done with initialization before proceeding to initialize the device. Also update initialization segment layout to match driver/firmware interface definitions. Signed-off-by: Eli Cohen Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx5/device.h | 3 ++- include/linux/mlx5/driver.h | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 2a0b95662548..0b473cbfa7ef 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -439,7 +439,8 @@ struct mlx5_init_seg { __be32 cmdq_addr_h; __be32 cmdq_addr_l_sz; __be32 cmd_dbell; - __be32 rsvd1[121]; + __be32 rsvd1[120]; + __be32 initializing; struct health_buffer health; __be32 rsvd2[884]; __be32 health_counter; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 9aba8d5139fa..5c857f2a20d7 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -826,6 +826,11 @@ void mlx5_core_put_rsc(struct mlx5_core_rsc_common *common); int mlx5_query_odp_caps(struct mlx5_core_dev *dev, struct mlx5_odp_caps *odp_caps); +static inline int fw_initializing(struct mlx5_core_dev *dev) +{ + return ioread32be(&dev->iseg->initializing) >> 31; +} + static inline u32 mlx5_mkey_to_idx(u32 mkey) { return mkey >> 8; -- cgit v1.2.3 From 2b3ddf27f48c8061f0676c5a8796008099945280 Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Wed, 14 Oct 2015 17:43:48 +0300 Subject: net/mlx4_core: Replace VF zero mac with random mac in mlx4_core By design, when no default MAC addresses are set in the Hypervisor for VFs, the VFs are passed zero-macs. When such a MAC is received by the VF, it generates a random MAC address and registers that MAC address with the Hypervisor. This random mac generation is currently done in the mlx4_en module. There is a problem, though, if the mlx4_ib module is loaded by a VF before the mlx4_en module. In this case, for RoCE, mlx4_ib will see the un-replaced zero-mac and register that zero-mac as part of QP1 initialization. Having a zero-mac in the port's MAC table creates problems for a Baseboard Management Console. The BMC occasionally sends packets with a zero-mac destination MAC. If there is a zero-mac present in the port's MAC table, the FW will send such BMC packets to the host driver rather than to the wire, and BMC will stop working. To address this problem, we move the replacement of zero-mac addresses with random-mac addresses to procedure mlx4_slave_cap(), which is part of the driver startup for VFs, and is before activation of mlx4_ib and mlx4_en. As a result, zero-mac addresses will never be registered in the port MAC table by the driver. In addition, when mlx4_en does initialize the net device, it needs to set the NET_ADDR_RANDOM flag in the netdev structure if the address was randomly generated. This is done so that udev on the VM does not create a new device name after each VF probe (VM boot and such). To accomplish this, we add a per-port flag in mlx4_dev which gets set whenever mlx4_core replaces a zero-mac with a randomly-generated mac. This flag is examined when mlx4_en initializes the net-device. Fix was suggested by Matan Barak Signed-off-by: Jack Morgenstein Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx4/device.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index baad4cb8e9b0..5a8677bafe04 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -833,6 +833,7 @@ struct mlx4_dev { struct mlx4_quotas quotas; struct radix_tree_root qp_table_tree; u8 rev_id; + u8 port_random_macs; char board_id[MLX4_BOARD_ID_LEN]; int numa_node; int oper_log_mgm_entry_size; -- cgit v1.2.3