net: hns3: fix to stop multiple HNS reset due to the AER changes
authorShiju Jose <shiju.jose@huawei.com>
Sun, 10 Mar 2019 06:47:51 +0000 (14:47 +0800)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 19 Mar 2019 12:10:55 +0000 (13:10 +0100)
[ Upstream commit 69b51bbb03f73e04c486f79d1556b2d9becf4dbc ]

The commit bfcb79fca19d
("PCI/ERR: Run error recovery callbacks for all affected devices")
affected the non-fatal error recovery logic for the HNS and RDMA devices.
This is because each HNS PF under PCIe bus receive callbacks
from the AER driver when an error is reported for one of the PF.
This causes unwanted PF resets because
the HNS decides which PF to reset based on the reset type set.
The HNS error handling code sets the reset type based on the hw error
type detected.

This patch provides fix for the above issue for the recovery of
the hw errors in the HNS and RDMA devices.

This patch needs backporting to the kernel v5.0+

Fixes: 332fbf576579 ("net: hns3: add handling of hw ras errors using new set of commands")
Reported-by: Xiaofei Tan <tanxiaofei@huawei.com>
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
Signed-off-by: Huazhong Tan <tanhuazhong@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
drivers/net/ethernet/hisilicon/hns3/hnae3.h
drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c

index 36eab37d8a403c468a1b04ff27931891d243113e..09c774fe885370d6b77b9d98d5d304f89e9edf38 100644 (file)
@@ -192,6 +192,7 @@ struct hnae3_ae_dev {
        const struct hnae3_ae_ops *ops;
        struct list_head node;
        u32 flag;
+       u8 override_pci_need_reset; /* fix to stop multiple reset happening */
        enum hnae3_dev_type dev_type;
        enum hnae3_reset_type reset_type;
        void *priv;
index 882d2d2acdfa58ca106553560e2d2831a3929550..d84c50068f661056d2922cd605935435b0bc230b 100644 (file)
@@ -1852,7 +1852,9 @@ static pci_ers_result_t hns3_slot_reset(struct pci_dev *pdev)
 
        /* request the reset */
        if (ae_dev->ops->reset_event) {
-               ae_dev->ops->reset_event(pdev, NULL);
+               if (!ae_dev->override_pci_need_reset)
+                       ae_dev->ops->reset_event(pdev, NULL);
+
                return PCI_ERS_RESULT_RECOVERED;
        }
 
index d0f654123b9b549c0588b059f80722983a3ad01c..efb6c1a25171aff5dcc9c59558e6e1b7da569424 100644 (file)
@@ -1259,8 +1259,10 @@ pci_ers_result_t hclge_handle_hw_ras_error(struct hnae3_ae_dev *ae_dev)
                hclge_handle_all_ras_errors(hdev);
        } else {
                if (test_bit(HCLGE_STATE_RST_HANDLING, &hdev->state) ||
-                   hdev->pdev->revision < 0x21)
+                   hdev->pdev->revision < 0x21) {
+                       ae_dev->override_pci_need_reset = 1;
                        return PCI_ERS_RESULT_RECOVERED;
+               }
        }
 
        if (status & HCLGE_RAS_REG_ROCEE_ERR_MASK) {
@@ -1269,8 +1271,11 @@ pci_ers_result_t hclge_handle_hw_ras_error(struct hnae3_ae_dev *ae_dev)
        }
 
        if (status & HCLGE_RAS_REG_NFE_MASK ||
-           status & HCLGE_RAS_REG_ROCEE_ERR_MASK)
+           status & HCLGE_RAS_REG_ROCEE_ERR_MASK) {
+               ae_dev->override_pci_need_reset = 0;
                return PCI_ERS_RESULT_NEED_RESET;
+       }
+       ae_dev->override_pci_need_reset = 1;
 
        return PCI_ERS_RESULT_RECOVERED;
 }