Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 8b9d8baa authored by Alex Vesker's avatar Alex Vesker Committed by Saeed Mahameed
Browse files

net/mlx5: Add Crdump support



Crdump allows the driver to retrieve a dump of the FW PCI crspace.
This is useful in case of catastrophic issues which may require FW
reset. The crspace dump can be used for later debug.

Signed-off-by: default avatarAlex Vesker <valex@mellanox.com>
Signed-off-by: default avatarMoshe Shemesh <moshe@mellanox.com>
Reviewed-by: default avatarFeras Daoud <ferasda@mellanox.com>
Signed-off-by: default avatarSaeed Mahameed <saeedm@mellanox.com>
parent b25bbc2f
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -16,7 +16,7 @@ mlx5_core-y := main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
		transobj.o vport.o sriov.o fs_cmd.o fs_core.o \
		fs_counters.o rl.o lag.o dev.o events.o wq.o lib/gid.o \
		lib/devcom.o lib/pci_vsc.o diag/fs_tracepoint.o \
		diag/fw_tracer.o devlink.o
		diag/fw_tracer.o diag/crdump.o devlink.o

#
# Netdev basic
+106 −0
Original line number Diff line number Diff line
// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
/* Copyright (c) 2019 Mellanox Technologies */

#include <linux/mlx5/driver.h>
#include "mlx5_core.h"
#include "lib/pci_vsc.h"
#include "lib/mlx5.h"

#define BAD_ACCESS			0xBADACCE5
#define MLX5_PROTECTED_CR_SCAN_CRSPACE	0x7

static bool mlx5_crdump_enabled(struct mlx5_core_dev *dev)
{
	return !!dev->priv.health.crdump_size;
}

static int mlx5_crdump_fill(struct mlx5_core_dev *dev, u32 *cr_data)
{
	u32 crdump_size = dev->priv.health.crdump_size;
	int i, ret;

	for (i = 0; i < (crdump_size / 4); i++)
		cr_data[i] = BAD_ACCESS;

	ret = mlx5_vsc_gw_read_block_fast(dev, cr_data, crdump_size);
	if (ret <= 0) {
		if (ret == 0)
			return -EIO;
		return ret;
	}

	if (crdump_size != ret) {
		mlx5_core_warn(dev, "failed to read full dump, read %d out of %u\n",
			       ret, crdump_size);
		return -EINVAL;
	}

	return 0;
}

int mlx5_crdump_collect(struct mlx5_core_dev *dev, u32 *cr_data)
{
	int ret;

	if (!mlx5_crdump_enabled(dev))
		return -ENODEV;

	ret = mlx5_vsc_gw_lock(dev);
	if (ret) {
		mlx5_core_warn(dev, "crdump: failed to lock vsc gw err %d\n",
			       ret);
		return ret;
	}

	ret = mlx5_vsc_gw_set_space(dev, MLX5_VSC_SPACE_SCAN_CRSPACE, NULL);
	if (ret)
		goto unlock;

	ret = mlx5_crdump_fill(dev, cr_data);

unlock:
	mlx5_vsc_gw_unlock(dev);
	return ret;
}

int mlx5_crdump_enable(struct mlx5_core_dev *dev)
{
	struct mlx5_priv *priv = &dev->priv;
	u32 space_size;
	int ret;

	if (!mlx5_core_is_pf(dev) || !mlx5_vsc_accessible(dev) ||
	    mlx5_crdump_enabled(dev))
		return 0;

	ret = mlx5_vsc_gw_lock(dev);
	if (ret)
		return ret;

	/* Check if space is supported and get space size */
	ret = mlx5_vsc_gw_set_space(dev, MLX5_VSC_SPACE_SCAN_CRSPACE,
				    &space_size);
	if (ret) {
		/* Unlock and mask error since space is not supported */
		mlx5_vsc_gw_unlock(dev);
		return 0;
	}

	if (!space_size) {
		mlx5_core_warn(dev, "Invalid Crspace size, zero\n");
		mlx5_vsc_gw_unlock(dev);
		return -EINVAL;
	}

	ret = mlx5_vsc_gw_unlock(dev);
	if (ret)
		return ret;

	priv->health.crdump_size = space_size;
	return 0;
}

void mlx5_crdump_disable(struct mlx5_core_dev *dev)
{
	dev->priv.health.crdump_size = 0;
}
+3 −0
Original line number Diff line number Diff line
@@ -41,6 +41,9 @@ int mlx5_core_reserve_gids(struct mlx5_core_dev *dev, unsigned int count);
void mlx5_core_unreserve_gids(struct mlx5_core_dev *dev, unsigned int count);
int  mlx5_core_reserved_gid_alloc(struct mlx5_core_dev *dev, int *gid_index);
void mlx5_core_reserved_gid_free(struct mlx5_core_dev *dev, int gid_index);
int mlx5_crdump_enable(struct mlx5_core_dev *dev);
void mlx5_crdump_disable(struct mlx5_core_dev *dev);
int mlx5_crdump_collect(struct mlx5_core_dev *dev, u32 *cr_data);

/* TODO move to lib/events.h */

+5 −0
Original line number Diff line number Diff line
@@ -1313,6 +1313,10 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *id)
	if (err)
		goto clean_load;

	err = mlx5_crdump_enable(dev);
	if (err)
		dev_err(&pdev->dev, "mlx5_crdump_enable failed with error code %d\n", err);

	pci_save_state(pdev);
	return 0;

@@ -1334,6 +1338,7 @@ static void remove_one(struct pci_dev *pdev)
	struct mlx5_core_dev *dev  = pci_get_drvdata(pdev);
	struct devlink *devlink = priv_to_devlink(dev);

	mlx5_crdump_disable(dev);
	mlx5_devlink_unregister(devlink);
	mlx5_unregister_device(dev);

+1 −0
Original line number Diff line number Diff line
@@ -435,6 +435,7 @@ struct mlx5_core_health {
	u32				prev;
	int				miss_counter;
	bool				sick;
	u32				crdump_size;
	/* wq spinlock to synchronize draining */
	spinlock_t			wq_lock;
	struct workqueue_struct	       *wq;