Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 69248719 authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'fib-notifier-event-replay'



Jiri Pirko says:

====================
ipv4: fib: Replay events when registering FIB notifier

Ido says:

In kernel 4.9 the switchdev-specific FIB offload mechanism was replaced
by a new FIB notification chain to which modules could register in order
to be notified about the addition and deletion of FIB entries. The
motivation for this change was that switchdev drivers need to be able to
reflect the entire FIB table and not only FIBs configured on top of the
port netdevs themselves. This is useful in case of in-band management.

The fundamental problem with this approach is that upon registration
listeners lose all the information previously sent in the chain and
thus have an incomplete view of the FIB tables, which can result in
packet loss. This patchset fixes that by dumping the FIB tables and
replaying notifications previously sent in the chain for the registered
notification block.

The entire dump process is done under RCU and thus the FIB notification
chain is converted to be atomic. The listeners are modified accordingly.
This is done in the first eight patches.

The ninth patch adds a change sequence counter to ensure the integrity
of the FIB dump. The last patch adds the dump itself to the FIB chain
registration function and modifies existing listeners to pass a callback
to be executed in case dump was inconsistent.

---
v3->v4:
- Register the notification block after the dump and protect it using
  the change sequence counter (Hannes Frederic Sowa).
- Since we now integrate the dump into the registration function, drop
  the sysctl to set maximum number of retries and instead set it to a
  fixed number. Lets see if it's really a problem before adding something
  we can never remove.
- For the same reason, dump FIB tables for all net namespaces.
- Add a comment regarding guarantees provided by mutex semantics.

v2->v3:
- Add sysctl to set the number of FIB dump retries (Hannes Frederic Sowa).
- Read the sequence counter under RTNL to ensure synchronization
  between the dump process and other processes changing the routing
  tables (Hannes Frederic Sowa).
- Pass a callback to the dump function to be executed prior to a retry.
- Limit the dump to a single net namespace.

v1->v2:
- Add a sequence counter to ensure the integrity of the FIB dump
  (David S. Miller, Hannes Frederic Sowa).
- Protect notifications from re-ordering in listeners by using an
  ordered workqueue (Hannes Frederic Sowa).
- Introduce fib_info_hold() (Jiri Pirko).
- Relieve rocker from the need to invoke the FIB dump by registering
  to the FIB notification chain prior to ports creation.
====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 548ed722 c3852ef7
Loading
Loading
Loading
Loading
+22 −0
Original line number Diff line number Diff line
@@ -77,6 +77,7 @@ static const char mlxsw_core_driver_name[] = "mlxsw_core";
static struct dentry *mlxsw_core_dbg_root;

static struct workqueue_struct *mlxsw_wq;
static struct workqueue_struct *mlxsw_owq;

struct mlxsw_core_pcpu_stats {
	u64			trap_rx_packets[MLXSW_TRAP_ID_MAX];
@@ -1900,6 +1901,18 @@ int mlxsw_core_schedule_dw(struct delayed_work *dwork, unsigned long delay)
}
EXPORT_SYMBOL(mlxsw_core_schedule_dw);

int mlxsw_core_schedule_odw(struct delayed_work *dwork, unsigned long delay)
{
	return queue_delayed_work(mlxsw_owq, dwork, delay);
}
EXPORT_SYMBOL(mlxsw_core_schedule_odw);

void mlxsw_core_flush_owq(void)
{
	flush_workqueue(mlxsw_owq);
}
EXPORT_SYMBOL(mlxsw_core_flush_owq);

static int __init mlxsw_core_module_init(void)
{
	int err;
@@ -1907,6 +1920,12 @@ static int __init mlxsw_core_module_init(void)
	mlxsw_wq = alloc_workqueue(mlxsw_core_driver_name, WQ_MEM_RECLAIM, 0);
	if (!mlxsw_wq)
		return -ENOMEM;
	mlxsw_owq = alloc_ordered_workqueue("%s_ordered", WQ_MEM_RECLAIM,
					    mlxsw_core_driver_name);
	if (!mlxsw_owq) {
		err = -ENOMEM;
		goto err_alloc_ordered_workqueue;
	}
	mlxsw_core_dbg_root = debugfs_create_dir(mlxsw_core_driver_name, NULL);
	if (!mlxsw_core_dbg_root) {
		err = -ENOMEM;
@@ -1915,6 +1934,8 @@ static int __init mlxsw_core_module_init(void)
	return 0;

err_debugfs_create_dir:
	destroy_workqueue(mlxsw_owq);
err_alloc_ordered_workqueue:
	destroy_workqueue(mlxsw_wq);
	return err;
}
@@ -1922,6 +1943,7 @@ static int __init mlxsw_core_module_init(void)
static void __exit mlxsw_core_module_exit(void)
{
	debugfs_remove_recursive(mlxsw_core_dbg_root);
	destroy_workqueue(mlxsw_owq);
	destroy_workqueue(mlxsw_wq);
}

+2 −0
Original line number Diff line number Diff line
@@ -207,6 +207,8 @@ enum devlink_port_type mlxsw_core_port_type_get(struct mlxsw_core *mlxsw_core,
						u8 local_port);

int mlxsw_core_schedule_dw(struct delayed_work *dwork, unsigned long delay);
int mlxsw_core_schedule_odw(struct delayed_work *dwork, unsigned long delay);
void mlxsw_core_flush_owq(void);

#define MLXSW_CONFIG_PROFILE_SWID_COUNT 8

+81 −11
Original line number Diff line number Diff line
@@ -593,6 +593,14 @@ static void mlxsw_sp_router_fib_flush(struct mlxsw_sp *mlxsw_sp);

static void mlxsw_sp_vrs_fini(struct mlxsw_sp *mlxsw_sp)
{
	/* At this stage we're guaranteed not to have new incoming
	 * FIB notifications and the work queue is free from FIBs
	 * sitting on top of mlxsw netdevs. However, we can still
	 * have other FIBs queued. Flush the queue before flushing
	 * the device's tables. No need for locks, as we're the only
	 * writer.
	 */
	mlxsw_core_flush_owq();
	mlxsw_sp_router_fib_flush(mlxsw_sp);
	kfree(mlxsw_sp->router.vrs);
}
@@ -1948,33 +1956,89 @@ static void __mlxsw_sp_router_fini(struct mlxsw_sp *mlxsw_sp)
	kfree(mlxsw_sp->rifs);
}

static int mlxsw_sp_router_fib_event(struct notifier_block *nb,
				     unsigned long event, void *ptr)
struct mlxsw_sp_fib_event_work {
	struct delayed_work dw;
	struct fib_entry_notifier_info fen_info;
	struct mlxsw_sp *mlxsw_sp;
	unsigned long event;
};

static void mlxsw_sp_router_fib_event_work(struct work_struct *work)
{
	struct mlxsw_sp *mlxsw_sp = container_of(nb, struct mlxsw_sp, fib_nb);
	struct fib_entry_notifier_info *fen_info = ptr;
	struct mlxsw_sp_fib_event_work *fib_work =
		container_of(work, struct mlxsw_sp_fib_event_work, dw.work);
	struct mlxsw_sp *mlxsw_sp = fib_work->mlxsw_sp;
	int err;

	if (!net_eq(fen_info->info.net, &init_net))
		return NOTIFY_DONE;

	switch (event) {
	/* Protect internal structures from changes */
	rtnl_lock();
	switch (fib_work->event) {
	case FIB_EVENT_ENTRY_ADD:
		err = mlxsw_sp_router_fib4_add(mlxsw_sp, fen_info);
		err = mlxsw_sp_router_fib4_add(mlxsw_sp, &fib_work->fen_info);
		if (err)
			mlxsw_sp_router_fib4_abort(mlxsw_sp);
		fib_info_put(fib_work->fen_info.fi);
		break;
	case FIB_EVENT_ENTRY_DEL:
		mlxsw_sp_router_fib4_del(mlxsw_sp, fen_info);
		mlxsw_sp_router_fib4_del(mlxsw_sp, &fib_work->fen_info);
		fib_info_put(fib_work->fen_info.fi);
		break;
	case FIB_EVENT_RULE_ADD: /* fall through */
	case FIB_EVENT_RULE_DEL:
		mlxsw_sp_router_fib4_abort(mlxsw_sp);
		break;
	}
	rtnl_unlock();
	kfree(fib_work);
}

/* Called with rcu_read_lock() */
static int mlxsw_sp_router_fib_event(struct notifier_block *nb,
				     unsigned long event, void *ptr)
{
	struct mlxsw_sp *mlxsw_sp = container_of(nb, struct mlxsw_sp, fib_nb);
	struct mlxsw_sp_fib_event_work *fib_work;
	struct fib_notifier_info *info = ptr;

	if (!net_eq(info->net, &init_net))
		return NOTIFY_DONE;

	fib_work = kzalloc(sizeof(*fib_work), GFP_ATOMIC);
	if (WARN_ON(!fib_work))
		return NOTIFY_BAD;

	INIT_DELAYED_WORK(&fib_work->dw, mlxsw_sp_router_fib_event_work);
	fib_work->mlxsw_sp = mlxsw_sp;
	fib_work->event = event;

	switch (event) {
	case FIB_EVENT_ENTRY_ADD: /* fall through */
	case FIB_EVENT_ENTRY_DEL:
		memcpy(&fib_work->fen_info, ptr, sizeof(fib_work->fen_info));
		/* Take referece on fib_info to prevent it from being
		 * freed while work is queued. Release it afterwards.
		 */
		fib_info_hold(fib_work->fen_info.fi);
		break;
	}

	mlxsw_core_schedule_odw(&fib_work->dw, 0);

	return NOTIFY_DONE;
}

static void mlxsw_sp_router_fib_dump_flush(struct notifier_block *nb)
{
	struct mlxsw_sp *mlxsw_sp = container_of(nb, struct mlxsw_sp, fib_nb);

	/* Flush pending FIB notifications and then flush the device's
	 * table before requesting another dump. The FIB notification
	 * block is unregistered, so no need to take RTNL.
	 */
	mlxsw_core_flush_owq();
	mlxsw_sp_router_fib_flush(mlxsw_sp);
}

int mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp)
{
	int err;
@@ -1995,9 +2059,15 @@ int mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp)
		goto err_neigh_init;

	mlxsw_sp->fib_nb.notifier_call = mlxsw_sp_router_fib_event;
	register_fib_notifier(&mlxsw_sp->fib_nb);
	err = register_fib_notifier(&mlxsw_sp->fib_nb,
				    mlxsw_sp_router_fib_dump_flush);
	if (err)
		goto err_register_fib_notifier;

	return 0;

err_register_fib_notifier:
	mlxsw_sp_neigh_fini(mlxsw_sp);
err_neigh_init:
	mlxsw_sp_vrs_fini(mlxsw_sp);
err_vrs_init:
+1 −0
Original line number Diff line number Diff line
@@ -72,6 +72,7 @@ struct rocker {
	struct rocker_dma_ring_info event_ring;
	struct notifier_block fib_nb;
	struct rocker_world_ops *wops;
	struct workqueue_struct *rocker_owq;
	void *wpriv;
};

+72 −12
Original line number Diff line number Diff line
@@ -28,6 +28,7 @@
#include <linux/if_bridge.h>
#include <linux/bitops.h>
#include <linux/ctype.h>
#include <linux/workqueue.h>
#include <net/switchdev.h>
#include <net/rtnetlink.h>
#include <net/netevent.h>
@@ -2165,28 +2166,70 @@ static const struct switchdev_ops rocker_port_switchdev_ops = {
	.switchdev_port_obj_dump	= rocker_port_obj_dump,
};

static int rocker_router_fib_event(struct notifier_block *nb,
				   unsigned long event, void *ptr)
struct rocker_fib_event_work {
	struct work_struct work;
	struct fib_entry_notifier_info fen_info;
	struct rocker *rocker;
	unsigned long event;
};

static void rocker_router_fib_event_work(struct work_struct *work)
{
	struct rocker *rocker = container_of(nb, struct rocker, fib_nb);
	struct fib_entry_notifier_info *fen_info = ptr;
	struct rocker_fib_event_work *fib_work =
		container_of(work, struct rocker_fib_event_work, work);
	struct rocker *rocker = fib_work->rocker;
	int err;

	switch (event) {
	/* Protect internal structures from changes */
	rtnl_lock();
	switch (fib_work->event) {
	case FIB_EVENT_ENTRY_ADD:
		err = rocker_world_fib4_add(rocker, fen_info);
		err = rocker_world_fib4_add(rocker, &fib_work->fen_info);
		if (err)
			rocker_world_fib4_abort(rocker);
		else
		fib_info_put(fib_work->fen_info.fi);
		break;
	case FIB_EVENT_ENTRY_DEL:
		rocker_world_fib4_del(rocker, fen_info);
		rocker_world_fib4_del(rocker, &fib_work->fen_info);
		fib_info_put(fib_work->fen_info.fi);
		break;
	case FIB_EVENT_RULE_ADD: /* fall through */
	case FIB_EVENT_RULE_DEL:
		rocker_world_fib4_abort(rocker);
		break;
	}
	rtnl_unlock();
	kfree(fib_work);
}

/* Called with rcu_read_lock() */
static int rocker_router_fib_event(struct notifier_block *nb,
				   unsigned long event, void *ptr)
{
	struct rocker *rocker = container_of(nb, struct rocker, fib_nb);
	struct rocker_fib_event_work *fib_work;

	fib_work = kzalloc(sizeof(*fib_work), GFP_ATOMIC);
	if (WARN_ON(!fib_work))
		return NOTIFY_BAD;

	INIT_WORK(&fib_work->work, rocker_router_fib_event_work);
	fib_work->rocker = rocker;
	fib_work->event = event;

	switch (event) {
	case FIB_EVENT_ENTRY_ADD: /* fall through */
	case FIB_EVENT_ENTRY_DEL:
		memcpy(&fib_work->fen_info, ptr, sizeof(fib_work->fen_info));
		/* Take referece on fib_info to prevent it from being
		 * freed while work is queued. Release it afterwards.
		 */
		fib_info_hold(fib_work->fen_info.fi);
		break;
	}

	queue_work(rocker->rocker_owq, &fib_work->work);

	return NOTIFY_DONE;
}

@@ -2754,6 +2797,21 @@ static int rocker_probe(struct pci_dev *pdev, const struct pci_device_id *id)
		goto err_request_event_irq;
	}

	rocker->rocker_owq = alloc_ordered_workqueue(rocker_driver_name,
						     WQ_MEM_RECLAIM);
	if (!rocker->rocker_owq) {
		err = -ENOMEM;
		goto err_alloc_ordered_workqueue;
	}

	/* Only FIBs pointing to our own netdevs are programmed into
	 * the device, so no need to pass a callback.
	 */
	rocker->fib_nb.notifier_call = rocker_router_fib_event;
	err = register_fib_notifier(&rocker->fib_nb, NULL);
	if (err)
		goto err_register_fib_notifier;

	rocker->hw.id = rocker_read64(rocker, SWITCH_ID);

	err = rocker_probe_ports(rocker);
@@ -2762,15 +2820,16 @@ static int rocker_probe(struct pci_dev *pdev, const struct pci_device_id *id)
		goto err_probe_ports;
	}

	rocker->fib_nb.notifier_call = rocker_router_fib_event;
	register_fib_notifier(&rocker->fib_nb);

	dev_info(&pdev->dev, "Rocker switch with id %*phN\n",
		 (int)sizeof(rocker->hw.id), &rocker->hw.id);

	return 0;

err_probe_ports:
	unregister_fib_notifier(&rocker->fib_nb);
err_register_fib_notifier:
	destroy_workqueue(rocker->rocker_owq);
err_alloc_ordered_workqueue:
	free_irq(rocker_msix_vector(rocker, ROCKER_MSIX_VEC_EVENT), rocker);
err_request_event_irq:
	free_irq(rocker_msix_vector(rocker, ROCKER_MSIX_VEC_CMD), rocker);
@@ -2796,9 +2855,10 @@ static void rocker_remove(struct pci_dev *pdev)
{
	struct rocker *rocker = pci_get_drvdata(pdev);

	rocker_remove_ports(rocker);
	unregister_fib_notifier(&rocker->fib_nb);
	rocker_write32(rocker, CONTROL, ROCKER_CONTROL_RESET);
	rocker_remove_ports(rocker);
	destroy_workqueue(rocker->rocker_owq);
	free_irq(rocker_msix_vector(rocker, ROCKER_MSIX_VEC_EVENT), rocker);
	free_irq(rocker_msix_vector(rocker, ROCKER_MSIX_VEC_CMD), rocker);
	rocker_dma_rings_fini(rocker);
Loading