[rds-devel] [patch rfc] destroy rds ib devices in remove_one callback

Zach Brown zach.brown at oracle.com
Thu Nov 4 12:13:59 PDT 2010


So here's a potential fix for those crashes that we've seen which look
like ib_dealloc_fmr() trying to follow function pointers through a freed
ib_device.  It passes basic rmmod testing over here.

The question I have is whether it's safe to wait on krdsd from the
remove_one callback like this.  I don't know if the IB stack could need
to use the context that remove_one is running in to do any of the work
that connection shutdown waits for in krdsd.  In particular, draining
the rings.  Andy?  Do we know anything about these contexts?

- z

-=--=-

RDS/IB: ensure that ib_device won't be used after .remove_one

We're seeing crashes where ib_dealloc_fmr() appears to be trying to call
through a freed ib_device.  Our .remove_one can return while we still
have device references that may flush after .remove_one's caller frees
the ib_device.

This reworks our rds_ib_device freeing path.  Rather than queueing
freeing after the last reference count release, we have remove_one wait
for the last release and then free the rds_ib_device before returning.

Signed-off-by: Zach Brown <zach.brown at oracle.com>

diff -up a/net/rds/ib.c b/net/rds/ib.c
--- a/net/rds/ib.c	2010-11-04 10:58:37.000000000 -0700
+++ b/net/rds/ib.c	2010-11-04 11:16:25.000000000 -0700
@@ -86,14 +86,12 @@ void rds_ib_dev_shutdown(struct rds_ib_d
 }
 
 /*
- * rds_ib_destroy_mr_pool() blocks on a few things and mrs drop references
- * from interrupt context so we push freing off into a work struct in krdsd.
+ * This is only called from our .remove_one() ib callback once all other
+ * references to the ib_device have been released.  It can block.
  */
-static void rds_ib_dev_free(struct work_struct *work)
+static void rds_ib_dev_free(struct rds_ib_device *rds_ibdev)
 {
 	struct rds_ib_ipaddr *i_ipaddr, *i_next;
-	struct rds_ib_device *rds_ibdev = container_of(work,
-					struct rds_ib_device, free_work);
 
 	if (rds_ibdev->mr_pool)
 		rds_ib_destroy_mr_pool(rds_ibdev->mr_pool);
@@ -114,7 +112,7 @@ void rds_ib_dev_put(struct rds_ib_device
 {
 	BUG_ON(atomic_read(&rds_ibdev->refcount) <= 0);
 	if (atomic_dec_and_test(&rds_ibdev->refcount))
-		queue_work(rds_wq, &rds_ibdev->free_work);
+		complete(&rds_ibdev->free_comp);
 }
 
 void rds_ib_add_one(struct ib_device *device)
@@ -141,7 +139,7 @@ void rds_ib_add_one(struct ib_device *de
 
 	spin_lock_init(&rds_ibdev->spinlock);
 	atomic_set(&rds_ibdev->refcount, 1);
-	INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free);
+	init_completion(&rds_ibdev->free_comp);
 
 	rds_ibdev->max_wrs = dev_attr->max_qp_wr;
 	rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE);
@@ -221,11 +219,12 @@ struct rds_ib_device *rds_ib_get_client_
 }
 
 /*
- * The IB stack is letting us know that a device is going away.  This can
- * happen if the underlying HCA driver is removed or if PCI hotplug is removing
- * the pci function, for example.
+ * The IB stack is letting us know that a device is going away, most
+ * commonly during rmmod.  This can be called at any time and can be
+ * racing with any other RDS path.
  *
- * This can be called at any time and can be racing with any other RDS path.
+ * The ib_device will be freed after this returns so we must ensure that
+ * we won't reference the device after returning from here.
  */
 void rds_ib_remove_one(struct ib_device *device)
 {
@@ -235,11 +234,10 @@ void rds_ib_remove_one(struct ib_device 
 	if (!rds_ibdev)
 		return;
 
-	rds_ib_dev_shutdown(rds_ibdev);
-
 	/* stop connection attempts from getting a reference to this device. */
 	ib_set_client_data(device, &rds_ib_client, NULL);
 
+	/* stop new memory registration from seeing the device */
 	down_write(&rds_ib_devices_lock);
 	list_del_rcu(&rds_ibdev->list);
 	up_write(&rds_ib_devices_lock);
@@ -252,6 +250,13 @@ void rds_ib_remove_one(struct ib_device 
 	synchronize_rcu();
 	rds_ib_dev_put(rds_ibdev);
 	rds_ib_dev_put(rds_ibdev);
+
+	/* trigger shutdown on all connections using the device */
+	rds_ib_dev_shutdown(rds_ibdev);
+
+	/* wait for shutdown to drop all refs, and free */
+	wait_for_completion(&rds_ibdev->free_comp);
+	rds_ib_dev_free(rds_ibdev);
 }
 
 struct ib_client rds_ib_client = {
diff -up a/net/rds/ib.h b/net/rds/ib.h
--- a/net/rds/ib.h	2010-11-04 11:06:47.000000000 -0700
+++ b/net/rds/ib.h	2010-11-04 11:08:34.000000000 -0700
@@ -180,7 +180,7 @@ struct rds_ib_device {
 	unsigned int		max_responder_resources;
 	spinlock_t		spinlock;	/* protect the above */
 	atomic_t		refcount;
-	struct work_struct	free_work;
+	struct completion	free_comp;
 };
 
 /* bits for i_ack_flags */



More information about the rds-devel mailing list