[rds-devel] [patch rfc] destroy rds ib devices in remove_one callback
Zach Brown
zach.brown at oracle.com
Thu Nov 4 12:13:59 PDT 2010
So here's a potential fix for those crashes that we've seen which look
like ib_dealloc_fmr() trying to follow function pointers through a freed
ib_device. It passes basic rmmod testing over here.
The question I have is whether it's safe to wait on krdsd from the
remove_one callback like this. I don't know if the IB stack could need
to use the context that remove_one is running in to do any of the work
that connection shutdown waits for in krdsd. In particular, draining
the rings. Andy? Do we know anything about these contexts?
- z
-=--=-
RDS/IB: ensure that ib_device won't be used after .remove_one
We're seeing crashes where ib_dealloc_fmr() appears to be trying to call
through a freed ib_device. Our .remove_one can return while we still
have device references that may flush after .remove_one's caller frees
the ib_device.
This reworks our rds_ib_device freeing path. Rather than queueing
freeing after the last reference count release, we have remove_one wait
for the last release and then free the rds_ib_device before returning.
Signed-off-by: Zach Brown <zach.brown at oracle.com>
diff -up a/net/rds/ib.c b/net/rds/ib.c
--- a/net/rds/ib.c 2010-11-04 10:58:37.000000000 -0700
+++ b/net/rds/ib.c 2010-11-04 11:16:25.000000000 -0700
@@ -86,14 +86,12 @@ void rds_ib_dev_shutdown(struct rds_ib_d
}
/*
- * rds_ib_destroy_mr_pool() blocks on a few things and mrs drop references
- * from interrupt context so we push freing off into a work struct in krdsd.
+ * This is only called from our .remove_one() ib callback once all other
+ * references to the ib_device have been released. It can block.
*/
-static void rds_ib_dev_free(struct work_struct *work)
+static void rds_ib_dev_free(struct rds_ib_device *rds_ibdev)
{
struct rds_ib_ipaddr *i_ipaddr, *i_next;
- struct rds_ib_device *rds_ibdev = container_of(work,
- struct rds_ib_device, free_work);
if (rds_ibdev->mr_pool)
rds_ib_destroy_mr_pool(rds_ibdev->mr_pool);
@@ -114,7 +112,7 @@ void rds_ib_dev_put(struct rds_ib_device
{
BUG_ON(atomic_read(&rds_ibdev->refcount) <= 0);
if (atomic_dec_and_test(&rds_ibdev->refcount))
- queue_work(rds_wq, &rds_ibdev->free_work);
+ complete(&rds_ibdev->free_comp);
}
void rds_ib_add_one(struct ib_device *device)
@@ -141,7 +139,7 @@ void rds_ib_add_one(struct ib_device *de
spin_lock_init(&rds_ibdev->spinlock);
atomic_set(&rds_ibdev->refcount, 1);
- INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free);
+ init_completion(&rds_ibdev->free_comp);
rds_ibdev->max_wrs = dev_attr->max_qp_wr;
rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE);
@@ -221,11 +219,12 @@ struct rds_ib_device *rds_ib_get_client_
}
/*
- * The IB stack is letting us know that a device is going away. This can
- * happen if the underlying HCA driver is removed or if PCI hotplug is removing
- * the pci function, for example.
+ * The IB stack is letting us know that a device is going away, most
+ * commonly during rmmod. This can be called at any time and can be
+ * racing with any other RDS path.
*
- * This can be called at any time and can be racing with any other RDS path.
+ * The ib_device will be freed after this returns so we must ensure that
+ * we won't reference the device after returning from here.
*/
void rds_ib_remove_one(struct ib_device *device)
{
@@ -235,11 +234,10 @@ void rds_ib_remove_one(struct ib_device
if (!rds_ibdev)
return;
- rds_ib_dev_shutdown(rds_ibdev);
-
/* stop connection attempts from getting a reference to this device. */
ib_set_client_data(device, &rds_ib_client, NULL);
+ /* stop new memory registration from seeing the device */
down_write(&rds_ib_devices_lock);
list_del_rcu(&rds_ibdev->list);
up_write(&rds_ib_devices_lock);
@@ -252,6 +250,13 @@ void rds_ib_remove_one(struct ib_device
synchronize_rcu();
rds_ib_dev_put(rds_ibdev);
rds_ib_dev_put(rds_ibdev);
+
+ /* trigger shutdown on all connections using the device */
+ rds_ib_dev_shutdown(rds_ibdev);
+
+ /* wait for shutdown to drop all refs, and free */
+ wait_for_completion(&rds_ibdev->free_comp);
+ rds_ib_dev_free(rds_ibdev);
}
struct ib_client rds_ib_client = {
diff -up a/net/rds/ib.h b/net/rds/ib.h
--- a/net/rds/ib.h 2010-11-04 11:06:47.000000000 -0700
+++ b/net/rds/ib.h 2010-11-04 11:08:34.000000000 -0700
@@ -180,7 +180,7 @@ struct rds_ib_device {
unsigned int max_responder_resources;
spinlock_t spinlock; /* protect the above */
atomic_t refcount;
- struct work_struct free_work;
+ struct completion free_comp;
};
/* bits for i_ack_flags */
More information about the rds-devel
mailing list