[Ocfs2-commits] jlbec commits r2006 - in trunk/fs: ocfs2
ocfs2/cluster ocfs2/dlm usysfs
svn-commits at oss.oracle.com
svn-commits at oss.oracle.com
Fri Mar 18 00:01:11 CST 2005
Author: jlbec
Signed-off-by: mfasheh
Date: 2005-03-18 00:01:10 -0600 (Fri, 18 Mar 2005)
New Revision: 2006
Removed:
trunk/fs/ocfs2/cluster/clcommon.h
trunk/fs/ocfs2/cluster/gsd.c
trunk/fs/ocfs2/cluster/gsd.h
trunk/fs/ocfs2/cluster/ocfs2_tcp.h
Modified:
trunk/fs/ocfs2/cluster/Makefile
trunk/fs/ocfs2/cluster/heartbeat.c
trunk/fs/ocfs2/cluster/heartbeat.h
trunk/fs/ocfs2/cluster/nodemanager.c
trunk/fs/ocfs2/cluster/nodemanager.h
trunk/fs/ocfs2/cluster/ocfs2_nodemanager.h
trunk/fs/ocfs2/cluster/tcp.c
trunk/fs/ocfs2/cluster/tcp.h
trunk/fs/ocfs2/dlm/dlmast.c
trunk/fs/ocfs2/dlm/dlmconvert.c
trunk/fs/ocfs2/dlm/dlmfs.c
trunk/fs/ocfs2/dlm/dlmfs_compat.c
trunk/fs/ocfs2/dlm/dlmlock.c
trunk/fs/ocfs2/dlm/dlmmaster.c
trunk/fs/ocfs2/dlm/dlmmod.c
trunk/fs/ocfs2/dlm/dlmmod.h
trunk/fs/ocfs2/dlm/dlmrecovery.c
trunk/fs/ocfs2/dlm/dlmthread.c
trunk/fs/ocfs2/dlm/dlmunlock.c
trunk/fs/ocfs2/dlm/userdlm.c
trunk/fs/ocfs2/dlmglue.c
trunk/fs/ocfs2/heartbeat.c
trunk/fs/ocfs2/ocfs.h
trunk/fs/ocfs2/super.c
trunk/fs/ocfs2/vote.c
trunk/fs/usysfs/dir.c
trunk/fs/usysfs/mount.c
trunk/fs/usysfs/usysfs.h
Log:
o Merge the usysfsify branch. The O2CB ABI is now usysfs. You must
have revision 668 of ocfs2-tools or better.
Signed-off-by: mfasheh
Modified: trunk/fs/ocfs2/cluster/Makefile
===================================================================
--- trunk/fs/ocfs2/cluster/Makefile 2005-03-18 04:25:45 UTC (rev 2005)
+++ trunk/fs/ocfs2/cluster/Makefile 2005-03-18 06:01:10 UTC (rev 2006)
@@ -16,6 +16,11 @@
EXTRA_CFLAGS += -DMISSING_SOCK_CREATE_LITE
endif
+# this is going to get exciting if usysfs is merged
+EXTRA_CFLAGS += -I$(OUR_TOPDIR)/fs/usysfs/
+# XXX should be folded into the kapi stuff
+EXTRA_CFLAGS += -I$(OUR_TOPDIR)/fs/usysfs/compatinclude
+
ifeq ($(KERNELRELEASE),)
ifeq ($(KERNEL_26),)
COMPAT_LIBFS := compat_libfs.o
@@ -24,15 +29,10 @@
INSTALL_MOD_DIR := fs/ocfs2
-obj-m := ocfs2_heartbeat.o ocfs2_nodemanager.o ocfs2_tcp.o
+obj-m := ocfs2_nodemanager.o
-ocfs2_nodemanager-objs := nodemanager.o util.o transaction_file.o $(COMPAT_LIBFS)
+ocfs2_nodemanager-objs := nodemanager.o heartbeat.o tcp.o $(COMPAT_LIBFS)
-ocfs2_heartbeat-objs := heartbeat.o util.o transaction_file.o $(COMPAT_LIBFS)
-
-ocfs2_tcp-objs := gsd.o tcp.o util.o $(COMPAT_LIBFS)
-
-
ifeq ($(KERNELRELEASE),)
#
# Called from a regular "make".
@@ -40,25 +40,20 @@
SOURCES = \
compat_libfs.c \
- gsd.c \
heartbeat.c \
nodemanager.c \
tcp.c \
- transaction_file.c \
util.c
HEADERS = \
- clcommon.h \
cl_compat.h \
compat_libfs.h \
- gsd.h \
heartbeat.h \
nodemanager.h \
ocfs2_heartbeat.h \
ocfs2_nodemanager.h \
ocfs2_tcp.h \
tcp.h \
- transaction_file.h \
util.h
DIST_FILES = $(SOURCES) $(HEADERS)
Deleted: trunk/fs/ocfs2/cluster/clcommon.h
===================================================================
--- trunk/fs/ocfs2/cluster/clcommon.h 2005-03-18 04:25:45 UTC (rev 2005)
+++ trunk/fs/ocfs2/cluster/clcommon.h 2005-03-18 06:01:10 UTC (rev 2006)
@@ -1,49 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * clcommon.h
- *
- * Common stuff
- *
- * Copyright (C) 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
- * Authors: Kurt Hackel
- */
-
-#ifndef CLUSTER_CLCOMMON_H
-#define CLUSTER_CLCOMMON_H
-
-typedef struct _nm_ctxt nm_ctxt;
-typedef struct _heartbeat_ctxt heartbeat_ctxt;
-
-#define CLUSTER_DISK_UUID_LEN 32 // 16 byte binary == 32 char hex string
-
-typedef struct _cluster_disk
-{
- // uuid of disk
- char uuid[CLUSTER_DISK_UUID_LEN+1];
- // all the rest are for heartbeat
- dev_t dev;
- u32 blocksize_bits;
- u32 num_blocks;
- u64 start_block;
- util_rarray slots;
-} cluster_disk;
-
-
-#endif /* CLUSTER_CLCOMMON_H */
Deleted: trunk/fs/ocfs2/cluster/gsd.c
===================================================================
--- trunk/fs/ocfs2/cluster/gsd.c 2005-03-18 04:25:45 UTC (rev 2005)
+++ trunk/fs/ocfs2/cluster/gsd.c 2005-03-18 06:01:10 UTC (rev 2006)
@@ -1,245 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * Copyright (C) 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
- */
-#include <linux/config.h>
-#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/sched.h>
-#include <asm/uaccess.h>
-#include <linux/file.h>
-
-#include "cl_compat.h"
-#include "util.h"
-#include "clcommon.h"
-#include "nodemanager.h"
-#include "tcp.h"
-#include "gsd.h"
-
-static char *gsd_handler_buf = NULL;
-/* sigh. these will be claned up, I'm just sure of it. */
-static u8 gsd_node_num;
-static struct inode *gsd_inode;
-
-static int gsd_message_action(gsd_message *g)
-{
- int ret;
- nm_op op;
- int namelen = g->namelen;
- struct inode *node=NULL, *group=NULL;
- char name[NM_MAX_NAME_LEN+1];
-
- if (namelen > NM_MAX_NAME_LEN)
- return -EINVAL;
- strncpy(name, g->name, namelen);
- name[namelen] = '\0';
-
- memset(&op, 0, sizeof(op));
- switch (g->action) {
- case GSD_ACTION_ADD_GROUP:
- group = nm_get_group_by_name(name);
- if (group) {
- ret = 0;
- break;
- }
- op.arg_u.gc.group_num = NM_INVALID_SLOT_NUM;
- memcpy(op.arg_u.gc.name, name, namelen);
- memcpy(op.arg_u.gc.disk_uuid, name, namelen);
-
- ret = nm_create_group(gsd_handler_buf, &op);
- if (ret >= 0)
- ret = 0;
- break;
-
- case GSD_ACTION_ADD_GROUP_NODE:
- group = nm_get_group_by_name(name);
- if (!group) {
- ret = -EINVAL;
- break;
- }
- node = nm_get_group_node_by_index(group, g->from);
- if (node) {
- ret = 0;
- if (nm_get_node_global_index(node) != g->from)
- ret = -EINVAL;
- break;
- }
- op.arg_u.gc.group_num = nm_get_group_global_index(group);
- op.arg_u.gc.node_num = g->from;
- op.arg_u.gc.slot_num = g->from;
- ret = nm_add_node_to_group(gsd_handler_buf, &op);
- if (ret >= 0)
- ret = 0;
- break;
- default:
- ret = -EINVAL;
- break;
- }
-
- if (node)
- iput(node);
- if (group)
- iput(group);
- return ret;
-}
-
-static int gsd_message_handler(net_msg *msg, u32 len, void *data)
-{
- gsd_message *g = (gsd_message *)msg->buf;
- gsd_message_to_host(g);
- return gsd_message_action(g);
-}
-
-int gsd_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
- unsigned long arg)
-{
- gsd_ioc gsd_data;
- int ret = 0;
- gsd_message g;
- int response = 0;
- struct inode *to = NULL;
- struct file *file = NULL;
-
- switch (cmd) {
- case GSD_IOC_CREATE_GROUP:
- memset(&gsd_data, 0, sizeof(gsd_ioc));
- ret = copy_from_user(&gsd_data, (gsd_ioc *)arg,
- sizeof(gsd_ioc));
-
- file = fget(gsd_data.fd);
- if (!file || !file->f_dentry || !file->f_dentry->d_inode) {
- ret = -EINVAL;
- break;
- }
- to = file->f_dentry->d_inode;
-
- g.action = GSD_ACTION_ADD_GROUP;
- g.from = gsd_node_num;
- g.namelen = gsd_data.namelen;
- memcpy(g.name, gsd_data.name, gsd_data.namelen);
-
- if (to == gsd_inode) {
- /* create the group locally */
- ret = gsd_message_action(&g);
- } else {
- /* create the group on remote node */
- gsd_message_to_net(&g);
- ret = net_send_message(GSD_MESSAGE, 0, &g,
- sizeof(g), to, &response);
- if (ret == 0)
- ret = response;
- }
-
- memset(&gsd_data, 0, sizeof(gsd_ioc));
- gsd_data.status = ret;
- ret = copy_to_user((gsd_ioc *)arg, &gsd_data,
- sizeof(gsd_ioc));
- break;
-
- case GSD_IOC_ADD_GROUP_NODE:
- memset(&gsd_data, 0, sizeof(gsd_ioc));
- ret = copy_from_user(&gsd_data, (gsd_ioc *)arg,
- sizeof(gsd_ioc));
-
- file = fget(gsd_data.fd);
- if (!file || !file->f_dentry || !file->f_dentry->d_inode) {
- ret = -EINVAL;
- break;
- }
- to = file->f_dentry->d_inode;
-
- g.action = GSD_ACTION_ADD_GROUP_NODE;
- g.from = gsd_node_num;
- g.namelen = gsd_data.namelen;
- memcpy(g.name, gsd_data.name, gsd_data.namelen);
-
- if (to == gsd_inode) {
- /* create the group locally */
- ret = gsd_message_action(&g);
- } else {
- /* create the group on remote node */
- gsd_message_to_net(&g);
- ret = net_send_message(GSD_MESSAGE, 0, &g,
- sizeof(g), to, &response);
- if (ret == 0)
- ret = response;
- }
- memset(&gsd_data, 0, sizeof(gsd_ioc));
- gsd_data.status = ret;
- ret = copy_to_user((gsd_ioc *)arg, &gsd_data,
- sizeof(gsd_ioc));
- break;
- default:
- BUG();
- break;
- }
-
- if (file)
- fput(file);
-
- return ret;
-} /* net_ioctl */
-
-int gsd_setup(void)
-{
- int ret;
-
- gsd_node_num = nm_this_node(NULL);
- if (gsd_node_num >= NM_MAX_NODES) {
- printk("local nm node number not initialized!\n");
- ret = -EINVAL;
- goto out;
- }
-
- gsd_inode = nm_get_node_by_num(gsd_node_num);
- if (!gsd_inode) {
- printk("local nm node inode not initialized!\n");
- return -1;
- }
-
- /* need this stupidity until I can divorce the actual nm actions
- * from the output they send to their user buffer */
- gsd_handler_buf = (char *) __get_free_page(GFP_KERNEL);
- if (!gsd_handler_buf) {
- ret = -ENOMEM;
- goto out;
- }
-
- ret = net_register_handler(GSD_MESSAGE, 0, 0, sizeof(gsd_message),
- gsd_message_handler, NULL);
-out:
- if (ret)
- if (gsd_inode) {
- iput(gsd_inode);
- gsd_inode = NULL;
- }
-
- return ret;
-}
-
-void gsd_teardown(void)
-{
- free_page((unsigned long)gsd_handler_buf);
- if (gsd_inode) {
- iput(gsd_inode);
- gsd_inode = NULL;
- }
-}
-
Deleted: trunk/fs/ocfs2/cluster/gsd.h
===================================================================
--- trunk/fs/ocfs2/cluster/gsd.h 2005-03-18 04:25:45 UTC (rev 2005)
+++ trunk/fs/ocfs2/cluster/gsd.h 2005-03-18 06:01:10 UTC (rev 2006)
@@ -1,31 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * Copyright (C) 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef CLUSTER_GSD_H
-#define CLUSTER_GSD_H
-
-int gsd_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
- unsigned long arg);
-
-int gsd_setup(void);
-void gsd_teardown(void);
-
-#endif
Modified: trunk/fs/ocfs2/cluster/heartbeat.c
===================================================================
--- trunk/fs/ocfs2/cluster/heartbeat.c 2005-03-18 04:25:45 UTC (rev 2005)
+++ trunk/fs/ocfs2/cluster/heartbeat.c 2005-03-18 06:01:10 UTC (rev 2006)
@@ -1,12 +1,8 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
- * heartbeat.c
+ * Copyright (C) 2004, 2005 Oracle. All rights reserved.
*
- * Keeps track of alive nodes in the cluster.
- *
- * Copyright (C) 2004 Oracle. All rights reserved.
- *
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
@@ -22,11 +18,10 @@
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*
- * Authors: Kurt Hackel
+ * TODO:
+ * - make sure attributes can't be written to after object commital
*/
-
-
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/fs.h>
@@ -61,12 +56,13 @@
#include <linux/file.h>
#include <linux/bitops.h>
#include <linux/kthread.h>
+#include "usysfs.h"
#include <asm/uaccess.h>
+#include <asm/bitops.h>
#include "cl_compat.h"
#include "util.h"
-#include "clcommon.h"
#include "heartbeat.h"
#include "tcp.h"
@@ -79,32 +75,21 @@
#define __user
#endif
+static void hb_do_callbacks(int type, struct nm_node *node, int idx);
-static void hb_teardown(void);
-static void hb_nm_group_node_add_cb(void *ptr1, void *ptr2, u8 idx);
-static void hb_nm_group_node_del_cb(void *ptr1, void *ptr2, u8 idx);
-static void hb_nm_node_add_cb(void *ptr1, void *ptr2, u8 idx);
-static void hb_nm_group_add_cb(void *ptr1, void *ptr2, u8 idx);
-static int hb_init_disk_hb_group(struct inode *group, dev_t dev, u32 bits,
- u32 blocks, u64 start);
-static ssize_t write_disk(struct file *file, char *buf, size_t size);
-static void hb_do_callbacks(int type, void *ptr1, void *ptr2, int idx);
-static void hb_end_buffer_io_sync(struct buffer_head *bh, int uptodate);
-static int hb_do_node_down(struct inode *group, struct inode *node, int idx);
-static int hb_do_node_up(struct inode *group, struct inode *node, int idx);
-static int hb_do_disk_heartbeat(void *page);
-static int hb_thread(void *data);
-static void hb_complete_thread(void);
-static int hb_launch_thread(void);
-static int hb_fill_node_bytemap(struct inode *group, void *map, int size);
+/*
+ * The first heartbeat pass had one global thread that would serialize all hb
+ * callback calls. This global serializing sem should only be removed once
+ * we've made sure that all callees can deal with being called concurrently
+ * from multiple hb region threads.
+ */
+static DECLARE_RWSEM(hb_callback_sem);
+/*
+ * region setup and teardown races with node_fill_map here. We use
+ * the callback sem to protect them.
+ */
+static LIST_HEAD(hb_active_regions);
-
-/* globals */
-static spinlock_t hb_lock = SPIN_LOCK_UNLOCKED;
-static LIST_HEAD(hb_net_groups);
-static LIST_HEAD(hb_disk_groups);
-static struct task_struct *hb_task = NULL;
-
static struct hb_callback {
struct list_head list;
struct semaphore sem;
@@ -119,8 +104,38 @@
#define hbprintk0(x)
#endif
+#define HB_THREAD_MS 2000 // every 2 seconds
+struct hb_disk_slot {
+ struct buffer_head *ds_bh;
+ u64 ds_block;
+ u8 ds_node_num;
+ unsigned long ds_last_time;
+ u16 ds_margin;
+ /* the single hb-thread only ever touches these items, no locking */
+ struct list_head ds_dead_item;
+ struct list_head ds_alive_item;
+};
+/* each thread owns a region.. when we're asked to tear down the region
+ * we ask the thread to stop, who cleans up the region */
+struct hb_region {
+ struct kobject hr_kobj;
+ struct list_head hr_active_item;
+ struct task_struct *hr_task;
+ u64 hr_block_bytes;
+ u64 hr_block_bits;
+ u64 hr_start_block;
+ u8 hr_blocks;
+
+ struct block_device *hr_bdev;
+ struct hb_disk_slot *hr_slots;
+ /* a single hb-thread writer and many fill_node readers are protected */
+ rwlock_t hr_slot_list_lock;
+ struct list_head hr_alive_list;
+ struct list_head hr_dead_list;
+};
+
static void hb_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
{
if (uptodate)
@@ -132,24 +147,17 @@
unlock_buffer(bh);
}
-
-
-static int hb_do_node_down(struct inode *group, struct inode *node, int idx)
+static int hb_do_node_down(struct nm_node *node, int idx)
{
- //int ret;
- hbprintk("hb_do_node_down: group=%lu, node=%lu\n", group->i_ino,
- node->i_ino);
- hbprintk("NOT removing node from group\n");
- //ret = nm_remove_node_from_group(group, node);
- hb_do_callbacks(HB_NODE_DOWN_CB, group, node, idx);
+ hbprintk("hb_do_node_down: node=%u\n", node->nd_num);
+ hb_do_callbacks(HB_NODE_DOWN_CB, node, idx);
return 0;
}
-static int hb_do_node_up(struct inode *group, struct inode *node, int idx)
+static int hb_do_node_up(struct nm_node *node, int idx)
{
- hbprintk("hb_do_node_up: group=%lu, node=%lu\n", group->i_ino,
- node->i_ino);
- hb_do_callbacks(HB_NODE_UP_CB, group, node, idx);
+ hbprintk("hb_do_node_up: node=%u\n", node->nd_num);
+ hb_do_callbacks(HB_NODE_UP_CB, node, idx);
return 0;
}
@@ -167,28 +175,27 @@
}
/*
- * hb_init_disk_hb_group() sets disk->blocksize_bits which defines the block
- * size that we'll use to calculate block offsets. The blocksize of the
- * device might change under us, though. this detects when that happens
- * and tries to lookup the bh again with the newer blocksize. The exciting
- * wrinkle here is that its fatal to call __getblk() with the wrong block
- * size in some 2.6 kernels.
+ * hr_region's block_bytes attribute defines the block size that we'll use to
+ * calculate block offsets. The blocksize of the device might change under us,
+ * though. this detects when that happens and tries to lookup the bh again
+ * with the newer blocksize. The exciting wrinkle here is that its fatal to
+ * call __getblk() with the wrong block size in some 2.6 kernels.
*
- * We only ever use a few bytes of the block so it is ok that we return
- * a smaller bh than what is implied by blocksize_bits. We're just fixing
- * up addressing here.
+ * We only ever use a few bytes of the block so it is ok that we return a
+ * smaller bh than what is implied by blocksize_bits. We're just fixing up
+ * addressing here.
*/
-static struct buffer_head *hb_getblk(int orig_blkno, cluster_disk *disk)
+static struct buffer_head *hb_getblk(struct block_device *bdev, int orig_blkno,
+ int bits)
{
/* XXX getblk() takes an int block in 2.4 :/ */
int blkno;
- int bits, dev_bits;
+ int dev_bits;
struct buffer_head *bh = NULL;
- bits = disk->blocksize_bits;
blkno = orig_blkno;
- dev_bits = ocfs_dev_bits(disk->dev);
+ dev_bits = bdev->bd_inode->i_blkbits;
if (dev_bits < 0)
goto out;
@@ -197,697 +204,586 @@
else if (dev_bits > bits)
blkno >>= dev_bits - bits;
- bh = getblk(disk->dev, blkno, 1 << dev_bits);
+ bh = __getblk(bdev, blkno, 1 << dev_bits);
out:
return bh;
}
-static struct buffer_head *hb_get_locked_mapped(hb_disk_slot *slot,
- int ino,
- cluster_disk *disk)
+static struct buffer_head *hb_get_locked_mapped(struct hb_region *reg,
+ struct hb_disk_slot *slot)
{
struct buffer_head *bh = NULL;
- bh = slot->bh;
+ bh = slot->ds_bh;
if (bh) {
lock_buffer(bh);
if (buffer_mapped(bh))
goto out;
- slot->bh = NULL;
+ slot->ds_bh = NULL;
unlock_buffer(bh);
brelse(bh);
}
- slot->bh = hb_getblk(ino + disk->start_block, disk);
- if (slot->bh) {
- bh = slot->bh;
+ slot->ds_bh = hb_getblk(reg->hr_bdev, slot->ds_block,
+ reg->hr_block_bits);
+ if (slot->ds_bh) {
+ bh = slot->ds_bh;
lock_buffer(bh);
}
out:
return bh;
}
-static int hb_do_disk_heartbeat(void *page)
+static void hb_do_disk_heartbeat(struct hb_region *reg)
{
- nm_group_inode_private *priv;
- struct inode *group, *node;
- struct list_head *iter;
+ struct nm_node *node;
struct buffer_head *bh;
- hb_disk_slot *slot;
+ struct hb_disk_slot *slot;
hb_disk_heartbeat_block *hb_block;
- int rw, ino, idx, ret, i;
- struct inode **dead_nodes, **live_nodes;
- LIST_HEAD(tmplist);
- cluster_disk *disk;
+ /* only need to worry about locking when we touch the reg lists
+ * which fill_node_map sees. otherwise only we touch these
+ * lists and the slot items */
+ LIST_HEAD(newborn);
+ LIST_HEAD(deceased);
u64 cputime;
+ int i, rw;
- // NM_MAX_NODES is 255
- dead_nodes = page;
- live_nodes = page + (sizeof(struct inode *) * 256);
-
- spin_lock(&hb_lock);
- list_splice_init(&hb_disk_groups, &tmplist);
- spin_unlock(&hb_lock);
+ /* first we clear uptodate on other nodes slots and write our slot */
+ for(i = 0; i < reg->hr_blocks; i++) {
+ slot = ®->hr_slots[i];
- list_for_each(iter, &tmplist) {
- priv = list_entry(iter, nm_group_inode_private, disk_list);
- group = priv->inode;
- disk = &priv->disk;
+ bh = hb_get_locked_mapped(reg, slot);
+ if (bh == NULL) {
+ hbprintk("getblk failed in slot %d!\n", i);
+ continue;
+ }
- memset(page, 0, PAGE_SIZE);
- down(&group->i_sem);
+ if (slot->ds_node_num == nm_this_node()) {
+ memset(bh->b_data, 0, bh->b_size);
+ hb_block = (hb_disk_heartbeat_block *)bh->b_data;
+ BUG_ON(bh->b_size < sizeof(*hb_block));
- idx = 0;
- while ((slot = nm_iterate_group_disk_slots(group, &idx))) {
+ /* TODO: time stuff */
+ cputime = OCFS_CURRENT_SECONDS;
+ if (!cputime)
+ cputime = 1;
+ hb_block->time = cpu_to_le64(cputime);
- node = slot->inode;
- if (!node) {
- hbprintk("no inode in slot %d!\n", idx);
- idx++;
- continue;
- }
- ino = nm_get_node_global_index(node);
+ set_buffer_uptodate(bh);
+ clear_buffer_dirty(bh);
+ rw = WRITE;
+ } else {
+ clear_buffer_uptodate(bh);
+ rw = READ;
+ }
- bh = hb_get_locked_mapped(slot, ino, disk);
- if (bh == NULL) {
- hbprintk("getblk failed in slot %d!\n",
- idx);
- idx++;
- continue;
- }
-
- if (ino == nm_this_node(group)) {
- memset(bh->b_data, 0, bh->b_size);
- hb_block= (hb_disk_heartbeat_block *)bh->b_data;
- BUG_ON(bh->b_size < sizeof(*hb_block));
+ bh->b_end_io = hb_end_buffer_io_sync;
+ hb_submit_bh(rw, bh);
+ }
- /* TODO: time stuff */
- cputime = OCFS_CURRENT_SECONDS;
- if (!cputime)
- cputime = 1;
- hb_block->time = cpu_to_le64(cputime);
+ /* now we read again and see what other nodes have done */
+ for(i = 0; i < reg->hr_blocks; i++) {
+ /* never, ever, generate events for our node */
+ if (i == nm_this_node())
+ continue;
+ slot = ®->hr_slots[i];
- set_buffer_uptodate(bh);
- clear_buffer_dirty(bh);
- rw = WRITE;
- } else {
- clear_buffer_uptodate(bh);
- rw = READ;
- }
-
- bh->b_end_io = hb_end_buffer_io_sync;
- hb_submit_bh(rw, bh);
- idx++;
+ bh = slot->ds_bh;
+ if (bh == NULL) {
+ hbprintk("no bh in slot %d!\n", i);
+ continue;
}
-
- idx = 0;
- while ((slot = nm_iterate_group_disk_slots(group, &idx))) {
- bh = slot->bh;
- if (!bh) {
- hbprintk("no bh in slot %d!\n", idx);
- idx++;
- continue;
- }
+ wait_on_buffer(bh);
+ hb_block = (hb_disk_heartbeat_block *)bh->b_data;
- node = slot->inode;
- if (!node) {
- hbprintk("no inode in slot %d!\n", idx);
- idx++;
- continue;
- }
+ cputime = le64_to_cpu(hb_block->time);
+ if (slot->ds_last_time != cputime) {
+ /* the node is active */
+ if (!list_empty(&slot->ds_dead_item))
+ list_del_init(&slot->ds_dead_item);
- ino = nm_get_node_global_index(node);
+ if (list_empty(&slot->ds_alive_item))
+ list_add_tail(&slot->ds_alive_item, &newborn);
- wait_on_buffer(bh);
- hb_block = (hb_disk_heartbeat_block *)bh->b_data;
+ slot->ds_last_time = cputime;
+ slot->ds_margin = HB_DISK_MARGIN;
+ continue;
+ }
- /* TODO: time stuff */
- cputime = le64_to_cpu(hb_block->time);
- if (slot->last_time != cputime) {
- if (slot->state == HB_NODE_STATE_INIT) {
- hbprintk("first time for this node!\n");
- live_nodes[ino] = node;
- slot->state = HB_NODE_STATE_UP;
- }
- node->i_atime = mk_inode_time(cputime, 0);
- slot->last_time = cputime;
- slot->margin = HB_DISK_MARGIN;
- hb_do_callbacks(HB_NODE_RESPONDED_CB,
- group, node, idx);
- } else {
- slot->margin--;
- hbprintk("node %d missed. margin=%d\n",
- ino, slot->margin);
- }
+ /* only tick down on idlw nodes that we think are alive.
+ * this stops us from getting a sea of node down events for
+ * nodes that have never been active. */
+ if (!list_empty(&slot->ds_dead_item) ||
+ list_empty(&slot->ds_alive_item))
+ continue;
- if (ino != nm_this_node(group) && slot->margin <= 0) {
- hbprintk("node %d JUST DIED!!!!\n", ino);
- dead_nodes[ino] = node;
- slot->state = HB_NODE_STATE_DOWN;
- }
- idx++;
+ /* decrease slot margin to zero as long as we don't
+ * see any updates */
+ if (slot->ds_margin) {
+ if (--slot->ds_margin)
+ continue;
}
- up(&group->i_sem);
+ /* ok, margin is 0, it's really dead */
+ if (list_empty(&slot->ds_dead_item)) {
+ hbprintk("node %d JUST DIED!!!!\n", i);
+ list_add_tail(&slot->ds_dead_item, &deceased);
+ }
+ }
- /* Do holding group i_sem while doing node-up/down.
- * Changes may need to be made to the group, so
- * i_sem will be needed... */
- for (i=0; i<NM_MAX_NODES; i++) {
- if (live_nodes[i])
- ret = hb_do_node_up(group, live_nodes[i], i);
- else if (dead_nodes[i])
- ret = hb_do_node_down(group, dead_nodes[i], i);
+ /* we're the only thing that modifies the lists, we don't have to lock
+ * while we're just reading them. the write locks protect the
+ * fill_node_map readers. */
+ list_for_each_entry(slot, &newborn, ds_alive_item) {
+ node = nm_get_node_by_num(slot->ds_node_num);
+ if (node == NULL) {
+ hbprintk("saw hb for node %d but don't have a node\n",
+ slot->ds_node_num);
+ continue;
}
+ hb_do_node_up(node, slot->ds_node_num);
+ nm_node_put(node);
}
-
- spin_lock(&hb_lock);
- list_splice(&tmplist, &hb_disk_groups);
- spin_unlock(&hb_lock);
- return 0;
+ list_for_each_entry(slot, &deceased, ds_dead_item) {
+ node = nm_get_node_by_num(slot->ds_node_num);
+ if (node == NULL) {
+ hbprintk("node %d went down but don't have a node\n",
+ slot->ds_node_num);
+ continue;
+ }
+ hb_do_node_down(node, slot->ds_node_num);
+ nm_node_put(node);
+ }
+
+ write_lock(®->hr_slot_list_lock);
+ list_splice_init(&newborn, ®->hr_alive_list);
+ list_splice_init(&deceased, ®->hr_dead_list);
+ write_unlock(®->hr_slot_list_lock);
}
+/*
+ * we ride the region ref that the region dir holds. before the region
+ * dir is removed and drops it ref it will wait to tear down this
+ * thread.
+ */
static int hb_thread(void *data)
{
- void *page = data;
+ struct hb_region *reg = data;
hbprintk("hb thread running\n");
while (!kthread_should_stop()) {
- hb_do_disk_heartbeat(page);
+ hb_do_disk_heartbeat(reg);
set_current_state(TASK_UNINTERRUPTIBLE);
schedule_timeout(msecs_to_jiffies(HB_THREAD_MS));
}
hbprintk("hb thread exiting\n");
- free_page((unsigned long)page);
+
return 0;
}
-/* Launch the hb thread for the mounted volume */
-static int hb_launch_thread(void)
+void hb_init(void)
{
- void *page;
- int ret;
+ int i;
- page = (void *)__get_free_page(GFP_KERNEL);
- if (!page) {
- ret = -ENOMEM;
- goto out;
+ for (i = 0; i < ARRAY_SIZE(hb_callbacks); i++) {
+ INIT_LIST_HEAD(&hb_callbacks[i].list);
+ init_MUTEX(&hb_callbacks[i].sem);
}
+}
- hbprintk("starting hb thread...\n");
- hb_task = kthread_run(hb_thread, page, "hb_thread");
- if (IS_ERR(hb_task)) {
- hb_task = NULL;
- hbprintk("unable to launch hb thread, error=%ld",
- PTR_ERR(hb_task));
- ret = -EINVAL;
- goto out;
+/*
+ * get a map of all nodes that are heartbeating in any regions
+ */
+void hb_fill_node_map(unsigned long *map, unsigned bytes)
+{
+ struct hb_region *reg;
+ struct hb_disk_slot *slot;
+
+ BUG_ON(bytes < (BITS_TO_LONGS(NM_MAX_NODES) * sizeof(unsigned long)));
+
+ memset(map, 0, bytes);
+
+ /* callers want to serialize this map and callbacks so that they
+ * can trust that they don't miss nodes coming to the party */
+ down_read(&hb_callback_sem);
+
+ list_for_each_entry(reg, &hb_active_regions, hr_active_item) {
+ read_lock(®->hr_slot_list_lock);
+ list_for_each_entry(slot, ®->hr_alive_list, ds_alive_item)
+ set_bit(slot->ds_node_num, map);
+ read_unlock(®->hr_slot_list_lock);
}
- /* hb_thread is responsible for freeing the page if it runs */
- page = NULL;
- ret = 0;
+ up_read(&hb_callback_sem);
-out:
- if (page)
- free_page((unsigned long)page);
- return ret;
+ /* our node is Always Up */
+ set_bit(nm_this_node(), map);
}
+EXPORT_SYMBOL(hb_fill_node_map);
-static void hb_complete_thread(void)
+/*
+ * heartbeat usysfs bits. The heartbeat set is a default set under
+ * the cluster set in nodemanager.c.
+ */
+
+static struct hb_region *to_hb_region(struct kobject *kobj)
{
- if (hb_task) {
- hbprintk("waiting for hb thread to exit\n");
- kthread_stop(hb_task);
- hb_task = NULL;
- }
+ return kobj ? container_of(kobj, struct hb_region, hr_kobj) : NULL;
}
-static int hb_init_disk_hb_group(struct inode *group, dev_t dev, u32 bits,
- u32 blocks, u64 start)
+/* drop_object only drops its ref after killing the thread, nothing should
+ * be using the region anymore. this has to clean up any state that
+ * attributes might have built up. */
+static void hb_region_release(struct kobject *kobj)
{
- int ret = -EINVAL;
- cluster_disk *disk;
- nm_group_inode_private *priv;
+ struct hb_region *reg = to_hb_region(kobj);
+ printk("releasing reg %p\n", reg);
- priv = group->u.generic_ip;
- if (!priv)
- goto leave;
+ if (reg->hr_bdev)
+ blkdev_put(reg->hr_bdev);
+ kfree(reg->hr_slots); /* might be null if never activated */
+ kfree(reg);
+}
- if (priv->state == NM_GROUP_READY)
- return 0;
+static ssize_t hb_region_block_bytes_read(struct hb_region *reg, char *page)
+{
+ return sprintf(page, "%lld\n", (long long)reg->hr_block_bytes);
+}
- /* hold an extra ref as long as hb keeps track of the group */
- igrab(group);
+static ssize_t hb_region_block_bytes_write(struct hb_region *reg,
+ const char *page,
+ size_t count)
+{
+ unsigned long long tmp;
+ char *p = (char *)page;
- disk = &priv->disk;
- if (blocks > NM_MAX_NODES)
- blocks = NM_MAX_NODES;
- disk->dev = dev;
- disk->blocksize_bits = bits;
- disk->num_blocks = blocks;
- disk->start_block = start;
- util_init_rarray(&disk->slots, sizeof(hb_disk_slot));
+ tmp = simple_strtoull(p, &p, 0);
+ if (!p || (*p && (*p != '\n')))
+ return -EINVAL;
- /* start allowing group additions */
- ret = nm_make_group_ready(group);
+ /* XXX probably very stupid. */
+ if (tmp >= 65536) /* just so we can use hweight16 */
+ return -ERANGE;
+ if (hweight16(tmp) != 1)
+ return -EINVAL;
-leave:
- if (ret < 0)
- iput(group);
+ /* XXX compare blocks against dev later on commit? */
+ reg->hr_block_bytes = tmp;
+ /* XXX is this right? */
+ reg->hr_block_bits = ffs(reg->hr_block_bytes) - 1;
- return ret;
+ return count;
}
-
+static ssize_t hb_region_start_block_read(struct hb_region *reg, char *page)
+{
+ return sprintf(page, "%lld\n", (long long)reg->hr_start_block);
+}
-static ssize_t write_disk(struct file *file, char *buf, size_t size)
+static ssize_t hb_region_start_block_write(struct hb_region *reg,
+ const char *page,
+ size_t count)
{
- hb_op *data;
- struct inode *group = NULL;
- struct file *filp = NULL;
- dev_t dev;
- int ret, tmpret;
- nm_group_inode_private *priv;
- u8 tmpmap[NM_MAX_NODES];
-
- hbprintk("write_disk\n");
+ unsigned long long tmp;
+ char *p = (char *)page;
- if (size < sizeof(*data))
- return -EINVAL;
- data = (hb_op *) buf; if (data->magic != HB_OP_MAGIC)
+ tmp = simple_strtoull(p, &p, 0);
+ if (!p || (*p && (*p != '\n')))
return -EINVAL;
- switch (data->opcode)
- {
- case HB_OP_START_DISK_HEARTBEAT:
- if (data->bits < 9 || data->bits > 12) {
- ret = sprintf(buf, "%d: bad blocksize bits! %u",
- -EINVAL, data->bits);
- break;
- }
- group = nm_get_group_by_num(data->group_num);
- if (!group || !group->u.generic_ip) {
- ret = sprintf(buf, "%d: bad group number! %u",
- -EINVAL, data->group_num);
- break;
- }
- priv = group->u.generic_ip;
- if (strncmp(priv->disk.uuid, data->disk_uuid,
- CLUSTER_DISK_UUID_LEN) != 0) {
- ret = sprintf(buf, "%d: bad disk uuid!",
- -EINVAL);
- break;
- }
- filp = fget(data->fd);
- if (!filp) {
- ret = sprintf(buf, "%d: bad fd!", -EINVAL);
- break;
- }
- dev = filp->f_dentry->d_inode->i_rdev;
- tmpret = hb_init_disk_hb_group(group, dev, data->bits,
- data->blocks,
- data->start);
- if (tmpret < 0) {
- fput(filp);
- ret = sprintf(buf, "%d: failed to init disk "
- "heartbeat for group %u!",
- -EINVAL, data->group_num);
- } else {
- ret = sprintf(buf, "0: disk heartbeat started "
- "for group %u!", data->group_num);
- }
- break;
+ /* XXX compare blocks against dev later on commit? */
+ reg->hr_start_block = tmp;
- case HB_OP_GET_NODE_MAP:
- group = nm_get_group_by_num(data->group_num);
- if (!group || !group->u.generic_ip) {
- ret = sprintf(buf, "%d: bad group number! %u",
- -EINVAL, data->group_num);
- break;
- }
-
- memset(tmpmap, 0, sizeof(tmpmap));
- if ((ret = hb_fill_node_bytemap(group, tmpmap,
- NM_MAX_NODES)) == 0) {
- ret = sprintf(buf, "0: ");
- buf += ret;
- memcpy(buf, tmpmap, sizeof(tmpmap));
- ret += sizeof(tmpmap);
- } else {
- ret = sprintf(buf, "%d: error occurred in "
- "hb_fill_node_bytemap", ret);
- }
- break;
-
- default:
- ret = sprintf(buf, "%d: bad opcode! %u", -EINVAL,
- data->opcode);
- break;
- }
-
- if (group)
- iput(group);
-
- return ret;
+ return count;
}
+static ssize_t hb_region_blocks_read(struct hb_region *reg, char *page)
+{
+ return sprintf(page, "%d\n", reg->hr_blocks);
+}
+static ssize_t hb_region_blocks_write(struct hb_region *reg,
+ const char *page,
+ size_t count)
+{
+ unsigned long long tmp;
+ char *p = (char *)page;
-extern struct file_operations transaction_ops;
+ tmp = simple_strtoull(p, &p, 0);
+ if (!p || (*p && (*p != '\n')))
+ return -EINVAL;
-/*----------------------------------------------------------------------------*/
-/*
- * populating the filesystem.
- */
-static int hb_fill_super(struct super_block * sb, void * data, int silent)
-{
- int ret;
- struct TA_write_ops *ops;
- static struct tree_descr hb_files[] = {
- [HB_Disk] = {".disk", &transaction_ops, S_IWUSR},
- /* last one */ {""}
- };
-
- ops = kmalloc(sizeof(struct TA_write_ops) +
- (sizeof(ops->write_op[0])),
- GFP_KERNEL);
- if (!ops)
+ if (tmp >= NM_MAX_NODES)
+ return -ERANGE;
+
+ reg->hr_slots = kcalloc(tmp, sizeof(struct hb_disk_slot), GFP_KERNEL);
+ if (reg->hr_slots == NULL)
return -ENOMEM;
- ops->num_ops = HB_WriteOpArraySize;
- ops->write_op[HB_Disk] = write_disk;
+ reg->hr_blocks = tmp;
- hbprintk("calling simple_fill_super...\n");
- ret = simple_fill_super(sb, 0x5551212f, hb_files);
- if (ret >= 0)
- TA_GENERIC_SB_MEMBER(sb) = ops;
- else
- kfree(ops);
- return ret;
+ return count;
}
-/* blindly copied from ocfs2 */
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
-static struct super_block *hb_get_sb(struct file_system_type *fs_type,
- int flags,
- const char *dev_name,
- void *data)
+static ssize_t hb_region_dev_read(struct hb_region *reg, char *page)
{
- return get_sb_single(fs_type, flags, data, hb_fill_super);
+ const char *str = bdevname(reg->hr_bdev, page);
+ return sprintf(page, "%s\n", str);
}
-static struct file_system_type hb_fs_type = {
- .owner = THIS_MODULE,
- .name = "hb",
- .get_sb = hb_get_sb,
- .kill_sb = kill_anon_super,
-};
-#else
-static struct super_block *hb_read_super(struct super_block *sb,
- void *data,
- int silent)
+/* this is acting as commit; we set up all of hr_bdev and hr_task or nothing */
+static ssize_t hb_region_dev_write(struct hb_region *reg, const char *page,
+ size_t count)
{
- hbprintk("welcome to hb_read_super!!!\n");
- return (hb_fill_super(sb, data, silent) < 0) ? NULL : sb;
-}
-static DECLARE_FSTYPE (hb_fs_type, "hb", hb_read_super, FS_SINGLE|FS_LITTER);
-#endif
+ long fd;
+ char *p = (char *)page;
+ struct file *filp = NULL;
+ struct inode *inode = NULL;
+ ssize_t ret = -EINVAL;
+ struct hb_disk_slot *slot;
+ int i;
-/* TODO: make callbacks all return int */
-static void hb_nm_group_node_add_cb(void *ptr1, void *ptr2, u8 idx)
-{
- hb_disk_slot *slot;
- struct inode *group = ptr1;
- struct inode *node = ptr2;
- cluster_disk *disk;
- nm_group_inode_private *priv;
- int ino, ret = 0;
- u64 block;
+ fd = simple_strtol(p, &p, 0);
+ if (!p || (*p && (*p != '\n')))
+ goto out;
- hbprintk("hb_nm_group_node_add_cb: group=%lu, node=%lu, idx=%u\n",
- group->i_ino, node->i_ino, idx);
+ if (fd < 0 || fd >= INT_MAX)
+ goto out;
- down(&group->i_sem);
- priv = group->u.generic_ip;
- if (!priv) {
- hbprintk("eek! bad group inode!\n");
- goto leave;
- }
- disk = &priv->disk;
- if (disk->uuid[0]) {
- ret = util_resize_rarray(&disk->slots, idx+1);
- if (ret < 0) {
- hbprintk("eeeeeeek!!!! failed to resize disk state "
- "data\n");
- goto leave;
- }
-
- ino = nm_get_node_global_index(node);
- if (ino > disk->num_blocks) {
- hbprintk("disk heartbeat area does not have enough "
- "blocks!\n");
- goto leave;
- }
- block = ino + disk->start_block;
-
- slot = util_rarray_idx_to_slot(&disk->slots, idx);
- if (!slot) {
- hbprintk("eeeeeeek!!!! failed to get disk state data "
- "pointer: %d\n", idx);
- goto leave;
- }
- slot->inode = igrab(node);
- slot->last_time = 0;
- slot->margin = HB_INITIAL_DISK_MARGIN;
-#warning needs to change for 2.6
- slot->bh = hb_getblk(block, disk);
- slot->state = HB_NODE_STATE_INIT;
- } else {
- hbprintk("doing nothing for group add for non-disk heartbeat "
- "group\n");
- }
-
-leave:
- up(&group->i_sem);
- return;
-}
+ filp = fget(fd);
+ if (filp == NULL)
+ goto out;
-static void hb_nm_group_node_del_cb(void *ptr1, void *ptr2, u8 idx)
-{
- hb_disk_slot *slot;
- struct inode *group = ptr1;
- struct inode *node = ptr2;
- cluster_disk *disk;
- nm_group_inode_private *priv;
- int ret = -EINVAL;
+ if (reg->hr_blocks == 0 || reg->hr_slots == NULL)
+ goto out;
- hbprintk("hb_nm_group_node_del_cb: group=%lu, node=%lu, idx=%u\n",
- group->i_ino, node->i_ino, idx);
+ inode = igrab(filp->f_mapping->host);
+ if (inode == NULL)
+ goto out;
+ reg->hr_bdev = I_BDEV(filp->f_mapping->host);
- down(&group->i_sem);
- priv = group->u.generic_ip;
- if (!priv) {
- hbprintk("eek! bad group inode!\n");
- goto leave;
+ ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, 0);
+ if (ret) {
+ reg->hr_bdev = NULL;
+ goto out;
}
- disk = &priv->disk;
- slot = util_rarray_idx_to_slot(&disk->slots, idx);
- if (!slot) {
- hbprintk("eeeeeeek!!!! failed to get disk state data "
- "pointer: %d\n", idx);
- goto leave;
+ inode = NULL;
+
+ for(i = 0; i < reg->hr_blocks; i++) {
+ slot = ®->hr_slots[i];
+ slot->ds_block = reg->hr_start_block + i;
+ slot->ds_node_num = i;
+ slot->ds_margin = HB_INITIAL_DISK_MARGIN;
+ INIT_LIST_HEAD(&slot->ds_alive_item);
+ INIT_LIST_HEAD(&slot->ds_dead_item);
}
- if (slot->inode!=node) {
- hbprintk("eeeeeeek!!!! node inode changed!\n");
- goto leave;
+
+ reg->hr_task = kthread_run(hb_thread, reg, "hb-%s",
+ reg->hr_kobj.k_name);
+ if (IS_ERR(reg->hr_task)) {
+ reg->hr_task = NULL;
+ goto out;
}
- iput(node);
- if (slot->bh) {
- wait_on_buffer(slot->bh);
- brelse(slot->bh);
- }
- memset(slot, 0, sizeof(hb_disk_slot));
- ret = 0;
-leave:
- up(&group->i_sem);
- hbprintk("hb_nm_group_node_del_cb done: %d\n", ret);
- return;
-}
+ down_write(&hb_callback_sem);
+ list_add_tail(®->hr_active_item, &hb_active_regions);
+ up_write(&hb_callback_sem);
-static void hb_nm_node_add_cb(void *ptr1, void *ptr2, u8 idx)
-{
- //struct inode *node = ptr1;
+ ret = count;
+out:
+ if (filp)
+ fput(filp);
+ if (inode)
+ iput(inode);
+ if (ret < 0) {
+ if (reg->hr_bdev) {
+ blkdev_put(reg->hr_bdev);
+ reg->hr_bdev = NULL;
+ }
+ }
+ return ret;
}
+struct hb_region_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct hb_region *, char *);
+ ssize_t (*store)(struct hb_region *, const char *, size_t);
+};
-static void hb_nm_group_add_cb(void *ptr1, void *ptr2, u8 idx)
-{
- struct inode *group = ptr1;
- nm_group_inode_private *priv;
+static struct hb_region_attribute hb_region_attr_block_bytes = {
+ .attr = { .name = "block_bytes", .mode = S_IRUGO | S_IWUSR },
+ .show = hb_region_block_bytes_read,
+ .store = hb_region_block_bytes_write,
+};
+static struct hb_region_attribute hb_region_attr_start_block = {
+ .attr = { .name = "start_block", .mode = S_IRUGO | S_IWUSR },
+ .show = hb_region_start_block_read,
+ .store = hb_region_start_block_write,
+};
+static struct hb_region_attribute hb_region_attr_blocks = {
+ .attr = { .name = "blocks", .mode = S_IRUGO | S_IWUSR },
+ .show = hb_region_blocks_read,
+ .store = hb_region_blocks_write,
+};
+static struct hb_region_attribute hb_region_attr_dev = {
+ .attr = { .name = "dev", .mode = S_IRUGO | S_IWUSR },
+ .show = hb_region_dev_read,
+ .store = hb_region_dev_write,
+};
- hbprintk("hb_nm_group_add_cb: group=%lu, idx=%u\n",
- group->i_ino, idx);
-
- priv = group->u.generic_ip;
- if (!priv) {
- hbprintk("eek! bad group inode!\n");
- return;
- }
+static struct attribute *hb_region_default_attrs[] = {
+ &hb_region_attr_block_bytes.attr,
+ &hb_region_attr_start_block.attr,
+ &hb_region_attr_blocks.attr,
+ &hb_region_attr_dev.attr,
+ NULL,
+};
- spin_lock(&hb_lock);
- list_add_tail(&priv->net_list, &hb_net_groups);
- if (priv->disk.uuid[0]) {
- hbprintk("adding priv=%p inode=%p to disk group list\n",
- priv, group);
- list_add_tail(&priv->disk_list, &hb_disk_groups);
- }
- spin_unlock(&hb_lock);
+static ssize_t hb_region_show(struct kobject *kobj,
+ struct attribute *attr,
+ char *page)
+{
+ struct hb_region *reg = to_hb_region(kobj);
+ struct hb_region_attribute *hb_region_attr =
+ container_of(attr, struct hb_region_attribute, attr);
+ ssize_t ret = 0;
+
+ if (hb_region_attr->show)
+ ret = hb_region_attr->show(reg, page);
+ return ret;
}
-static int __init init_hb(void)
+static ssize_t hb_region_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *page, size_t count)
{
- int retval=-1, i;
- hbprintk("loading heartbeat module: nodename is %s\n", nm_nodename);
+ struct hb_region *reg = to_hb_region(kobj);
+ struct hb_region_attribute *hb_region_attr =
+ container_of(attr, struct hb_region_attribute, attr);
+ ssize_t ret = -EINVAL;
- if (proc_mkdir("cluster/heartbeat", 0)) {
- // ???
- }
+ if (hb_region_attr->store)
+ ret = hb_region_attr->store(reg, page, count);
+ return ret;
+}
- //hb_net_timestamps = __get_free_page(GFP_KERNEL);
- //if (!hb_net_timestamps)
- // goto done;
+struct sysfs_ops hb_region_sysfs_ops = {
+ .show = &hb_region_show,
+ .store = &hb_region_store,
+};
- for (i=HB_NODE_DOWN_CB; i<HB_NUM_CB; i++) {
- INIT_LIST_HEAD(&hb_callbacks[i].list);
- init_MUTEX(&hb_callbacks[i].sem);
- }
+static struct ukobj_type hb_region_type = {
+ .ktype = {
+ .release = hb_region_release,
+ .sysfs_ops = &hb_region_sysfs_ops,
+ .default_attrs = hb_region_default_attrs,
+ },
+ .owner = THIS_MODULE,
+};
- if (nm_register_callback(NM_GROUP_NODE_DEL_CB, hb_nm_group_node_del_cb))
- goto done;
- if (nm_register_callback(NM_GROUP_NODE_ADD_CB, hb_nm_group_node_add_cb))
- goto done;
- if (nm_register_callback(NM_NODE_ADD_CB, hb_nm_node_add_cb))
- goto done;
- if (nm_register_callback(NM_GROUP_ADD_CB, hb_nm_group_add_cb))
- goto done;
-
- if (hb_launch_thread() < 0)
- goto done;
-
- retval = register_filesystem(&hb_fs_type);
-done:
- if (retval)
- hb_teardown();
- return retval;
-}
+/* heartbeat set */
-static void __exit exit_hb(void)
+struct hb_heartbeat_set {
+ struct ukset hs_ukset;
+ /* some stuff? */
+};
+
+static struct hb_heartbeat_set *to_hb_heartbeat_set(struct kset *kset)
{
- hb_complete_thread();
- hb_teardown();
- unregister_filesystem(&hb_fs_type);
- hbprintk("unloading heartbeat module\n");
+ return kset ?
+ container_of(to_ukset(kset), struct hb_heartbeat_set, hs_ukset)
+ : NULL;
}
-static void hb_teardown(void)
+static struct kobject *hb_heartbeat_set_make_object(struct kset *kset,
+ const char *name)
{
- nm_unregister_callback(NM_GROUP_NODE_DEL_CB, hb_nm_group_node_del_cb);
- nm_unregister_callback(NM_GROUP_NODE_ADD_CB, hb_nm_group_node_add_cb);
- nm_unregister_callback(NM_NODE_ADD_CB, hb_nm_node_add_cb);
- nm_unregister_callback(NM_GROUP_ADD_CB, hb_nm_group_add_cb);
- remove_proc_entry("cluster/heartbeat", NULL);
+ struct hb_region *reg = NULL;
+ struct kobject *ret = NULL;
- /* XXX make sure that we're not being called from any more active
- * nm callbacks, then teardown hb_callbacks */
- //if (hb_net_timestamps)
- // kfree(hb_net_timestamps);
-}
+ printk("trying to make a heartbeat object\n");
-MODULE_LICENSE("GPL");
-module_init(init_hb)
-module_exit(exit_hb)
+ reg = kcalloc(1, sizeof(struct hb_region), GFP_KERNEL);
+ if (reg == NULL)
+ goto out; /* ENOMEM */
-/*
- * hb_fill_node_bytemap()
- * 255 bytes... each byte set to 0 (not mounted) or 1 (mounted)
- *
- */
-static int hb_fill_node_bytemap(struct inode *group, void *map, int size)
-{
- hb_disk_slot *slot;
- int idx = 0;
- nm_group_inode_private *priv;
- u8 *bytemap = (u8 *)map;
-
- priv = group->u.generic_ip;
+ INIT_LIST_HEAD(®->hr_active_item);
+ rwlock_init(®->hr_slot_list_lock);
+ INIT_LIST_HEAD(®->hr_alive_list);
+ INIT_LIST_HEAD(®->hr_dead_list);
- down(&group->i_sem);
+ kobject_set_name(®->hr_kobj, name);
+ reg->hr_kobj.ktype = &hb_region_type.ktype;
+ kobject_init(®->hr_kobj);
- if (priv->disk.uuid[0]) {
- while ((slot = nm_iterate_group_disk_slots(group, &idx))) {
- if (idx >= size-1) {
- hbprintk("map size (%d) too small for "
- "index (%d)\n", size, idx);
- up(&group->i_sem);
- return -EINVAL;
- }
- if (slot->state == HB_NODE_STATE_UP)
- bytemap[idx] = 1;
- idx++;
- }
- } else {
- hbprintk("filling straight from slot bitmap for non-disk "
- "heartbeat group\n");
- idx = 0;
- while ((idx = find_next_bit(priv->slot_bitmap, NM_MAX_NODES,
- idx)) != -1) {
- if (idx == NM_MAX_NODES)
- break;
- bytemap[idx] = 1;
- }
- }
+ ret = ®->hr_kobj;
- up(&group->i_sem);
+out:
+ if (ret == NULL)
+ kfree(reg);
- return 0;
+ return ret;
}
-int hb_fill_node_map(struct inode *group, void *map, int size)
+static void hb_heartbeat_set_drop_object(struct kset *kset,
+ struct kobject *kobj)
{
- hb_disk_slot *slot;
- int idx = 0;
- nm_group_inode_private *priv;
-
- priv = group->u.generic_ip;
+ struct hb_region *reg = to_hb_region(kobj);
- memset(map, 0, size);
- down(&group->i_sem);
+ printk("dropping hr_region %p\n", reg);
- if (priv->disk.uuid[0]) {
- while ((slot = nm_iterate_group_disk_slots(group, &idx))) {
- if (idx >= size-1) {
- hbprintk("map size (%d) too small for "
- "index (%d)\n", size, idx);
- up(&group->i_sem);
- return -EINVAL;
- }
- if (slot->state == HB_NODE_STATE_UP)
- set_bit(idx, map);
- idx++;
- }
- } else {
- hbprintk("filling straight from slot bitmap for non-disk "
- "heartbeat group\n");
- memcpy(map, priv->slot_bitmap, size);
+ down_read(&hb_callback_sem);
+ if (!list_empty(®->hr_active_item))
+ list_del_init(®->hr_active_item);
+ up_read(&hb_callback_sem);
+
+ /* stop the thread when the user removes the region dir */
+ if (reg->hr_task) {
+ kthread_stop(reg->hr_task);
+ reg->hr_task = NULL;
}
- up(&group->i_sem);
+ kobject_put(kobj);
+}
- return 0;
+struct ukobj_type hb_heartbeat_set_type = {
+ .ktype = {
+ .sysfs_ops = NULL, /* no attributes */
+ },
+ .make_object = hb_heartbeat_set_make_object,
+ .drop_object = hb_heartbeat_set_drop_object,
+ .owner = THIS_MODULE,
+};
+
+/* this is just here to avoid touching ukset in heartbeat.h which the
+ * entire damn world #includes */
+struct kset *hb_alloc_hb_set(void)
+{
+ struct hb_heartbeat_set *hs = NULL;
+ struct kset *ret = NULL;
+
+ hs = kcalloc(1, sizeof(struct hb_heartbeat_set), GFP_KERNEL);
+ if (hs == NULL)
+ goto out;
+
+ ukset_init_type_name(&hs->hs_ukset, "heartbeat",
+ &hb_heartbeat_set_type.ktype);
+
+ ret = &hs->hs_ukset.kset;
+out:
+ if (ret == NULL)
+ kfree(hs);
+ return ret;
}
-EXPORT_SYMBOL(hb_fill_node_map);
+void hb_free_hb_set(struct kset *kset)
+{
+ struct hb_heartbeat_set *hs = to_hb_heartbeat_set(kset);
+ kfree(hs);
+}
+
+
+/* hb callback registration and issueing */
+
static struct hb_callback *hbcall_from_type(int type)
{
if (type < HB_NODE_DOWN_CB || type >= HB_NUM_CB)
@@ -964,7 +860,7 @@
}
EXPORT_SYMBOL(hb_unregister_callback);
-static void hb_do_callbacks(int type, void *ptr1, void *ptr2, int idx)
+static void hb_do_callbacks(int type, struct nm_node *node, int idx)
{
struct list_head *iter;
struct hb_callback_func *f;
@@ -974,15 +870,19 @@
if (IS_ERR(hbcall))
return;
+ /* XXX not interruptible? this is in the hb thread.. */
+ down_write(&hb_callback_sem);
+
if (down_interruptible(&hbcall->sem)) {
hbprintk("missed hb callback(%d) due to EINTR!\n", type);
- return;
+ goto out;
}
list_for_each(iter, &hbcall->list) {
f = list_entry(iter, struct hb_callback_func, hc_item);
- (f->hc_func)(ptr1, ptr2, idx, f->hc_data);
+ (f->hc_func)(node, idx, f->hc_data);
}
-
up(&hbcall->sem);
+out:
+ up_write(&hb_callback_sem);
}
Modified: trunk/fs/ocfs2/cluster/heartbeat.h
===================================================================
--- trunk/fs/ocfs2/cluster/heartbeat.h 2005-03-18 04:25:45 UTC (rev 2005)
+++ trunk/fs/ocfs2/cluster/heartbeat.h 2005-03-18 06:01:10 UTC (rev 2006)
@@ -30,48 +30,15 @@
#include "ocfs2_heartbeat.h"
-enum {
- HB_NODE_STATE_INIT = 0,
- HB_NODE_STATE_DOWN,
- HB_NODE_STATE_UP
-};
-
-struct _heartbeat_ctxt
-{
- int dummy;
-};
-
-typedef struct _hb_disk_slot
-{
- struct inode *inode;
- struct buffer_head *bh;
- struct list_head list;
- unsigned long last_time;
- u16 margin;
- u16 state;
-} hb_disk_slot;
-
-
-
-#define HB_THREAD_MS 2000 // every 2 seconds
-
-
-enum {
- HB_TYPE_DISK = 0,
- HB_TYPE_NET
-};
-
-
/* callback stuff */
-
enum {
HB_NODE_DOWN_CB = 0,
HB_NODE_UP_CB,
- HB_NODE_RESPONDED_CB, // this one is very chatty
HB_NUM_CB
};
-typedef void (hb_cb_func)(struct inode *, struct inode *, int, void *);
+struct nm_node;
+typedef void (hb_cb_func)(struct nm_node *, int, void *);
struct hb_callback_func {
struct list_head hc_item;
@@ -81,28 +48,18 @@
int hc_type;
};
-enum {
- HB_Root = 1,
- HB_Disk,
- HB_WriteOpArraySize
-};
-
-
-// number of initial allowed misses
-#define HB_INITIAL_DISK_MARGIN 60
-#define HB_INITIAL_NET_MARGIN 60
-
// number of allowed misses in steady state
+#define HB_INITIAL_DISK_MARGIN 60
#define HB_DISK_MARGIN 30
-#define HB_NET_MARGIN 30
+struct kset *hb_alloc_hb_set(void);
+void hb_free_hb_set(struct kset *kset);
void hb_setup_callback(struct hb_callback_func *hc, int type, hb_cb_func *func,
void *data, int priority);
int hb_register_callback(struct hb_callback_func *hc);
int hb_unregister_callback(struct hb_callback_func *hc);
-int hb_fill_node_map(struct inode *group, void *map, int size);
+void hb_fill_node_map(unsigned long *map, unsigned bytes);
+void hb_init(void);
-
-
#endif /* CLUSTER_HEARTBEAT_H */
Modified: trunk/fs/ocfs2/cluster/nodemanager.c
===================================================================
--- trunk/fs/ocfs2/cluster/nodemanager.c 2005-03-18 04:25:45 UTC (rev 2005)
+++ trunk/fs/ocfs2/cluster/nodemanager.c 2005-03-18 06:01:10 UTC (rev 2006)
@@ -1,12 +1,8 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
- * nodemanager.c
+ * Copyright (C) 2004, 2005 Oracle. All rights reserved.
*
- * totally lame static node management placeholder
- *
- * Copyright (C) 2004 Oracle. All rights reserved.
- *
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
@@ -21,14 +17,10 @@
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
- *
- * Authors: Kurt Hackel
*/
-
#include "cl_compat.h"
#include "util.h"
-#include "clcommon.h"
#include <linux/module.h>
#include <linux/kernel.h>
@@ -64,6 +56,7 @@
#include <linux/pagemap.h>
#include <linux/hash.h>
#include <linux/bitops.h>
+#include <linux/kobject.h>
#include <asm/uaccess.h>
@@ -73,116 +66,17 @@
#include "compat_libfs.h"
#include "transaction_file.h"
+#include "usysfs.h"
#ifndef __user
#define __user
#endif
+/* for now we operate under the assertion that there can be only one
+ * cluster active at a time. Changing this will require trickling
+ * cluster references throughout where nodes are looked up */
+static struct nm_cluster *nm_single_cluster = NULL;
-/*
- * This nm module is similar to nfsd/nfsctl.c in that it uses
- * transaction files (in /proc/cluster/nm) to communicate with
- * the kernel module instead of ioctls or other means.
- *
- * Files involved:
- * /proc/cluster/nm/cluster - used to create/destroy cluster, adds
- * nodes/groups to the cluster, queries info
- * about the cluster
- * /proc/cluster/nm/group - adds/removes nodes from a group, queries
- * info about a group
- * /proc/cluster/nm/node - changes info for a node, queries info about
- * a node
- *
- * This nm implementation basically allows this node to live in exactly one
- * cluster. All "clustered" nodes that are known to this node should be
- * added to the cluster, and all nodes should see the same list of nodes in
- * the same order at all times. The "slot" number given to a node in this
- * global cluster list is fixed and never changes. Groups can be dynamically
- * created within a cluster (TODO: currently static only) and be made up of
- * one or more nodes (listed at most once) in the global list. A node may exist
- * in many groups. Also, a group may have an optional disk UUID which is simply
- * stored for later use by the heartbeat service. (The heartbeat service will
- * do disk heartbeating only for those groups with valid UUIDs.)
- *
- * USAGE:
- * For our purposes, the nm service can be autoloaded by an fstab entry or
- * manually through mount (mount -t nm none /proc/cluster/nm). Once that is
- * done, an init script (or single executable on an initrd) should be run to
- * create the static cluster info, possibly from a file like /etc/nm.conf or
- * similar. We should probably create a "dlm" or "everyone" group (with NO disk
- * heartbeating) so that the dlm service can be used with the network only.
- * This group should contain all known nodes. After this is done, the net, hb
- * and dlm modules can come up. The nm service is now ready for use, since
- * groups don't need to be created till later.
- *
- * A group services daemon can be written (by someone!? ;-) to run at this
- * point. Since the "dlm" group has everything it needs for full dlmming (since
- * it uses only network), the dlm itself can be used to arbitrate for group
- * creation, and additions/deletions from groups. Callbacks should be
- * registered with nm by other services that care on each of these events. For
- * instance, heartbeat should register a callback with nm for group creation,
- * and addition and deletion from a group so that it can make any necessary
- * changes to its heartbeating (primarily so that it can begin/end disk
- * heartbeat for any group/node that needs it).
- *
- * NOTE NOTE NOTE !!!!:
- * This is intended to be a quickie implementation. (translation: lame) I do
- * not want to step on anyone's toes who may have implemented something wayyy
- * better. If something out there "wins", we will plug into that instead. If
- * nothing really takes off, we at least have a (lame) reference to work off of.
- * However, since this implementation exists solely to make ocfs2 work, and one
- * of the major advantages of ocfs version 1 was ease of setup, we don't want
- * to move to something substantially more complicated than this (one conf
- * file).
- *
- */
-
-
-
-/* globals */
-nm_cluster cluster;
-struct super_block *single_sb;
-char *nm_nodename;
-EXPORT_SYMBOL(nm_nodename);
-static spinlock_t nm_lock = SPIN_LOCK_UNLOCKED;
-static DECLARE_MUTEX(nm_cb_sem);
-struct list_head nm_callbacks[NM_NUM_CB];
-
-
-static void nm_teardown(void);
-static int nm_create_cluster(char *buf);
-static void nm_init_cluster(nm_cluster *cluster);
-int nm_create_node(char *buf, nm_op *data);
-int nm_name_cluster(char *buf, nm_op *data);
-int nm_destroy_cluster(char *buf);
-int nm_get_cluster_num_nodes(char *buf);
-int nm_get_cluster_num_groups(char *buf);
-int nm_get_node_info(char *buf, nm_op *data);
-int nm_get_group_info(char *buf, nm_op *data);
-nm_cluster *nm_get_cluster(void);
-struct inode *nm_get_node_by_name(char *node_name);
-static void nm_do_callbacks(int type, void *ptr1, void *ptr2, u8 idx);
-
-/* support for adding files, dirs, hardlinks in /proc/cluster/nm/... */
-extern struct file_operations simple_dir_operations;
-extern struct inode_operations simple_dir_inode_operations;
-
-static inline int nm_find_next_slot(void *bitmap, int max, int request);
-static struct dentry * nm_add_file(struct super_block *s, struct dentry *parent, struct tree_descr *file, int ino);
-static struct dentry * nm_add_link(struct super_block *s, struct dentry *parent, struct tree_descr *file, int ino);
-
-static ssize_t write_node(struct file *file, char *buf, size_t size);
-static ssize_t write_group(struct file *file, char *buf, size_t size);
-static ssize_t write_cluster(struct file *file, char *buf, size_t size);
-
-static u8 nm_get_group_index(struct inode *group, struct inode *inode, struct dentry **child);
-
-#define NM_HASH_BITS 7
-#define NM_HASH_SIZE (1 << NM_HASH_BITS)
-#define NM_HASH_MASK (NM_HASH_SIZE - 1)
-
-
-
#define nmprintk(x, arg...) printk("(nm:%d) " x, current->pid, ##arg)
#define nmprintk0(x) printk("(nm:%d) " x, current->pid)
#if 0
@@ -190,1171 +84,603 @@
#define nmprintk0(x)
#endif
-static struct list_head *nm_ip_hash = NULL;
-static spinlock_t nm_ip_hash_lock;
+struct nm_cluster {
+ struct ukset cl_ukset;
+ unsigned cl_has_local:1;
+ u8 cl_local_node;
+ rwlock_t cl_nodes_lock;
+ struct nm_node *cl_nodes[NM_MAX_NODES];
+ struct rb_root cl_node_ip_tree;
+};
-static int nm_init_ip_hash(void);
-static void nm_destroy_ip_hash(void);
+struct nm_node * nm_get_node_by_num(u8 node_num)
+{
+ struct nm_node *node = NULL;
+ if (node_num >= NM_MAX_NODES || nm_single_cluster == NULL)
+ goto out;
-static void nm_destroy_ip_hash(void)
-{
- int i;
- if (!nm_ip_hash)
- return;
- for (i=0; i<NM_HASH_SIZE; i++) {
- /* TODO: cleanup */
- }
- free_page((unsigned long)nm_ip_hash);
+ read_lock(&nm_single_cluster->cl_nodes_lock);
+ node = nm_single_cluster->cl_nodes[node_num];
+ if (node) /* XXX get a ref */
+ ;
+ read_unlock(&nm_single_cluster->cl_nodes_lock);
+out:
+ return node;
}
+EXPORT_SYMBOL(nm_get_node_by_num);
-static int nm_init_ip_hash(void)
+static struct nm_node * nm_node_ip_tree_lookup(struct nm_cluster *cluster,
+ u32 ip_needle,
+ struct rb_node ***ret_p,
+ struct rb_node **ret_parent)
{
- int i;
-
- if ((PAGE_SIZE / sizeof(struct list_head)) < NM_HASH_SIZE) {
- nmprintk("eek! hash size too big for this arch!\n");
- BUG();
- }
+ struct rb_node **p = &cluster->cl_node_ip_tree.rb_node;
+ struct rb_node *parent = NULL;
+ struct nm_node *node, *ret = NULL;
- nm_ip_hash = (struct list_head *) __get_free_page(GFP_KERNEL);
- if (!nm_ip_hash)
- return -ENOMEM;
- for (i=0; i<NM_HASH_SIZE; i++)
- INIT_LIST_HEAD(&nm_ip_hash[i]);
- spin_lock_init(&nm_ip_hash_lock);
- return 0;
-}
+ while (*p) {
+ parent = *p;
+ node = rb_entry(parent, struct nm_node, nd_ip_node);
+ if (ip_needle < node->nd_ipv4_address)
+ p = &(*p)->rb_left;
+ else if (ip_needle > node->nd_ipv4_address)
+ p = &(*p)->rb_right;
+ else {
+ ret = node;
+ break;
+ }
+ }
+
+ if (ret_p != NULL)
+ *ret_p = p;
+ if (ret_parent != NULL)
+ *ret_parent = parent;
-
-
-
-static inline int nm_find_next_slot(void *bitmap, int max, int request)
-{
- int start = 0, slot_num;
- if (request != NM_INVALID_SLOT_NUM)
- start = request;
- slot_num = find_next_zero_bit (bitmap, max, start);
- if (slot_num >= max)
- return -1;
- if (request != NM_INVALID_SLOT_NUM && slot_num != request)
- return -1;
- set_bit(slot_num, bitmap);
- return slot_num;
+ return ret;
}
-
-
-
-static struct dentry * nm_add_file(struct super_block *s, struct dentry *parent,
- struct tree_descr *file, int ino)
+struct nm_node * nm_get_node_by_ip(u32 addr)
{
- struct qstr name;
- struct dentry *dentry = ERR_PTR(-EINVAL);
- struct inode *inode;
+ struct nm_node *node = NULL;
+ struct nm_cluster *cluster = nm_single_cluster;
- if (!file->name)
+ if (cluster == NULL)
goto out;
- name.name = file->name;
- name.len = strlen(name.name);
- nmprintk("adding file %.*s\n", name.len, name.name);
- name.hash = full_name_hash(name.name, name.len);
- dentry = d_alloc(parent, &name);
- if (!dentry) {
- dentry = ERR_PTR(-EINVAL);
- goto out;
- }
- inode = new_inode(s);
- if (!inode) {
- dput(dentry);
- dentry = ERR_PTR(-EINVAL);
- goto out;
- }
- inode->i_mode = file->mode;
- inode->i_uid = inode->i_gid = 0;
- inode->i_blksize = PAGE_CACHE_SIZE;
- inode->i_blocks = 0;
- inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
- if (file->mode & S_IFDIR) {
- inode->i_op = &simple_dir_inode_operations;
- inode->i_fop = &simple_dir_operations;
- } else {
- inode->i_fop = file->ops;
- }
- inode->i_ino = ino;
- insert_inode_hash(inode);
- d_add(dentry, inode);
+ read_lock(&cluster->cl_nodes_lock);
+ node = nm_node_ip_tree_lookup(cluster, addr, NULL, NULL);
+ if (node) /* XXX get a ref */
+ ;
+ read_unlock(&cluster->cl_nodes_lock);
+
out:
- return dentry;
+ return node;
}
+EXPORT_SYMBOL(nm_get_node_by_ip);
+void nm_node_put(struct nm_node *node)
+{
+ /* XXX do something */
+}
+EXPORT_SYMBOL(nm_node_put);
-static struct dentry * nm_add_link(struct super_block *s, struct dentry *parent,
- struct tree_descr *file, int ino)
+u8 nm_this_node(void)
{
- struct qstr name;
- struct dentry *dentry = ERR_PTR(-EINVAL);
- struct inode *inode;
+ u8 node_num = NM_MAX_NODES;
- if (!file->name)
- goto out;
- name.name = file->name;
- name.len = strlen(name.name);
- nmprintk("adding link %.*s\n", name.len, name.name);
- name.hash = full_name_hash(name.name, name.len);
- dentry = d_alloc(parent, &name);
- if (!dentry) {
- nmprintk("failed to d_alloc\n");
- dentry = ERR_PTR(-EINVAL);
- goto out;
- }
- inode = iget(s, ino);
- if (!inode) {
- nmprintk("failed to iget\n");
- dput(dentry);
- dentry = ERR_PTR(-EINVAL);
- goto out;
- }
- if (!inode->u.generic_ip) {
- nmprintk("bad inode: %d\n", ino);
- iput(inode);
- dput(dentry);
- dentry = ERR_PTR(-EINVAL);
- goto out;
- }
- inode->i_nlink++;
- d_add(dentry, inode);
+ if (nm_single_cluster && nm_single_cluster->cl_has_local)
+ node_num = nm_single_cluster->cl_local_node;
-out:
- return dentry;
+ return node_num;
}
+EXPORT_SYMBOL(nm_this_node);
+/* node usysfs bits */
+static struct nm_cluster *to_nm_cluster(struct kobject *kobj)
+{
+ return kobj ?
+ container_of(to_ukset(to_kset(kobj)), struct nm_cluster,
+ cl_ukset)
+ : NULL;
+}
+static struct nm_node *to_nm_node(struct kobject *kobj)
+{
+ return kobj ? container_of(kobj, struct nm_node, nd_kobj) : NULL;
+}
+static void nm_node_release(struct kobject *kobj)
+{
+ struct nm_node *node = to_nm_node(kobj);
+ printk("releasing node %p\n", node);
+}
-/* cluster, node and group transaction files.
- * here's where the actual work of nm takes place. */
+static ssize_t nm_node_num_read(struct nm_node *node, char *page)
+{
+ return sprintf(page, "%d\n", node->nd_num);
+}
-static int nm_create_cluster(char *buf)
+struct nm_cluster *to_nm_cluster_from_node(struct nm_node *node)
{
- int ret = -EINVAL;
-
- nmprintk("create cluster...\n");
-
- spin_lock(&nm_lock);
- if (cluster.state == NM_CLUSTER_UP) {
- ret = sprintf(buf, "%d: cluster already up\n", -EINVAL);
- } else {
- cluster.state = NM_CLUSTER_UP;
- ret = sprintf(buf, "0: cluster state: UP");
- }
- spin_unlock(&nm_lock);
- return ret;
+ /* through the first node_set .parent
+ * mycluster/nodes/mynode == nm_cluster->nm_node_set->nm_node */
+ return to_nm_cluster(node->nd_kobj.parent->parent);
}
-
-
-int nm_create_group(char *buf, nm_op *data)
+static ssize_t nm_node_num_write(struct nm_node *node, const char *page,
+ size_t count)
{
- struct tree_descr desc;
- struct dentry *dentry = NULL;
- struct inode *inode = NULL;
- int ino, group_num;
- int ret = -EINVAL;
- nm_group_inode_private *g = NULL;
+ struct nm_cluster *cluster = to_nm_cluster_from_node(node);
+ unsigned long tmp;
+ char *p = (char *)page;
- nmprintk("create group...\n");
+ tmp = simple_strtoul(p, &p, 0);
+ if (!p || (*p && (*p != '\n')))
+ return -EINVAL;
- data->arg_u.gc.name[NM_MAX_NAME_LEN] = '\0';
- inode = nm_get_group_by_name(data->arg_u.gc.name);
- if (inode) {
- ret = sprintf(buf, "%d: group %u (%s) already exists", -EEXIST,
- nm_get_group_global_index(inode),
- data->arg_u.gc.name);
- iput(inode);
- return ret;
- }
+ if (tmp >= NM_MAX_NODES)
+ return -ERANGE;
- group_num = data->arg_u.gc.group_num;
- if (group_num > NM_INVALID_SLOT_NUM)
- goto leave;
-
- spin_lock(&cluster.bitmap_lock);
- group_num = nm_find_next_slot(&(cluster.group_bitmap[0]), 255,
- group_num);
- spin_unlock(&cluster.bitmap_lock);
-
- if (group_num < 0) {
- nmprintk("out of group slots!\n");
- goto leave;
+ write_lock(&cluster->cl_nodes_lock);
+ if (cluster->cl_nodes[tmp])
+ p = NULL;
+ else {
+ cluster->cl_nodes[tmp] = node;
+ node->nd_num = tmp;
}
+ write_unlock(&cluster->cl_nodes_lock);
+ if (p == NULL)
+ return -EEXIST;
- ino = group_num + NM_GROUP_INODE_START;
-
- desc.name = data->arg_u.gc.name;
- desc.ops = NULL;
- desc.mode = S_IFDIR | 0755;
- dentry = nm_add_file(single_sb, single_sb->s_root, &desc, ino);
- if (IS_ERR(dentry))
- goto leave;
- inode = igrab(dentry->d_inode);
- if (!inode) {
- nmprintk("igrab failed!\n");
- goto leave;
- }
-
- g = kmalloc(sizeof(nm_group_inode_private), GFP_KERNEL);
- if (!g)
- goto leave;
-
- memset(g, 0, sizeof(nm_group_inode_private));
- memcpy(g->disk.uuid, data->arg_u.gc.disk_uuid, CLUSTER_DISK_UUID_LEN);
- spin_lock_init(&g->bitmap_lock);
- if (g->disk.uuid[0])
- g->state = NM_GROUP_NOT_READY;
- else
- g->state = NM_GROUP_READY;
- g->inode = inode;
- inode->u.generic_ip = g;
-
- ret = sprintf(buf, "0: group %u (%s) added, uuid: %s", group_num,
- data->arg_u.gc.name, g->disk.uuid);
- nm_do_callbacks(NM_GROUP_ADD_CB, inode, NULL, group_num);
-
-leave:
- if (ret < 0) {
- if (inode) {
- if (inode->u.generic_ip)
- kfree(inode->u.generic_ip);
- iput(inode);
- }
- if (dentry)
- dput(dentry);
- }
- return ret;
+ return count;
}
-EXPORT_SYMBOL(nm_create_group);
+static ssize_t nm_node_ipv4_port_read(struct nm_node *node, char *page)
+{
+ return sprintf(page, "%u\n", ntohs(node->nd_ipv4_port));
+}
-int nm_create_node(char *buf, nm_op *data)
+static ssize_t nm_node_ipv4_port_write(struct nm_node *node, const char *page,
+ size_t count)
{
- struct tree_descr desc;
- struct dentry *dentry = NULL;
- struct inode *inode = NULL;
- int ino, node_num, bucket;
- int ret = -EINVAL;
- nm_node_inode_private *n = NULL;
- struct page *page = NULL;
+ unsigned long tmp;
+ char *p = (char *)page;
- nmprintk("add cluster node ...\n");
+ tmp = simple_strtoul(p, &p, 0);
+ if (!p || (*p && (*p != '\n')))
+ return -EINVAL;
- data->arg_u.node.node_name[NM_MAX_NAME_LEN] = '\0';
- inode = nm_get_node_by_name(data->arg_u.node.node_name);
- if (inode) {
- ret = sprintf(buf, "%d: node %u (%s) already exists", -EEXIST,
- nm_get_node_global_index(inode),
- data->arg_u.node.node_name);
- iput(inode);
- return ret;
- }
+ if (tmp == 0)
+ return -EINVAL;
+ if (tmp >= (u16)-1)
+ return -ERANGE;
- node_num = data->arg_u.node.node_num;
- if (node_num > NM_INVALID_SLOT_NUM) {
- nmprintk("bad node_num: %d\n", node_num);
- goto leave;
- }
+ node->nd_ipv4_port = htons(tmp);
- spin_lock(&cluster.bitmap_lock);
- node_num = nm_find_next_slot(&(cluster.node_bitmap[0]), 255, node_num);
- spin_unlock(&cluster.bitmap_lock);
-
- if (node_num < 0) {
- nmprintk("out of node slots!\n");
- goto leave;
- }
-
- ino = node_num + NM_NODE_INODE_START;
-
- desc.name = data->arg_u.node.node_name;
- desc.ops = NULL;
- desc.mode = S_IFREG | S_IWUSR;
- dentry = nm_add_file(single_sb, single_sb->s_root, &desc, ino);
- if (IS_ERR(dentry)) {
- nmprintk("bad dentry\n");
- goto leave;
- }
- inode = igrab(dentry->d_inode);
- if (!inode) {
- nmprintk("igrab failed!\n");
- goto leave;
- }
-
- n = kmalloc(sizeof(nm_node_inode_private), GFP_KERNEL);
- if (!n) {
- nmprintk("could not kmalloc\n");
- goto leave;
- }
- memcpy(&n->node, &data->arg_u.node, sizeof(nm_node_info));
- INIT_LIST_HEAD(&n->ip_hash);
-
- spin_lock_init(&n->net.sock_lock);
- n->net.sock = NULL;
- n->net.sock_refs = 0;
- n->net.sock_pending = 0;
- n->net.defer_release = 0;
- INIT_LIST_HEAD(&n->net.pending_waiters);
- init_waitqueue_head(&n->net.waitq);
- INIT_LIST_HEAD(&n->net.handlers);
- INIT_LIST_HEAD(&n->net.active_item);
- n->net.page = NULL;
- n->net.page_off = 0;
-
- page = alloc_page(GFP_KERNEL);
- if (page == NULL) {
- nmprintk("page allocation failed\n");
- goto leave;
- }
- n->net.page = page;
-
- /* hash on first ip address */
- spin_lock(&nm_ip_hash_lock);
- bucket = hash_long(n->node.ifaces[0].addr_u.ip_addr4, NM_HASH_BITS);
- list_add_tail(&n->ip_hash, &nm_ip_hash[bucket]);
- spin_unlock(&nm_ip_hash_lock);
- nmprintk("hashed ip %d.%d.%d.%d to bucket %d\n",
- NIPQUAD(n->node.ifaces[0].addr_u.ip_addr4), bucket);
- n->inode = inode;
- inode->u.generic_ip = n;
-
- ret = sprintf(buf, "0: node %u (%s) added", node_num,
- n->node.node_name);
- nm_do_callbacks(NM_NODE_ADD_CB, inode, NULL, node_num);
-
-leave:
- if (ret < 0) {
- if (page)
- __free_page(page);
- if (inode) {
- if (inode->u.generic_ip)
- kfree(inode->u.generic_ip);
- iput(inode);
- }
- if (dentry)
- dput(dentry);
- }
- return ret;
+ return count;
}
-int nm_make_group_ready(struct inode *group)
+static ssize_t nm_node_ipv4_address_read(struct nm_node *node, char *page)
{
- nm_group_inode_private *g = group->u.generic_ip;
- if (!g)
- return -EINVAL;
- g->state = NM_GROUP_READY;
- return 0;
+ return sprintf(page, "%u.%u.%u.%u\n", NIPQUAD(node->nd_ipv4_address));
}
-EXPORT_SYMBOL(nm_make_group_ready);
-int nm_add_node_to_group(char *buf, nm_op *data)
+/* XXX this is acting as commit until commit really lands.. all this will be
+ * hoisted into the commit method */
+static ssize_t nm_node_ipv4_address_write(struct nm_node *node,
+ const char *page,
+ size_t count)
{
- struct tree_descr desc;
- struct inode *inode = NULL;
- struct dentry *dentry = NULL, *child = NULL;
- nm_group_inode_private *g = NULL;
- int group_num, slot_num;
- int ret = -EINVAL;
- u8 ino;
- char tmpname[6];
+ struct nm_cluster *cluster = to_nm_cluster_from_node(node);
+ int ret, i;
+ struct rb_node **p, *parent;
+ unsigned int octets[4];
+ u32 ipv4_addr = 0; /* network order */
- nmprintk("add node to group...\n");
+ ret = sscanf(page, "%3u.%3u.%3u.%3u", &octets[0], &octets[1],
+ &octets[2], &octets[3]);
+ if (ret != 4)
+ return -EINVAL;
- group_num = data->arg_u.gc.group_num;
- ino = data->arg_u.gc.node_num;
- slot_num = data->arg_u.gc.slot_num;
-
- /* request a certain slot, or NM_INVALID_SLOT_NUM for any slot */
- if (slot_num > NM_INVALID_SLOT_NUM)
- goto leave;
-
- if (ino >= NM_INVALID_SLOT_NUM || group_num >= NM_INVALID_SLOT_NUM)
- goto leave;
-
- inode = nm_get_group_by_num(group_num);
- if (!inode)
- goto leave;
- if (list_empty(&inode->i_dentry))
- goto leave;
- dentry = dget(list_entry(inode->i_dentry.next, struct dentry, d_alias));
- if (!dentry)
- goto leave;
- g = inode->u.generic_ip;
- if (!g)
- goto leave;
-
- if (g->state == NM_GROUP_NOT_READY) {
- ret = sprintf(buf, "%d: group disk has not been discovered. "
- "cannot add nodes.", -EROFS);
- goto leave;
+ for (i = 0; i < ARRAY_SIZE(octets); i++) {
+ if (octets[i] > 255)
+ return -ERANGE;
+ ipv4_addr |= octets[i] << (i * 8);
}
- spin_lock(&g->bitmap_lock);
- slot_num = nm_find_next_slot(&(g->slot_bitmap[0]), 255, slot_num);
- spin_unlock(&g->bitmap_lock);
- if (slot_num < 0)
- goto leave;
-
- /* create hardlink to ino with name "slot_num" */
- sprintf(tmpname, "%03u", slot_num);
- desc.name = &(tmpname[0]);
- desc.ops = NULL;
- desc.mode = 0;
- child = nm_add_link(single_sb, dentry, &desc,
- NM_NODE_INODE_START+ino);
- if (IS_ERR(child)) {
- nmprintk("error adding link for %s\n", tmpname);
- child = NULL;
- goto leave;
+ ret = 0;
+ write_lock(&cluster->cl_nodes_lock);
+ if (nm_node_ip_tree_lookup(cluster, ipv4_addr, &p, &parent))
+ ret = -EEXIST;
+ else {
+ rb_link_node(&node->nd_ip_node, parent, p);
+ rb_insert_color(&node->nd_ip_node, &cluster->cl_node_ip_tree);
}
+ write_unlock(&cluster->cl_nodes_lock);
+ if (ret)
+ return ret;
- ret = sprintf(buf, "0: node %u added to group: %.*s",
- ino, dentry->d_name.len, dentry->d_name.name);
+ memcpy(&node->nd_ipv4_address, &ipv4_addr, sizeof(ipv4_addr));
- if (!igrab(child->d_inode))
- goto leave;
- nm_do_callbacks(NM_GROUP_NODE_ADD_CB, inode, child->d_inode, slot_num);
- iput(child->d_inode);
-
-leave:
- if (dentry)
- dput(dentry);
- if (child)
- dput(child);
- if (inode)
- iput(inode);
- return ret;
+ return count;
}
-EXPORT_SYMBOL(nm_add_node_to_group);
+static ssize_t nm_node_local_read(struct nm_node *node, char *page)
+{
+ return sprintf(page, "%d\n", node->nd_local);
+}
-int nm_remove_node_from_group(struct inode *group, struct inode *node)
+static ssize_t nm_node_local_write(struct nm_node *node,
+ const char *page,
+ size_t count)
{
- struct dentry *child = NULL;
- nm_group_inode_private *g = NULL;
- int slot_num;
- int ret = -EINVAL;
+ struct nm_cluster *cluster = to_nm_cluster_from_node(node);
+ unsigned long tmp;
+ char *p = (char *)page;
+ ssize_t ret;
- nmprintk("remove node from group...\n");
+ tmp = simple_strtoul(p, &p, 0);
+ if (!p || (*p && (*p != '\n')))
+ return -EINVAL;
- slot_num = nm_get_group_index(group, node, &child);
+ tmp = !!tmp; /* boolean of whether this node wants to be local */
- if (slot_num == NM_MAX_NODES || !child)
- goto leave;
+ /* the only failure case is trying to set a new local node
+ * when a different one is already set */
+ if (tmp && tmp == cluster->cl_has_local &&
+ cluster->cl_local_node != node->nd_num)
+ return -EBUSY;
- g = group->u.generic_ip;
- if (!g)
- goto leave;
-
- nmprintk("killing the dentry now!!\n");
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
- down(&group->i_zombie);
- node->i_nlink--;
- d_delete(child);
- up(&group->i_zombie);
-#else
- down(&group->i_sem);
- node->i_nlink--;
- d_delete(child);
- up(&group->i_sem);
-#endif
- nmprintk("done killing the dentry!!\n");
+ /* bring up the rx thread if we're setting the new local
+ * node. XXX make sure port/addr are set */
+ if (tmp && !cluster->cl_has_local) {
+ ret = net_start_rx_thread(node);
+ if (ret)
+ return ret;
+ }
+ if (!tmp && cluster->cl_has_local &&
+ cluster->cl_local_node == node->nd_num) {
+ net_stop_rx_thread(node);
+ cluster->cl_local_node = 0;
+ }
- if (!igrab(node))
- goto leave;
- nm_do_callbacks(NM_GROUP_NODE_DEL_CB, group, node, slot_num);
- iput(node);
-
- spin_lock(&g->bitmap_lock);
- clear_bit(slot_num, (void *)(&g->slot_bitmap[0]));
- spin_unlock(&g->bitmap_lock);
+ node->nd_local = tmp;
+ if (node->nd_local) {
+ cluster->cl_has_local = tmp;
+ cluster->cl_local_node = node->nd_num;
+ }
- ret = 0;
-
-leave:
- if (child)
- dput(child);
- return ret;
+ return count;
}
+struct nm_node_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct nm_node *, char *);
+ ssize_t (*store)(struct nm_node *, const char *, size_t);
+};
+static struct nm_node_attribute nm_node_attr_num = {
+ .attr = { .name = "num", .mode = S_IRUGO | S_IWUSR },
+ .show = nm_node_num_read,
+ .store = nm_node_num_write,
+};
+static struct nm_node_attribute nm_node_attr_ipv4_port = {
+ .attr = { .name = "ipv4_port", .mode = S_IRUGO | S_IWUSR },
+ .show = nm_node_ipv4_port_read,
+ .store = nm_node_ipv4_port_write,
+};
+static struct nm_node_attribute nm_node_attr_ipv4_address = {
+ .attr = { .name = "ipv4_address", .mode = S_IRUGO | S_IWUSR },
+ .show = nm_node_ipv4_address_read,
+ .store = nm_node_ipv4_address_write,
+};
+static struct nm_node_attribute nm_node_attr_local = {
+ .attr = { .name = "local", .mode = S_IRUGO | S_IWUSR },
+ .show = nm_node_local_read,
+ .store = nm_node_local_write,
+};
+static struct attribute *nm_node_default_attrs[] = {
+ &nm_node_attr_num.attr,
+ &nm_node_attr_ipv4_port.attr,
+ &nm_node_attr_ipv4_address.attr,
+ &nm_node_attr_local.attr,
+ NULL,
+};
-int nm_name_cluster(char *buf, nm_op *data)
+static ssize_t nm_node_show(struct kobject *kobj,
+ struct attribute *attr,
+ char *page)
{
- int ret = -EINVAL;
+ struct nm_node *node = to_nm_node(kobj);
+ struct nm_node_attribute *nm_node_attr =
+ container_of(attr, struct nm_node_attribute, attr);
+ ssize_t ret = 0;
- nmprintk("name cluster...\n");
- spin_lock(&nm_lock);
- if (cluster.state == NM_CLUSTER_UP) {
- ret = sprintf(buf, "%d: cluster name could not be set. "
- "cluster already up.", -EINVAL);
- goto leave;
- }
- memset(cluster.name, 0, NM_MAX_NAME_LEN+1);
- memcpy(cluster.name, data->arg_u.name, NM_MAX_NAME_LEN);
- ret = sprintf(buf, "0: cluster name set: %s", cluster.name);
-leave:
- spin_unlock(&nm_lock);
+ if (nm_node_attr->show)
+ ret = nm_node_attr->show(node, page);
return ret;
}
-int nm_destroy_cluster(char *buf)
+static ssize_t nm_node_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *page, size_t count)
{
- int ret;
- nmprintk("destroy cluster...\n");
+ struct nm_node *node = to_nm_node(kobj);
+ struct nm_node_attribute *nm_node_attr =
+ container_of(attr, struct nm_node_attribute, attr);
+ ssize_t ret = -EINVAL;
- /* TODO */
- spin_lock(&nm_lock);
- nm_init_cluster(&cluster);
- ret = sprintf(buf, "0: rudely destroyed cluster!!!");
- spin_unlock(&nm_lock);
+ if (nm_node_attr->store)
+ ret = nm_node_attr->store(node, page, count);
return ret;
}
+struct sysfs_ops nm_node_sysfs_ops = {
+ .show = &nm_node_show,
+ .store = &nm_node_store,
+};
-int nm_get_cluster_num_nodes(char *buf)
-{
- int num_nodes=0, i;
-
- nmprintk("get cluster num nodes...\n");
+static struct ukobj_type nm_node_type = {
+ .ktype = {
+ .release = nm_node_release,
+ .sysfs_ops = &nm_node_sysfs_ops,
+ .default_attrs = nm_node_default_attrs,
+ },
+ .make_object = usysfs_make_no_object,
+ .owner = THIS_MODULE,
+};
- spin_lock(&cluster.bitmap_lock);
- for (i=0; i<ARRAY_SIZE(cluster.node_bitmap); i++)
- num_nodes += hweight_long(cluster.node_bitmap[i]);
- spin_unlock(&cluster.bitmap_lock);
+/* node set */
- return sprintf(buf, "0: %d", num_nodes);
-}
+struct nm_node_set {
+ struct ukset ns_ukset;
+ /* some stuff? */
+};
-int nm_get_cluster_num_groups(char *buf)
+#if 0
+static struct nm_node_set *to_nm_node_set(struct kset *kset)
{
- int num_groups=0, i;
-
- nmprintk("get cluster num groups...\n");
-
- spin_lock(&cluster.bitmap_lock);
- for (i=0; i<ARRAY_SIZE(cluster.group_bitmap); i++)
- num_groups += hweight_long(cluster.group_bitmap[i]);
- spin_unlock(&cluster.bitmap_lock);
-
- return sprintf(buf, "0: %d", num_groups);
+ return kset ?
+ container_of(to_ukset(kset), struct nm_node_set, ns_ukset)
+ : NULL;
}
+#endif
-int nm_get_group_num_nodes(struct inode *group)
+static struct kobject *nm_node_set_make_object(struct kset *kset,
+ const char *name)
{
- int num_nodes=0, i;
- nm_group_inode_private *g;
-
- nmprintk("get group num nodes...\n");
-
- g = group->u.generic_ip;
- if (!g)
- return -EINVAL;
+ struct nm_node *node = NULL;
+ struct nm_cluster *cluster = to_nm_cluster(kset->kobj.parent);
+ struct kobject *ret = NULL;
+ net_inode_private *nip;
- spin_lock(&g->bitmap_lock);
- for (i=0; i<ARRAY_SIZE(g->slot_bitmap); i++)
- num_nodes += hweight_long(g->slot_bitmap[i]);
- spin_unlock(&g->bitmap_lock);
+ printk("trying to make a node object under cluster %p\n", cluster);
- return num_nodes;
-}
+ if (strlen(name) > NM_MAX_NAME_LEN)
+ goto out; /* ENAMETOOLONG */
-void * nm_iterate_group_disk_slots(struct inode *group, int *idx)
-{
- nm_group_inode_private *priv;
- int next;
+ node = kcalloc(1, sizeof(struct nm_node), GFP_KERNEL);
+ if (node == NULL)
+ goto out; /* ENOMEM */
- if (*idx >= 255)
- return NULL;
- priv = group->u.generic_ip;
- if (!priv)
- return NULL;
- next = find_next_bit(priv->slot_bitmap, 255, *idx);
- if (next >= 255)
- return NULL;
- *idx = next;
- return util_rarray_idx_to_slot(&priv->disk.slots, next);
-}
-EXPORT_SYMBOL(nm_iterate_group_disk_slots);
+ strcpy(node->nd_name, name); /* use kobj.name instead? */
+ node->nd_num = NM_MAX_NODES;
-int nm_get_node_info(char *buf, nm_op *data)
-{
- int ret, tmpret, i;
- nm_node_inode_private *priv;
- nm_network_iface *n;
- struct inode *inode = NULL;
- struct dentry *dentry;
- u8 node_num;
- u16 vers;
-
- ret = -EINVAL;
- node_num = data->arg_u.index;
- inode = nm_get_node_by_num(node_num);
- if (inode) {
- dentry = list_entry(inode->i_dentry.next, struct dentry,
- d_alias);
- priv = inode->u.generic_ip;
- ret = sprintf(buf, "0: global_index=%u\n"
- "name=%.*s\n",
- priv->node.node_num, dentry->d_name.len,
- dentry->d_name.name);
- buf += ret;
- for (i=0; i<NM_MAX_IFACES; i++) {
- n = &priv->node.ifaces[i];
- vers = ntohs(n->ip_version);
- nmprintk("ip_version=%u, vers=%u\n",
- n->ip_version, vers);
- if (vers!=4 && vers!=6)
- continue;
- /* TODO: how to print ipv6? */
- tmpret = sprintf(buf, "iface%d.port=%u\n"
- "iface%d.version=%d\n"
- "iface%d.addr=%d.%d.%d.%d\n",
- i, ntohs(n->ip_port), i, vers, i,
- NIPQUAD(n->addr_u.ip_addr4));
- buf += tmpret;
- ret += tmpret;
- }
- iput(inode);
+ /* this should be somewhere else */
+ nip = &node->nd_net_inode_private;
+ spin_lock_init(&nip->sock_lock);
+ INIT_LIST_HEAD(&nip->pending_waiters);
+ init_waitqueue_head(&nip->waitq);
+ INIT_LIST_HEAD(&nip->handlers);
+ INIT_LIST_HEAD(&nip->active_item);
+ nip->page = alloc_page(GFP_KERNEL);
+ if (nip->page == NULL) {
+ nmprintk("page allocation failed\n");
+ goto out; /* ENOMEM */
}
- return ret;
-}
-int nm_get_group_info(char *buf, nm_op *data)
-{
- int ret, tmpret;
- nm_group_inode_private *g = NULL;
- struct inode *inode = NULL;
- u8 group_num;
- struct dentry *dentry, *child;
+ kobject_set_name(&node->nd_kobj, name);
+ node->nd_kobj.ktype = &nm_node_type.ktype;
+ kobject_init(&node->nd_kobj);
- ret = -EINVAL;
- group_num = data->arg_u.index;
- inode = nm_get_group_by_num(group_num);
- if (inode) {
- g = inode->u.generic_ip;
- dentry = list_entry(inode->i_dentry.next, struct dentry,
- d_alias);
- ret = sprintf(buf, "0: group_num=%u\n"
- "name=%.*s\n"
- "disk_uuid=%s\n",
- group_num, dentry->d_name.len,
- dentry->d_name.name, g->disk.uuid);
- buf += ret;
+ ret = &node->nd_kobj;
- spin_lock(&dcache_lock);
- list_for_each_entry(child, &dentry->d_subdirs, d_child) {
- tmpret = sprintf(buf, "%.*s\n", child->d_name.len,
- child->d_name.name);
- buf += tmpret;
- ret += tmpret;
- }
- spin_unlock(&dcache_lock);
- iput(inode);
- }
- return ret;
-}
+out:
+ if (ret == NULL)
+ kfree(node);
-
-
-static ssize_t write_cluster(struct file *file, char *buf, size_t size)
-{
- nm_op *data;
- int ret;
- u8 me;
-
- nmprintk("write_cluster\n");
-
- if (size < sizeof(*data))
- return -EINVAL;
- data = (nm_op *) buf;
- if (data->magic != NM_OP_MAGIC)
- return -EINVAL;
-
- switch (data->opcode) {
- case NM_OP_CREATE_CLUSTER:
- ret = nm_create_cluster(buf);
- break;
- case NM_OP_CREATE_GROUP:
- ret = nm_create_group(buf, data);
- break;
- case NM_OP_NAME_CLUSTER:
- ret = nm_name_cluster(buf, data);
- break;
- case NM_OP_DESTROY_CLUSTER:
- ret = nm_destroy_cluster(buf);
- break;
- case NM_OP_ADD_CLUSTER_NODE:
- ret = nm_create_node(buf, data);
- break;
- case NM_OP_GET_CLUSTER_NUM_NODES:
- ret = nm_get_cluster_num_nodes(buf);
- break;
- case NM_OP_GET_GLOBAL_NODE_NUM:
- ret = 0;
- me = nm_this_node(NULL);
- if (me >= NM_MAX_NODES)
- ret = -EINVAL;
- ret = sprintf(buf, "%d: %u", ret, me);
- break;
- default:
- ret = sprintf(buf, "%d: bad opcode: %u", -EINVAL,
- data->opcode);
- break;
- }
- nmprintk("leaving!\n");
return ret;
}
-static ssize_t write_node(struct file *file, char *buf, size_t size)
+static void nm_node_set_drop_object(struct kset *kset, struct kobject *kobj)
{
- nm_op *data;
- int ret;
-
- nmprintk("write_node\n");
+ struct nm_node *node = to_nm_node(kobj);
+ struct nm_cluster *cluster = to_nm_cluster(node->nd_kobj.parent);
+ int node_not_in_nodes_array = 0;
- if (size < sizeof(*data))
- return -EINVAL;
- data = (nm_op *) buf;
- if (data->magic != NM_OP_MAGIC)
- return -EINVAL;
+ /* please don't try this yet, needs proper refcounts of nodes too */
+ BUG();
- switch (data->opcode) {
- case NM_OP_GET_NODE_INFO:
- ret = nm_get_node_info(buf, data);
- break;
- default:
- ret = sprintf(buf, "%d: bad opcode: %u", -EINVAL,
- data->opcode);
- break;
- }
- nmprintk("leaving!\n");
- return ret;
-}
+ if (cluster->cl_has_local)
+ net_stop_rx_thread(node);
-static ssize_t write_group(struct file *file, char *buf, size_t size)
-{
- nm_op *data;
- int ret;
-
- nmprintk("write_group\n");
+ /* XXX sloppy */
+ if (node->nd_ipv4_address)
+ rb_erase(&node->nd_ip_node, &cluster->cl_node_ip_tree);
- if (size < sizeof(*data))
- return -EINVAL;
- data = (nm_op *) buf;
- if (data->magic != NM_OP_MAGIC)
- return -EINVAL;
+ /* XXX call into net to stop this node from trading messages */
- nmprintk("opcode is %u, add_group is %u\n", data->opcode,
- NM_OP_ADD_GROUP_NODE);
- switch (data->opcode) {
- case NM_OP_GET_GROUP_INFO:
- ret = nm_get_group_info(buf, data);
- break;
-
- case NM_OP_ADD_GROUP_NODE:
- ret = nm_add_node_to_group(buf, data);
- break;
-
- default:
- ret = sprintf(buf, "%d: bad opcode: %u",
- -EINVAL, data->opcode);
- break;
+ write_lock(&cluster->cl_nodes_lock);
+ if (node->nd_num != NM_MAX_NODES) {
+ if (cluster->cl_nodes[node->nd_num] != node)
+ node_not_in_nodes_array = 1;
+ else {
+ cluster->cl_nodes[node->nd_num] = NULL;
+ node->nd_num = NM_MAX_NODES;
+ }
}
- nmprintk("leaving!\n");
- return ret;
-}
+ write_unlock(&cluster->cl_nodes_lock);
+ BUG_ON(node_not_in_nodes_array);
-
-struct inode * nm_get_group_by_num(u8 group_num)
-{
- struct inode *inode = iget(single_sb, group_num + NM_GROUP_INODE_START);
- if (!inode)
- return NULL;
- if (!inode->u.generic_ip) {
- iput(inode);
- return NULL;
- }
- return inode;
-}
-EXPORT_SYMBOL(nm_get_group_by_num);
-
-struct inode * nm_get_node_by_num(u8 node_num)
-{
- struct inode *inode = iget(single_sb, node_num + NM_NODE_INODE_START);
- if (!inode)
- return NULL;
- if (!inode->u.generic_ip) {
- iput(inode);
- return NULL;
- }
- return inode;
+ kobject_put(kobj);
}
-EXPORT_SYMBOL(nm_get_node_by_num);
-/* ipv4 only for now... */
-struct inode * nm_get_node_by_ip(u32 addr)
-{
- int bucket;
- struct list_head *iter;
- nm_node_inode_private *priv;
- struct inode *ret = NULL;
-
- bucket = hash_long(addr, NM_HASH_BITS);
+static struct ukobj_type nm_node_set_type = {
+ .ktype = {
+ .sysfs_ops = NULL, /* no attributes */
+ },
+ .make_object = nm_node_set_make_object,
+ .drop_object = nm_node_set_drop_object,
+ .owner = THIS_MODULE,
+};
- spin_lock(&nm_ip_hash_lock);
- list_for_each(iter, &nm_ip_hash[bucket]) {
- priv = list_entry(iter, nm_node_inode_private, ip_hash);
- if (priv->node.ifaces[0].addr_u.ip_addr4 == addr) {
- ret = igrab(priv->inode);
- break;
- }
-
- }
- spin_unlock(&nm_ip_hash_lock);
- return ret;
-}
-EXPORT_SYMBOL(nm_get_node_by_ip);
+/* cluster */
-nm_cluster * nm_get_cluster(void)
+static void nm_cluster_release(struct kobject *kobj)
{
- return &cluster;
-}
+ struct nm_cluster *cluster = to_nm_cluster(kobj);
-struct inode * nm_get_group_node_by_index(struct inode *group, u8 index)
-{
- struct dentry *dentry = NULL, *parent;
- struct inode *inode = NULL;
- char tmpname[6];
+ printk("releasing cluster %p\n", cluster);
- if (list_empty(&group->i_dentry))
- return NULL;
- parent = dget(list_entry(group->i_dentry.next, struct dentry, d_alias));
- if (!parent)
- return NULL;
-
- sprintf(tmpname, "%03u", index);
- dentry = lookup_one_len(tmpname, parent, strlen(tmpname));
- if (!IS_ERR(dentry)) {
- inode = dentry->d_inode;
- if (inode) {
- inode = igrab(inode);
- if (!inode->u.generic_ip || !S_ISREG (inode->i_mode)) {
- nmprintk("bad inode!\n");
- iput(inode);
- inode = NULL;
- }
- }
- if (!inode)
- dput(dentry);
- }
- dput(parent);
- return inode;
+ kfree(cluster->cl_ukset.default_sets);
+ kfree(cluster);
}
-EXPORT_SYMBOL(nm_get_group_node_by_index);
-struct inode * __nm_get_node_by_name(const char *node_name, int dir)
-{
- struct dentry *dentry = NULL;
- struct inode *inode = NULL;
+static struct ukobj_type nm_cluster_type = {
+ .ktype = {
+ .release = nm_cluster_release,
+ .sysfs_ops = NULL, /* no attributes */
+ },
+ .make_object = usysfs_make_no_object,
+ .owner = THIS_MODULE,
+};
- NM_ASSERT(node_name);
- NM_ASSERT(single_sb);
- NM_ASSERT(single_sb->s_root);
+/* cluster set */
- dentry = lookup_one_len(node_name, single_sb->s_root,
- strlen(node_name));
- if (!IS_ERR(dentry)) {
- inode = dentry->d_inode;
- if (inode) {
- inode = igrab(inode);
- if (!inode->u.generic_ip ||
- (dir && !S_ISDIR (inode->i_mode)) ||
- (!dir && !S_ISREG (inode->i_mode))) {
- nmprintk("bad inode!\n");
- iput(inode);
- inode = NULL;
- }
- }
- }
- return inode;
-}
-EXPORT_SYMBOL(__nm_get_node_by_name);
+struct nm_cluster_set {
+ struct ukset cs_ukset;
+ /* some stuff? */
+};
-
-/*
- * if group is NULL: return the global index for this node
- * if group is non NULL: return the index within the group of this node
- *
- * NOTE: currently getting the group index is slow
- * will need to change this somehow
- */
-u8 nm_this_node(struct inode *group)
+#if 0
+static struct nm_cluster_set *to_nm_cluster_set(struct kset *kset)
{
- struct inode *inode = NULL;
- struct dentry *child = NULL;
- u8 node_num = NM_MAX_NODES;
-
- inode = nm_get_node_by_name(nm_nodename);
- if (inode && inode->u.generic_ip) {
- if (group)
- node_num = nm_get_group_index(group, inode, &child);
- else
- node_num = nm_get_node_global_index(inode);
-
- }
- iput(inode);
- dput(child);
- //nmprintk("for group=%p, this node is %u\n", group, node_num);
- return node_num;
+ return kset ?
+ container_of(to_ukset(kset), struct nm_cluster_set, cs_ukset)
+ : NULL;
}
-EXPORT_SYMBOL(nm_this_node);
+#endif
-/* slow */
-static u8 nm_get_group_index(struct inode *group, struct inode *inode,
- struct dentry **child)
+static struct kset *nm_cluster_set_make_kset(struct kset *kset,
+ const char *name)
{
- struct dentry *tmp = NULL, *parent = NULL;
- u8 slot_num = NM_MAX_NODES;
- struct list_head *iter;
- char tmpname[6];
- char *err;
+ struct nm_cluster *cluster = NULL;
+ struct nm_node_set *ns = NULL;
+ struct kset *hb_kset = NULL, *ret = NULL;
+ void *defs = NULL;
- *child = NULL;
- parent = NULL;
- if (list_empty(&group->i_dentry))
- goto leave;
- parent = dget(list_entry(group->i_dentry.next, struct dentry, d_alias));
- if (!parent)
- goto leave;
-
- spin_lock(&dcache_lock);
- list_for_each(iter, &parent->d_subdirs) {
- tmp = list_entry(iter, struct dentry, d_child);
- if (tmp->d_inode == inode)
- break;
- tmp = NULL;
- }
- if (tmp)
- dget_locked(tmp);
- spin_unlock(&dcache_lock);
+ printk("trying to make a cluster object\n");
- if (!tmp || tmp->d_name.len > 3)
- goto leave;
- strncpy(tmpname, tmp->d_name.name, tmp->d_name.len);
- tmpname[tmp->d_name.len] = '\0';
- err=NULL;
- slot_num = simple_strtoul(tmpname, &err, 10);
-
- if (*err != '\0')
- slot_num = NM_MAX_NODES; // error
- else
- *child = dget(tmp); // done, get extra ref for child
-
-leave:
- dput(parent);
- dput(tmp);
+ /* this runs under the parent dir's i_sem; there can be only
+ * one caller in here at a time */
+ if (nm_single_cluster)
+ goto out; /* ENOSPC */
- return slot_num;
-}
+ cluster = kcalloc(1, sizeof(struct nm_cluster), GFP_KERNEL);
+ ns = kcalloc(1, sizeof(struct nm_node_set), GFP_KERNEL);
+ defs = kcalloc(3, sizeof(struct kset *), GFP_KERNEL);
+ hb_kset = hb_alloc_hb_set();
+ if (cluster == NULL || ns == NULL || hb_kset == NULL || defs == NULL)
+ goto out;
-int nm_register_callback(int type, void (*func)(void *, void *, u8))
-{
- nm_callback_func *f;
+ ukset_init_type_name(&cluster->cl_ukset, name, &nm_cluster_type.ktype);
+ ukset_init_type_name(&ns->ns_ukset, "nodes", &nm_node_set_type.ktype);
- if (type < NM_NODE_ADD_CB || type > NM_GROUP_NODE_DEL_CB)
- return -EINVAL;
- f = kmalloc(sizeof(nm_callback_func), GFP_KERNEL);
- if (f == NULL)
- return -ENOMEM;
- memset(f, 0, sizeof(nm_callback_func));
- f->func = func;
- down(&nm_cb_sem);
- list_add_tail(&f->list, &nm_callbacks[type]);
- up(&nm_cb_sem);
- return 0;
-}
-EXPORT_SYMBOL(nm_register_callback);
+ cluster->cl_ukset.default_sets = defs;
+ cluster->cl_ukset.default_sets[0] = &ns->ns_ukset.kset;
+ cluster->cl_ukset.default_sets[1] = hb_kset;
+ cluster->cl_ukset.default_sets[2] = NULL;
+ rwlock_init(&cluster->cl_nodes_lock);
+ cluster->cl_node_ip_tree = RB_ROOT;
-int nm_unregister_callback(int type, void (*func)(void *, void *, u8))
-{
- struct list_head *iter, *tmpiter;
- int ret = -EINVAL;
- nm_callback_func *f;
+ ret = &cluster->cl_ukset.kset;
+ nm_single_cluster = cluster;
- if (type < NM_NODE_ADD_CB || type > NM_GROUP_NODE_DEL_CB)
- return ret;
-
- down(&nm_cb_sem);
- list_for_each_safe(iter, tmpiter, &nm_callbacks[type]) {
- f = list_entry (iter, nm_callback_func, list);
- if (f->func == func) {
- list_del(&f->list);
- kfree(f);
- ret = 0;
- break;
- }
+out:
+ if (ret == NULL) {
+ kfree(cluster);
+ kfree(ns);
+ hb_free_hb_set(hb_kset);
+ kfree(defs);
}
- up(&nm_cb_sem);
+
return ret;
}
-EXPORT_SYMBOL(nm_unregister_callback);
-static void nm_do_callbacks(int type, void *ptr1, void *ptr2, u8 idx)
+static void nm_cluster_set_drop_object(struct kset *kset, struct kobject *kobj)
{
- struct list_head *iter;
- nm_callback_func *f;
-
- down(&nm_cb_sem);
- list_for_each(iter, &nm_callbacks[type]) {
- f = list_entry (iter, nm_callback_func, list);
- (f->func) (ptr1, ptr2, idx);
- }
- up(&nm_cb_sem);
-}
+ struct nm_cluster *cluster = to_nm_cluster(kobj);
+ BUG_ON(nm_single_cluster != cluster);
+ nm_single_cluster = NULL;
-static void nm_teardown(void)
-{
- remove_proc_entry("cluster/nm", NULL);
- remove_proc_entry("cluster", NULL);
+ kobject_put(kobj);
}
-static void nm_init_cluster(nm_cluster *cluster)
-{
- int i;
- memset(cluster, 0, sizeof(nm_cluster));
- cluster->state = NM_CLUSTER_DOWN;
- spin_lock_init(&cluster->bitmap_lock);
-
- for (i=NM_NODE_ADD_CB; i<=NM_GROUP_NODE_DEL_CB; i++)
- INIT_LIST_HEAD(&nm_callbacks[i]);
-}
+static struct ukobj_type nm_cluster_set_type = {
+ .ktype = {
+ .sysfs_ops = NULL, /* no attributes */
+ },
+ .make_kset = nm_cluster_set_make_kset,
+ .drop_object = nm_cluster_set_drop_object,
+ .owner = THIS_MODULE,
+};
+static struct nm_cluster_set nm_cluster_set = {
+ .cs_ukset = {
+ .kset = {
+ .kobj = {
+ .name = "cluster",
+ .ktype = &nm_cluster_set_type.ktype,
+ },
+ },
+ },
+};
-
-
-
-/*----------------------------------------------------------------------------*/
-/*
- * populating the filesystem.
- */
-static int nm_fill_super(struct super_block * sb, void * data, int silent)
+static void __exit exit_nm(void)
{
- int ret, sz;
- struct TA_write_ops *ops;
- static struct tree_descr nm_files[] = {
- [NM_Cluster] = {".cluster", &transaction_ops, S_IWUSR},
- [NM_Node] = {".node", &transaction_ops, S_IWUSR},
- [NM_Group] = {".group", &transaction_ops, S_IWUSR},
- /* last one */ {""}
- };
-
- sz = sizeof(nm_files) / sizeof(struct tree_descr);
- ops = kmalloc(sizeof(struct TA_write_ops) +
- (sz * sizeof(ops->write_op[0])),
- GFP_KERNEL);
- if (!ops)
- return -ENOMEM;
-
- ops->num_ops = sz;
- ops->write_op[NM_Cluster] = write_cluster;
- ops->write_op[NM_Node] = write_node;
- ops->write_op[NM_Group] = write_group;
-
- single_sb = NULL;
- nmprintk("calling simple_fill_super...\n");
- ret = simple_fill_super(sb, 0x98675309, nm_files);
- if (ret >= 0) {
- TA_GENERIC_SB_MEMBER(sb) = ops;
- single_sb = sb;
- } else {
- kfree(ops);
- }
- return ret;
+ nmprintk("unloading nm module\n");
+ /* XXX sync with hb callbacks and shut down hb? */
+ net_unregister_hb_callbacks();
+ usysfs_unregister_subsystem(&nm_cluster_set.cs_ukset.kset);
}
-/* blindly copied from ocfs2 */
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
-static struct super_block *nm_get_sb(struct file_system_type *fs_type,
- int flags,
- const char *dev_name,
- void *data)
+static int __init init_nm(void)
{
- return get_sb_single(fs_type, flags, data, nm_fill_super);
-}
+ int ret;
-static struct file_system_type nm_fs_type = {
- .owner = THIS_MODULE,
- .name = "nm",
- .get_sb = nm_get_sb,
- .kill_sb = kill_anon_super,
-};
-#else
-static struct super_block *nm_read_super(struct super_block *sb,
- void *data,
- int silent)
-{
- nmprintk("welcome to nm_read_super!!!\n");
- return (nm_fill_super(sb, data, silent) < 0) ? NULL : sb;
-}
+ hb_init();
+ ret = net_register_hb_callbacks();
+ if (ret)
+ goto out;
-static DECLARE_FSTYPE (nm_fs_type, "nm", nm_read_super, FS_SINGLE|FS_LITTER);
-#endif
-
-static int __init init_nm(void)
-{
- int retval;
- nm_nodename = kmalloc(strlen(system_utsname.nodename) + 1, GFP_KERNEL);
- if (nm_nodename==NULL) {
- nmprintk("could not allocate a few bytes for nodename!\n");
- return -ENOMEM;
+ ukset_init(&nm_cluster_set.cs_ukset);
+ ret = usysfs_register_subsystem(&nm_cluster_set.cs_ukset.kset);
+ if (ret) {
+ printk(KERN_ERR "nodemanager: Registration returned %d\n", ret);
+ goto out;
}
- strcpy(nm_nodename, system_utsname.nodename);
- nmprintk("loading nm module: nodename is %s\n", nm_nodename);
- if (nm_init_ip_hash() < 0) {
- nmprintk("failed to allocate node IP hash\n");
- return -ENOMEM;
- }
+out:
+ if (ret)
+ exit_nm();
- nm_init_cluster(&cluster);
-
- if (proc_mkdir("cluster", 0)) {
- if (proc_mkdir("cluster/nm", 0)) {
- }
- }
- nmprintk("calling register_filesystem\n");
- retval = register_filesystem(&nm_fs_type);
- nmprintk("done calling register_filesystem: ret=%d\n", retval);
- if (retval)
- nm_teardown();
- return retval;
+ return ret;
}
-static void __exit exit_nm(void)
-{
- nm_teardown();
- unregister_filesystem(&nm_fs_type);
- nm_destroy_ip_hash();
- kfree(nm_nodename);
- nmprintk("unloading nm module\n");
-}
-
-
-
-
MODULE_LICENSE("GPL");
module_init(init_nm)
module_exit(exit_nm)
Modified: trunk/fs/ocfs2/cluster/nodemanager.h
===================================================================
--- trunk/fs/ocfs2/cluster/nodemanager.h 2005-03-18 04:25:45 UTC (rev 2005)
+++ trunk/fs/ocfs2/cluster/nodemanager.h 2005-03-18 06:01:10 UTC (rev 2006)
@@ -33,54 +33,22 @@
#include "ocfs2_nodemanager.h"
-
-struct _nm_ctxt
-{
- int dummy;
-};
-
-enum {
- NM_CLUSTER_DOWN=0,
- NM_CLUSTER_UP
-};
-
-enum {
- NM_GROUP_NOT_READY=0,
- NM_GROUP_READY
-};
-
-enum {
- NM_Root = 1,
- NM_Cluster,
- NM_Node,
- NM_Group,
-};
-
-typedef struct _nm_cluster
-{
- char name[NM_MAX_NAME_LEN+1];
- int state;
- spinlock_t bitmap_lock;
- unsigned long group_bitmap[BITS_TO_LONGS(NM_MAX_NODES)];
- unsigned long node_bitmap[BITS_TO_LONGS(NM_MAX_NODES)];
-} nm_cluster;
-
-
-typedef struct _nm_group_inode_private
-{
- struct inode *inode;
- struct list_head net_list;
- struct list_head disk_list;
- cluster_disk disk;
- int state;
- spinlock_t bitmap_lock;
- unsigned long slot_bitmap[BITS_TO_LONGS(NM_MAX_NODES)];
-} nm_group_inode_private;
-
/* TODO: move this */
+/*
+ * this stores the per-socket state for each socket that we associate
+ * with a node. for remote nodes this is a socket that is established
+ * on demand and trades messages. For a local node this is just a listening
+ * socket that spawns message sockets from other nodes.
+ */
struct sock;
+/* this is still called net_inode_private for hysterical raisins. one
+ * has to draw the cleanup line somewhere.. */
typedef struct _net_inode_private
{
+ /* only used by the local node. */
+ struct task_struct *rx_thread;
+ /* the rest is for remote nodes */
+
/* sockets themselves don't seem to have a nice way to refcount them
* above sock_release. one could use iget/iput, but that seems
* to interact poory with sock_release() itself calling iput. */
@@ -97,99 +65,33 @@
struct page *page;
size_t page_off;
+
void (*orig_state_change)(struct sock *sk);
void (*orig_error_report)(struct sock *sk);
void (*orig_data_ready)(struct sock *sk, int bytes);
} net_inode_private;
-typedef struct _nm_node_inode_private
-{
- struct inode *inode;
- nm_node_info node;
- struct list_head ip_hash;
- net_inode_private net;
-} nm_node_inode_private;
+struct nm_node {
+ struct kobject nd_kobj;
+ char nd_name[NM_MAX_NAME_LEN+1]; /* replace? */
+ __u8 nd_num;
+ /* only one address per node, as attributes, for now. both
+ * in network order */
+ __u32 nd_ipv4_address;
+ __u16 nd_ipv4_port;
+ struct rb_node nd_ip_node;
+ /* there can be only one local node for now */
+ int nd_local;
-
-/* callback stuff */
-
-enum {
- NM_NODE_ADD_CB = 0,
- NM_NODE_DEL_CB,
- NM_GROUP_ADD_CB,
- NM_GROUP_DEL_CB,
- NM_GROUP_NODE_ADD_CB,
- NM_GROUP_NODE_DEL_CB,
- NM_NUM_CB
+ /* we're making simple assertions that a node can only have one network
+ * identity and report at one place in a heartbeat */
+ net_inode_private nd_net_inode_private;
};
-typedef void (nm_cb_func)(void *, void *, u8);
+u8 nm_this_node(void);
-typedef struct _nm_callback_func
-{
- struct list_head list;
- nm_cb_func *func;
- //void (*func)(void *, void *, u8);
-} nm_callback_func;
+struct nm_node * nm_get_node_by_num(u8 node_num);
+struct nm_node * nm_get_node_by_ip(u32 addr);
+void nm_node_put(struct nm_node *node);
-
-
-
-u8 nm_this_node(struct inode *group);
-nm_cluster * nm_get_cluster(void);
-int nm_register_callback(int type, void (*func)(void *, void *, u8));
-int nm_unregister_callback(int type, void (*func)(void *, void *, u8));
-int nm_get_group_num_nodes(struct inode *group);
-int nm_make_group_ready(struct inode *group);
-void * nm_iterate_group_disk_slots(struct inode *group, int *idx);
-int nm_remove_node_from_group(struct inode *group, struct inode *node);
-int nm_create_group(char *buf, nm_op *data);
-int nm_add_node_to_group(char *buf, nm_op *data);
-
-extern char *nm_nodename;
-
-
-struct inode * nm_get_group_by_num(u8 group_num);
-struct inode * nm_get_node_by_num(u8 node_num);
-struct inode * __nm_get_node_by_name(const char *node_name, int dir);
-struct inode * nm_get_node_by_ip(u32 addr);
-struct inode * nm_get_group_node_by_index(struct inode *group, u8 index);
-
-static inline struct inode * nm_get_node_by_name(char *node_name)
-{
- return __nm_get_node_by_name(node_name, 0);
-}
-static inline struct inode * nm_get_group_by_name(const char *group_name)
-{
- return __nm_get_node_by_name(group_name, 1);
-}
-
-
-static inline int nm_get_node_global_index(struct inode *node)
-{
- return (node->i_ino - NM_NODE_INODE_START);
-}
-static inline int nm_get_group_global_index(struct inode *group)
-{
- return (group->i_ino - NM_GROUP_INODE_START);
-}
-
-static inline int nm_valid_ino(int ino)
-{
-#if 0
- // these should never be referred to in kernel
- if (ino >= NM_Cluster && ino <= NM_Group)
- return 1;
-#endif
- if (ino >= NM_NODE_INODE_START &&
- ino < NM_NODE_INODE_START + NM_MAX_NODES)
- return 1;
- if (ino >= NM_GROUP_INODE_START &&
- ino < NM_GROUP_INODE_START + NM_MAX_NODES)
- return 1;
- return 0;
-}
-
-
-
#endif /* CLUSTER_NODEMANAGER_H */
Modified: trunk/fs/ocfs2/cluster/ocfs2_nodemanager.h
===================================================================
--- trunk/fs/ocfs2/cluster/ocfs2_nodemanager.h 2005-03-18 04:25:45 UTC (rev 2005)
+++ trunk/fs/ocfs2/cluster/ocfs2_nodemanager.h 2005-03-18 06:01:10 UTC (rev 2006)
@@ -43,29 +43,6 @@
#define NM_GROUP_INODE_START 200000
#define NM_NODE_INODE_START 100000
-
-
-typedef struct _nm_network_iface
-{
- __u16 ip_port; /* for simplicity, just define exactly one port for this if */
- __u16 ip_version;
- union {
- __u32 ip_addr4; /* IPv4 address in NBO */
- __u32 ip_addr6[4]; /* IPv6 address in NBO */
- } addr_u;
-} nm_network_iface;
-
-typedef struct _nm_node_info
-{
- __u8 node_num;
- __u8 pad1;
- __u16 pad2;
- __u32 pad3;
- char node_name[NM_MAX_NAME_LEN+1];
- char pad4[63];
- nm_network_iface ifaces[NM_MAX_IFACES];
-} nm_node_info;
-
/* transaction file nm_op stuff */
#define NM_OP_MAGIC 0xbeaf
@@ -101,7 +78,7 @@
union {
__u8 index;
char name[NM_MAX_NAME_LEN+1];
- nm_node_info node;
+// nm_node_info node;
nm_group_change gc;
} arg_u;
} nm_op;
Deleted: trunk/fs/ocfs2/cluster/ocfs2_tcp.h
===================================================================
--- trunk/fs/ocfs2/cluster/ocfs2_tcp.h 2005-03-18 04:25:45 UTC (rev 2005)
+++ trunk/fs/ocfs2/cluster/ocfs2_tcp.h 2005-03-18 06:01:10 UTC (rev 2006)
@@ -1,46 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * ocfs2_tcp.h
- *
- * Copyright (C) 2002, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef _OCFS2_TCP_H
-#define _OCFS2_TCP_H
-
-typedef struct _gsd_ioc
-{
- int fd;
- int namelen;
- char name[NM_MAX_NAME_LEN+1];
- int status;
-} gsd_ioc;
-
-typedef struct _net_ioc
-{
- __u32 status;
-} net_ioc;
-
-#define NET_IOC_MAGIC 'O'
-#define NET_IOC_ACTIVATE _IOR(NET_IOC_MAGIC, 1, net_ioc)
-#define NET_IOC_GETSTATE _IOR(NET_IOC_MAGIC, 2, net_ioc)
-#define GSD_IOC_CREATE_GROUP _IOR(NET_IOC_MAGIC, 3, gsd_ioc)
-#define GSD_IOC_ADD_GROUP_NODE _IOR(NET_IOC_MAGIC, 4, gsd_ioc)
-
-#endif /* _OCFS2_TCP_H */
Modified: trunk/fs/ocfs2/cluster/tcp.c
===================================================================
--- trunk/fs/ocfs2/cluster/tcp.c 2005-03-18 04:25:45 UTC (rev 2005)
+++ trunk/fs/ocfs2/cluster/tcp.c 2005-03-18 06:01:10 UTC (rev 2006)
@@ -63,17 +63,22 @@
* - handers must be callable from bh context
* but it really depends on what the semantics and messages are.
*
- * XXX we should resolve these before release
+ * asap
+ * - only have lookup succeed for active nodes (fully configured)
+ * - only initiate connections if rx thread is running?
+ * - don't allow node rmdir if it has socket and rx thread is running
+ * - tear down all node sockets on rx thread exit
+ * - have rx thread stop active tx and wait for them
+ * - make sure ->net.page gets torn down with net_inode_private
+ * - tear down sockets on exit.. via removing their inodes?
+ *
+ * XXX
* - disable preemt before calling rx handler when debugging
* - find explicit stack call to drain rx queue
* - add trivial version trading message at the start of a conn
* - go nuts adding static
* - nsc waiting is buggy, should be on socket.. wake w/err if socket dies
* - compare socks in attach_sock so both size don't close
- * - implement net_remove_handlers
- * - make sure ->net.page gets torn down with net_inode_private
- * - tear down sockets on exit.. via removing their inodes?
- * - simplify rx thread exit path (completion, etc)
*/
#include <linux/module.h>
@@ -101,7 +106,6 @@
#include "util.h"
-#include "gsd.h"
#include "heartbeat.h"
#include "tcp.h"
#include "nodemanager.h"
@@ -145,10 +149,6 @@
#define sk_state_change state_change
#endif
-static u16 ip_version, ip_port;
-static struct inode *net_inode = NULL;
-static u8 net_node_num;
-
/* all this state should eventually be brought up by object activation
* and tied to that object rather than being globally valid at insmod */
static spinlock_t net_handler_lock = SPIN_LOCK_UNLOCKED;
@@ -159,6 +159,7 @@
static spinlock_t net_active_lock = SPIN_LOCK_UNLOCKED;
static LIST_HEAD(net_active_list);
+/* XXX someday we'll need better accounting */
static struct task_struct *net_recv_task = NULL;
static inline void net_abort_status_return(net_status_ctxt *nsc)
@@ -169,27 +170,16 @@
spin_unlock(&net_status_lock);
}
-static int net_register_hb_callbacks(void);
-static void net_unregister_hb_callbacks(void);
-
/////////////////////
-static void net_shutdown(void);
-static int net_startup(void);
-static int __init net_driver_entry (void);
-static int net_init_driver(void);
-static void __exit net_driver_exit (void);
static int net_add_handler(net_msg_handler *nmh);
-static void net_remove_handlers(void);
-static struct socket *net_init_tcp_recv_sock(void);
+static struct socket *net_init_tcp_recv_sock(u16 port);
static int net_receive_thread(void *data);
static int net_receive(void);
static void net_try_accept(struct socket *sock);
static int net_process_message(struct socket *sock, net_msg *hdr);
-static int net_ioctl (struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg);
-static int net_sock_addref_or_connect(struct inode *inode,
- struct socket **sock_ret);
-static void net_sock_decref(struct inode *inode, int error);
+static int net_sock_addref_or_connect(u8 node_num, struct socket **sock_ret);
+static void net_sock_decref(struct socket *sock, int error);
//////////////////////
@@ -213,161 +203,19 @@
spin_unlock(&net_handler_lock);
}
-
-DECLARE_MUTEX(net_state_lock);
-u32 net_driver_state = NET_DRIVER_UNINITED;
-u32 net_num_dispatched = 0;
-
-
-/*
- * net_driver_entry()
- *
- * Driver entry point. Called on insmod.
- */
-static int __init net_driver_entry (void)
+int net_start_rx_thread(struct nm_node *node)
{
- struct proc_dir_entry *de;
- de = proc_mkdir("cluster/net", 0);
- if (!de)
- return -1;
- de->proc_fops->ioctl = net_ioctl;
-
- if (net_register_hb_callbacks())
- return -1;
-
- netprintk0("Loaded net Driver module\n");
- return 0;
-} /* net_driver_entry */
-
-static int net_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
- unsigned long arg)
-{
- net_ioc data;
- int ret = 0;
- struct file *file = NULL;
-
- if (_IOC_TYPE (cmd) != NET_IOC_MAGIC) {
- ret = -ENOTTY;
- goto exit_ioctl;
- }
-
- switch (cmd) {
- case NET_IOC_ACTIVATE:
- memset(&data, 0, sizeof(net_ioc));
- down(&net_state_lock);
- data.status = net_driver_state;
- if (net_driver_state == NET_DRIVER_UNINITED) {
- ret = net_init_driver();
- if (ret < 0) {
- netprintk("error trying to activate net driver: %d\n", ret);
- data.status = NET_DRIVER_UNINITED;
- } else {
- netprintk0("activated net driver!\n");
- net_driver_state = data.status = NET_DRIVER_READY;
- }
- }
- up(&net_state_lock);
-
- ret = copy_to_user ((net_ioc *) arg, &data,
- sizeof (net_ioc));
- break;
- case NET_IOC_GETSTATE:
- memset(&data, 0, sizeof(net_ioc));
- down(&net_state_lock);
- data.status = net_driver_state;
- up(&net_state_lock);
- ret = copy_to_user ((net_ioc *) arg, &data,
- sizeof (net_ioc));
- break;
-
- case GSD_IOC_CREATE_GROUP:
- case GSD_IOC_ADD_GROUP_NODE:
- ret = gsd_ioctl(inode, filp, cmd, arg);
- break;
- default:
- ret = -ENOTTY;
- break;
- }
-
-exit_ioctl:
-
- if (file)
- fput(file);
-
- return ret;
-} /* net_ioctl */
-
-static int net_init_driver(void)
-{
- nm_node_info *info;
- nm_node_inode_private *priv;
-
- /* get the global node number for this node */
- net_node_num = nm_this_node(NULL);
- if (net_node_num >= NM_MAX_NODES) {
- netprintk0("local nm node number not initialized!\n");
- return -1;
- }
- net_inode = nm_get_node_by_num(net_node_num);
- if (!net_inode) {
- netprintk0("local nm node inode not initialized!\n");
- return -1;
- }
- priv = (nm_node_inode_private *)net_inode->u.generic_ip;
- if (!priv) {
- iput(net_inode);
- netprintk0("local nm node info not initialized!\n");
- return -1;
- }
- info = &priv->node;
- ip_version = info->ifaces[0].ip_version;
- ip_port = info->ifaces[0].ip_port;
-
- if (net_startup() < 0)
- return -1;
-
- if (gsd_setup() < 0)
- return -1;
-
- return 0;
-} /* net_init_driver*/
-
-
-/*
- * net_driver_exit()
- *
- * Called on rmmod
- */
-static void __exit net_driver_exit (void)
-{
- down(&net_state_lock);
- if (net_driver_state == NET_DRIVER_READY) {
- netprintk0("shutting down network\n");
- net_shutdown();
- netprintk0("removing all net driver handlers\n");
- net_remove_handlers();
- gsd_teardown();
- if (net_inode)
- iput(net_inode);
- net_driver_state = NET_DRIVER_UNINITED;
- }
- up(&net_state_lock);
- remove_proc_entry("cluster/net", NULL);
- net_unregister_hb_callbacks();
- netprintk0("Unloading net driver module\n");
- return;
-} /* net_driver_exit */
-
-
-static int net_startup(void)
-{
struct socket *sock;
+ net_inode_private *net = &node->nd_net_inode_private;
int ret = 0;
+ BUG_ON(net->rx_thread != NULL);
+ BUG_ON(net_recv_task != NULL);
+
/* if the thread was setting up the rx socket we'd like to have it
* communicate errors back to us here. us setting up the socket
* and passing it to the thread is easier */
- sock = net_init_tcp_recv_sock();
+ sock = net_init_tcp_recv_sock(node->nd_ipv4_port);
if (IS_ERR(sock)) {
ret = PTR_ERR(sock);
goto out;
@@ -375,10 +223,11 @@
netprintk0("starting net receive thread...\n");
- net_recv_task = kthread_run(net_receive_thread, sock, "netrecv");
- if (IS_ERR(net_recv_task)) {
- ret = PTR_ERR(net_recv_task);
- net_recv_task = NULL;
+ net->rx_thread = kthread_run(net_receive_thread, sock,
+ "netrecv-%s", node->nd_name);
+ if (IS_ERR(net->rx_thread)) {
+ ret = PTR_ERR(net->rx_thread);
+ net->rx_thread = NULL;
netprintk("unable to launch net receive thread, error=%ld\n",
(long)ret);
goto out;
@@ -386,6 +235,7 @@
/* once the thread is running it has ownership of the sock */
sock = NULL;
+ net_recv_task = net->rx_thread;
out:
if (sock)
@@ -393,12 +243,21 @@
return 0;
}
-static void net_shutdown(void)
+void net_stop_rx_thread(struct nm_node *node)
{
- if (net_recv_task) {
+ net_inode_private *net = &node->nd_net_inode_private;
+ if (net->rx_thread) {
netprintk("waiting for net thread to exit....\n");
- kthread_stop(net_recv_task);
+ kthread_stop(net->rx_thread);
+ net->rx_thread = NULL;
+ net_recv_task = NULL;
}
+
+ /* XXX if we stop the thread we've cut off the rx path for all the
+ * nodes.. we should walk their net_inode_privates and tear down their
+ * sockets. tx shouldn't bring up a conn if there is no
+ * rx thread and rmdir should sync with the rx therad and tx
+ * references.. ugh. */
}
static int net_rx_should_wake(struct socket *sock)
@@ -534,16 +393,6 @@
return ret;
}
-
-/* TODO Fix */
-static void net_remove_handlers(void)
-{
- /* TODO: make an iterator in nm for running over each global inode
- * do I have this already? then call destroy on each. last put
- * will do the work. doesnt matter if it's slow. this is only
- * on shutdown... */
-}
-
static int net_recv_tcp_msg(struct socket *sock, void *data, size_t len)
{
int ret;
@@ -613,7 +462,7 @@
}
int net_send_message_iov(u32 msg_type, u32 key, struct iovec *caller_iov,
- size_t caller_iovlen, struct inode *inode,
+ size_t caller_iovlen, u8 target_node,
int *status)
{
int ret;
@@ -626,13 +475,8 @@
struct iovec *iov = NULL;
struct socket *sock = NULL;
- BUG_ON(current == net_recv_task);
+ BUG_ON(net_recv_task && (current == net_recv_task));
- if (!inode || !inode->u.generic_ip) {
- netprintk0("bad inode, cannot send message\n");
- ret = -EINVAL;
- goto out;
- }
if (caller_iovlen == 0) {
netprintk0("bad iovec array length\n");
ret = -EINVAL;
@@ -648,7 +492,7 @@
goto out;
}
- ret = net_sock_addref_or_connect(inode, &sock);
+ ret = net_sock_addref_or_connect(target_node, &sock);
if (ret)
goto out;
@@ -688,8 +532,7 @@
nsc.msg_num = msg->msg_num;
nsc.sys_status = NET_ERR_NONE;
nsc.status = 0;
- /* XXX: Should be using group index here. */
- nsc.target_node = nm_get_node_global_index(inode);
+ nsc.target_node = target_node;
init_waitqueue_entry(&sleep, current);
add_wait_queue(&nsc.wq, &sleep);
@@ -732,7 +575,7 @@
if (cleanup_wq)
remove_wait_queue(&nsc.wq, &sleep);
if (sock)
- net_sock_decref(inode, cleanup_sock);
+ net_sock_decref(sock, cleanup_sock);
if (iov)
kfree(iov);
if (msg)
@@ -758,13 +601,14 @@
* - status will not be set on return code != 0
*/
int net_send_message(u32 msg_type, u32 key, void *data, u32 len,
- struct inode *inode, int *status)
+ u8 target_node, int *status)
{
struct iovec iov = {
.iov_base = data,
.iov_len = len,
};
- return net_send_message_iov(msg_type, key, &iov, 1, inode, status);
+ return net_send_message_iov(msg_type, key, &iov, 1,
+ target_node, status);
}
EXPORT_SYMBOL(net_send_message);
@@ -887,9 +731,7 @@
static int net_receive(void)
{
- struct inode *inode;
LIST_HEAD(snapshot_list);
- nm_node_inode_private *priv;
net_inode_private *net;
struct socket *sock;
net_msg *hdr;
@@ -915,14 +757,13 @@
list_del_init(&net->active_item);
spin_unlock_bh(&net_active_lock);
- priv = container_of(net, nm_node_inode_private, net);
- inode = priv->inode;
sock = NULL;
err = 0;
read_eagain = 0;
read_some = 0;
+ /* basically a manual addref that doesn't connect :/ */
spin_lock_bh(&net->sock_lock);
if (net->sock && !net->sock_pending) {
sock = net->sock;
@@ -1009,9 +850,9 @@
spin_unlock_bh(&net_active_lock);
netprintk("net %p finished reading with %d\n", net, err);
- if (err < 0 && err != -EAGAIN) {
+ if (sock && err < 0 && err != -EAGAIN) {
netprintk("socket saw err %d, closing\n", err);
- net_sock_decref(inode, err);
+ net_sock_decref(sock, err);
}
}
@@ -1064,8 +905,11 @@
netprintk("node %u died, killed %d messages\n", node, num_kills);
}
-static void net_hb_node_down_cb(struct inode *group,
- struct inode *node,
+/* this callback is registered on insmod and torn down on rmmod.
+ * the list and locks that it uses to kill messages are statically
+ * defined so it should be ok.. it just has to carefully be called
+ * after hb is ready and before hb is torn down */
+static void net_hb_node_down_cb(struct nm_node *node,
int node_num,
void *data)
{
@@ -1075,7 +919,7 @@
static struct hb_callback_func *net_hb_down = NULL;
#define NET_HB_NODE_DOWN_PRI (0x1)
-static int net_register_hb_callbacks(void)
+int net_register_hb_callbacks(void)
{
net_hb_down = kmalloc(sizeof(*net_hb_down), GFP_KERNEL);
if (!net_hb_down)
@@ -1087,7 +931,7 @@
return hb_register_callback(net_hb_down);
}
-static void net_unregister_hb_callbacks(void)
+void net_unregister_hb_callbacks(void)
{
int status;
@@ -1153,7 +997,6 @@
if (syserr != NET_ERR_NONE)
goto out_respond;
- net_num_dispatched++;
handler_status = (hnd->func)(hdr, sizeof(net_msg) + hdr->data_len, hnd->data);
out_respond:
@@ -1301,7 +1144,7 @@
state_change(sk);
}
-static int net_start_connect(net_inode_private *net, nm_node_info *node)
+static int net_start_connect(net_inode_private *net, u32 addr, u16 port)
{
struct socket *sock = NULL;
struct sock *sk;
@@ -1325,9 +1168,9 @@
}
memset (&remoteaddr, 0, sizeof (remoteaddr));
- remoteaddr.sin_family = net_ip_version_to_family(node->ifaces[0].ip_version);
- remoteaddr.sin_addr.s_addr = node->ifaces[0].addr_u.ip_addr4;
- remoteaddr.sin_port = node->ifaces[0].ip_port;
+ remoteaddr.sin_family = AF_INET;
+ remoteaddr.sin_addr.s_addr = addr;
+ remoteaddr.sin_port = port;
net->sock = sock;
@@ -1380,19 +1223,14 @@
set_fs(oldfs);
}
-static void net_sock_decref(struct inode *inode, int error)
+static void net_sock_decref(struct socket *sock, int error)
{
net_inode_private *net = NULL;
- nm_node_inode_private *priv;
- struct socket *sock = NULL;
int release = 0;
- priv = inode->u.generic_ip;
- if (!priv) {
- netprintk("bad inode %p\n", inode);
- return;
- }
- net = &priv->net;
+ /* we hold a ref, this should be stable */
+ net = sock->sk->sk_user_data;
+ BUG_ON(net == NULL);
spin_lock_bh(&net->sock_lock);
@@ -1441,24 +1279,23 @@
return empty;
}
-static int net_sock_addref_or_connect(struct inode *inode,
- struct socket **sock_ret)
+static int net_sock_addref_or_connect(u8 target_node, struct socket **sock_ret)
{
- nm_node_inode_private *priv;
- nm_node_info *node;
+ struct nm_node *node = NULL;
net_inode_private *net = NULL;
struct socket *sock = NULL;
int ret = 0, wait = 0, set_pending = 0;
struct waiting_for_sock wfs;
- priv = inode->u.generic_ip;
- if (!priv) {
- netprintk("bad inode %p\n", inode);
+ /* XXX think about passing refs around.. */
+ node = nm_get_node_by_num(target_node);
+ if (node == NULL) {
+ netprintk("node %u unknown\n", target_node);
ret = -EINVAL;
goto out;
}
- net = &priv->net;
- node = &priv->node;
+ /* XXX verify that node is fully configured, rx thread is going */
+ net = &node->nd_net_inode_private;
spin_lock_bh(&net->sock_lock);
if (net->sock && !net->sock_pending) {
@@ -1481,7 +1318,8 @@
spin_unlock_bh(&net->sock_lock);
if (set_pending) {
- ret = net_start_connect(net, node);
+ ret = net_start_connect(net, node->nd_ipv4_address,
+ node->nd_ipv4_port);
if (ret)
goto out;
}
@@ -1518,6 +1356,8 @@
}
if (sock)
*sock_ret = sock;
+ if (node)
+ nm_node_put(node);
BUG_ON(ret == 0 && sock == NULL);
netprintk("addref for net %p gave %d\n", net, ret);
@@ -1529,8 +1369,7 @@
int error, slen;
struct sockaddr_in sin;
struct socket *new_sock = NULL;
- struct inode *inode = NULL;
- nm_node_inode_private *priv;
+ struct nm_node *node = NULL;
BUG_ON(sock == NULL);
error = sock_create_lite(sock->sk->sk_family,
@@ -1555,24 +1394,21 @@
netprintk("attempt to connect from %u.%u.%u.%u:%04x\n",
NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
- inode = nm_get_node_by_ip(sin.sin_addr.s_addr);
- if (inode == NULL) {
+ node = nm_get_node_by_ip(sin.sin_addr.s_addr);
+ if (node == NULL) {
netprintk0("connect from unknown host...\n");
net_send_error(new_sock, NET_UNKNOWN_HOST);
goto out;
}
- priv = inode->u.generic_ip;
- BUG_ON(priv == NULL);
+ netprintk("connect from known host: %s\n", node->nd_name);
- netprintk("connect from known host: %s\n", priv->node.node_name);
-
if (ntohs(sin.sin_port) >= 1024)
netprintk("warning: connect from unprivileged port: "
"%u.%u.%u.%u:%d\n", NIPQUAD(sin.sin_addr.s_addr),
ntohs(sin.sin_port));
- error = net_attach_sock(&priv->net, new_sock);
+ error = net_attach_sock(&node->nd_net_inode_private, new_sock);
if (error == -EEXIST)
net_send_error(new_sock, NET_ALREADY_CONNECTED);
@@ -1582,36 +1418,33 @@
net_sock_drain(new_sock);
sock_release(new_sock);
}
- if (inode)
- iput(inode);
}
+ if (node)
+ nm_node_put(node);
return;
}
-static struct socket *net_init_tcp_recv_sock(void)
+static struct socket *net_init_tcp_recv_sock(u16 port)
{
struct sockaddr_in sin;
struct socket *sock;
int error;
- error = sock_create(net_ip_version_to_family(ip_version),
- SOCK_STREAM, IPPROTO_TCP,
- &sock);
+ error = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
if (error < 0) {
netprintk("unable to create socket, error=%d\n", error);
goto bail;
}
memset(&sin, 0, sizeof(sin));
- sin.sin_family = net_ip_version_to_family(ip_version);
+ sin.sin_family = PF_INET;
sin.sin_addr.s_addr = htonl(INADDR_ANY);
- sin.sin_port = ip_port;
+ sin.sin_port = port;
- error = sock->ops->bind(sock, (struct sockaddr *)&sin,
- sizeof(sin));
+ error = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin));
if (error < 0) {
netprintk ("unable to bind socket to port %d, error=%d\n",
- ntohs(ip_port), error);
+ ntohs(port), error);
goto bail;
}
@@ -1629,7 +1462,3 @@
BUG_ON(sock == NULL);
return sock;
}
-
-MODULE_LICENSE("GPL");
-module_init (net_driver_entry);
-module_exit (net_driver_exit);
Modified: trunk/fs/ocfs2/cluster/tcp.h
===================================================================
--- trunk/fs/ocfs2/cluster/tcp.h 2005-03-18 04:25:45 UTC (rev 2005)
+++ trunk/fs/ocfs2/cluster/tcp.h 2005-03-18 06:01:10 UTC (rev 2006)
@@ -41,17 +41,11 @@
#include <linux/in.h>
/*
- * FIXME: no need for clcommon.h and nodemanager.h except for
+ * FIXME: no need for nodemanager.h except for
* NM_MAX_NAME_LEN...shouldn't that be something or somewhere else?
*/
-#include "clcommon.h"
#include "nodemanager.h"
-#include "ocfs2_tcp.h"
-
-#define NET_DISP_THREAD_MS 5000 /* TODO */
-#define NET_RECV_THREAD_MS 5000 /* TODO */
-
enum net_system_error {
NET_ERR_NONE = 0,
NET_ERR_NO_HNDLR,
@@ -155,24 +149,6 @@
}
-static inline int net_ip_version_to_family(u16 ip_version)
-{
- printk("ip_version passed: %u, host byteorder: %u\n", ip_version, ntohs(ip_version));
- return PF_INET;
- switch (ntohs(ip_version)) {
- case 4:
- return PF_INET;
- case 6:
- return PF_INET6;
- default:
- BUG();
- }
-
- return 4;
-}
-
-
-
/* TODO: figure this out.... */
static inline int net_link_down(int err, struct socket *sock)
{
@@ -213,33 +189,16 @@
int net_register_handler(u32 msg_type, u32 key, int flags,
u32 max_len, net_msg_handler_func *func, void *data);
int net_init_tcp_sock(struct inode *inode);
-int net_send_message(u32 msg_type, u32 key, void *data, u32 len, struct inode *inode, int *status);
+int net_send_message(u32 msg_type, u32 key, void *data, u32 len,
+ u8 target_node, int *status);
int net_send_message_iov(u32 msg_type, u32 key, struct iovec *iov,
- size_t iovlen, struct inode *inode, int *status);
+ size_t iovlen, u8 target_node, int *status);
int net_broadcast_message(u32 msg_type, u32 key, void *data, u32 len, struct inode *group);
net_msg_handler * net_lookup_handler(u32 msg_type, u32 key);
-#define GSD_MESSAGE 130
-#define GSD_ACTION_ADD_GROUP (0x01)
-#define GSD_ACTION_ADD_GROUP_NODE (0x02)
+int net_register_hb_callbacks(void);
+void net_unregister_hb_callbacks(void);
+int net_start_rx_thread(struct nm_node *node);
+void net_stop_rx_thread(struct nm_node *node);
-typedef struct _gsd_message
-{
- u8 from;
- u8 action;
- u8 namelen;
- u8 pad1;
- u32 pad2;
- u8 name[NM_MAX_NAME_LEN];
-} gsd_message;
-
-static inline void gsd_message_to_net(gsd_message *g)
-{
- /* do nothing */
-}
-static inline void gsd_message_to_host(gsd_message *g)
-{
- /* do nothing */
-}
-
#endif /* CLUSTER_TCP_H */
Modified: trunk/fs/ocfs2/dlm/dlmast.c
===================================================================
--- trunk/fs/ocfs2/dlm/dlmast.c 2005-03-18 04:25:45 UTC (rev 2005)
+++ trunk/fs/ocfs2/dlm/dlmast.c 2005-03-18 06:01:10 UTC (rev 2006)
@@ -42,7 +42,6 @@
#include "util.h"
-#include "cluster/clcommon.h"
#include "cluster/heartbeat.h"
#include "cluster/nodemanager.h"
#include "cluster/tcp.h"
@@ -101,18 +100,18 @@
DLM_ASSERT(lksb);
/* only updates if this node masters the lockres */
- if (res->owner == dlm->group_index) {
+ if (res->owner == dlm->node_num) {
spin_lock(&res->spinlock);
/* check the lksb flags for the direction */
if (lksb->flags & DLM_LKSB_GET_LVB) {
dlmprintk("getting lvb from lockres for %s node\n",
- lock->ml.node == dlm->group_index ? "master" :
+ lock->ml.node == dlm->node_num ? "master" :
"remote");
memcpy(lksb->lvb, res->lvb, DLM_LVB_LEN);
} else if (lksb->flags & DLM_LKSB_PUT_LVB) {
dlmprintk("setting lvb from lockres for %s node\n",
- lock->ml.node == dlm->group_index ? "master" :
+ lock->ml.node == dlm->node_num ? "master" :
"remote");
memcpy(res->lvb, lksb->lvb, DLM_LVB_LEN);
}
@@ -136,7 +135,7 @@
DLM_ASSERT(lksb);
fn = lock->ast;
DLM_ASSERT(fn);
- DLM_ASSERT(lock->ml.node == dlm->group_index);
+ DLM_ASSERT(lock->ml.node == dlm->node_num);
dlm_update_lvb(dlm, res, lock);
(*fn)(lock->astdata);
@@ -155,7 +154,7 @@
DLM_ASSERT(res);
lksb = lock->lksb;
DLM_ASSERT(lksb);
- DLM_ASSERT(lock->ml.node != dlm->group_index);
+ DLM_ASSERT(lock->ml.node != dlm->node_num);
ret = 0;
dlm_update_lvb(dlm, res, lock);
@@ -172,7 +171,7 @@
dlm_bastlockfunc_t *fn = lock->bast;
dlmprintk0("\n");
- DLM_ASSERT(lock->ml.node == dlm->group_index);
+ DLM_ASSERT(lock->ml.node == dlm->node_num);
DLM_ASSERT(fn);
(*fn)(lock->astdata, blocked_type);
@@ -242,7 +241,7 @@
}
/* cannot get a proxy ast message if this node owns it */
- DLM_ASSERT(res->owner != dlm->group_index);
+ DLM_ASSERT(res->owner != dlm->node_num);
dlmprintk("lockres %.*s\n", res->lockname.len, res->lockname.name);
if (!dlm_is_recovery_lock(past->name, past->namelen))
@@ -329,7 +328,6 @@
{
int ret = 0;
dlm_proxy_ast past;
- struct inode *inode = NULL;
struct iovec iov[2];
size_t iovlen = 1;
@@ -338,7 +336,7 @@
msg_type, blocked_type);
memset(&past, 0, sizeof(dlm_proxy_ast));
- past.node_idx = dlm->group_index;
+ past.node_idx = dlm->node_num;
past.type = msg_type;
past.blocked_type = blocked_type;
past.namelen = res->lockname.len;
@@ -355,14 +353,9 @@
iovlen++;
}
- ret = -EINVAL;
- inode = nm_get_group_node_by_index(dlm->group, lock->ml.node);
- if (inode) {
- dlm_proxy_ast_to_net(&past);
- ret = net_send_message_iov(DLM_PROXY_AST_MSG, dlm->key,
- iov, iovlen, inode, NULL);
- iput(inode);
- }
+ dlm_proxy_ast_to_net(&past);
+ ret = net_send_message_iov(DLM_PROXY_AST_MSG, dlm->key, iov, iovlen,
+ lock->ml.node, NULL);
if (ret < 0)
dlmprintk("(%d) dlm_send_proxy_ast: returning %d\n",
current->pid, ret);
Modified: trunk/fs/ocfs2/dlm/dlmconvert.c
===================================================================
--- trunk/fs/ocfs2/dlm/dlmconvert.c 2005-03-18 04:25:45 UTC (rev 2005)
+++ trunk/fs/ocfs2/dlm/dlmconvert.c 2005-03-18 06:01:10 UTC (rev 2006)
@@ -42,7 +42,6 @@
#include "util.h"
-#include "cluster/clcommon.h"
#include "cluster/heartbeat.h"
#include "cluster/nodemanager.h"
#include "cluster/tcp.h"
@@ -189,7 +188,7 @@
res->lockname.name, dlm_lock_mode_name(type));
/* immediately grant the new lock type */
lock->lksb->status = DLM_NORMAL;
- if (lock->ml.node == dlm->group_index)
+ if (lock->ml.node == dlm->node_num)
dlmprintk0("doing in-place convert for nonlocal lock\n");
lock->ml.type = type;
status = DLM_NORMAL;
@@ -306,7 +305,6 @@
dlm_lock_resource *res,
dlm_lock *lock, int flags, int type)
{
- struct inode *inode = NULL;
dlm_convert_lock convert;
int tmpret;
dlm_status ret;
@@ -317,7 +315,7 @@
dlmprintk0("\n");
memset(&convert, 0, sizeof(dlm_convert_lock));
- convert.node_idx = dlm->group_index;
+ convert.node_idx = dlm->node_num;
convert.requested_type = type;
convert.cookie = lock->ml.cookie;
convert.namelen = res->lockname.len;
@@ -334,21 +332,16 @@
iovlen++;
}
- ret = DLM_NOLOCKMGR;
- inode = nm_get_group_node_by_index(dlm->group, res->owner);
- if (inode) {
- dlm_convert_lock_to_net(&convert);
- tmpret = net_send_message_iov(DLM_CONVERT_LOCK_MSG, dlm->key,
- iov, iovlen, inode, &status);
- if (tmpret >= 0) {
- // successfully sent and received
- ret = status; // this is already a dlm_status
- } else {
- dlmprintk("error occurred in net_send_message: %d\n",
- tmpret);
- ret = dlm_err_to_dlm_status(tmpret);
- }
- iput(inode);
+ dlm_convert_lock_to_net(&convert);
+ tmpret = net_send_message_iov(DLM_CONVERT_LOCK_MSG, dlm->key,
+ iov, iovlen, res->owner, &status);
+ if (tmpret >= 0) {
+ // successfully sent and received
+ ret = status; // this is already a dlm_status
+ } else {
+ dlmprintk("error occurred in net_send_message: %d\n",
+ tmpret);
+ ret = dlm_err_to_dlm_status(tmpret);
}
return ret;
Modified: trunk/fs/ocfs2/dlm/dlmfs.c
===================================================================
--- trunk/fs/ocfs2/dlm/dlmfs.c 2005-03-18 04:25:45 UTC (rev 2005)
+++ trunk/fs/ocfs2/dlm/dlmfs.c 2005-03-18 06:01:10 UTC (rev 2006)
@@ -40,7 +40,6 @@
#include "util.h"
-#include "cluster/clcommon.h"
#include "cluster/nodemanager.h"
#include "cluster/heartbeat.h"
#include "cluster/tcp.h"
Modified: trunk/fs/ocfs2/dlm/dlmfs_compat.c
===================================================================
--- trunk/fs/ocfs2/dlm/dlmfs_compat.c 2005-03-18 04:25:45 UTC (rev 2005)
+++ trunk/fs/ocfs2/dlm/dlmfs_compat.c 2005-03-18 06:01:10 UTC (rev 2006)
@@ -5,7 +5,6 @@
#include "util.h"
-#include "cluster/clcommon.h"
#include "cluster/nodemanager.h"
#include "cluster/tcp.h"
Modified: trunk/fs/ocfs2/dlm/dlmlock.c
===================================================================
--- trunk/fs/ocfs2/dlm/dlmlock.c 2005-03-18 04:25:45 UTC (rev 2005)
+++ trunk/fs/ocfs2/dlm/dlmlock.c 2005-03-18 06:01:10 UTC (rev 2006)
@@ -42,7 +42,6 @@
#include "util.h"
-#include "cluster/clcommon.h"
#include "cluster/heartbeat.h"
#include "cluster/nodemanager.h"
#include "cluster/tcp.h"
@@ -197,7 +196,6 @@
dlm_lock_resource *res,
dlm_lock *lock, int flags)
{
- struct inode *inode = NULL;
dlm_create_lock create;
int tmpret, status = 0;
dlm_status ret;
@@ -205,29 +203,23 @@
dlmprintk0("\n");
memset(&create, 0, sizeof(create));
- create.node_idx = dlm->group_index;
+ create.node_idx = dlm->node_num;
create.requested_type = lock->ml.type;
create.cookie = lock->ml.cookie;
create.namelen = res->lockname.len;
create.flags = flags;
strncpy(create.name, res->lockname.name, create.namelen);
- ret = DLM_NOLOCKMGR;
- inode = nm_get_group_node_by_index(dlm->group, res->owner);
- if (inode) {
- dlm_create_lock_to_net(&create);
- tmpret = net_send_message(DLM_CREATE_LOCK_MSG, dlm->key,
- &create, sizeof(create),
- inode, &status);
- if (tmpret >= 0) {
- // successfully sent and received
- ret = status; // this is already a dlm_status
- } else {
- dlmprintk("error occurred in net_send_message: %d\n",
- tmpret);
- ret = dlm_err_to_dlm_status(tmpret);
- }
- iput(inode);
+ dlm_create_lock_to_net(&create);
+ tmpret = net_send_message(DLM_CREATE_LOCK_MSG, dlm->key, &create,
+ sizeof(create), res->owner, &status);
+ if (tmpret >= 0) {
+ // successfully sent and received
+ ret = status; // this is already a dlm_status
+ } else {
+ dlmprintk("error occurred in net_send_message: %d\n",
+ tmpret);
+ ret = dlm_err_to_dlm_status(tmpret);
}
return ret;
Modified: trunk/fs/ocfs2/dlm/dlmmaster.c
===================================================================
--- trunk/fs/ocfs2/dlm/dlmmaster.c 2005-03-18 04:25:45 UTC (rev 2005)
+++ trunk/fs/ocfs2/dlm/dlmmaster.c 2005-03-18 06:01:10 UTC (rev 2006)
@@ -42,7 +42,6 @@
#include "util.h"
-#include "cluster/clcommon.h"
#include "cluster/heartbeat.h"
#include "cluster/nodemanager.h"
#include "cluster/tcp.h"
@@ -243,8 +242,8 @@
/* copy off the node_map and register hb callbacks on our copy */
memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map));
memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map));
- clear_bit(dlm->group_index, mle->vote_map);
- clear_bit(dlm->group_index, mle->node_map);
+ clear_bit(dlm->node_num, mle->vote_map);
+ clear_bit(dlm->node_num, mle->node_map);
/* attach the mle to the domain node up/down events */
__dlm_mle_attach_hb_events(dlm, mle);
@@ -273,14 +272,11 @@
void dlm_mle_node_down(dlm_ctxt *dlm, dlm_master_list_entry *mle,
- struct inode *group, struct inode *node, int idx)
+ struct nm_node *node, int idx)
{
DLM_ASSERT(mle);
DLM_ASSERT(dlm);
- if (dlm->group != group)
- return;
-
spin_lock(&mle->spinlock);
if (!test_bit(idx, mle->node_map))
@@ -298,14 +294,11 @@
}
void dlm_mle_node_up(dlm_ctxt *dlm, dlm_master_list_entry *mle,
- struct inode *group, struct inode *node, int idx)
+ struct nm_node *node, int idx)
{
DLM_ASSERT(mle);
DLM_ASSERT(dlm);
- if (dlm->group != group)
- return;
-
spin_lock(&mle->spinlock);
#if 0
@@ -365,7 +358,7 @@
{
assert_spin_locked(&res->spinlock);
- if (owner == dlm->group_index)
+ if (owner == dlm->node_num)
atomic_inc(&dlm->local_resources);
else if (owner == DLM_LOCK_RES_OWNER_UNKNOWN)
atomic_inc(&dlm->unknown_resources);
@@ -384,7 +377,7 @@
if (owner == res->owner)
return;
- if (res->owner == dlm->group_index)
+ if (res->owner == dlm->node_num)
atomic_dec(&dlm->local_resources);
else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN)
atomic_dec(&dlm->unknown_resources);
@@ -590,7 +583,7 @@
/* caller knows it's safe to assume it's not mastered elsewhere
* DONE! return right away */
spin_lock(&res->spinlock);
- dlm_change_lockres_owner(dlm, res, dlm->group_index);
+ dlm_change_lockres_owner(dlm, res, dlm->node_num);
__dlm_insert_lock(dlm, res);
spin_unlock(&res->spinlock);
spin_unlock(&dlm->spinlock);
@@ -730,11 +723,11 @@
/* have all nodes responded? */
if (voting_done) {
bit = find_next_bit(mle->maybe_map, NM_MAX_NODES, 0);
- if (dlm->group_index <= bit) {
+ if (dlm->node_num <= bit) {
/* my node number is lowest.
* now tell other nodes that I am
* mastering this. */
- mle->master = dlm->group_index;
+ mle->master = dlm->node_num;
assert = 1;
sleep = 0;
}
@@ -766,7 +759,7 @@
ret = 0; /* done */
if (assert) {
- m = dlm->group_index;
+ m = dlm->node_num;
ret = dlm_do_assert_master(dlm, res->lockname.name,
res->lockname.len, mle->vote_map);
if (ret) {
@@ -809,13 +802,12 @@
static int dlm_do_master_request(dlm_master_list_entry *mle, int to)
{
- struct inode *inode = NULL;
dlm_ctxt *dlm = mle->dlm;
dlm_master_request request;
int ret, response=0;
memset(&request, 0, sizeof(request));
- request.node_idx = dlm->group_index;
+ request.node_idx = dlm->node_num;
if (mle->type == DLM_MLE_BLOCK) {
request.namelen = mle->u.name.len;
strncpy(request.name, mle->u.name.name, request.namelen);
@@ -825,53 +817,47 @@
request.namelen);
}
- ret = -EINVAL;
- inode = nm_get_group_node_by_index(dlm->group, to);
- if (inode) {
- dlm_master_request_to_net(&request);
- ret = net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key,
- &request, sizeof(request),
- inode, &response);
- iput(inode);
- if (ret >= 0) {
- spin_lock(&mle->spinlock);
- switch (response) {
- case DLM_MASTER_RESP_YES:
- set_bit(to, mle->response_map);
- // dlmprintk("woot! node %u is the "
- // "master!\n", to);
- mle->master = to;
- break;
- case DLM_MASTER_RESP_NO:
- // dlmprintk("node %u is not the "
- // "master, not in-progress\n", to);
- set_bit(to, mle->response_map);
- break;
- case DLM_MASTER_RESP_MAYBE:
- // dlmprintk("node %u is not the "
- // "master, but IS in-progress\n", to);
- set_bit(to, mle->response_map);
- set_bit(to, mle->maybe_map);
- break;
- case DLM_MASTER_RESP_ERROR:
- dlmprintk("node %u hit an -ENOMEM! "
- "try everything again\n", to);
- mle->error = 1;
- break;
- default:
- dlmprintk("bad response! %u\n",
- response);
- ret = -EINVAL;
- break;
- }
- spin_unlock(&mle->spinlock);
- } else {
- dlmprintk("net_send_message returned %d!\n", ret);
- }
- } else {
- dlmprintk("nm_get_group_node_by_index failed to find inode "
- "for node %d!\n", to);
- }
+ dlm_master_request_to_net(&request);
+ ret = net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request,
+ sizeof(request), to, &response);
+ if (ret < 0) {
+ dlmprintk("net_send_message returned %d!\n", ret);
+ goto out;
+ }
+
+ spin_lock(&mle->spinlock);
+ switch (response) {
+ case DLM_MASTER_RESP_YES:
+ set_bit(to, mle->response_map);
+ // dlmprintk("woot! node %u is the "
+ // "master!\n", to);
+ mle->master = to;
+ break;
+ case DLM_MASTER_RESP_NO:
+ // dlmprintk("node %u is not the "
+ // "master, not in-progress\n", to);
+ set_bit(to, mle->response_map);
+ break;
+ case DLM_MASTER_RESP_MAYBE:
+ // dlmprintk("node %u is not the "
+ // "master, but IS in-progress\n", to);
+ set_bit(to, mle->response_map);
+ set_bit(to, mle->maybe_map);
+ break;
+ case DLM_MASTER_RESP_ERROR:
+ dlmprintk("node %u hit an -ENOMEM! "
+ "try everything again\n", to);
+ mle->error = 1;
+ break;
+ default:
+ dlmprintk("bad response! %u\n",
+ response);
+ ret = -EINVAL;
+ break;
+ }
+ spin_unlock(&mle->spinlock);
+
+out:
return ret;
}
@@ -923,16 +909,16 @@
* the node that called us */
memcpy(nodemap, dlm->domain_map, sizeof(nodemap));
clear_bit(request->node_idx, nodemap);
- clear_bit(dlm->group_index, nodemap);
+ clear_bit(dlm->node_num, nodemap);
while ((bit = find_next_bit(nodemap, NM_MAX_NODES,
- dlm->group_index)) < NM_MAX_NODES) {
+ dlm->node_num)) < NM_MAX_NODES) {
clear_bit(bit, nodemap);
}
spin_unlock(&dlm->spinlock);
/* take care of the easy cases up front */
spin_lock(&res->spinlock);
- if (res->owner == dlm->group_index) {
+ if (res->owner == dlm->node_num) {
spin_unlock(&res->spinlock);
// dlmprintk0("this node is the master\n");
response = DLM_MASTER_RESP_YES;
@@ -1067,7 +1053,6 @@
int dlm_do_assert_master(dlm_ctxt *dlm, const char *lockname,
unsigned int namelen, void *nodemap)
{
- struct inode *inode = NULL;
dlm_assert_master assert;
int to, tmpret;
dlm_node_iter iter;
@@ -1083,24 +1068,13 @@
while ((to = dlm_node_iter_next(&iter)) >= 0) {
// dlmprintk("sending assert master to %d\n", to);
memset(&assert, 0, sizeof(assert));
- assert.node_idx = dlm->group_index;
+ assert.node_idx = dlm->node_num;
assert.namelen = namelen;
strncpy(assert.name, lockname, namelen);
- inode = nm_get_group_node_by_index(dlm->group, to);
- if (!inode) {
- tmpret = -EINVAL;
- dlmprintk("could not get nm info for node %d! "
- "need to retry this whole thing\n", to);
- ret = tmpret;
- break;
- }
-
dlm_assert_master_to_net(&assert);
tmpret = net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key,
- &assert, sizeof(assert), inode, NULL);
- iput(inode);
-
+ &assert, sizeof(assert), to, NULL);
if (tmpret < 0) {
// TODO
// dlmprintk("assert_master returned %d!\n", tmpret);
Modified: trunk/fs/ocfs2/dlm/dlmmod.c
===================================================================
--- trunk/fs/ocfs2/dlm/dlmmod.c 2005-03-18 04:25:45 UTC (rev 2005)
+++ trunk/fs/ocfs2/dlm/dlmmod.c 2005-03-18 06:01:10 UTC (rev 2006)
@@ -43,7 +43,6 @@
#include "util.h"
-#include "cluster/clcommon.h"
#include "cluster/heartbeat.h"
#include "cluster/nodemanager.h"
#include "cluster/tcp.h"
@@ -82,7 +81,6 @@
LIST_HEAD(dlm_domains);
spinlock_t dlm_domain_lock = SPIN_LOCK_UNLOCKED;
DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
-u8 dlm_global_index = NM_MAX_NODES;
static spinlock_t dlm_cookie_lock = SPIN_LOCK_UNLOCKED;
static u64 dlm_next_cookie = 1;
@@ -196,10 +194,6 @@
if (status < 0)
return -1;
- dlm_global_index = nm_this_node(NULL);
- if (dlm_global_index == NM_MAX_NODES)
- return -1;
-
status = dlm_register_net_handlers();
if (status)
return -1;
@@ -307,7 +301,7 @@
res = dlm_lockres_grab(dlm, lock->lockres);
down_read(&dlm->recovery_sem);
- if (res->owner == dlm->group_index)
+ if (res->owner == dlm->node_num)
status = dlmconvert_master(dlm, res, lock, flags, mode);
else
status = dlmconvert_remote(dlm, res, lock, flags, mode);
@@ -350,7 +344,7 @@
lock->ml.type = mode;
lock->ml.convert_type = LKM_IVMODE;
lock->ml.highest_blocked = LKM_IVMODE;
- lock->ml.node = dlm->group_index;
+ lock->ml.node = dlm->node_num;
lock->ast = ast;
lock->bast = bast;
lock->astdata = data;
@@ -371,7 +365,7 @@
}
}
- if (res->owner == dlm->group_index)
+ if (res->owner == dlm->node_num)
status = dlmlock_master(dlm, res, lock, flags);
else
status = dlmlock_remote(dlm, res, lock, flags);
@@ -439,7 +433,7 @@
DLM_ASSERT(res);
dlmprintk("lock=%p res=%p\n", lock, res);
- if (res->owner == dlm->group_index) {
+ if (res->owner == dlm->node_num) {
status = dlmunlock_master(dlm, res, lock, lksb, flags,
&call_ast);
dlmprintk("done calling dlmunlock_master: returned %d, "
@@ -597,9 +591,6 @@
if (dlm->name)
kfree(dlm->name);
- if (dlm->group)
- iput(dlm->group);
-
kfree(dlm);
}
@@ -783,31 +774,22 @@
unsigned int node)
{
int status;
- struct inode *node_inode;
dlm_exit_domain leave_msg;
dlmprintk("Asking node %u if we can leave the domain %s me = %u\n",
- node, dlm->name, dlm->group_index);
+ node, dlm->name, dlm->node_num);
- node_inode = nm_get_group_node_by_index(dlm->group, node);
- if (!node_inode) {
- status = -EINVAL;
- dlmprintk("Could not get inode for node %u!\n", node);
- goto bail;
- }
-
memset(&leave_msg, 0, sizeof(leave_msg));
- leave_msg.node_idx = dlm->group_index;
+ leave_msg.node_idx = dlm->node_num;
dlm_exit_domin_to_net(&leave_msg);
status = net_send_message(DLM_EXIT_DOMAIN_MSG, dlm->key,
- &leave_msg, sizeof(leave_msg), node_inode,
+ &leave_msg, sizeof(leave_msg), node,
NULL);
- iput(node_inode);
+
dlmprintk("status return %d from net_send_message\n", status);
-bail:
return status;
}
@@ -822,7 +804,7 @@
spin_lock(&dlm->spinlock);
/* Clear ourselves from the domain map */
- clear_bit(dlm->group_index, dlm->domain_map);
+ clear_bit(dlm->node_num, dlm->domain_map);
while ((node = find_next_bit(dlm->domain_map, NM_MAX_NODES, 0))
!= -1) {
if (node > NM_MAX_NODES)
@@ -1026,27 +1008,18 @@
unsigned int node)
{
int status;
- struct inode *node_inode;
dlm_cancel_join cancel_msg;
- node_inode = nm_get_group_node_by_index(dlm->group, node);
- if (!node_inode) {
- status = -EINVAL;
- dlmprintk("Could not get inode for node %u!\n", node);
- goto bail;
- }
-
memset(&cancel_msg, 0, sizeof(cancel_msg));
- cancel_msg.node_idx = dlm->group_index;
+ cancel_msg.node_idx = dlm->node_num;
cancel_msg.name_len = strlen(dlm->name);
strncpy(cancel_msg.domain, dlm->name, cancel_msg.name_len);
dlm_cancel_join_to_net(&cancel_msg);
status = net_send_message(DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY,
- &cancel_msg, sizeof(cancel_msg), node_inode,
+ &cancel_msg, sizeof(cancel_msg), node,
NULL);
- iput(node_inode);
if (status < 0) {
dlmprintk("net_send_message returned %d!\n", status);
goto bail;
@@ -1074,7 +1047,7 @@
if (node >= NM_MAX_NODES)
break;
- if (node == dlm->group_index)
+ if (node == dlm->node_num)
continue;
tmpstat = dlm_send_one_join_cancel(dlm, node);
@@ -1095,27 +1068,18 @@
{
int status, retval;
dlm_query_join_request join_msg;
- struct inode *node_inode;
dlmprintk("querying node %d\n", node);
- node_inode = nm_get_group_node_by_index(dlm->group, node);
- if (!node_inode) {
- status = -EINVAL;
- dlmprintk("Could not get inode for node %u!\n", node);
- goto bail;
- }
-
memset(&join_msg, 0, sizeof(join_msg));
- join_msg.node_idx = dlm->group_index;
+ join_msg.node_idx = dlm->node_num;
join_msg.name_len = strlen(dlm->name);
strncpy(join_msg.domain, dlm->name, join_msg.name_len);
dlm_query_join_request_to_net(&join_msg);
status = net_send_message(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, &join_msg,
- sizeof(join_msg), node_inode, &retval);
- iput(node_inode);
+ sizeof(join_msg), node, &retval);
if (status < 0 && status != -ENOPROTOOPT && status != -ENOTCONN) {
dlmprintk("net_send_message returned %d!\n", status);
goto bail;
@@ -1152,33 +1116,22 @@
unsigned int node)
{
int status;
- struct inode *node_inode;
dlm_assert_joined assert_msg;
dlmprintk("Sending join assert to node %u\n", node);
- node_inode = nm_get_group_node_by_index(dlm->group, node);
- if (!node_inode) {
- status = -EINVAL;
- dlmprintk("Could not get inode for node %u!\n", node);
- goto bail;
- }
-
memset(&assert_msg, 0, sizeof(assert_msg));
- assert_msg.node_idx = dlm->group_index;
+ assert_msg.node_idx = dlm->node_num;
assert_msg.name_len = strlen(dlm->name);
strncpy(assert_msg.domain, dlm->name, assert_msg.name_len);
dlm_assert_joined_to_net(&assert_msg);
status = net_send_message(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,
- &assert_msg, sizeof(assert_msg), node_inode,
- NULL);
- iput(node_inode);
+ &assert_msg, sizeof(assert_msg), node, NULL);
if (status < 0)
dlmprintk("net_send_message returned %d!\n", status);
-bail:
return status;
}
@@ -1194,7 +1147,7 @@
if (node >= NM_MAX_NODES)
break;
- if (node == dlm->group_index)
+ if (node == dlm->node_num)
continue;
do {
@@ -1250,7 +1203,7 @@
static int dlm_try_to_join_domain(dlm_ctxt *dlm)
{
- int status, tmpstat, node;
+ int status = 0, tmpstat, node;
struct domain_join_ctxt *ctxt;
enum dlm_query_join_response response;
@@ -1265,17 +1218,12 @@
/* group sem locking should work for us here -- we're already
* registered for heartbeat events so filling this should be
* atomic wrt getting those handlers called. */
- status = hb_fill_node_map(dlm->group, dlm->live_nodes_map,
- sizeof(dlm->live_nodes_map));
- if (status < 0) {
- dlmprintk("I couldn't fill my node map!\n");
- goto bail;
- }
+ hb_fill_node_map(dlm->live_nodes_map, sizeof(dlm->live_nodes_map));
spin_lock(&dlm->spinlock);
memcpy(ctxt->live_map, dlm->live_nodes_map, sizeof(ctxt->live_map));
- __dlm_set_joining_node(dlm, dlm->group_index);
+ __dlm_set_joining_node(dlm, dlm->node_num);
spin_unlock(&dlm->spinlock);
@@ -1285,7 +1233,7 @@
if (node >= NM_MAX_NODES)
break;
- if (node == dlm->group_index)
+ if (node == dlm->node_num)
continue;
status = dlm_request_join(dlm, node, &response);
@@ -1314,7 +1262,7 @@
spin_lock(&dlm->spinlock);
memcpy(dlm->domain_map, ctxt->yes_resp_map,
sizeof(ctxt->yes_resp_map));
- set_bit(dlm->group_index, dlm->domain_map);
+ set_bit(dlm->node_num, dlm->domain_map);
spin_unlock(&dlm->spinlock);
dlm_send_join_asserts(dlm, ctxt->yes_resp_map);
@@ -1467,17 +1415,11 @@
}
static dlm_ctxt *dlm_alloc_ctxt(const char *domain,
- struct inode *group,
u32 key)
{
int i;
dlm_ctxt *dlm = NULL;
- /* if for some reason we can't get a reference on the group
- * inode (required) then don't even try the rest. */
- if (!igrab(group))
- goto leave;
-
dlm = kmalloc(sizeof(dlm_ctxt), GFP_KERNEL);
if (!dlm) {
dlmprintk0("could not allocate dlm_ctxt\n");
@@ -1508,6 +1450,7 @@
strcpy(dlm->name, domain);
dlm->key = key;
+ dlm->node_num = nm_this_node();
spin_lock_init(&dlm->spinlock);
spin_lock_init(&dlm->master_lock);
@@ -1526,10 +1469,6 @@
INIT_LIST_HEAD(&dlm->mle_hb_events);
init_rwsem(&dlm->recovery_sem);
- /* this eats the reference we got above. */
- dlm->group = group;
- dlm->group_index = nm_this_node(group);
-
dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN;
init_waitqueue_head(&dlm->dlm_join_events);
@@ -1555,23 +1494,19 @@
* dlm_register_domain: one-time setup per "domain"
*/
dlm_ctxt * dlm_register_domain(const char *domain,
- const char *group_name,
u32 key)
{
int ret;
dlm_ctxt *dlm = NULL;
dlm_ctxt *new_ctxt = NULL;
- struct inode *group = NULL;
if (strlen(domain) > NM_MAX_NAME_LEN) {
dlmprintk0("domain name length too long\n");
goto leave;
}
- group = nm_get_group_by_name(group_name);
- if (!group) {
- dlmprintk("no nm group %s for domain %s!\n",
- group_name, domain);
+ if (nm_this_node() == NM_MAX_NODES) {
+ dlmprintk0("a local node has not been configured\n");
goto leave;
}
@@ -1607,7 +1542,7 @@
if (!new_ctxt) {
spin_unlock(&dlm_domain_lock);
- new_ctxt = dlm_alloc_ctxt(domain, group, key);
+ new_ctxt = dlm_alloc_ctxt(domain, key);
if (new_ctxt)
goto retry;
goto leave;
@@ -1632,9 +1567,6 @@
if (new_ctxt)
dlm_free_ctxt_mem(new_ctxt);
- if (group)
- iput(group);
-
return dlm;
}
EXPORT_SYMBOL(dlm_register_domain);
@@ -1682,8 +1614,8 @@
struct list_head *bucket;
int i;
- printk("dlm_ctxt: %s, group=%u, key=%u\n",
- dlm->name, dlm->group_index, dlm->key);
+ printk("dlm_ctxt: %s, node=%u, key=%u\n",
+ dlm->name, dlm->node_num, dlm->key);
printk("some bug here... should not have to check for this...\n");
if (!dlm || !dlm->name) {
printk("wtf... dlm=%p\n", dlm);
Modified: trunk/fs/ocfs2/dlm/dlmmod.h
===================================================================
--- trunk/fs/ocfs2/dlm/dlmmod.h 2005-03-18 04:25:45 UTC (rev 2005)
+++ trunk/fs/ocfs2/dlm/dlmmod.h 2005-03-18 06:01:10 UTC (rev 2006)
@@ -213,9 +213,8 @@
spinlock_t spinlock;
struct rw_semaphore recovery_sem;
char *name;
- struct inode *group;
+ u8 node_num;
u32 key;
- u8 group_index;
u8 joining_node;
wait_queue_head_t dlm_join_events;
unsigned long live_nodes_map[BITS_TO_LONGS(NM_MAX_NODES)];
@@ -718,7 +717,6 @@
}
dlm_ctxt * dlm_register_domain(const char *domain,
- const char *group_name,
u32 key);
void dlm_unregister_domain(dlm_ctxt *dlm);
void dlm_get(dlm_ctxt *dlm);
@@ -783,8 +781,8 @@
int dlm_nm_init(dlm_ctxt *dlm);
int dlm_heartbeat_init(dlm_ctxt *dlm);
-void dlm_hb_node_down_cb(struct inode *group, struct inode *node, int idx, void *data);
-void dlm_hb_node_up_cb(struct inode *group, struct inode *node, int idx, void *data);
+void dlm_hb_node_down_cb(struct nm_node *node, int idx, void *data);
+void dlm_hb_node_up_cb(struct nm_node *node, int idx, void *data);
int dlm_hb_node_dead(dlm_ctxt *dlm, int node);
int __dlm_hb_node_dead(dlm_ctxt *dlm, int node);
@@ -806,9 +804,9 @@
void dlm_init_lock(dlm_lock *newlock, int type, u8 node, u64 cookie);
void dlm_mle_node_down(dlm_ctxt *dlm, dlm_master_list_entry *mle,
- struct inode *group, struct inode *node, int idx);
+ struct nm_node *node, int idx);
void dlm_mle_node_up(dlm_ctxt *dlm, dlm_master_list_entry *mle,
- struct inode *group, struct inode *node, int idx);
+ struct nm_node *node, int idx);
int dlm_do_assert_master(dlm_ctxt *dlm, const char *lockname,
unsigned int namelen, void *nodemap);
Modified: trunk/fs/ocfs2/dlm/dlmrecovery.c
===================================================================
--- trunk/fs/ocfs2/dlm/dlmrecovery.c 2005-03-18 04:25:45 UTC (rev 2005)
+++ trunk/fs/ocfs2/dlm/dlmrecovery.c 2005-03-18 06:01:10 UTC (rev 2006)
@@ -41,7 +41,6 @@
#include "util.h"
-#include "cluster/clcommon.h"
#include "cluster/heartbeat.h"
#include "cluster/nodemanager.h"
#include "cluster/tcp.h"
@@ -84,7 +83,7 @@
res->state |= DLM_LOCK_RES_RECOVERING;
list_del(&res->recovering);
list_add_tail(&res->recovering, &dlm->reco.resources);
- } else if (res->owner == dlm->group_index) {
+ } else if (res->owner == dlm->node_num) {
list_for_each_safe(iter2, tmpiter, &res->granted) {
lock = list_entry (iter2, dlm_lock, list);
if (lock->ml.node == dead_node) {
@@ -114,7 +113,7 @@
}
-void dlm_hb_node_down_cb(struct inode *group, struct inode *node, int idx, void *data)
+void dlm_hb_node_down_cb(struct nm_node *node, int idx, void *data)
{
dlm_ctxt *dlm = data;
dlm_master_list_entry *mle;
@@ -136,7 +135,7 @@
/* notify any mles attached to the heartbeat events */
list_for_each(iter, &dlm->mle_hb_events) {
mle = list_entry(iter, dlm_master_list_entry, hb_events);
- dlm_mle_node_down(dlm, mle, group, node, idx);
+ dlm_mle_node_down(dlm, mle, node, idx);
}
if (!test_bit(idx, dlm->domain_map)) {
@@ -162,7 +161,7 @@
dlm_put(dlm);
}
-void dlm_hb_node_up_cb(struct inode *group, struct inode *node, int idx, void *data)
+void dlm_hb_node_up_cb(struct nm_node *node, int idx, void *data)
{
dlm_ctxt *dlm = data;
dlm_master_list_entry *mle;
@@ -178,7 +177,7 @@
/* notify any mles attached to the heartbeat events */
list_for_each(iter, &dlm->mle_hb_events) {
mle = list_entry(iter, dlm_master_list_entry, hb_events);
- dlm_mle_node_up(dlm, mle, group, node, idx);
+ dlm_mle_node_up(dlm, mle, node, idx);
}
spin_unlock(&dlm->spinlock);
Modified: trunk/fs/ocfs2/dlm/dlmthread.c
===================================================================
--- trunk/fs/ocfs2/dlm/dlmthread.c 2005-03-18 04:25:45 UTC (rev 2005)
+++ trunk/fs/ocfs2/dlm/dlmthread.c 2005-03-18 06:01:10 UTC (rev 2006)
@@ -44,7 +44,6 @@
#include "util.h"
#include "cluster/cl_compat.h"
-#include "cluster/clcommon.h"
#include "cluster/heartbeat.h"
#include "cluster/nodemanager.h"
#include "cluster/tcp.h"
@@ -54,7 +53,6 @@
extern spinlock_t dlm_domain_lock;
extern struct list_head dlm_domains;
-extern u8 dlm_global_index;
static int dlm_thread(void *data);
struct task_struct *dlm_thread_task;
@@ -119,7 +117,7 @@
/* Since we can't migrate locks yet, for now we only handle
* non locally mastered locks. */
spin_lock(&lockres->spinlock);
- master = lockres->owner == dlm->group_index;
+ master = lockres->owner == dlm->node_num;
spin_unlock(&lockres->spinlock);
dlmprintk("purging lockres %.*s, master = %d\n", lockres->lockname.len,
@@ -185,9 +183,8 @@
void dlm_shuffle_lists(dlm_ctxt *dlm, dlm_lock_resource *res)
{
dlm_lock *lock, *target;
- struct list_head *iter, *tmpiter;
+ struct list_head *iter;
struct list_head *head;
- s8 hi;
int can_grant = 1;
dlmprintk("shuffle res %.*s\n", res->lockname.len, res->lockname.name);
@@ -329,7 +326,7 @@
spin_lock(&res->spinlock);
/* don't shuffle secondary queues */
- if ((res->owner == dlm->group_index) &&
+ if ((res->owner == dlm->node_num) &&
!(res->state & DLM_LOCK_RES_DIRTY)) {
list_add_tail(&res->dirty, &dlm->dirty_list);
res->state |= DLM_LOCK_RES_DIRTY;
@@ -400,7 +397,7 @@
dlmprintk0("delivering an ast for this lockres\n");
list_del_init(&lock->ast_list);
- if (lock->ml.node != dlm->group_index) {
+ if (lock->ml.node != dlm->node_num) {
if (dlm_do_remote_ast(dlm, res, lock) < 0)
dlmprintk("eek\n");
} else
@@ -422,7 +419,7 @@
dlmprintk("delivering a bast for this lockres "
"(blocked = %d\n", hi);
- if (lock->ml.node != dlm->group_index) {
+ if (lock->ml.node != dlm->node_num) {
if (dlm_send_proxy_bast(dlm, res, lock, hi) < 0)
dlmprintk0("eeek\n");
} else
@@ -453,7 +450,7 @@
spin_lock(&res->spinlock);
list_del_init(&res->dirty);
res->state &= ~DLM_LOCK_RES_DIRTY;
- BUG_ON(res->owner != dlm->group_index);
+ BUG_ON(res->owner != dlm->node_num);
spin_unlock(&res->spinlock);
dlm_shuffle_lists(dlm, res);
Modified: trunk/fs/ocfs2/dlm/dlmunlock.c
===================================================================
--- trunk/fs/ocfs2/dlm/dlmunlock.c 2005-03-18 04:25:45 UTC (rev 2005)
+++ trunk/fs/ocfs2/dlm/dlmunlock.c 2005-03-18 06:01:10 UTC (rev 2006)
@@ -42,7 +42,6 @@
#include "util.h"
-#include "cluster/clcommon.h"
#include "cluster/heartbeat.h"
#include "cluster/nodemanager.h"
#include "cluster/tcp.h"
@@ -91,9 +90,9 @@
flags & LKM_VALBLK);
if (master_node)
- DLM_ASSERT(res->owner == dlm->group_index);
+ DLM_ASSERT(res->owner == dlm->node_num);
else
- DLM_ASSERT(res->owner != dlm->group_index);
+ DLM_ASSERT(res->owner != dlm->node_num);
spin_lock(&dlm->spinlock);
/* We want to be sure that we're not freeing a lock
@@ -202,7 +201,6 @@
dlm_lockstatus *lksb,
int flags)
{
- struct inode *inode = NULL;
dlm_unlock_lock unlock;
int tmpret;
dlm_status ret;
@@ -213,7 +211,7 @@
dlmprintk0("\n");
memset(&unlock, 0, sizeof(unlock));
- unlock.node_idx = dlm->group_index;
+ unlock.node_idx = dlm->node_num;
unlock.flags = flags;
unlock.cookie = lock->ml.cookie;
unlock.namelen = res->lockname.len;
@@ -229,27 +227,21 @@
iovlen++;
}
- ret = DLM_NOLOCKMGR;
- lksb->status = DLM_NOLOCKMGR;
- inode = nm_get_group_node_by_index(dlm->group, res->owner);
- if (inode) {
- dlm_unlock_lock_to_net(&unlock);
- tmpret = net_send_message_iov(DLM_UNLOCK_LOCK_MSG, dlm->key,
- iov, iovlen, inode, &status);
- if (tmpret >= 0) {
- // successfully sent and received
- if (status == DLM_CANCELGRANT)
- ret = DLM_NORMAL;
- else
- ret = status;
- lksb->status = status;
- } else {
- dlmprintk("error occurred in net_send_message: %d\n",
- tmpret);
- ret = dlm_err_to_dlm_status(tmpret);
- lksb->status = ret;
- }
- iput(inode);
+ dlm_unlock_lock_to_net(&unlock);
+ tmpret = net_send_message_iov(DLM_UNLOCK_LOCK_MSG, dlm->key,
+ iov, iovlen, res->owner, &status);
+ if (tmpret >= 0) {
+ // successfully sent and received
+ if (status == DLM_CANCELGRANT)
+ ret = DLM_NORMAL;
+ else
+ ret = status;
+ lksb->status = status;
+ } else {
+ dlmprintk("error occurred in net_send_message: %d\n",
+ tmpret);
+ ret = dlm_err_to_dlm_status(tmpret);
+ lksb->status = ret;
}
return ret;
Modified: trunk/fs/ocfs2/dlm/userdlm.c
===================================================================
--- trunk/fs/ocfs2/dlm/userdlm.c 2005-03-18 04:25:45 UTC (rev 2005)
+++ trunk/fs/ocfs2/dlm/userdlm.c 2005-03-18 06:01:10 UTC (rev 2006)
@@ -37,7 +37,6 @@
#include "util.h"
-#include "cluster/clcommon.h"
#include "cluster/nodemanager.h"
#include "cluster/heartbeat.h"
#include "cluster/tcp.h"
@@ -637,7 +636,7 @@
snprintf(domain, name->len + 1, "%.*s", name->len, name->name);
- dlm = dlm_register_domain(domain, domain, dlm_key);
+ dlm = dlm_register_domain(domain, dlm_key);
kfree(domain);
return dlm;
Modified: trunk/fs/ocfs2/dlmglue.c
===================================================================
--- trunk/fs/ocfs2/dlmglue.c 2005-03-18 04:25:45 UTC (rev 2005)
+++ trunk/fs/ocfs2/dlmglue.c 2005-03-18 06:01:10 UTC (rev 2006)
@@ -34,7 +34,6 @@
#include <linux/kthread.h>
#include <cluster/util.h>
-#include <cluster/clcommon.h>
#include <cluster/heartbeat.h>
#include <cluster/nodemanager.h>
#include <cluster/tcp.h>
@@ -1617,10 +1616,10 @@
/* used by the dlm code to make message headers unique, each
* node in this domain must agree on this. */
- dlm_key = crc32(0, osb->group_name, strlen(osb->group_name));
+ dlm_key = crc32(0, osb->uuid_str, strlen(osb->uuid_str));
- /* for now, group_name == domain */
- dlm = dlm_register_domain(osb->group_name, osb->group_name, dlm_key);
+ /* for now, uuid == domain */
+ dlm = dlm_register_domain(osb->uuid_str, dlm_key);
if (!dlm) {
/* This is a best guess on return value... */
status = -ENOMEM;
Modified: trunk/fs/ocfs2/heartbeat.c
===================================================================
--- trunk/fs/ocfs2/heartbeat.c 2005-03-18 04:25:45 UTC (rev 2005)
+++ trunk/fs/ocfs2/heartbeat.c 2005-03-18 06:01:10 UTC (rev 2006)
@@ -32,8 +32,8 @@
#include <linux/highmem.h>
#include <cluster/util.h>
-#include <cluster/clcommon.h>
#include <cluster/heartbeat.h>
+#include <cluster/nodemanager.h>
#include <dlm/dlmcommon.h>
#include "ocfs_log.h"
@@ -53,6 +53,7 @@
#define OCFS2_HB_NODE_DOWN_PRI (0x0000002)
#define OCFS2_HB_NODE_UP_PRI OCFS2_HB_NODE_DOWN_PRI
+#if 0
static void ocfs2_hb_node_down_cb(struct inode *group,
struct inode *node,
int node_num,
@@ -61,6 +62,7 @@
struct inode *node,
int node_num,
void *data);
+#endif
static inline void __ocfs_node_map_set_bit(ocfs_node_map *map,
int bit);
@@ -79,16 +81,12 @@
ocfs_node_map_init(&osb->umount_map);
}
-static void ocfs2_hb_node_down_cb(struct inode *group,
- struct inode *node,
+static void ocfs2_hb_node_down_cb(struct nm_node *node,
int node_num,
void *data)
{
ocfs_super *osb = data;
- if (osb->group_inode != group)
- return;
-
OCFS_ASSERT(osb->node_num != node_num);
printk("ocfs2: node down event for %d\n", node_num);
@@ -104,16 +102,12 @@
ocfs_recovery_thread(osb, node_num);
}
-static void ocfs2_hb_node_up_cb(struct inode *group,
- struct inode *node,
+static void ocfs2_hb_node_up_cb(struct nm_node *node,
int node_num,
void *data)
{
ocfs_super *osb = data;
- if (osb->group_inode != group)
- return;
-
OCFS_ASSERT(osb->node_num != node_num);
printk("ocfs2: node up event for %d\n", node_num);
Modified: trunk/fs/ocfs2/ocfs.h
===================================================================
--- trunk/fs/ocfs2/ocfs.h 2005-03-18 04:25:45 UTC (rev 2005)
+++ trunk/fs/ocfs2/ocfs.h 2005-03-18 06:01:10 UTC (rev 2006)
@@ -43,7 +43,6 @@
#endif
#include "cluster/util.h"
-#include "cluster/clcommon.h"
#include "cluster/nodemanager.h"
#include "cluster/heartbeat.h"
#include "cluster/tcp.h"
@@ -331,6 +330,7 @@
u64 bitmap_blkno;
u32 bitmap_cpg;
u8 *uuid;
+ char *uuid_str;
u8 *vol_label;
u64 first_cluster_group_blkno;
u32 fs_generation;
@@ -373,8 +373,6 @@
ocfs_alloc_stats alloc_stats;
char dev_str[20]; /* "major,minor" of the device */
- char *group_name;
- struct inode *group_inode;
dlm_ctxt *dlm;
ocfs2_lock_res super_lockres;
Modified: trunk/fs/ocfs2/super.c
===================================================================
--- trunk/fs/ocfs2/super.c 2005-03-18 04:25:45 UTC (rev 2005)
+++ trunk/fs/ocfs2/super.c 2005-03-18 06:01:10 UTC (rev 2006)
@@ -43,7 +43,6 @@
#include <linux/inet.h>
#include <cluster/util.h>
-#include <cluster/clcommon.h>
#include <cluster/nodemanager.h>
#include <dlm/dlmcommon.h>
@@ -129,12 +128,12 @@
#endif /* Linux 2.4 stuff */
-static int ocfs_parse_options (char *options, u32 * uid, u32 * gid, int * reclaim_id, char **group_name);
+static int ocfs_parse_options (char *options, u32 * uid, u32 * gid, int * reclaim_id);
static int __init ocfs_driver_entry (void);
static void __exit ocfs_driver_exit (void);
static void ocfs_put_super (struct super_block *sb);
static int ocfs_mount_volume (struct super_block *sb, int reclaim_id,
- char **group_name, struct inode *root);
+ struct inode *root);
static void ocfs_dismount_volume(struct super_block *sb);
static int ocfs_initialize_mem_lists (void);
static void ocfs_free_mem_lists (void);
@@ -151,7 +150,7 @@
static int ocfs_init_global_system_inodes(ocfs_super *osb);
static int ocfs_init_local_system_inodes(ocfs_super *osb);
static int ocfs_release_system_inodes(ocfs_super *osb);
-static int ocfs2_fill_local_node_info(ocfs_super *osb, char **group_name);
+static int ocfs2_fill_local_node_info(ocfs_super *osb);
static int ocfs2_complete_mount_recovery(ocfs_super *osb);
static int ocfs_check_volume(ocfs_super * osb);
static int ocfs_verify_volume(ocfs2_dinode *di, struct buffer_head *bh,
@@ -316,12 +315,11 @@
u32 uid = current->fsuid;
u32 gid = current->fsgid;
int reclaim_id;
- char *group_name = NULL;
ocfs_super *osb = NULL;
LOG_ENTRY_ARGS ("%p, %p, %i", sb, data, silent);
- if (ocfs_parse_options (data, &uid, &gid, &reclaim_id, &group_name) != 0) {
+ if (ocfs_parse_options (data, &uid, &gid, &reclaim_id) != 0) {
status = -EINVAL;
LOG_ERROR_STR ("ocfs_read_super: bad mount option");
goto read_super_error;
@@ -334,7 +332,7 @@
/* this is needed to support O_LARGE_FILE */
sb->s_maxbytes = OCFS_LINUX_MAX_FILE_SIZE;
- status = ocfs_mount_volume (sb, reclaim_id, &group_name, NULL);
+ status = ocfs_mount_volume (sb, reclaim_id, NULL);
if (status < 0)
goto read_super_error;
@@ -376,9 +374,6 @@
LOG_EXIT_STATUS(status);
}
- if (group_name)
- kfree(group_name);
-
LOG_EXIT_STATUS(status);
return status;
@@ -391,9 +386,6 @@
if (inode)
iput (inode);
- if (group_name)
- kfree(group_name);
-
LOG_EXIT_STATUS(status);
return status;
}
@@ -430,12 +422,11 @@
*
* e.g., gid=9999,uid=9999,[no]cache,reclaimid
*/
-static int ocfs_parse_options (char *options, u32 * uid, u32 * gid, int * reclaim_id, char **group_name)
+static int ocfs_parse_options (char *options, u32 * uid, u32 * gid, int * reclaim_id)
{
char *c;
char *value;
int ret = 1;
- int size;
LOG_ENTRY ();
@@ -476,24 +467,6 @@
}
} else if (!strcmp (c, "reclaimid")) {
*reclaim_id = 1;
- } else if (!strcmp(c, "group")) {
- if (!value || !*value) {
- LOG_ERROR_STR
- ("group option requires an argument");
- goto bail;
- }
- LOG_TRACE_ARGS("group name passed = %s\n", value);
-
- size = strlen(value) + 1;
- *group_name = kmalloc(size, GFP_KERNEL);
- if (!(*group_name)) {
- LOG_ERROR_STATUS(-ENOMEM);
- goto bail;
- }
- memset(*group_name, 0, size);
- printk("ocfs2: group name passed = %s, size = %d\n",
- value, size);
- strcpy(*group_name, value);
} else {
LOG_ERROR_ARGS ("Invalid mount option: %s", c);
goto bail;
@@ -844,44 +817,24 @@
return 0;
}
-static int ocfs2_fill_local_node_info(ocfs_super *osb, char **group_name)
+/* ocfs2 1.0 only allows one cluster and node identity per kernel image. */
+static int ocfs2_fill_local_node_info(ocfs_super *osb)
{
- int status, i;
- struct inode *group = NULL;
- char *p;
+ int status;
- if (group_name) {
- osb->group_name = *group_name;
- *group_name = NULL;
- } else {
- osb->group_name = kmalloc(NM_MAX_NAME_LEN + 1, GFP_KERNEL);
- if (!osb->group_name) {
- status = -ENOMEM;
- LOG_ERROR_STATUS(status);
- goto bail;
- }
- memset(osb->group_name, 0, NM_MAX_NAME_LEN + 1);
- for (i = 0, p = osb->uuid; i < MAX_VOL_ID_LENGTH; i++, p += 2)
- sprintf(p, "%02X", osb->uuid[i]);
- }
-
- group = nm_get_group_by_name(osb->group_name);
- if (!group) {
- printk("ocfs2: could not join group \"%s\"\n",
- osb->group_name);
- status = -EINVAL;
+ /* XXX hold a ref on the node while mounte? easy enough, if
+ * desirable. */
+ osb->node_num = nm_this_node();
+ if (osb->node_num == NM_MAX_NODES) {
+ printk("ocfs2: could not find this host's node number\n");
+ status = -ENOENT;
goto bail;
}
- osb->group_inode = group;
- osb->node_num = nm_this_node(group);
+ printk("ocfs2: I am node %d\n", osb->node_num);
- printk("ocfs2: I am node %d, a member of group %s\n", osb->node_num,
- osb->group_name);
-
status = 0;
bail:
-
return status;
}
@@ -890,7 +843,7 @@
*
*/
static int ocfs_mount_volume (struct super_block *sb, int reclaim_id,
- char **group_name, struct inode *root)
+ struct inode *root)
{
int status, sector_size;
int unlock_super = 0;
@@ -930,7 +883,7 @@
goto leave;
}
- status = ocfs2_fill_local_node_info(osb, group_name);
+ status = ocfs2_fill_local_node_info(osb);
if (status < 0) {
LOG_ERROR_STATUS (status);
goto leave;
@@ -1062,6 +1015,8 @@
ocfs2_put_slot(osb);
+ ocfs_release_system_inodes(osb);
+
ocfs2_dlm_shutdown(osb);
ocfs2_clear_hb_callbacks(osb);
@@ -1072,13 +1027,37 @@
MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev),
OcfsGlobalCtxt.node_name, osb->node_num);
- ocfs_release_system_inodes(osb);
-
ocfs_delete_osb (osb);
kfree(osb);
sb->s_dev = 0;
} /* ocfs_dismount_volume */
+static int osb_setup_uuid(ocfs_super *osb, const unsigned char *uuid,
+ unsigned uuid_bytes)
+{
+ int i, ret;
+ char *ptr;
+
+ BUG_ON(uuid_bytes != MAX_VOL_ID_LENGTH);
+
+ osb->uuid_str = kcalloc(1, MAX_VOL_ID_LENGTH*2 + 1, GFP_KERNEL);
+ if (osb->uuid_str == NULL)
+ return -ENOMEM;
+
+ memcpy(osb->uuid, uuid, MAX_VOL_ID_LENGTH);
+
+ for (i = 0, ptr = osb->uuid_str; i < MAX_VOL_ID_LENGTH; i++) {
+ /* print with null */
+ ret = snprintf(ptr, 3, "%02X", uuid[i]);
+ if (ret != 2) /* drop super cleans up */
+ return -EINVAL;
+ /* then only advace past the last char */
+ ptr += 2;
+ }
+
+ return 0;
+}
+
/*
* ocfs_initialize_osb()
*
@@ -1205,21 +1184,21 @@
goto bail;
}
+ if (osb_setup_uuid(osb, di->id2.i_super.s_uuid,
+ sizeof(di->id2.i_super.s_uuid))) {
+ LOG_ERROR_ARGS("Out of memory trying to setup our uuid.\n");
+ status = -ENOMEM;
+ goto bail;
+ }
+
strncpy(osb->vol_label, di->id2.i_super.s_label, 63);
osb->vol_label[63] = '\0';
- memcpy(osb->uuid, di->id2.i_super.s_uuid, MAX_VOL_ID_LENGTH);
osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno);
osb->system_dir_blkno = le64_to_cpu(di->id2.i_super.s_system_dir_blkno);
osb->first_cluster_group_blkno = le64_to_cpu(di->id2.i_super.s_first_cluster_group);
osb->fs_generation = le32_to_cpu(di->i_fs_generation);
printk("vol_label: %s\n", osb->vol_label);
- {
- int ttt;
- printk("uuid: ");
- for (ttt=0; ttt<16; ttt++)
- printk("%02x ", di->id2.i_super.s_uuid[ttt]);
- printk("\n");
- }
+ printk("uuid: %s\n", osb->uuid_str);
printk("root_blkno=%llu, system_dir_blkno=%llu\n", osb->root_blkno, osb->system_dir_blkno);
atomic_set (&osb->vol_state, VOLUME_INIT);
@@ -1500,19 +1479,15 @@
if (osb->slot_info)
ocfs2_free_slot_info(osb->slot_info);
- if (osb->group_inode)
- iput(osb->group_inode);
-
/* FIXME
* This belongs in journal shutdown, but because we have to
* allocate osb->journal at the start of ocfs_initalize_osb(),
* we free it here.
*/
kfree(osb->journal);
- if (osb->group_name)
- kfree(osb->group_name);
if (osb->local_alloc_copy)
kfree(osb->local_alloc_copy);
+ kfree(osb->uuid_str);
memset (osb, 0, sizeof (ocfs_super));
LOG_EXIT ();
Modified: trunk/fs/ocfs2/vote.c
===================================================================
--- trunk/fs/ocfs2/vote.c 2005-03-18 04:25:45 UTC (rev 2005)
+++ trunk/fs/ocfs2/vote.c 2005-03-18 06:01:10 UTC (rev 2006)
@@ -32,7 +32,6 @@
#include <linux/kthread.h>
#include <cluster/util.h>
-#include <cluster/clcommon.h>
#include <cluster/heartbeat.h>
#include <cluster/nodemanager.h>
#include <cluster/tcp.h>
@@ -223,7 +222,6 @@
u64 blkno;
enum ocfs2_vote_request request;
struct inode *inode = NULL;
- struct inode *remote_node;
ocfs2_msg_hdr *hdr = &msg->v_hdr;
ocfs2_response_msg response;
@@ -304,21 +302,15 @@
response.r_hdr.h_node_num = htonl(osb->node_num);
response.r_response = htonl(vote_response);
- remote_node = nm_get_node_by_num(node_num);
- if (!remote_node) {
- LOG_ERROR_ARGS("Couldn't get inode for node %u!\n", node_num);
- } else {
- net_status = net_send_message(OCFS2_MESSAGE_TYPE_RESPONSE,
- osb->net_key,
- &response,
- sizeof(ocfs2_response_msg),
- remote_node,
- NULL);
- if (net_status < 0)
- LOG_ERROR_ARGS("message to node %u fails with error "
- "%d!\n", node_num, net_status);
- iput(remote_node);
- }
+ net_status = net_send_message(OCFS2_MESSAGE_TYPE_RESPONSE,
+ osb->net_key,
+ &response,
+ sizeof(ocfs2_response_msg),
+ node_num,
+ NULL);
+ if (net_status < 0)
+ LOG_ERROR_ARGS("message to node %u fails with error "
+ "%d!\n", node_num, net_status);
if (inode)
iput(inode);
@@ -510,7 +502,6 @@
{
int status, i, remote_err;
ocfs2_net_wait_ctxt *w = NULL;
- struct inode *remote_node;
int dequeued = 0;
LOG_ENTRY();
@@ -534,21 +525,13 @@
i);
ocfs_node_map_set_bit(osb, &w->n_node_map, i);
- remote_node = nm_get_node_by_num(i);
- if (!remote_node) {
- status = -EINVAL;
- LOG_ERROR_STATUS(status);
- goto bail;
- }
-
remote_err = 0;
status = net_send_message(OCFS2_MESSAGE_TYPE_VOTE,
osb->net_key,
request,
sizeof(*request),
- remote_node,
+ i,
&remote_err);
- iput(remote_node);
if (status == -ETIMEDOUT) {
LOG_TRACE_ARGS("remote node %d timed out!\n",
i);
Modified: trunk/fs/usysfs/dir.c
===================================================================
--- trunk/fs/usysfs/dir.c 2005-03-18 04:25:45 UTC (rev 2005)
+++ trunk/fs/usysfs/dir.c 2005-03-18 06:01:10 UTC (rev 2006)
@@ -385,6 +385,7 @@
continue;
list_del_init(&sd->s_sibling);
child = sd->s_dentry;
+ /* And now we're faking rmdir. Viro must hate me */
down(&child->d_inode->i_sem);
usysfs_drop_set(sd->s_element);
child->d_inode->i_flags |= S_DEAD;
@@ -449,6 +450,7 @@
int i;
if (ukset && ukset->default_sets) {
+ /* FYI, we're faking mkdir here */
down(&dentry->d_inode->i_sem);
for (i = 0; ukset->default_sets[i]; i++) {
Modified: trunk/fs/usysfs/mount.c
===================================================================
--- trunk/fs/usysfs/mount.c 2005-03-18 04:25:45 UTC (rev 2005)
+++ trunk/fs/usysfs/mount.c 2005-03-18 06:01:10 UTC (rev 2006)
@@ -66,9 +66,17 @@
{
kset_init_copy(&ukset->kset);
}
-
EXPORT_SYMBOL_GPL(ukset_init);
+void ukset_init_type_name(struct ukset *ukset, const char *name,
+ struct kobj_type *ktype)
+{
+ kobject_set_name(&ukset->kset.kobj, name);
+ ukset->kset.kobj.ktype = ktype;
+ ukset_init(ukset);
+}
+EXPORT_SYMBOL(ukset_init_type_name);
+
static int usysfs_fill_super(struct super_block *sb, void *data, int silent)
{
struct inode *inode;
Modified: trunk/fs/usysfs/usysfs.h
===================================================================
--- trunk/fs/usysfs/usysfs.h 2005-03-18 04:25:45 UTC (rev 2005)
+++ trunk/fs/usysfs/usysfs.h 2005-03-18 06:01:10 UTC (rev 2006)
@@ -85,7 +85,9 @@
return kset ? container_of(kset, struct ukset, kset) : NULL;
}
-extern void ukset_init(struct ukset *ukset);
+void ukset_init(struct ukset *ukset);
+void ukset_init_type_name(struct ukset *ukset, const char *name,
+ struct kobj_type *ktype);
/**
More information about the Ocfs2-commits
mailing list